libgo: update to Go1.14beta1

Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/214297
author: Ian Lance Taylor <iant@golang.org> 2020-01-02 15:05:27 -0800
committer: Ian Lance Taylor <iant@golang.org> 2020-01-21 23:53:22 -0800
commit: 5a8ea165926cb0737ab03bc48c18dc5198ab5305 (patch)
tree: 962dc3357c57f019f85658f99e2e753e30201c27 /libgo/go/runtime
parent: 6ac6529e155c9baa0aaaed7aca06bd38ebda5b43 (diff)
download: gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.zip
gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.tar.gz
gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.tar.bz2
151 files changed, 12420 insertions, 3499 deletions
diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go
index e802fdd..101402c 100644
--- a/libgo/go/runtime/alg.go
+++ b/libgo/go/runtime/alg.go
@@ -281,6 +281,7 @@ func efaceeq(x, y eface) bool {
 	}
 	return eq(x.data, y.data)
 }
+
 func ifaceeq(x, y iface) bool {
 	xtab := x.tab
 	if xtab == nil && y.tab == nil {
@@ -463,7 +464,6 @@ var hashkey [4]uintptr
 func alginit() {
 	// Install AES hash algorithms if the instructions needed are present.
 	if (GOARCH == "386" || GOARCH == "amd64") &&
-		GOOS != "nacl" &&
 		support_aes &&
 		cpu.X86.HasAES && // AESENC
 		cpu.X86.HasSSSE3 && // PSHUFB
@@ -488,7 +488,7 @@ func initAlgAES() {
 	getRandomData(aeskeysched[:])
 }
 
-// Note: These routines perform the read with an native endianness.
+// Note: These routines perform the read with a native endianness.
 func readUnaligned32(p unsafe.Pointer) uint32 {
 	q := (*[4]byte)(p)
 	if sys.BigEndian {
diff --git a/libgo/go/runtime/callers_test.go b/libgo/go/runtime/callers_test.go
index ad83f99..26a6f3a 100644
--- a/libgo/go/runtime/callers_test.go
+++ b/libgo/go/runtime/callers_test.go
@@ -5,25 +5,26 @@
 package runtime_test
 
 import (
+	"reflect"
 	"runtime"
 	"strings"
 	"testing"
 )
 
 func f1(pan bool) []uintptr {
-	return f2(pan) // line 14
+	return f2(pan) // line 15
 }
 
 func f2(pan bool) []uintptr {
-	return f3(pan) // line 18
+	return f3(pan) // line 19
 }
 
 func f3(pan bool) []uintptr {
 	if pan {
-		panic("f3") // line 23
+		panic("f3") // line 24
 	}
 	ret := make([]uintptr, 20)
-	return ret[:runtime.Callers(0, ret)] // line 26
+	return ret[:runtime.Callers(0, ret)] // line 27
 }
 
 func testCallers(t *testing.T, pcs []uintptr, pan bool) {
@@ -47,16 +48,16 @@ func testCallers(t *testing.T, pcs []uintptr, pan bool) {
 
 	var f3Line int
 	if pan {
-		f3Line = 23
+		f3Line = 24
 	} else {
-		f3Line = 26
+		f3Line = 27
 	}
 	want := []struct {
 		name string
 		line int
 	}{
-		{"f1", 14},
-		{"f2", 18},
+		{"f1", 15},
+		{"f2", 19},
 		{"f3", f3Line},
 	}
 	for _, w := range want {
@@ -66,11 +67,38 @@ func testCallers(t *testing.T, pcs []uintptr, pan bool) {
 	}
 }
 
+func testCallersEqual(t *testing.T, pcs []uintptr, want []string) {
+	got := make([]string, 0, len(want))
+
+	frames := runtime.CallersFrames(pcs)
+	for {
+		frame, more := frames.Next()
+		if !more || len(got) >= len(want) {
+			break
+		}
+		got = append(got, frame.Function)
+	}
+	if !reflect.DeepEqual(want, got) {
+		t.Fatalf("wanted %v, got %v", want, got)
+	}
+}
+
 func TestCallers(t *testing.T) {
 	testCallers(t, f1(false), false)
 }
 
 func TestCallersPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersPanic.func1",
+		"runtime.gopanic", "runtime_test.f3", "runtime_test.f2", "runtime_test.f1",
+		"runtime_test.TestCallersPanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersPanic..func1",
+			"runtime.gopanic", "runtime_test.f3", "runtime_test.f2", "runtime_test.f1",
+			"runtime_test.TestCallersPanic"}
+	}
+
 	defer func() {
 		if r := recover(); r == nil {
 			t.Fatal("did not panic")
@@ -78,6 +106,240 @@ func TestCallersPanic(t *testing.T) {
 		pcs := make([]uintptr, 20)
 		pcs = pcs[:runtime.Callers(0, pcs)]
 		testCallers(t, pcs, true)
+		testCallersEqual(t, pcs, want)
 	}()
 	f1(true)
 }
+
+func TestCallersDoublePanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDoublePanic.func1.1",
+		"runtime.gopanic", "runtime_test.TestCallersDoublePanic.func1", "runtime.gopanic", "runtime_test.TestCallersDoublePanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersDoublePanic..func2",
+			"runtime.gopanic", "runtime_test.TestCallersDoublePanic..func1", "runtime.gopanic", "runtime_test.TestCallersDoublePanic"}
+	}
+
+	defer func() {
+		defer func() {
+			pcs := make([]uintptr, 20)
+			pcs = pcs[:runtime.Callers(0, pcs)]
+			if recover() == nil {
+				t.Fatal("did not panic")
+			}
+			testCallersEqual(t, pcs, want)
+		}()
+		if recover() == nil {
+			t.Fatal("did not panic")
+		}
+		panic(2)
+	}()
+	panic(1)
+}
+
+// Test that a defer after a successful recovery looks like it is called directly
+// from the function with the defers.
+func TestCallersAfterRecovery(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAfterRecovery.func1", "runtime_test.TestCallersAfterRecovery"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersAfterRecovery..func1", "runtime_test.TestCallersAfterRecovery"}
+	}
+
+	defer func() {
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	defer func() {
+		if recover() == nil {
+			t.Fatal("did not recover from panic")
+		}
+	}()
+	panic(1)
+}
+
+func TestCallersAbortedPanic(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic.func2", "runtime_test.TestCallersAbortedPanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic..func2", "runtime_test.TestCallersAbortedPanic"}
+	}
+
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatalf("should be no panic remaining to recover")
+		}
+	}()
+
+	defer func() {
+		// panic2 was aborted/replaced by panic1, so when panic2 was
+		// recovered, there is no remaining panic on the stack.
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	defer func() {
+		r := recover()
+		if r != "panic2" {
+			t.Fatalf("got %v, wanted %v", r, "panic2")
+		}
+	}()
+	defer func() {
+		// panic2 aborts/replaces panic1, because it is a recursive panic
+		// that is not recovered within the defer function called by
+		// panic1 panicking sequence
+		panic("panic2")
+	}()
+	panic("panic1")
+}
+
+func TestCallersAbortedPanic2(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic2.func2", "runtime_test.TestCallersAbortedPanic2"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic2..func2", "runtime_test.TestCallersAbortedPanic2"}
+	}
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatalf("should be no panic remaining to recover")
+		}
+	}()
+	defer func() {
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	func() {
+		defer func() {
+			r := recover()
+			if r != "panic2" {
+				t.Fatalf("got %v, wanted %v", r, "panic2")
+			}
+		}()
+		func() {
+			defer func() {
+				// Again, panic2 aborts/replaces panic1
+				panic("panic2")
+			}()
+			panic("panic1")
+		}()
+	}()
+}
+
+func TestCallersNilPointerPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersNilPointerPanic.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic",
+		"runtime_test.TestCallersNilPointerPanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersNilPointerPanic..func1",
+			"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic",
+			"runtime_test.TestCallersNilPointerPanic"}
+	}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	var p *int
+	if *p == 3 {
+		t.Fatal("did not see nil pointer panic")
+	}
+}
+
+func TestCallersDivZeroPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDivZeroPanic.func1",
+		"runtime.gopanic", "runtime.panicdivide",
+		"runtime_test.TestCallersDivZeroPanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersDivZeroPanic..func1",
+			"runtime.gopanic", "runtime.panicdivide",
+			"runtime_test.TestCallersDivZeroPanic"}
+	}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	var n int
+	if 5/n == 1 {
+		t.Fatal("did not see divide-by-sizer panic")
+	}
+}
+
+func TestCallersDeferNilFuncPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack. We cut off the check
+	// at runtime.sigpanic, because non-open-coded defers (which may be used in
+	// non-opt or race checker mode) include an extra 'deferreturn' frame (which is
+	// where the nil pointer deref happens).
+	state := 1
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanic.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanic..func1",
+			"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic"}
+	}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+		if state == 1 {
+			t.Fatal("nil defer func panicked at defer time rather than function exit time")
+		}
+
+	}()
+	var f func()
+	defer f()
+	// Use the value of 'state' to make sure nil defer func f causes panic at
+	// function exit, rather than at the defer statement.
+	state = 2
+}
+
+// Same test, but forcing non-open-coded defer by putting the defer in a loop.  See
+// issue #36050
+func TestCallersDeferNilFuncPanicWithLoop(t *testing.T) {
+	state := 1
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanicWithLoop.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic", "runtime.deferreturn", "runtime_test.TestCallersDeferNilFuncPanicWithLoop"}
+	if runtime.Compiler == "gccgo" {
+		want = []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanicWithLoop..func1",
+			"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic", "runtime_test.TestCallersDeferNilFuncPanicWithLoop"}
+	}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+		if state == 1 {
+			t.Fatal("nil defer func panicked at defer time rather than function exit time")
+		}
+
+	}()
+
+	for i := 0; i < 1; i++ {
+		var f func()
+		defer f()
+	}
+	// Use the value of 'state' to make sure nil defer func f causes panic at
+	// function exit, rather than at the defer statement.
+	state = 2
+}
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 587001c..efd0e24 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -47,24 +47,24 @@ import (
 
 // cgoCheckPointer checks if the argument contains a Go pointer that
 // points to a Go pointer, and panics if it does.
-func cgoCheckPointer(ptr interface{}, args ...interface{}) {
+func cgoCheckPointer(ptr interface{}, arg interface{}) {
 	if debug.cgocheck == 0 {
 		return
 	}
 
-	ep := (*eface)(unsafe.Pointer(&ptr))
+	ep := efaceOf(&ptr)
 	t := ep._type
 
 	top := true
-	if len(args) > 0 && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+	if arg != nil && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
 		p := ep.data
 		if t.kind&kindDirectIface == 0 {
 			p = *(*unsafe.Pointer)(p)
 		}
-		if !cgoIsGoPointer(p) {
+		if p == nil || !cgoIsGoPointer(p) {
 			return
 		}
-		aep := (*eface)(unsafe.Pointer(&args[0]))
+		aep := efaceOf(&arg)
 		switch aep._type.kind & kindMask {
 		case kindBool:
 			if t.kind&kindMask == kindUnsafePointer {
@@ -101,7 +101,7 @@ const cgoResultFail = "cgo result has Go pointer"
 // depending on indir. The top parameter is whether we are at the top
 // level, where Go pointers are allowed.
 func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
-	if t.ptrdata == 0 {
+	if t.ptrdata == 0 || p == nil {
 		// If the type has no pointers there is nothing to do.
 		return
 	}
@@ -158,7 +158,7 @@ func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
 		st := (*slicetype)(unsafe.Pointer(t))
 		s := (*slice)(p)
 		p = s.array
-		if !cgoIsGoPointer(p) {
+		if p == nil || !cgoIsGoPointer(p) {
 			return
 		}
 		if !top {
@@ -189,11 +189,17 @@ func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
 			return
 		}
 		for _, f := range st.fields {
+			if f.typ.ptrdata == 0 {
+				continue
+			}
 			cgoCheckArg(f.typ, add(p, f.offset()), true, top, msg)
 		}
 	case kindPtr, kindUnsafePointer:
 		if indir {
 			p = *(*unsafe.Pointer)(p)
+			if p == nil {
+				return
+			}
 		}
 
 		if !cgoIsGoPointer(p) {
@@ -298,7 +304,7 @@ func cgoCheckResult(val interface{}) {
 		return
 	}
 
-	ep := (*eface)(unsafe.Pointer(&val))
+	ep := efaceOf(&val)
 	t := ep._type
 	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
 }
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index 130db29..c03bafe 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -134,7 +134,7 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
 	}
 
 	s := spanOfUnchecked(uintptr(src))
-	if s.state == mSpanManual {
+	if s.state.get() == mSpanManual {
 		// There are no heap bits for value stored on the stack.
 		// For a channel receive src might be on the stack of some
 		// other goroutine, so we can't unwind the stack even if
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index 291fe00..549e566 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -133,6 +133,21 @@ func chanbuf(c *hchan, i uint) unsafe.Pointer {
 	return add(c.buf, uintptr(i)*uintptr(c.elemsize))
 }
 
+// full reports whether a send on c would block (that is, the channel is full).
+// It uses a single word-sized read of mutable state, so although
+// the answer is instantaneously true, the correct answer may have changed
+// by the time the calling function receives the return value.
+func full(c *hchan) bool {
+	// c.dataqsiz is immutable (never written after the channel is created)
+	// so it is safe to read at any time during channel operation.
+	if c.dataqsiz == 0 {
+		// Assumes that a pointer read is relaxed-atomic.
+		return c.recvq.first == nil
+	}
+	// Assumes that a uint read is relaxed-atomic.
+	return c.qcount == c.dataqsiz
+}
+
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
@@ -177,7 +192,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	//
 	// After observing that the channel is not closed, we observe that the channel is
 	// not ready for sending. Each of these observations is a single word-sized read
-	// (first c.closed and second c.recvq.first or c.qcount depending on kind of channel).
+	// (first c.closed and second full()).
 	// Because a closed channel cannot transition from 'ready for sending' to
 	// 'not ready for sending', even if the channel is closed between the two observations,
 	// they imply a moment between the two when the channel was both not yet closed
@@ -186,9 +201,10 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	//
 	// It is okay if the reads are reordered here: if we observe that the channel is not
 	// ready for sending and then observe that it is not closed, that implies that the
-	// channel wasn't closed during the first observation.
-	if !block && c.closed == 0 && ((c.dataqsiz == 0 && c.recvq.first == nil) ||
-		(c.dataqsiz > 0 && c.qcount == c.dataqsiz)) {
+	// channel wasn't closed during the first observation. However, nothing here
+	// guarantees forward progress. We rely on the side effects of lock release in
+	// chanrecv() and closechan() to update this thread's view of c.closed and full().
+	if !block && c.closed == 0 && full(c) {
 		return false
 	}
 
@@ -250,7 +266,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	gp.waiting = mysg
 	gp.param = nil
 	c.sendq.enqueue(mysg)
-	goparkunlock(&c.lock, waitReasonChanSend, traceEvGoBlockSend, 3)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
 	// Ensure the value being sent is kept alive until the
 	// receiver copies it out. The sudog has a pointer to the
 	// stack object, but sudogs aren't considered as roots of the
@@ -262,6 +278,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 		throw("G waiting list is corrupted")
 	}
 	gp.waiting = nil
+	gp.activeStackChans = false
 	if gp.param == nil {
 		if c.closed == 0 {
 			throw("chansend: spurious wakeup")
@@ -417,6 +434,16 @@ func closechan(c *hchan) {
 	}
 }
 
+// empty reports whether a read from c would block (that is, the channel is
+// empty).  It uses a single atomic read of mutable state.
+func empty(c *hchan) bool {
+	// c.dataqsiz is immutable.
+	if c.dataqsiz == 0 {
+		return atomic.Loadp(unsafe.Pointer(&c.sendq.first)) == nil
+	}
+	return atomic.Loaduint(&c.qcount) == 0
+}
+
 // entry points for <- c from compiled code
 //go:nosplit
 func chanrecv1(c *hchan, elem unsafe.Pointer) {
@@ -457,21 +484,33 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	}
 
 	// Fast path: check for failed non-blocking operation without acquiring the lock.
-	//
-	// After observing that the channel is not ready for receiving, we observe that the
-	// channel is not closed. Each of these observations is a single word-sized read
-	// (first c.sendq.first or c.qcount, and second c.closed).
-	// Because a channel cannot be reopened, the later observation of the channel
-	// being not closed implies that it was also not closed at the moment of the
-	// first observation. We behave as if we observed the channel at that moment
-	// and report that the receive cannot proceed.
-	//
-	// The order of operations is important here: reversing the operations can lead to
-	// incorrect behavior when racing with a close.
-	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
-		c.dataqsiz > 0 && atomic.Loaduint(&c.qcount) == 0) &&
-		atomic.Load(&c.closed) == 0 {
-		return
+	if !block && empty(c) {
+		// After observing that the channel is not ready for receiving, we observe whether the
+		// channel is closed.
+		//
+		// Reordering of these checks could lead to incorrect behavior when racing with a close.
+		// For example, if the channel was open and not empty, was closed, and then drained,
+		// reordered reads could incorrectly indicate "open and empty". To prevent reordering,
+		// we use atomic loads for both checks, and rely on emptying and closing to happen in
+		// separate critical sections under the same lock.  This assumption fails when closing
+		// an unbuffered channel with a blocked send, but that is an error condition anyway.
+		if atomic.Load(&c.closed) == 0 {
+			// Because a channel cannot be reopened, the later observation of the channel
+			// being not closed implies that it was also not closed at the moment of the
+			// first observation. We behave as if we observed the channel at that moment
+			// and report that the receive cannot proceed.
+			return
+		}
+		// The channel is irreversibly closed. Re-check whether the channel has any pending data
+		// to receive, which could have arrived between the empty and closed checks above.
+		// Sequential consistency is also required here, when racing with such a send.
+		if empty(c) {
+			// The channel is irreversibly closed and empty.
+			if ep != nil {
+				typedmemclr(c.elemtype, ep)
+			}
+			return true, false
+		}
 	}
 
 	var t0 int64
@@ -543,13 +582,14 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
-	goparkunlock(&c.lock, waitReasonChanReceive, traceEvGoBlockRecv, 3)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
 
 	// someone woke us up
 	if mysg != gp.waiting {
 		throw("G waiting list is corrupted")
 	}
 	gp.waiting = nil
+	gp.activeStackChans = false
 	if mysg.releasetime > 0 {
 		blockevent(mysg.releasetime-t0, 2)
 	}
@@ -616,6 +656,14 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	goready(gp, skip+1)
 }
 
+func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	gp.activeStackChans = true
+	unlock((*mutex)(chanLock))
+	return true
+}
+
 // compiler implements
 //
 //	select {
diff --git a/libgo/go/runtime/chan_test.go b/libgo/go/runtime/chan_test.go
index 29fb321..ac81d40 100644
--- a/libgo/go/runtime/chan_test.go
+++ b/libgo/go/runtime/chan_test.go
@@ -485,11 +485,11 @@ func TestSelectFairness(t *testing.T) {
 	// If the select in the goroutine is fair,
 	// cnt1 and cnt2 should be about the same value.
 	// With 10,000 trials, the expected margin of error at
-	// a confidence level of five nines is 4.4172 / (2 * Sqrt(10000)).
+	// a confidence level of six nines is 4.891676 / (2 * Sqrt(10000)).
 	r := float64(cnt1) / trials
 	e := math.Abs(r - 0.5)
 	t.Log(cnt1, cnt2, r, e)
-	if e > 4.4172/(2*math.Sqrt(trials)) {
+	if e > 4.891676/(2*math.Sqrt(trials)) {
 		t.Errorf("unfair select: in %d trials, results were %d, %d", trials, cnt1, cnt2)
 	}
 	close(done)
@@ -724,6 +724,7 @@ func TestSelectStackAdjust(t *testing.T) {
 		if after.NumGC-before.NumGC >= 2 {
 			goto done
 		}
+		runtime.Gosched()
 	}
 	t.Fatal("failed to trigger concurrent GC")
 done:
@@ -1131,6 +1132,20 @@ func BenchmarkChanPopular(b *testing.B) {
 	wg.Wait()
 }
 
+func BenchmarkChanClosed(b *testing.B) {
+	c := make(chan struct{})
+	close(c)
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			select {
+			case <-c:
+			default:
+				b.Error("Unreachable")
+			}
+		}
+	})
+}
+
 var (
 	alwaysFalse = false
 	workSink    = 0
diff --git a/libgo/go/runtime/checkptr.go b/libgo/go/runtime/checkptr.go
new file mode 100644
index 0000000..f478ddd
--- /dev/null
+++ b/libgo/go/runtime/checkptr.go
@@ -0,0 +1,106 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package runtime
+
+import "unsafe"
+
+type ptrAlignError struct {
+	ptr  unsafe.Pointer
+	elem *_type
+	n    uintptr
+}
+
+func (e ptrAlignError) RuntimeError() {}
+
+func (e ptrAlignError) Error() string {
+	return "runtime error: unsafe pointer conversion"
+}
+
+func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
+	// Check that (*[n]elem)(p) is appropriately aligned.
+	// TODO(mdempsky): What about fieldAlign?
+	if uintptr(p)&(uintptr(elem.align)-1) != 0 {
+		panic(ptrAlignError{p, elem, n})
+	}
+
+	// Check that (*[n]elem)(p) doesn't straddle multiple heap objects.
+	if size := n * elem.size; size > 1 && checkptrBase(p) != checkptrBase(add(p, size-1)) {
+		panic(ptrAlignError{p, elem, n})
+	}
+}
+
+type ptrArithError struct {
+	ptr       unsafe.Pointer
+	originals []unsafe.Pointer
+}
+
+func (e ptrArithError) RuntimeError() {}
+
+func (e ptrArithError) Error() string {
+	return "runtime error: unsafe pointer arithmetic"
+}
+
+func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
+	if 0 < uintptr(p) && uintptr(p) < minLegalPointer {
+		panic(ptrArithError{p, originals})
+	}
+
+	// Check that if the computed pointer p points into a heap
+	// object, then one of the original pointers must have pointed
+	// into the same object.
+	base := checkptrBase(p)
+	if base == 0 {
+		return
+	}
+
+	for _, original := range originals {
+		if base == checkptrBase(original) {
+			return
+		}
+	}
+
+	panic(ptrArithError{p, originals})
+}
+
+// checkptrBase returns the base address for the allocation containing
+// the address p.
+//
+// Importantly, if p1 and p2 point into the same variable, then
+// checkptrBase(p1) == checkptrBase(p2). However, the converse/inverse
+// is not necessarily true as allocations can have trailing padding,
+// and multiple variables may be packed into a single allocation.
+func checkptrBase(p unsafe.Pointer) uintptr {
+	// stack
+	if gp := getg(); gp.stack.lo <= uintptr(p) && uintptr(p) < gp.stack.hi {
+		// TODO(mdempsky): Walk the stack to identify the
+		// specific stack frame or even stack object that p
+		// points into.
+		//
+		// In the mean time, use "1" as a pseudo-address to
+		// represent the stack. This is an invalid address on
+		// all platforms, so it's guaranteed to be distinct
+		// from any of the addresses we might return below.
+		return 1
+	}
+
+	// heap (must check after stack because of #35068)
+	if base, _, _ := findObject(uintptr(p), 0, 0); base != 0 {
+		return base
+	}
+
+	// data or bss
+	for _, datap := range activeModules() {
+		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			return datap.data
+		}
+		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			return datap.bss
+		}
+	}
+
+	return 0
+}
diff --git a/libgo/go/runtime/crash_nonunix_test.go b/libgo/go/runtime/crash_nonunix_test.go
index bf349a5..06c197e 100644
--- a/libgo/go/runtime/crash_nonunix_test.go
+++ b/libgo/go/runtime/crash_nonunix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build windows plan9 nacl js,wasm
+// +build windows plan9 js,wasm
 
 package runtime_test
 
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 6343e8e..6268f2e 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -104,8 +104,6 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 		t.Skip("-quick")
 	}
 
-	checkStaleRuntime(t)
-
 	testprog.Lock()
 	defer testprog.Unlock()
 	if testprog.dir == "" {
@@ -143,34 +141,12 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	return exe, nil
 }
 
-var (
-	staleRuntimeOnce sync.Once // guards init of staleRuntimeErr
-	staleRuntimeErr  error
-)
-
-func checkStaleRuntime(t *testing.T) {
-	staleRuntimeOnce.Do(func() {
-		if runtime.Compiler == "gccgo" {
-			return
-		}
-		// 'go run' uses the installed copy of runtime.a, which may be out of date.
-		out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.Stale}}", "runtime")).CombinedOutput()
-		if err != nil {
-			staleRuntimeErr = fmt.Errorf("failed to execute 'go list': %v\n%v", err, string(out))
-			return
-		}
-		if string(out) != "false\n" {
-			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
-			out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
-			if err != nil {
-				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
-			}
-			t.Logf("go list -f {{.StaleReason}} runtime:\n%s", out)
-			staleRuntimeErr = fmt.Errorf("Stale runtime.a. Run 'go install runtime'.")
-		}
-	})
-	if staleRuntimeErr != nil {
-		t.Fatal(staleRuntimeErr)
+func TestVDSO(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "SignalInVDSO")
+	want := "success\n"
+	if output != want {
+		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
 	}
 }
 
@@ -231,9 +207,23 @@ func TestStackOverflow(t *testing.T) {
 		t.Skip("gccgo does not do stack overflow checking")
 	}
 	output := runTestProg(t, "testprog", "StackOverflow")
-	want := "runtime: goroutine stack exceeds 1474560-byte limit\nfatal error: stack overflow"
-	if !strings.HasPrefix(output, want) {
-		t.Fatalf("output does not start with %q:\n%s", want, output)
+	want := []string{
+		"runtime: goroutine stack exceeds 1474560-byte limit\n",
+		"fatal error: stack overflow",
+		// information about the current SP and stack bounds
+		"runtime: sp=",
+		"stack=[",
+	}
+	if !strings.HasPrefix(output, want[0]) {
+		t.Errorf("output does not start with %q", want[0])
+	}
+	for _, s := range want[1:] {
+		if !strings.Contains(output, s) {
+			t.Errorf("output does not contain %q", s)
+		}
+	}
+	if t.Failed() {
+		t.Logf("output:\n%s", output)
 	}
 }
 
@@ -257,6 +247,41 @@ panic: again
 
 }
 
+func TestRecursivePanic2(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic2")
+	want := `first panic
+second panic
+panic: third panic
+
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
+func TestRecursivePanic3(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic3")
+	want := `panic: first panic
+
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
+func TestRecursivePanic4(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic4")
+	want := `panic: first panic [recovered]
+	panic: second panic
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
 func TestGoexitCrash(t *testing.T) {
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
@@ -389,26 +414,32 @@ func TestRecoveredPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
-	// 1. defer a function that recovers
-	// 2. defer a function that panics
-	// 3. call goexit
-	// Goexit should run the #2 defer. Its panic
-	// should be caught by the #1 defer, and execution
-	// should resume in the caller. Like the Goexit
-	// never happened!
-	defer func() {
-		r := recover()
-		if r == nil {
-			panic("bad recover")
-		}
-	}()
-	defer func() {
-		panic("hello")
-	}()
-	runtime.Goexit()
+	t.Parallel()
+	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
 }
 
 func TestNetpollDeadlock(t *testing.T) {
+	if os.Getenv("GO_BUILDER_NAME") == "darwin-amd64-10_12" {
+		// A suspected kernel bug in macOS 10.12 occasionally results in
+		// an apparent deadlock when dialing localhost. The errors have not
+		// been observed on newer versions of the OS, so we don't plan to work
+		// around them. See https://golang.org/issue/22019.
+		testenv.SkipFlaky(t, 22019)
+	}
+
 	t.Parallel()
 	output := runTestProg(t, "testprognet", "NetpollDeadlock")
 	want := "done\n"
@@ -420,7 +451,7 @@ func TestNetpollDeadlock(t *testing.T) {
 func TestPanicTraceback(t *testing.T) {
 	t.Parallel()
 	output := runTestProg(t, "testprog", "PanicTraceback")
-	want := "panic: hello"
+	want := "panic: hello\n\tpanic: panic pt2\n\tpanic: panic pt1\n"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index b4b015e..7ce5bb2 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -15,9 +15,11 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
-	"strings"
+	"sync"
 	"syscall"
 	"testing"
+	"time"
+	"unsafe"
 )
 
 // sigquit is the signal to send to kill a hanging testdata program.
@@ -33,6 +35,29 @@ func init() {
 	}
 }
 
+func TestBadOpen(t *testing.T) {
+	// make sure we get the correct error code if open fails. Same for
+	// read/write/close on the resulting -1 fd. See issue 10052.
+	nonfile := []byte("/notreallyafile")
+	fd := runtime.Open(&nonfile[0], 0, 0)
+	if fd != -1 {
+		t.Errorf("open(%q)=%d, want -1", nonfile, fd)
+	}
+	var buf [32]byte
+	r := runtime.Read(-1, unsafe.Pointer(&buf[0]), int32(len(buf)))
+	if got, want := r, -int32(syscall.EBADF); got != want {
+		t.Errorf("read()=%d, want %d", got, want)
+	}
+	w := runtime.Write(^uintptr(0), unsafe.Pointer(&buf[0]), int32(len(buf)))
+	if got, want := w, -int32(syscall.EBADF); got != want {
+		t.Errorf("write()=%d, want %d", got, want)
+	}
+	c := runtime.Close(-1)
+	if c != -1 {
+		t.Errorf("close()=%d, want -1", c)
+	}
+}
+
 func TestCrashDumpsAllThreads(t *testing.T) {
 	if *flagQuick {
 		t.Skip("-quick")
@@ -53,8 +78,6 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	testenv.MustHaveGoBuild(t)
 
-	checkStaleRuntime(t)
-
 	t.Parallel()
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -76,18 +99,17 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
 	cmd = testenv.CleanCmdEnv(cmd)
-	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
-
-	// Set GOGC=off. Because of golang.org/issue/10958, the tight
-	// loops in the test program are not preemptible. If GC kicks
-	// in, it may lock up and prevent main from saying it's ready.
-	newEnv := []string{}
-	for _, s := range cmd.Env {
-		if !strings.HasPrefix(s, "GOGC=") {
-			newEnv = append(newEnv, s)
-		}
-	}
-	cmd.Env = append(newEnv, "GOGC=off")
+	cmd.Env = append(cmd.Env,
+		"GOTRACEBACK=crash",
+		// Set GOGC=off. Because of golang.org/issue/10958, the tight
+		// loops in the test program are not preemptible. If GC kicks
+		// in, it may lock up and prevent main from saying it's ready.
+		"GOGC=off",
+		// Set GODEBUG=asyncpreemptoff=1. If a thread is preempted
+		// when it receives SIGQUIT, it won't show the expected
+		// stack trace. See issue 35356.
+		"GODEBUG=asyncpreemptoff=1",
+	)
 
 	var outbuf bytes.Buffer
 	cmd.Stdout = &outbuf
@@ -288,3 +310,51 @@ func TestSignalDuringExec(t *testing.T) {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+func TestSignalM(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("no signalM for gccgo")
+	}
+
+	r, w, errno := runtime.Pipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+	runtime.Closeonexec(r)
+	runtime.Closeonexec(w)
+
+	var want, got int64
+	var wg sync.WaitGroup
+	ready := make(chan *runtime.M)
+	wg.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		want, got = runtime.WaitForSigusr1(r, w, func(mp *runtime.M) {
+			ready <- mp
+		})
+		runtime.UnlockOSThread()
+		wg.Done()
+	}()
+	waitingM := <-ready
+	runtime.SendSigusr1(waitingM)
+
+	timer := time.AfterFunc(time.Second, func() {
+		// Write 1 to tell WaitForSigusr1 that we timed out.
+		bw := byte(1)
+		if n := runtime.Write(uintptr(w), unsafe.Pointer(&bw), 1); n != 1 {
+			t.Errorf("pipe write failed: %d", n)
+		}
+	})
+	defer timer.Stop()
+
+	wg.Wait()
+	if got == -1 {
+		t.Fatal("signalM signal not received")
+	} else if want != got {
+		t.Fatalf("signal sent to M %d, but received on M %d", want, got)
+	}
+}
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index e480466..1202e36 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -26,12 +26,12 @@ func GOMAXPROCS(n int) int {
 		return ret
 	}
 
-	stopTheWorld("GOMAXPROCS")
+	stopTheWorldGC("GOMAXPROCS")
 
 	// newprocs will be processed by startTheWorld
 	newprocs = int32(n)
 
-	startTheWorld()
+	startTheWorldGC()
 	return ret
 }
 
diff --git a/libgo/go/runtime/debug/heapdump_test.go b/libgo/go/runtime/debug/heapdump_test.go
index c986efc..de1ec27 100644
--- a/libgo/go/runtime/debug/heapdump_test.go
+++ b/libgo/go/runtime/debug/heapdump_test.go
@@ -13,7 +13,7 @@ import (
 )
 
 func TestWriteHeapDumpNonempty(t *testing.T) {
-	if runtime.GOOS == "nacl" || runtime.GOOS == "js" {
+	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
 	f, err := ioutil.TempFile("", "heapdumptest")
@@ -42,7 +42,7 @@ func objfin(x *Obj) {
 }
 
 func TestWriteHeapDumpFinalizers(t *testing.T) {
-	if runtime.GOOS == "nacl" || runtime.GOOS == "js" {
+	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
 	f, err := ioutil.TempFile("", "heapdumptest")
diff --git a/libgo/go/runtime/debug/mod.go b/libgo/go/runtime/debug/mod.go
index 58c6ae0..c283928 100644
--- a/libgo/go/runtime/debug/mod.go
+++ b/libgo/go/runtime/debug/mod.go
@@ -23,7 +23,7 @@ func ReadBuildInfo() (info *BuildInfo, ok bool) {
 // the running binary.
 type BuildInfo struct {
 	Path string    // The main package path
-	Main Module    // The main module information
+	Main Module    // The module containing the main package
 	Deps []*Module // Module dependencies
 }
 
diff --git a/libgo/go/runtime/debug_test.go b/libgo/go/runtime/debug_test.go
index 12d93de..967477d 100644
--- a/libgo/go/runtime/debug_test.go
+++ b/libgo/go/runtime/debug_test.go
@@ -127,7 +127,7 @@ func TestDebugCall(t *testing.T) {
 		return x + 1
 	}
 	args.x = 42
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 	if args.yRet != 43 {
@@ -156,7 +156,7 @@ func TestDebugCallLarge(t *testing.T) {
 		args.in[i] = i
 		want[i] = i + 1
 	}
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 	if want != args.out {
@@ -169,7 +169,7 @@ func TestDebugCallGC(t *testing.T) {
 	defer after()
 
 	// Inject a call that performs a GC.
-	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -180,7 +180,7 @@ func TestDebugCallGrowStack(t *testing.T) {
 
 	// Inject a call that grows the stack. debugCallWorker checks
 	// for stack pointer breakage.
-	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -216,7 +216,7 @@ func TestDebugCallUnsafePoint(t *testing.T) {
 		runtime.Gosched()
 	}
 
-	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill)
+	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill, true)
 	if msg := "call not at safe point"; err == nil || err.Error() != msg {
 		t.Fatalf("want %q, got %s", msg, err)
 	}
@@ -240,7 +240,7 @@ func TestDebugCallPanic(t *testing.T) {
 	}()
 	g := <-ready
 
-	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill)
+	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill, false)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/libgo/go/runtime/debuglog.go b/libgo/go/runtime/debuglog.go
index 4f4109f..404d057 100644
--- a/libgo/go/runtime/debuglog.go
+++ b/libgo/go/runtime/debuglog.go
@@ -803,7 +803,7 @@ func printDebugLog() {
 
 func printDebugLogPC(pc uintptr) {
 	print(hex(pc))
-	name, file, line, _ := funcfileline(pc, -1)
+	name, file, line, _ := funcfileline(pc, -1, false)
 	if name == "" {
 		print(" [unknown PC]")
 	} else {
diff --git a/libgo/go/runtime/defer_test.go b/libgo/go/runtime/defer_test.go
new file mode 100644
index 0000000..3d8f812
--- /dev/null
+++ b/libgo/go/runtime/defer_test.go
@@ -0,0 +1,283 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+// Make sure open-coded defer exit code is not lost, even when there is an
+// unconditional panic (hence no return from the function)
+func TestUnconditionalPanic(t *testing.T) {
+	defer func() {
+		if recover() != "testUnconditional" {
+			t.Fatal("expected unconditional panic")
+		}
+	}()
+	panic("testUnconditional")
+}
+
+var glob int = 3
+
+// Test an open-coded defer and non-open-coded defer - make sure both defers run
+// and call recover()
+func TestOpenAndNonOpenDefers(t *testing.T) {
+	for {
+		// Non-open defer because in a loop
+		defer func(n int) {
+			if recover() != "testNonOpenDefer" {
+				t.Fatal("expected testNonOpen panic")
+			}
+		}(3)
+		if glob > 2 {
+			break
+		}
+	}
+	testOpen(t, 47)
+	panic("testNonOpenDefer")
+}
+
+//go:noinline
+func testOpen(t *testing.T, arg int) {
+	defer func(n int) {
+		if recover() != "testOpenDefer" {
+			t.Fatal("expected testOpen panic")
+		}
+	}(4)
+	if arg > 2 {
+		panic("testOpenDefer")
+	}
+}
+
+// Test a non-open-coded defer and an open-coded defer - make sure both defers run
+// and call recover()
+func TestNonOpenAndOpenDefers(t *testing.T) {
+	testOpen(t, 47)
+	for {
+		// Non-open defer because in a loop
+		defer func(n int) {
+			if recover() != "testNonOpenDefer" {
+				t.Fatal("expected testNonOpen panic")
+			}
+		}(3)
+		if glob > 2 {
+			break
+		}
+	}
+	panic("testNonOpenDefer")
+}
+
+var list []int
+
+// Make sure that conditional open-coded defers are activated correctly and run in
+// the correct order.
+func TestConditionalDefers(t *testing.T) {
+	list = make([]int, 0, 10)
+
+	defer func() {
+		if recover() != "testConditional" {
+			t.Fatal("expected panic")
+		}
+		want := []int{4, 2, 1}
+		if !reflect.DeepEqual(want, list) {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", want, list))
+		}
+
+	}()
+	testConditionalDefers(8)
+}
+
+func testConditionalDefers(n int) {
+	doappend := func(i int) {
+		list = append(list, i)
+	}
+
+	defer doappend(1)
+	if n > 5 {
+		defer doappend(2)
+		if n > 8 {
+			defer doappend(3)
+		} else {
+			defer doappend(4)
+		}
+	}
+	panic("testConditional")
+}
+
+// Test that there is no compile-time or run-time error if an open-coded defer
+// call is removed by constant propagation and dead-code elimination.
+func TestDisappearingDefer(t *testing.T) {
+	switch runtime.GOOS {
+	case "invalidOS":
+		defer func() {
+			t.Fatal("Defer shouldn't run")
+		}()
+	}
+}
+
+// This tests an extra recursive panic behavior that is only specified in the
+// code. Suppose a first panic P1 happens and starts processing defer calls. If a
+// second panic P2 happens while processing defer call D in frame F, then defer
+// call processing is restarted (with some potentially new defer calls created by
+// D or its callees). If the defer processing reaches the started defer call D
+// again in the defer stack, then the original panic P1 is aborted and cannot
+// continue panic processing or be recovered. If the panic P2 does a recover at
+// some point, it will naturally remove the original panic P1 from the stack
+// (since the original panic had to be in frame F or a descendant of F).
+func TestAbortedPanic(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatal(fmt.Sprintf("wanted nil recover, got %v", r))
+		}
+	}()
+	defer func() {
+		r := recover()
+		if r != "panic2" {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", "panic2", r))
+		}
+	}()
+	defer func() {
+		panic("panic2")
+	}()
+	panic("panic1")
+}
+
+// This tests that recover() does not succeed unless it is called directly from a
+// defer function that is directly called by the panic.  Here, we first call it
+// from a defer function that is created by the defer function called directly by
+// the panic.  In
+func TestRecoverMatching(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r != "panic1" {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", "panic1", r))
+		}
+	}()
+	defer func() {
+		defer func() {
+			// Shouldn't succeed, even though it is called directly
+			// from a defer function, since this defer function was
+			// not directly called by the panic.
+			r := recover()
+			if r != nil {
+				t.Fatal(fmt.Sprintf("wanted nil recover, got %v", r))
+			}
+		}()
+	}()
+	panic("panic1")
+}
+
+type nonSSAable [128]byte
+
+type bigStruct struct {
+	x, y, z, w, p, q int64
+}
+
+type containsBigStruct struct {
+	element bigStruct
+}
+
+func mknonSSAable() nonSSAable {
+	globint1++
+	return nonSSAable{0, 0, 0, 0, 5}
+}
+
+var globint1, globint2, globint3 int
+
+//go:noinline
+func sideeffect(n int64) int64 {
+	globint2++
+	return n
+}
+
+func sideeffect2(in containsBigStruct) containsBigStruct {
+	globint3++
+	return in
+}
+
+// Test that nonSSAable arguments to defer are handled correctly and only evaluated once.
+func TestNonSSAableArgs(t *testing.T) {
+	globint1 = 0
+	globint2 = 0
+	globint3 = 0
+	var save1 byte
+	var save2 int64
+	var save3 int64
+	var save4 int64
+
+	defer func() {
+		if globint1 != 1 {
+			t.Fatal(fmt.Sprintf("globint1:  wanted: 1, got %v", globint1))
+		}
+		if save1 != 5 {
+			t.Fatal(fmt.Sprintf("save1:  wanted: 5, got %v", save1))
+		}
+		if globint2 != 1 {
+			t.Fatal(fmt.Sprintf("globint2:  wanted: 1, got %v", globint2))
+		}
+		if save2 != 2 {
+			t.Fatal(fmt.Sprintf("save2:  wanted: 2, got %v", save2))
+		}
+		if save3 != 4 {
+			t.Fatal(fmt.Sprintf("save3:  wanted: 4, got %v", save3))
+		}
+		if globint3 != 1 {
+			t.Fatal(fmt.Sprintf("globint3:  wanted: 1, got %v", globint3))
+		}
+		if save4 != 4 {
+			t.Fatal(fmt.Sprintf("save1:  wanted: 4, got %v", save4))
+		}
+	}()
+
+	// Test function returning a non-SSAable arg
+	defer func(n nonSSAable) {
+		save1 = n[4]
+	}(mknonSSAable())
+	// Test composite literal that is not SSAable
+	defer func(b bigStruct) {
+		save2 = b.y
+	}(bigStruct{1, 2, 3, 4, 5, sideeffect(6)})
+
+	// Test struct field reference that is non-SSAable
+	foo := containsBigStruct{}
+	foo.element.z = 4
+	defer func(element bigStruct) {
+		save3 = element.z
+	}(foo.element)
+	defer func(element bigStruct) {
+		save4 = element.z
+	}(sideeffect2(foo).element)
+}
+
+//go:noinline
+func doPanic() {
+	panic("Test panic")
+}
+
+func TestDeferForFuncWithNoExit(t *testing.T) {
+	cond := 1
+	defer func() {
+		if cond != 2 {
+			t.Fatal(fmt.Sprintf("cond: wanted 2, got %v", cond))
+		}
+		if recover() != "Test panic" {
+			t.Fatal("Didn't find expected panic")
+		}
+	}()
+	x := 0
+	// Force a stack copy, to make sure that the &cond pointer passed to defer
+	// function is properly updated.
+	growStackIter(&x, 1000)
+	cond = 2
+	doPanic()
+
+	// This function has no exit/return, since it ends with an infinite loop
+	for {
+	}
+}
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index c725bd93..bf8996c 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd hurd js,wasm linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows
 
 package runtime
 
diff --git a/libgo/go/runtime/error.go b/libgo/go/runtime/error.go
index 0c7f631..6cc46bf 100644
--- a/libgo/go/runtime/error.go
+++ b/libgo/go/runtime/error.go
@@ -129,7 +129,7 @@ func (e plainError) Error() string {
 	return string(e)
 }
 
-// An boundsError represents a an indexing or slicing operation gone wrong.
+// A boundsError represents an indexing or slicing operation gone wrong.
 type boundsError struct {
 	x int64
 	y int
diff --git a/libgo/go/runtime/export_debug_test.go b/libgo/go/runtime/export_debug_test.go
index 608d756..769ad55 100644
--- a/libgo/go/runtime/export_debug_test.go
+++ b/libgo/go/runtime/export_debug_test.go
@@ -21,7 +21,7 @@ import (
 //
 // On success, InjectDebugCall returns the panic value of fn or nil.
 // If fn did not panic, its results will be available in args.
-func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (interface{}, error) {
+func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, returnOnUnsafePoint bool) (interface{}, error) {
 	if gp.lockedm == 0 {
 		return nil, plainError("goroutine not locked to thread")
 	}
@@ -65,9 +65,16 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (in
 		notetsleepg(&h.done, -1)
 		if h.err != "" {
 			switch h.err {
-			case "retry _Grunnable", "executing on Go runtime stack":
+			case "call not at safe point":
+				if returnOnUnsafePoint {
+					// This is for TestDebugCallUnsafePoint.
+					return nil, h.err
+				}
+				fallthrough
+			case "retry _Grunnable", "executing on Go runtime stack", "call from within the Go runtime":
 				// These are transient states. Try to get out of them.
 				if i < 100 {
+					usleep(100)
 					Gosched()
 					continue
 				}
diff --git a/libgo/go/runtime/export_linux_test.go b/libgo/go/runtime/export_linux_test.go
index 96ff1c7..1f8e633 100644
--- a/libgo/go/runtime/export_linux_test.go
+++ b/libgo/go/runtime/export_linux_test.go
@@ -10,6 +10,9 @@ import "unsafe"
 
 // var NewOSProc0 = newosproc0
 // var Mincore = mincore
+var Add = add
+
+type EpollEvent epollevent
 
 func Epollctl(epfd, op, fd int32, ev unsafe.Pointer) int32 {
 	return epollctl(epfd, op, fd, (*epollevent)(ev))
diff --git a/libgo/go/runtime/export_mmap_test.go b/libgo/go/runtime/export_mmap_test.go
index 5f3e99a..000948b 100644
--- a/libgo/go/runtime/export_mmap_test.go
+++ b/libgo/go/runtime/export_mmap_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd hurd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 // Export guts for testing.
 
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 10890d3..9a977d8 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -35,9 +35,18 @@ var Atoi = atoi
 var Atoi32 = atoi32
 
 var Nanotime = nanotime
+var NetpollBreak = netpollBreak
+var Usleep = usleep
 
+var PhysPageSize = physPageSize
 var PhysHugePageSize = physHugePageSize
 
+var NetpollGenericInit = netpollGenericInit
+
+var ParseRelease = parseRelease
+
+const PreemptMSupported = preemptMSupported
+
 type LFNode struct {
 	Next    uint64
 	Pushcnt uintptr
@@ -51,6 +60,12 @@ func LFStackPop(head *uint64) *LFNode {
 	return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
 }
 
+func Netpoll(delta int64) {
+	systemstack(func() {
+		netpoll(delta)
+	})
+}
+
 func GCMask(x interface{}) (ret []byte) {
 	return nil
 }
@@ -241,7 +256,7 @@ func CountPagesInUse() (pagesInUse, counted uintptr) {
 	pagesInUse = uintptr(mheap_.pagesInUse)
 
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			counted += s.npages
 		}
 	}
@@ -303,7 +318,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 
 		// Add up current allocations in spans.
 		for _, s := range mheap_.allspans {
-			if s.state != mSpanInUse {
+			if s.state.get() != mSpanInUse {
 				continue
 			}
 			if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
@@ -336,10 +351,18 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			slow.BySize[i].Frees = bySize[i].Frees
 		}
 
-		for i := mheap_.free.start(0, 0); i.valid(); i = i.next() {
-			slow.HeapReleased += uint64(i.span().released())
+		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
+			pg := mheap_.pages.chunkOf(i).scavenged.popcntRange(0, pallocChunkPages)
+			slow.HeapReleased += uint64(pg) * pageSize
+		}
+		for _, p := range allp {
+			pg := sys.OnesCount64(p.pcache.scav)
+			slow.HeapReleased += uint64(pg) * pageSize
 		}
 
+		// Unused space in the current arena also counts as released space.
+		slow.HeapReleased += uint64(mheap_.curArena.end - mheap_.curArena.base)
+
 		getg().m.mallocing--
 	})
 
@@ -512,200 +535,410 @@ func MapTombstoneCheck(m map[int]int) {
 	}
 }
 
-// UnscavHugePagesSlow returns the value of mheap_.freeHugePages
-// and the number of unscavenged huge pages calculated by
-// scanning the heap.
-func UnscavHugePagesSlow() (uintptr, uintptr) {
-	var base, slow uintptr
-	// Run on the system stack to avoid deadlock from stack growth
-	// trying to acquire the heap lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		base = mheap_.free.unscavHugePages
-		for _, s := range mheap_.allspans {
-			if s.state == mSpanFree && !s.scavenged {
-				slow += s.hugePages()
-			}
-		}
-		unlock(&mheap_.lock)
-	})
-	return base, slow
-}
+func RunGetgThreadSwitchTest() {
+	// Test that getg works correctly with thread switch.
+	// With gccgo, if we generate getg inlined, the backend
+	// may cache the address of the TLS variable, which
+	// will become invalid after a thread switch. This test
+	// checks that the bad caching doesn't happen.
 
-// Span is a safe wrapper around an mspan, whose memory
-// is managed manually.
-type Span struct {
-	*mspan
+	ch := make(chan int)
+	go func(ch chan int) {
+		ch <- 5
+		LockOSThread()
+	}(ch)
+
+	g1 := getg()
+
+	// Block on a receive. This is likely to get us a thread
+	// switch. If we yield to the sender goroutine, it will
+	// lock the thread, forcing us to resume on a different
+	// thread.
+	<-ch
+
+	g2 := getg()
+	if g1 != g2 {
+		panic("g1 != g2")
+	}
+
+	// Also test getg after some control flow, as the
+	// backend is sensitive to control flow.
+	g3 := getg()
+	if g1 != g3 {
+		panic("g1 != g3")
+	}
 }
 
-func AllocSpan(base, npages uintptr, scavenged bool) Span {
-	var s *mspan
-	systemstack(func() {
-		lock(&mheap_.lock)
-		s = (*mspan)(mheap_.spanalloc.alloc())
-		unlock(&mheap_.lock)
-	})
-	s.init(base, npages)
-	s.scavenged = scavenged
-	return Span{s}
+const (
+	PageSize         = pageSize
+	PallocChunkPages = pallocChunkPages
+	PageAlloc64Bit   = pageAlloc64Bit
+)
+
+// Expose pallocSum for testing.
+type PallocSum pallocSum
+
+func PackPallocSum(start, max, end uint) PallocSum { return PallocSum(packPallocSum(start, max, end)) }
+func (m PallocSum) Start() uint                    { return pallocSum(m).start() }
+func (m PallocSum) Max() uint                      { return pallocSum(m).max() }
+func (m PallocSum) End() uint                      { return pallocSum(m).end() }
+
+// Expose pallocBits for testing.
+type PallocBits pallocBits
+
+func (b *PallocBits) Find(npages uintptr, searchIdx uint) (uint, uint) {
+	return (*pallocBits)(b).find(npages, searchIdx)
 }
+func (b *PallocBits) AllocRange(i, n uint)       { (*pallocBits)(b).allocRange(i, n) }
+func (b *PallocBits) Free(i, n uint)             { (*pallocBits)(b).free(i, n) }
+func (b *PallocBits) Summarize() PallocSum       { return PallocSum((*pallocBits)(b).summarize()) }
+func (b *PallocBits) PopcntRange(i, n uint) uint { return (*pageBits)(b).popcntRange(i, n) }
 
-func (s *Span) Free() {
-	systemstack(func() {
-		lock(&mheap_.lock)
-		mheap_.spanalloc.free(unsafe.Pointer(s.mspan))
-		unlock(&mheap_.lock)
-	})
-	s.mspan = nil
+// SummarizeSlow is a slow but more obviously correct implementation
+// of (*pallocBits).summarize. Used for testing.
+func SummarizeSlow(b *PallocBits) PallocSum {
+	var start, max, end uint
+
+	const N = uint(len(b)) * 64
+	for start < N && (*pageBits)(b).get(start) == 0 {
+		start++
+	}
+	for end < N && (*pageBits)(b).get(N-end-1) == 0 {
+		end++
+	}
+	run := uint(0)
+	for i := uint(0); i < N; i++ {
+		if (*pageBits)(b).get(i) == 0 {
+			run++
+		} else {
+			run = 0
+		}
+		if run > max {
+			max = run
+		}
+	}
+	return PackPallocSum(start, max, end)
 }
 
-func (s Span) Base() uintptr {
-	return s.mspan.base()
+// Expose non-trivial helpers for testing.
+func FindBitRange64(c uint64, n uint) uint { return findBitRange64(c, n) }
+
+// Given two PallocBits, returns a set of bit ranges where
+// they differ.
+func DiffPallocBits(a, b *PallocBits) []BitRange {
+	ba := (*pageBits)(a)
+	bb := (*pageBits)(b)
+
+	var d []BitRange
+	base, size := uint(0), uint(0)
+	for i := uint(0); i < uint(len(ba))*64; i++ {
+		if ba.get(i) != bb.get(i) {
+			if size == 0 {
+				base = i
+			}
+			size++
+		} else {
+			if size != 0 {
+				d = append(d, BitRange{base, size})
+			}
+			size = 0
+		}
+	}
+	if size != 0 {
+		d = append(d, BitRange{base, size})
+	}
+	return d
+}
+
+// StringifyPallocBits gets the bits in the bit range r from b,
+// and returns a string containing the bits as ASCII 0 and 1
+// characters.
+func StringifyPallocBits(b *PallocBits, r BitRange) string {
+	str := ""
+	for j := r.I; j < r.I+r.N; j++ {
+		if (*pageBits)(b).get(j) != 0 {
+			str += "1"
+		} else {
+			str += "0"
+		}
+	}
+	return str
 }
 
-func (s Span) Pages() uintptr {
-	return s.mspan.npages
+// Expose pallocData for testing.
+type PallocData pallocData
+
+func (d *PallocData) FindScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
+	return (*pallocData)(d).findScavengeCandidate(searchIdx, min, max)
+}
+func (d *PallocData) AllocRange(i, n uint) { (*pallocData)(d).allocRange(i, n) }
+func (d *PallocData) ScavengedSetRange(i, n uint) {
+	(*pallocData)(d).scavenged.setRange(i, n)
+}
+func (d *PallocData) PallocBits() *PallocBits {
+	return (*PallocBits)(&(*pallocData)(d).pallocBits)
+}
+func (d *PallocData) Scavenged() *PallocBits {
+	return (*PallocBits)(&(*pallocData)(d).scavenged)
 }
 
-type TreapIterType treapIterType
+// Expose fillAligned for testing.
+func FillAligned(x uint64, m uint) uint64 { return fillAligned(x, m) }
 
-const (
-	TreapIterScav TreapIterType = TreapIterType(treapIterScav)
-	TreapIterHuge               = TreapIterType(treapIterHuge)
-	TreapIterBits               = treapIterBits
-)
+// Expose pageCache for testing.
+type PageCache pageCache
 
-type TreapIterFilter treapIterFilter
+const PageCachePages = pageCachePages
 
-func TreapFilter(mask, match TreapIterType) TreapIterFilter {
-	return TreapIterFilter(treapFilter(treapIterType(mask), treapIterType(match)))
+func NewPageCache(base uintptr, cache, scav uint64) PageCache {
+	return PageCache(pageCache{base: base, cache: cache, scav: scav})
 }
-
-func (s Span) MatchesIter(mask, match TreapIterType) bool {
-	return treapFilter(treapIterType(mask), treapIterType(match)).matches(s.treapFilter())
+func (c *PageCache) Empty() bool   { return (*pageCache)(c).empty() }
+func (c *PageCache) Base() uintptr { return (*pageCache)(c).base }
+func (c *PageCache) Cache() uint64 { return (*pageCache)(c).cache }
+func (c *PageCache) Scav() uint64  { return (*pageCache)(c).scav }
+func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
+	return (*pageCache)(c).alloc(npages)
 }
-
-type TreapIter struct {
-	treapIter
+func (c *PageCache) Flush(s *PageAlloc) {
+	(*pageCache)(c).flush((*pageAlloc)(s))
 }
 
-func (t TreapIter) Span() Span {
-	return Span{t.span()}
+// Expose chunk index type.
+type ChunkIdx chunkIdx
+
+// Expose pageAlloc for testing. Note that because pageAlloc is
+// not in the heap, so is PageAlloc.
+type PageAlloc pageAlloc
+
+func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
+	return (*pageAlloc)(p).alloc(npages)
+}
+func (p *PageAlloc) AllocToCache() PageCache {
+	return PageCache((*pageAlloc)(p).allocToCache())
+}
+func (p *PageAlloc) Free(base, npages uintptr) {
+	(*pageAlloc)(p).free(base, npages)
+}
+func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
+	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
+}
+func (p *PageAlloc) Scavenge(nbytes uintptr, locked bool) (r uintptr) {
+	systemstack(func() {
+		r = (*pageAlloc)(p).scavenge(nbytes, locked)
+	})
+	return
+}
+func (p *PageAlloc) InUse() []AddrRange {
+	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
+	for _, r := range p.inUse.ranges {
+		ranges = append(ranges, AddrRange{
+			Base:  r.base,
+			Limit: r.limit,
+		})
+	}
+	return ranges
 }
 
-func (t TreapIter) Valid() bool {
-	return t.valid()
+// Returns nil if the PallocData's L2 is missing.
+func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
+	ci := chunkIdx(i)
+	l2 := (*pageAlloc)(p).chunks[ci.l1()]
+	if l2 == nil {
+		return nil
+	}
+	return (*PallocData)(&l2[ci.l2()])
 }
 
-func (t TreapIter) Next() TreapIter {
-	return TreapIter{t.next()}
+// AddrRange represents a range over addresses.
+// Specifically, it represents the range [Base, Limit).
+type AddrRange struct {
+	Base, Limit uintptr
 }
 
-func (t TreapIter) Prev() TreapIter {
-	return TreapIter{t.prev()}
+// BitRange represents a range over a bitmap.
+type BitRange struct {
+	I, N uint // bit index and length in bits
 }
 
-// Treap is a safe wrapper around mTreap for testing.
+// NewPageAlloc creates a new page allocator for testing and
+// initializes it with the scav and chunks maps. Each key in these maps
+// represents a chunk index and each value is a series of bit ranges to
+// set within each bitmap's chunk.
 //
-// It must never be heap-allocated because mTreap is
-// notinheap.
+// The initialization of the pageAlloc preserves the invariant that if a
+// scavenged bit is set the alloc bit is necessarily unset, so some
+// of the bits described by scav may be cleared in the final bitmap if
+// ranges in chunks overlap with them.
 //
-//go:notinheap
-type Treap struct {
-	mTreap
-}
+// scav is optional, and if nil, the scavenged bitmap will be cleared
+// (as opposed to all 1s, which it usually is). Furthermore, every
+// chunk index in scav must appear in chunks; ones that do not are
+// ignored.
+func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
+	p := new(pageAlloc)
+
+	// We've got an entry, so initialize the pageAlloc.
+	p.init(new(mutex), nil)
+	p.test = true
+
+	for i, init := range chunks {
+		addr := chunkBase(chunkIdx(i))
+
+		// Mark the chunk's existence in the pageAlloc.
+		p.grow(addr, pallocChunkBytes)
+
+		// Initialize the bitmap and update pageAlloc metadata.
+		chunk := p.chunkOf(chunkIndex(addr))
+
+		// Clear all the scavenged bits which grow set.
+		chunk.scavenged.clearRange(0, pallocChunkPages)
+
+		// Apply scavenge state if applicable.
+		if scav != nil {
+			if scvg, ok := scav[i]; ok {
+				for _, s := range scvg {
+					// Ignore the case of s.N == 0. setRange doesn't handle
+					// it and it's a no-op anyway.
+					if s.N != 0 {
+						chunk.scavenged.setRange(s.I, s.N)
+					}
+				}
+			}
+		}
+		p.resetScavengeAddr()
+
+		// Apply alloc state.
+		for _, s := range init {
+			// Ignore the case of s.N == 0. allocRange doesn't handle
+			// it and it's a no-op anyway.
+			if s.N != 0 {
+				chunk.allocRange(s.I, s.N)
+			}
+		}
 
-func (t *Treap) Start(mask, match TreapIterType) TreapIter {
-	return TreapIter{t.start(treapIterType(mask), treapIterType(match))}
+		// Update heap metadata for the allocRange calls above.
+		p.update(addr, pallocChunkPages, false, false)
+	}
+	return (*PageAlloc)(p)
 }
 
-func (t *Treap) End(mask, match TreapIterType) TreapIter {
-	return TreapIter{t.end(treapIterType(mask), treapIterType(match))}
-}
+// FreePageAlloc releases hard OS resources owned by the pageAlloc. Once this
+// is called the pageAlloc may no longer be used. The object itself will be
+// collected by the garbage collector once it is no longer live.
+func FreePageAlloc(pp *PageAlloc) {
+	p := (*pageAlloc)(pp)
 
-func (t *Treap) Insert(s Span) {
-	// mTreap uses a fixalloc in mheap_ for treapNode
-	// allocation which requires the mheap_ lock to manipulate.
-	// Locking here is safe because the treap itself never allocs
-	// or otherwise ends up grabbing this lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		t.insert(s.mspan)
-		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
+	// Free all the mapped space for the summary levels.
+	if pageAlloc64Bit != 0 {
+		for l := 0; l < summaryLevels; l++ {
+			sysFree(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes, nil)
+		}
+	} else {
+		resSize := uintptr(0)
+		for _, s := range p.summary {
+			resSize += uintptr(cap(s)) * pallocSumBytes
+		}
+		sysFree(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize), nil)
+	}
+
+	// Free the mapped space for chunks.
+	for i := range p.chunks {
+		if x := p.chunks[i]; x != nil {
+			p.chunks[i] = nil
+			// This memory comes from sysAlloc and will always be page-aligned.
+			sysFree(unsafe.Pointer(x), unsafe.Sizeof(*p.chunks[0]), nil)
+		}
+	}
 }
 
-func (t *Treap) Find(npages uintptr) TreapIter {
-	return TreapIter{t.find(npages)}
+// BaseChunkIdx is a convenient chunkIdx value which works on both
+// 64 bit and 32 bit platforms, allowing the tests to share code
+// between the two.
+//
+// On AIX, the arenaBaseOffset is 0x0a00000000000000. However, this
+// constant can't be used here because it is negative and will cause
+// a constant overflow.
+//
+// This should not be higher than 0x100*pallocChunkBytes to support
+// mips and mipsle, which only have 31-bit address spaces.
+var BaseChunkIdx = ChunkIdx(chunkIndex(((0xc000*pageAlloc64Bit + 0x100*pageAlloc32Bit) * pallocChunkBytes) + 0x0a00000000000000*sys.GoosAix))
+
+// PageBase returns an address given a chunk index and a page index
+// relative to that chunk.
+func PageBase(c ChunkIdx, pageIdx uint) uintptr {
+	return chunkBase(chunkIdx(c)) + uintptr(pageIdx)*pageSize
 }
 
-func (t *Treap) Erase(i TreapIter) {
-	// mTreap uses a fixalloc in mheap_ for treapNode
-	// freeing which requires the mheap_ lock to manipulate.
-	// Locking here is safe because the treap itself never allocs
-	// or otherwise ends up grabbing this lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		t.erase(i.treapIter)
-		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
+type BitsMismatch struct {
+	Base      uintptr
+	Got, Want uint64
 }
 
-func (t *Treap) RemoveSpan(s Span) {
-	// See Erase about locking.
+func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
+	ok = true
+
+	// Run on the system stack to avoid stack growth allocation.
 	systemstack(func() {
+		getg().m.mallocing++
+
+		// Lock so that we can safely access the bitmap.
 		lock(&mheap_.lock)
-		t.removeSpan(s.mspan)
+	chunkLoop:
+		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
+			chunk := mheap_.pages.chunkOf(i)
+			for j := 0; j < pallocChunkPages/64; j++ {
+				// Run over each 64-bit bitmap section and ensure
+				// scavenged is being cleared properly on allocation.
+				// If a used bit and scavenged bit are both set, that's
+				// an error, and could indicate a larger problem, or
+				// an accounting problem.
+				want := chunk.scavenged[j] &^ chunk.pallocBits[j]
+				got := chunk.scavenged[j]
+				if want != got {
+					ok = false
+					if n >= len(mismatches) {
+						break chunkLoop
+					}
+					mismatches[n] = BitsMismatch{
+						Base: chunkBase(i) + uintptr(j)*64*pageSize,
+						Got:  got,
+						Want: want,
+					}
+					n++
+				}
+			}
+		}
 		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
-}
 
-func (t *Treap) Size() int {
-	i := 0
-	t.mTreap.treap.walkTreap(func(t *treapNode) {
-		i++
+		getg().m.mallocing--
 	})
-	return i
+	return
 }
 
-func (t *Treap) CheckInvariants() {
-	t.mTreap.treap.walkTreap(checkTreapNode)
-	t.mTreap.treap.validateInvariants()
-}
+func PageCachePagesLeaked() (leaked uintptr) {
+	stopTheWorld("PageCachePagesLeaked")
 
-func RunGetgThreadSwitchTest() {
-	// Test that getg works correctly with thread switch.
-	// With gccgo, if we generate getg inlined, the backend
-	// may cache the address of the TLS variable, which
-	// will become invalid after a thread switch. This test
-	// checks that the bad caching doesn't happen.
-
-	ch := make(chan int)
-	go func(ch chan int) {
-		ch <- 5
-		LockOSThread()
-	}(ch)
-
-	g1 := getg()
+	// Walk over destroyed Ps and look for unflushed caches.
+	deadp := allp[len(allp):cap(allp)]
+	for _, p := range deadp {
+		// Since we're going past len(allp) we may see nil Ps.
+		// Just ignore them.
+		if p != nil {
+			leaked += uintptr(sys.OnesCount64(p.pcache.cache))
+		}
+	}
 
-	// Block on a receive. This is likely to get us a thread
-	// switch. If we yield to the sender goroutine, it will
-	// lock the thread, forcing us to resume on a different
-	// thread.
-	<-ch
+	startTheWorld()
+	return
+}
 
-	g2 := getg()
-	if g1 != g2 {
-		panic("g1 != g2")
-	}
+var Semacquire = semacquire
+var Semrelease1 = semrelease1
 
-	// Also test getg after some control flow, as the
-	// backend is sensitive to control flow.
-	g3 := getg()
-	if g1 != g3 {
-		panic("g1 != g3")
-	}
+func SemNwait(addr *uint32) uint32 {
+	root := semroot(addr)
+	return atomic.Load(&root.nwait)
 }
+
+var Pusestackmaps = &usestackmaps
diff --git a/libgo/go/runtime/export_unix_test.go b/libgo/go/runtime/export_unix_test.go
index 064d2b2..9d0f0d8 100644
--- a/libgo/go/runtime/export_unix_test.go
+++ b/libgo/go/runtime/export_unix_test.go
@@ -6,6 +6,13 @@
 
 package runtime
 
+import "unsafe"
+
+var NonblockingPipe = nonblockingPipe
+var Pipe = pipe
+var SetNonblock = setNonblock
+var Closeonexec = closeonexec
+
 func sigismember(mask *sigset, i int) bool {
 	clear := *mask
 	sigdelset(&clear, i)
@@ -17,3 +24,71 @@ func Sigisblocked(i int) bool {
 	sigprocmask(_SIG_SETMASK, nil, &sigmask)
 	return sigismember(&sigmask, i)
 }
+
+type M = m
+
+var waitForSigusr1 struct {
+	rdpipe int32
+	wrpipe int32
+	mID    int64
+}
+
+// WaitForSigusr1 blocks until a SIGUSR1 is received. It calls ready
+// when it is set up to receive SIGUSR1. The ready function should
+// cause a SIGUSR1 to be sent. The r and w arguments are a pipe that
+// the signal handler can use to report when the signal is received.
+//
+// Once SIGUSR1 is received, it returns the ID of the current M and
+// the ID of the M the SIGUSR1 was received on. If the caller writes
+// a non-zero byte to w, WaitForSigusr1 returns immediately with -1, -1.
+func WaitForSigusr1(r, w int32, ready func(mp *M)) (int64, int64) {
+	lockOSThread()
+	// Make sure we can receive SIGUSR1.
+	unblocksig(_SIGUSR1)
+
+	waitForSigusr1.rdpipe = r
+	waitForSigusr1.wrpipe = w
+
+	mp := getg().m
+	testSigusr1 = waitForSigusr1Callback
+	ready(mp)
+
+	// Wait for the signal. We use a pipe rather than a note
+	// because write is always async-signal-safe.
+	entersyscallblock()
+	var b byte
+	read(waitForSigusr1.rdpipe, noescape(unsafe.Pointer(&b)), 1)
+	exitsyscall()
+
+	gotM := waitForSigusr1.mID
+	testSigusr1 = nil
+
+	unlockOSThread()
+
+	if b != 0 {
+		// timeout signal from caller
+		return -1, -1
+	}
+	return mp.id, gotM
+}
+
+// waitForSigusr1Callback is called from the signal handler during
+// WaitForSigusr1. It must not have write barriers because there may
+// not be a P.
+//
+//go:nowritebarrierrec
+func waitForSigusr1Callback(gp *g) bool {
+	if gp == nil || gp.m == nil {
+		waitForSigusr1.mID = -1
+	} else {
+		waitForSigusr1.mID = gp.m.id
+	}
+	b := byte(0)
+	write(uintptr(waitForSigusr1.wrpipe), noescape(unsafe.Pointer(&b)), 1)
+	return true
+}
+
+// SendSigusr1 sends SIGUSR1 to mp.
+func SendSigusr1(mp *M) {
+	panic("SendSigusr1")
+}
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 9dbf057..e2e601f 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -125,6 +125,13 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	IDs will refer to the ID of the goroutine at the time of creation; it's possible for this
 	ID to be reused for another goroutine. Setting N to 0 will report no ancestry information.
 
+	asyncpreemptoff: asyncpreemptoff=1 disables signal-based
+	asynchronous goroutine preemption. This makes some loops
+	non-preemptible for long periods, which may delay GC and
+	goroutine scheduling. This is useful for debugging GC issues
+	because it also disables the conservative stack scanning used
+	for asynchronously preempted goroutines.
+
 The net, net/http, and crypto/tls packages also refer to debugging variables in GODEBUG.
 See the documentation for those packages for details.
 
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index 3eb01bf..8f14bf9 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -22,12 +22,6 @@ func TestGcSys(t *testing.T) {
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
-	if runtime.GOOS == "windows" {
-		t.Skip("skipping test; GOOS=windows http://golang.org/issue/27156")
-	}
-	if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
-		t.Skip("skipping test; GOOS=linux GOARCH=arm64 https://github.com/golang/go/issues/27636")
-	}
 	got := runTestProg(t, "testprog", "GCSys")
 	want := "OK\n"
 	if got != want {
@@ -473,25 +467,6 @@ func TestReadMemStats(t *testing.T) {
 	}
 }
 
-func TestUnscavHugePages(t *testing.T) {
-	// Allocate 20 MiB and immediately free it a few times to increase
-	// the chance that unscavHugePages isn't zero and that some kind of
-	// accounting had to happen in the runtime.
-	for j := 0; j < 3; j++ {
-		var large [][]byte
-		for i := 0; i < 5; i++ {
-			large = append(large, make([]byte, runtime.PhysHugePageSize))
-		}
-		runtime.KeepAlive(large)
-		runtime.GC()
-	}
-	base, slow := runtime.UnscavHugePagesSlow()
-	if base != slow {
-		logDiff(t, "unscavHugePages", reflect.ValueOf(base), reflect.ValueOf(slow))
-		t.Fatal("unscavHugePages mismatch")
-	}
-}
-
 func logDiff(t *testing.T, prefix string, got, want reflect.Value) {
 	typ := got.Type()
 	switch typ.Kind() {
diff --git a/libgo/go/runtime/gcinfo_test.go b/libgo/go/runtime/gcinfo_test.go
index 89144d5..fc24f04 100644
--- a/libgo/go/runtime/gcinfo_test.go
+++ b/libgo/go/runtime/gcinfo_test.go
@@ -173,14 +173,6 @@ func infoBigStruct() []byte {
 			typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
 			typePointer, typeScalar, // i string
 		}
-	case "amd64p32":
-		return []byte{
-			typePointer,                                                // q *int
-			typeScalar, typeScalar, typeScalar, typeScalar, typeScalar, // w byte; e [17]byte
-			typePointer, typeScalar, typeScalar, // r []byte
-			typeScalar, typeScalar, typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
-			typePointer, typeScalar, // i string
-		}
 	default:
 		panic("unknown arch")
 	}
diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go
index 3f94256..cff663a 100644
--- a/libgo/go/runtime/hash64.go
+++ b/libgo/go/runtime/hash64.go
@@ -25,8 +25,7 @@ const (
 )
 
 func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
-	if (GOARCH == "amd64" || GOARCH == "arm64") &&
-		GOOS != "nacl" && useAeshash {
+	if (GOARCH == "amd64" || GOARCH == "arm64") && useAeshash {
 		return aeshash(p, seed, s)
 	}
 	h := uint64(seed + s*hashkey[0])
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index c968ab3..e8f16e9 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -313,7 +313,7 @@ func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
 func dumproots() {
 	// MSpan.types
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			// Finalizers
 			for sp := s.specials; sp != nil; sp = sp.next {
 				if sp.kind != _KindSpecialFinalizer {
@@ -336,7 +336,7 @@ var freemark [_PageSize / 8]bool
 
 func dumpobjs() {
 	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+		if s.state.get() != mSpanInUse {
 			continue
 		}
 		p := s.base()
@@ -445,7 +445,7 @@ func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs,
 	dumpint(uint64(nstk))
 	for i := uintptr(0); i < nstk; i++ {
 		pc := stk[i]
-		fn, file, line, _ := funcfileline(pc, -1)
+		fn, file, line, _ := funcfileline(pc, -1, false)
 		if fn == "" {
 			var buf [64]byte
 			n := len(buf)
@@ -483,7 +483,7 @@ func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs,
 func dumpmemprof() {
 	iterate_memprof(dumpmemprof_callback)
 	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+		if s.state.get() != mSpanInUse {
 			continue
 		}
 		for sp := s.specials; sp != nil; sp = sp.next {
@@ -504,7 +504,7 @@ var dumphdr = []byte("go1.7 heap dump\n")
 func mdump() {
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			s.ensureSwept()
 		}
 	}
diff --git a/libgo/go/runtime/internal/atomic/atomic.c b/libgo/go/runtime/internal/atomic/atomic.c
index 17c83a2..8ae4d7b 100644
--- a/libgo/go/runtime/internal/atomic/atomic.c
+++ b/libgo/go/runtime/internal/atomic/atomic.c
@@ -26,6 +26,16 @@ Loadp (void *ptr)
   return __atomic_load_n ((void **) ptr, __ATOMIC_SEQ_CST);
 }
 
+uint8_t Load8 (uint8_t *ptr)
+  __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Load8")
+  __attribute__ ((no_split_stack));
+
+uint8_t
+Load8 (uint8_t *ptr)
+{
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
+}
+
 uint64_t Load64 (uint64_t *ptr)
   __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Load64")
   __attribute__ ((no_split_stack));
@@ -238,6 +248,16 @@ Store (uint32_t *ptr, uint32_t val)
   __atomic_store_n (ptr, val, __ATOMIC_SEQ_CST);
 }
 
+void Store8 (uint8_t *ptr, uint8_t val)
+  __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Store8")
+  __attribute__ ((no_split_stack));
+
+void
+Store8 (uint8_t *ptr, uint8_t val)
+{
+  __atomic_store_n (ptr, val, __ATOMIC_SEQ_CST);
+}
+
 void Store64 (uint64_t *ptr, uint64_t val)
   __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Store64")
   __attribute__ ((no_split_stack));
diff --git a/libgo/go/runtime/internal/atomic/atomic_test.go b/libgo/go/runtime/internal/atomic/atomic_test.go
index 0ba7544..0c1125c 100644
--- a/libgo/go/runtime/internal/atomic/atomic_test.go
+++ b/libgo/go/runtime/internal/atomic/atomic_test.go
@@ -86,14 +86,8 @@ func TestUnaligned64(t *testing.T) {
 	// a continual source of pain. Test that on 32-bit systems they crash
 	// instead of failing silently.
 
-	switch runtime.GOARCH {
-	default:
-		if unsafe.Sizeof(int(0)) != 4 {
-			t.Skip("test only runs on 32-bit systems")
-		}
-	case "amd64p32":
-		// amd64p32 can handle unaligned atomics.
-		t.Skipf("test not needed on %v", runtime.GOARCH)
+	if unsafe.Sizeof(int(0)) != 4 {
+		t.Skip("test only runs on 32-bit systems")
 	}
 
 	x := make([]uint32, 4)
@@ -109,3 +103,120 @@ func TestUnaligned64(t *testing.T) {
 	shouldPanic(t, "Xchg64", func() { atomic.Xchg64(up64, 1) })
 	shouldPanic(t, "Cas64", func() { atomic.Cas64(up64, 1, 2) })
 }
+
+func TestAnd8(t *testing.T) {
+	// Basic sanity check.
+	x := uint8(0xff)
+	for i := uint8(0); i < 8; i++ {
+		atomic.And8(&x, ^(1 << i))
+		if r := uint8(0xff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint8(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint8, 1<<12)
+	for i := range a {
+		a[i] = 0xff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := ^uint8(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And8(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint8(0), v)
+		}
+	}
+}
+
+func TestOr8(t *testing.T) {
+	// Basic sanity check.
+	x := uint8(0)
+	for i := uint8(0); i < 8; i++ {
+		atomic.Or8(&x, 1<<i)
+		if r := (uint8(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint8(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint8, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := uint8(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or8(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint8(0xff), v)
+		}
+	}
+}
+
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint8, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := uint8(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or8(&a[i], m)
+					if atomic.Load8(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And8(&a[i], ^m)
+					if atomic.Load8(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint8(0), v)
+		}
+	}
+}
diff --git a/libgo/go/runtime/internal/atomic/bench_test.go b/libgo/go/runtime/internal/atomic/bench_test.go
index 083a75c..de71b0f 100644
--- a/libgo/go/runtime/internal/atomic/bench_test.go
+++ b/libgo/go/runtime/internal/atomic/bench_test.go
@@ -43,6 +43,46 @@ func BenchmarkAtomicStore(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkAnd8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.And8(&x[255], i)
+			i++
+		}
+	})
+}
+
+func BenchmarkOr8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkOr8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.Or8(&x[255], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
diff --git a/libgo/go/runtime/internal/atomic/gccgo.go b/libgo/go/runtime/internal/atomic/gccgo.go
index e5edbfb..4df8346 100644
--- a/libgo/go/runtime/internal/atomic/gccgo.go
+++ b/libgo/go/runtime/internal/atomic/gccgo.go
@@ -15,6 +15,9 @@ func Load(ptr *uint32) uint32
 func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 
 //go:noescape
+func Load8(ptr *uint8) uint8
+
+//go:noescape
 func Load64(ptr *uint64) uint64
 
 //go:noescape
@@ -56,6 +59,9 @@ func CasRel(ptr *uint32, old, new uint32) bool
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 //go:noescape
diff --git a/libgo/go/runtime/internal/sys/intrinsics.go b/libgo/go/runtime/internal/sys/intrinsics.go
index 6906938..d25f350 100644
--- a/libgo/go/runtime/internal/sys/intrinsics.go
+++ b/libgo/go/runtime/internal/sys/intrinsics.go
@@ -37,25 +37,6 @@ func Ctz8(x uint8) int {
 	return int(ntz8tab[x])
 }
 
-var ntz8tab = [256]uint8{
-	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-}
-
 //extern __builtin_bswap64
 func bswap64(uint64) uint64
 
diff --git a/libgo/go/runtime/internal/sys/intrinsics_common.go b/libgo/go/runtime/internal/sys/intrinsics_common.go
new file mode 100644
index 0000000..818d75e
--- /dev/null
+++ b/libgo/go/runtime/internal/sys/intrinsics_common.go
@@ -0,0 +1,143 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+// Copied from math/bits to avoid dependence.
+
+var len8tab = [256]uint8{
+	0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+}
+
+var ntz8tab = [256]uint8{
+	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+}
+
+// len64 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len64(x uint64) (n int) {
+	if x >= 1<<32 {
+		x >>= 32
+		n = 32
+	}
+	if x >= 1<<16 {
+		x >>= 16
+		n += 16
+	}
+	if x >= 1<<8 {
+		x >>= 8
+		n += 8
+	}
+	return n + int(len8tab[x])
+}
+
+// --- OnesCount ---
+
+const m0 = 0x5555555555555555 // 01010101 ...
+const m1 = 0x3333333333333333 // 00110011 ...
+const m2 = 0x0f0f0f0f0f0f0f0f // 00001111 ...
+
+// OnesCount64 returns the number of one bits ("population count") in x.
+func OnesCount64(x uint64) int {
+	// Implementation: Parallel summing of adjacent bits.
+	// See "Hacker's Delight", Chap. 5: Counting Bits.
+	// The following pattern shows the general approach:
+	//
+	//   x = x>>1&(m0&m) + x&(m0&m)
+	//   x = x>>2&(m1&m) + x&(m1&m)
+	//   x = x>>4&(m2&m) + x&(m2&m)
+	//   x = x>>8&(m3&m) + x&(m3&m)
+	//   x = x>>16&(m4&m) + x&(m4&m)
+	//   x = x>>32&(m5&m) + x&(m5&m)
+	//   return int(x)
+	//
+	// Masking (& operations) can be left away when there's no
+	// danger that a field's sum will carry over into the next
+	// field: Since the result cannot be > 64, 8 bits is enough
+	// and we can ignore the masks for the shifts by 8 and up.
+	// Per "Hacker's Delight", the first line can be simplified
+	// more, but it saves at best one instruction, so we leave
+	// it alone for clarity.
+	const m = 1<<64 - 1
+	x = x>>1&(m0&m) + x&(m0&m)
+	x = x>>2&(m1&m) + x&(m1&m)
+	x = (x>>4 + x) & (m2 & m)
+	x += x >> 8
+	x += x >> 16
+	x += x >> 32
+	return int(x) & (1<<7 - 1)
+}
+
+var deBruijn64tab = [64]byte{
+	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
+	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
+	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
+	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
+}
+
+const deBruijn64 = 0x03f79d71b4ca8b09
+
+// TrailingZeros64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
+func TrailingZeros64(x uint64) int {
+	if x == 0 {
+		return 64
+	}
+	// If popcount is fast, replace code below with return popcount(^x & (x - 1)).
+	//
+	// x & -x leaves only the right-most bit set in the word. Let k be the
+	// index of that bit. Since only a single bit is set, the value is two
+	// to the power of k. Multiplying by a power of two is equivalent to
+	// left shifting, in this case by k bits. The de Bruijn (64 bit) constant
+	// is such that all six bit, consecutive substrings are distinct.
+	// Therefore, if we have a left shifted version of this constant we can
+	// find by how many bits it was shifted by looking at which six bit
+	// substring ended up at the top of the word.
+	// (Knuth, volume 4, section 7.3.1)
+	return int(deBruijn64tab[(x&-x)*deBruijn64>>(64-6)])
+}
+
+// LeadingZeros64 returns the number of leading zero bits in x; the result is 64 for x == 0.
+func LeadingZeros64(x uint64) int { return 64 - Len64(x) }
+
+// LeadingZeros8 returns the number of leading zero bits in x; the result is 8 for x == 0.
+func LeadingZeros8(x uint8) int { return 8 - Len8(x) }
+
+// TrailingZeros8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func TrailingZeros8(x uint8) int {
+	return int(ntz8tab[x])
+}
+
+// Len8 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len8(x uint8) int {
+	return int(len8tab[x])
+}
diff --git a/libgo/go/runtime/lfstack_32bit.go b/libgo/go/runtime/lfstack_32bit.go
index f50c508..6da037e 100644
--- a/libgo/go/runtime/lfstack_32bit.go
+++ b/libgo/go/runtime/lfstack_32bit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 amd64p32 arm nacl armbe m68k mips mipsle mips64p32 mips64p32le nios2 ppc s390 sh shbe sparc
+// +build 386 amd64p32 arm armbe m68k mips mipsle mips64p32 mips64p32le nios2 ppc s390 sh shbe sparc
 
 package runtime
 
diff --git a/libgo/go/runtime/libfuzzer.go b/libgo/go/runtime/libfuzzer.go
new file mode 100644
index 0000000..0161955
--- /dev/null
+++ b/libgo/go/runtime/libfuzzer.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build libfuzzer
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+func libfuzzerCall(fn *byte, arg0, arg1 uintptr)
+
+func libfuzzerTraceCmp1(arg0, arg1 uint8) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp1, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp2(arg0, arg1 uint16) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp2, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp4(arg0, arg1 uint32) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp4, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp8(arg0, arg1 uint64) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp8, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp1(arg0, arg1 uint8) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp1, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp2(arg0, arg1 uint16) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp2, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp4(arg0, arg1 uint32) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp4, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp8(arg0, arg1 uint64) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp8, uintptr(arg0), uintptr(arg1))
+}
+
+//go:linkname __sanitizer_cov_trace_cmp1 __sanitizer_cov_trace_cmp1
+//go:cgo_import_static __sanitizer_cov_trace_cmp1
+var __sanitizer_cov_trace_cmp1 byte
+
+//go:linkname __sanitizer_cov_trace_cmp2 __sanitizer_cov_trace_cmp2
+//go:cgo_import_static __sanitizer_cov_trace_cmp2
+var __sanitizer_cov_trace_cmp2 byte
+
+//go:linkname __sanitizer_cov_trace_cmp4 __sanitizer_cov_trace_cmp4
+//go:cgo_import_static __sanitizer_cov_trace_cmp4
+var __sanitizer_cov_trace_cmp4 byte
+
+//go:linkname __sanitizer_cov_trace_cmp8 __sanitizer_cov_trace_cmp8
+//go:cgo_import_static __sanitizer_cov_trace_cmp8
+var __sanitizer_cov_trace_cmp8 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp1 __sanitizer_cov_trace_const_cmp1
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp1
+var __sanitizer_cov_trace_const_cmp1 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp2 __sanitizer_cov_trace_const_cmp2
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp2
+var __sanitizer_cov_trace_const_cmp2 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp4 __sanitizer_cov_trace_const_cmp4
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp4
+var __sanitizer_cov_trace_const_cmp4 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp8 __sanitizer_cov_trace_const_cmp8
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp8
+var __sanitizer_cov_trace_const_cmp8 byte
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index 6f86e91..f672efd 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -241,7 +241,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle() bool {
+func beforeIdle(int64) bool {
 	return false
 }
 
diff --git a/libgo/go/runtime/lock_js.go b/libgo/go/runtime/lock_js.go
index c038499..3168c86 100644
--- a/libgo/go/runtime/lock_js.go
+++ b/libgo/go/runtime/lock_js.go
@@ -111,6 +111,8 @@ func notetsleepg(n *note, ns int64) bool {
 		gopark(nil, nil, waitReasonSleep, traceEvNone, 1)
 
 		clearTimeoutEvent(id) // note might have woken early, clear timeout
+		clearIdleID()
+
 		mp = acquirem()
 		delete(notes, n)
 		delete(notesWithTimeout, n)
@@ -144,33 +146,65 @@ func checkTimeouts() {
 	}
 }
 
-var returnedEventHandler *g
+// events is a stack of calls from JavaScript into Go.
+var events []*event
+
+type event struct {
+	// g was the active goroutine when the call from JavaScript occurred.
+	// It needs to be active when returning to JavaScript.
+	gp *g
+	// returned reports whether the event handler has returned.
+	// When all goroutines are idle and the event handler has returned,
+	// then g gets resumed and returns the execution to JavaScript.
+	returned bool
+}
 
-func init() {
-	// At the toplevel we need an extra goroutine that handles asynchronous events.
-	initg := getg()
-	go func() {
-		returnedEventHandler = getg()
-		goready(initg, 1)
+// The timeout event started by beforeIdle.
+var idleID int32
 
-		gopark(nil, nil, waitReasonZero, traceEvNone, 1)
-		returnedEventHandler = nil
+// beforeIdle gets called by the scheduler if no goroutine is awake.
+// If we are not already handling an event, then we pause for an async event.
+// If an event handler returned, we resume it and it will pause the execution.
+func beforeIdle(delay int64) bool {
+	if delay > 0 {
+		clearIdleID()
+		if delay < 1e6 {
+			delay = 1
+		} else if delay < 1e15 {
+			delay = delay / 1e6
+		} else {
+			// An arbitrary cap on how long to wait for a timer.
+			// 1e9 ms == ~11.5 days.
+			delay = 1e9
+		}
+		idleID = scheduleTimeoutEvent(delay)
+	}
 
-		pause(getcallersp() - 16)
-	}()
-	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
-}
+	if len(events) == 0 {
+		go handleAsyncEvent()
+		return true
+	}
 
-// beforeIdle gets called by the scheduler if no goroutine is awake.
-// We resume the event handler (if available) which will pause the execution.
-func beforeIdle() bool {
-	if returnedEventHandler != nil {
-		goready(returnedEventHandler, 1)
+	e := events[len(events)-1]
+	if e.returned {
+		goready(e.gp, 1)
 		return true
 	}
 	return false
 }
 
+func handleAsyncEvent() {
+	pause(getcallersp() - 16)
+}
+
+// clearIdleID clears our record of the timeout started by beforeIdle.
+func clearIdleID() {
+	if idleID != 0 {
+		clearTimeoutEvent(idleID)
+		idleID = 0
+	}
+}
+
 // pause sets SP to newsp and pauses the execution of Go's WebAssembly code until an event is triggered.
 func pause(newsp uintptr)
 
@@ -181,18 +215,29 @@ func scheduleTimeoutEvent(ms int64) int32
 // clearTimeoutEvent clears a timeout event scheduled by scheduleTimeoutEvent.
 func clearTimeoutEvent(id int32)
 
+// handleEvent gets invoked on a call from JavaScript into Go. It calls the event handler of the syscall/js package
+// and then parks the handler goroutine to allow other goroutines to run before giving execution back to JavaScript.
+// When no other goroutine is awake any more, beforeIdle resumes the handler goroutine. Now that the same goroutine
+// is running as was running when the call came in from JavaScript, execution can be safely passed back to JavaScript.
 func handleEvent() {
-	prevReturnedEventHandler := returnedEventHandler
-	returnedEventHandler = nil
+	e := &event{
+		gp:       getg(),
+		returned: false,
+	}
+	events = append(events, e)
 
-	checkTimeouts()
 	eventHandler()
 
-	returnedEventHandler = getg()
+	clearIdleID()
+
+	// wait until all goroutines are idle
+	e.returned = true
 	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
 
-	returnedEventHandler = prevReturnedEventHandler
+	events[len(events)-1] = nil
+	events = events[:len(events)-1]
 
+	// return execution to JavaScript
 	pause(getcallersp() - 16)
 }
 
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 5cf2406..63a6014 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin hurd nacl netbsd openbsd plan9 solaris windows
+// +build aix darwin hurd netbsd openbsd plan9 solaris windows
 
 package runtime
 
@@ -300,7 +300,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle() bool {
+func beforeIdle(int64) bool {
 	return false
 }
 
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index 0eee55e..fda2273 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -19,7 +19,7 @@
 //	fixalloc: a free-list allocator for fixed-size off-heap objects,
 //		used to manage storage used by the allocator.
 //	mheap: the malloc heap, managed at page (8192-byte) granularity.
-//	mspan: a run of pages managed by the mheap.
+//	mspan: a run of in-use pages managed by the mheap.
 //	mcentral: collects all spans of a given size class.
 //	mcache: a per-P cache of mspans with free space.
 //	mstats: allocation statistics.
@@ -56,13 +56,8 @@
 //	   it is placed on the mcentral free list for the mspan's size
 //	   class.
 //
-//	3. Otherwise, if all objects in the mspan are free, the mspan
-//	   is now "idle", so it is returned to the mheap and no longer
-//	   has a size class.
-//	   This may coalesce it with adjacent idle mspans.
-//
-//	4. If an mspan remains idle for long enough, return its pages
-//	   to the operating system.
+//	3. Otherwise, if all objects in the mspan are free, the mspan's
+//	   pages are returned to the mheap and the mspan is now dead.
 //
 // Allocating and freeing a large object uses the mheap
 // directly, bypassing the mcache and mcentral.
@@ -207,17 +202,21 @@ const (
 	// exceed Go's 48 bit limit, it's extremely unlikely in
 	// practice.
 	//
-	// On aix/ppc64, the limits is increased to 1<<60 to accept addresses
-	// returned by mmap syscall. These are in range:
-	//  0x0a00000000000000 - 0x0afffffffffffff
-	//
 	// On 32-bit platforms, we accept the full 32-bit address
 	// space because doing so is cheap.
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
+	// On darwin/arm64, although 64-bit pointers are presumably
+	// available, pointers are truncated to 33 bits. Furthermore,
+	// only the top 4 GiB of the address space are actually available
+	// to the application, but we allow the whole 33 bits anyway for
+	// simplicity.
+	// TODO(mknyszek): Consider limiting it to 32 bits and using
+	// arenaBaseOffset to offset into the top 4 GiB.
+	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosAix))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 60*(sys.GoosAix*_64bit)
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosDarwin*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosDarwin*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -236,7 +235,6 @@ const (
 	//       Platform  Addr bits  Arena size  L1 entries   L2 entries
 	// --------------  ---------  ----------  ----------  -----------
 	//       */64-bit         48        64MB           1    4M (32MB)
-	//     aix/64-bit         60       256MB        4096    4M (32MB)
 	// windows/64-bit         48         4MB          64    1M  (8MB)
 	//       */32-bit         32         4MB           1  1024  (4KB)
 	//     */mips(le)         31         4MB           1   512  (2KB)
@@ -258,7 +256,7 @@ const (
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
-	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoosAix)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (8+20)*(sys.GoosAix*_64bit)
+	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
@@ -278,10 +276,7 @@ const (
 	// We use the L1 map on 64-bit Windows because the arena size
 	// is small, but the address space is still 48 bits, and
 	// there's a high cost to having a large L2.
-	//
-	// We use the L1 map on aix/ppc64 to keep the same L2 value
-	// as on Linux.
-	arenaL1Bits = 6*(_64bit*sys.GoosWindows) + 12*(sys.GoosAix*_64bit)
+	arenaL1Bits = 6 * (_64bit * sys.GoosWindows)
 
 	// arenaL2Bits is the number of bits of the arena number
 	// covered by the second level arena index.
@@ -308,9 +303,15 @@ const (
 	// bits. This offset lets us handle "negative" addresses (or
 	// high addresses if viewed as unsigned).
 	//
+	// On aix/ppc64, this offset allows to keep the heapAddrBits to
+	// 48. Otherwize, it would be 60 in order to handle mmap addresses
+	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this
+	// case, the memory reserved in (s *pageAlloc).init for chunks
+	// is causing important slowdowns.
+	//
 	// On other platforms, the user address space is contiguous
 	// and starts at 0, so no offset is necessary.
-	arenaBaseOffset uintptr = sys.GoarchAmd64 * (1 << 47)
+	arenaBaseOffset = sys.GoarchAmd64*(1<<47) + (^0x0a00000000000000+1)&uintptrMask*sys.GoosAix
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
@@ -444,6 +445,10 @@ func mallocinit() {
 		// The OS init code failed to fetch the physical page size.
 		throw("failed to get system page size")
 	}
+	if physPageSize > maxPhysPageSize {
+		print("system page size (", physPageSize, ") is larger than maximum page size (", maxPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
 	if physPageSize < minPhysPageSize {
 		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
 		throw("bad system page size")
@@ -456,6 +461,13 @@ func mallocinit() {
 		print("system huge page size (", physHugePageSize, ") must be a power of 2\n")
 		throw("bad system huge page size")
 	}
+	if physHugePageSize > maxPhysHugePageSize {
+		// physHugePageSize is greater than the maximum supported huge page size.
+		// Don't throw here, like in the other cases, since a system configured
+		// in this way isn't wrong, we just don't have the code to support them.
+		// Instead, silently set the huge page size to zero.
+		physHugePageSize = 0
+	}
 	if physHugePageSize != 0 {
 		// Since physHugePageSize is a power of 2, it suffices to increase
 		// physHugePageShift until 1<<physHugePageShift == physHugePageSize.
@@ -579,7 +591,7 @@ func mallocinit() {
 		if mheap_.heapArenaAlloc.next <= p && p < mheap_.heapArenaAlloc.end {
 			p = mheap_.heapArenaAlloc.end
 		}
-		p = round(p+(256<<10), heapArenaBytes)
+		p = alignUp(p+(256<<10), heapArenaBytes)
 		// Because we're worried about fragmentation on
 		// 32-bit, we try to make a large initial reservation.
 		arenaSizes := [...]uintptr{
@@ -612,7 +624,7 @@ func mallocinit() {
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
-	n = round(n, heapArenaBytes)
+	n = alignUp(n, heapArenaBytes)
 
 	// First, try the arena pre-reservation.
 	v = h.arena.alloc(n, heapArenaBytes, &memstats.heap_sys)
@@ -795,7 +807,7 @@ retry:
 		// re-reserve the aligned sub-region. This may race,
 		// so we may have to try again.
 		sysFree(unsafe.Pointer(p), size+align, nil)
-		p = round(p, align)
+		p = alignUp(p, align)
 		p2 := sysReserve(unsafe.Pointer(p), size)
 		if p != uintptr(p2) {
 			// Must have raced. Try again.
@@ -809,7 +821,7 @@ retry:
 		return p2, size
 	default:
 		// Trim off the unaligned parts.
-		pAligned := round(p, align)
+		pAligned := alignUp(p, align)
 		sysFree(unsafe.Pointer(p), pAligned-p, nil)
 		end := pAligned + size
 		endLen := (p + size + align) - end
@@ -998,11 +1010,11 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			off := c.tinyoffset
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
-				off = round(off, 8)
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
-				off = round(off, 4)
+				off = alignUp(off, 4)
 			} else if size&1 == 0 {
-				off = round(off, 2)
+				off = alignUp(off, 2)
 			}
 			if off+size <= maxTinySize && c.tiny != 0 {
 				// The object fits into existing tiny block.
@@ -1160,7 +1172,7 @@ func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, makeSpanClass(0, noscan), true, needzero)
+	s := mheap_.alloc(npages, makeSpanClass(0, noscan), needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -1338,7 +1350,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 		lock(&globalAlloc.mutex)
 		persistent = &globalAlloc.persistentAlloc
 	}
-	persistent.off = round(persistent.off, align)
+	persistent.off = alignUp(persistent.off, align)
 	if persistent.off+size > persistentChunkSize || persistent.base == nil {
 		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
 		if persistent.base == nil {
@@ -1356,7 +1368,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 				break
 			}
 		}
-		persistent.off = round(sys.PtrSize, align)
+		persistent.off = alignUp(sys.PtrSize, align)
 	}
 	p := persistent.base.add(persistent.off)
 	persistent.off += size
@@ -1402,12 +1414,12 @@ func (l *linearAlloc) init(base, size uintptr) {
 }
 
 func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
-	p := round(l.next, align)
+	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
 	}
 	l.next = p + size
-	if pEnd := round(l.next-1, physPageSize); pEnd > l.mapped {
+	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
 		// Transition from Reserved to Prepared to Ready.
 		sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
 		sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index c9282ba..bd30bc1 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -170,6 +170,14 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+func TestPageCacheLeak(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(1))
+	leaked := PageCachePagesLeaked()
+	if leaked != 0 {
+		t.Fatalf("found %d leaked pages in page caches", leaked)
+	}
+}
+
 func TestPhysicalMemoryUtilization(t *testing.T) {
 	got := runTestProg(t, "testprog", "GCPhys")
 	want := "OK\n"
@@ -178,6 +186,19 @@ func TestPhysicalMemoryUtilization(t *testing.T) {
 	}
 }
 
+func TestScavengedBitsCleared(t *testing.T) {
+	var mismatches [128]BitsMismatch
+	if n, ok := CheckScavengedBitsCleared(mismatches[:]); !ok {
+		t.Errorf("uncleared scavenged bits")
+		for _, m := range mismatches[:n] {
+			t.Logf("\t@ address 0x%x", m.Base)
+			t.Logf("\t|  got: %064b", m.Got)
+			t.Logf("\t| want: %064b", m.Want)
+		}
+		t.FailNow()
+	}
+}
+
 type acLink struct {
 	x [1 << 20]byte
 }
diff --git a/libgo/go/runtime/map.go b/libgo/go/runtime/map.go
index 3672908..6667fe7 100644
--- a/libgo/go/runtime/map.go
+++ b/libgo/go/runtime/map.go
@@ -1429,5 +1429,5 @@ func reflectlite_maplen(h *hmap) int {
 	return h.count
 }
 
-const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go
+const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go:zeroValSize
 var zeroVal [maxZero]byte
diff --git a/libgo/go/runtime/map_benchmark_test.go b/libgo/go/runtime/map_benchmark_test.go
index cf04ead..bae1aa0 100644
--- a/libgo/go/runtime/map_benchmark_test.go
+++ b/libgo/go/runtime/map_benchmark_test.go
@@ -251,7 +251,7 @@ func BenchmarkMapLast(b *testing.B) {
 }
 
 func BenchmarkMapCycle(b *testing.B) {
-	// Arrange map entries to be a permuation, so that
+	// Arrange map entries to be a permutation, so that
 	// we hit all entries, and one lookup is data dependent
 	// on the previous lookup.
 	const N = 3127
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index b84fe0f..457da13 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -243,6 +243,10 @@ func (s *mspan) nextFreeIndex() uintptr {
 }
 
 // isFree reports whether the index'th object in s is unallocated.
+//
+// The caller must ensure s.state is mSpanInUse, and there must have
+// been no preemption points since ensuring this (which could allow a
+// GC transition, which would allow the state to change).
 func (s *mspan) isFree(index uintptr) bool {
 	if index < s.freeindex {
 		return false
@@ -349,6 +353,33 @@ func heapBitsForAddr(addr uintptr) (h heapBits) {
 	return
 }
 
+// badPointer throws bad pointer in heap panic.
+func badPointer(s *mspan, p, refBase, refOff uintptr) {
+	// Typically this indicates an incorrect use
+	// of unsafe or cgo to store a bad pointer in
+	// the Go heap. It may also indicate a runtime
+	// bug.
+	//
+	// TODO(austin): We could be more aggressive
+	// and detect pointers to unallocated objects
+	// in allocated spans.
+	printlock()
+	print("runtime: pointer ", hex(p))
+	state := s.state.get()
+	if state != mSpanInUse {
+		print(" to unallocated span")
+	} else {
+		print(" to unused region of span")
+	}
+	print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state, "\n")
+	if refBase != 0 {
+		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+		gcDumpObject("object", refBase, refOff)
+	}
+	getg().m.traceback = 2
+	throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+}
+
 // findObject returns the base address for the heap object containing
 // the address p, the object's span, and the index of the object in s.
 // If p does not point into a heap object, it returns base == 0.
@@ -362,42 +393,30 @@ func heapBitsForAddr(addr uintptr) (h heapBits) {
 // refBase and refOff optionally give the base address of the object
 // in which the pointer p was found and the byte offset at which it
 // was found. These are used for error reporting.
+//
+// It is nosplit so it is safe for p to be a pointer to the current goroutine's stack.
+// Since p is a uintptr, it would not be adjusted if the stack were to move.
+//go:nosplit
 func findObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, s *mspan, objIndex uintptr) {
 	s = spanOf(p)
+	// If s is nil, the virtual address has never been part of the heap.
+	// This pointer may be to some mmap'd region, so we allow it.
+	if s == nil {
+		return
+	}
 	// If p is a bad pointer, it may not be in s's bounds.
-	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
-		if s == nil || s.state == mSpanManual || forStack {
-			// If s is nil, the virtual address has never been part of the heap.
-			// This pointer may be to some mmap'd region, so we allow it.
-			// Pointers into stacks are also ok, the runtime manages these explicitly.
+	//
+	// Check s.state to synchronize with span initialization
+	// before checking other fields. See also spanOfHeap.
+	if state := s.state.get(); state != mSpanInUse || p < s.base() || p >= s.limit {
+		// Pointers into stacks are also ok, the runtime manages these explicitly.
+		if state == mSpanManual || forStack {
 			return
 		}
-
 		// The following ensures that we are rigorous about what data
 		// structures hold valid pointers.
 		if debug.invalidptr != 0 {
-			// Typically this indicates an incorrect use
-			// of unsafe or cgo to store a bad pointer in
-			// the Go heap. It may also indicate a runtime
-			// bug.
-			//
-			// TODO(austin): We could be more aggressive
-			// and detect pointers to unallocated objects
-			// in allocated spans.
-			printlock()
-			print("runtime: pointer ", hex(p))
-			if s.state != mSpanInUse {
-				print(" to unallocated span")
-			} else {
-				print(" to unused region of span")
-			}
-			print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
-			if refBase != 0 {
-				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
-				gcDumpObject("object", refBase, refOff)
-			}
-			getg().m.traceback = 2
-			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+			badPointer(s, p, refBase, refOff)
 		}
 		return
 	}
@@ -629,7 +648,7 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			}
 		}
 		return
-	} else if s.state != mSpanInUse || dst < s.base() || s.limit <= dst {
+	} else if s.state.get() != mSpanInUse || dst < s.base() || s.limit <= dst {
 		// dst was heap memory at some point, but isn't now.
 		// It can't be a global. It must be either our stack,
 		// or in the case of direct channel sends, it could be
@@ -801,29 +820,19 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
 // words to pointer/scan.
 // Otherwise, it initializes all words to scalar/dead.
 func (h heapBits) initSpan(s *mspan) {
-	size, n, total := s.layout()
-
-	// Init the markbit structures
-	s.freeindex = 0
-	s.allocCache = ^uint64(0) // all 1s indicating all free.
-	s.nelems = n
-	s.allocBits = nil
-	s.gcmarkBits = nil
-	s.gcmarkBits = newMarkBits(s.nelems)
-	s.allocBits = newAllocBits(s.nelems)
-
 	// Clear bits corresponding to objects.
-	nw := total / sys.PtrSize
+	nw := (s.npages << _PageShift) / sys.PtrSize
 	if nw%wordsPerBitmapByte != 0 {
 		throw("initSpan: unaligned length")
 	}
 	if h.shift != 0 {
 		throw("initSpan: unaligned base")
 	}
+	isPtrs := sys.PtrSize == 8 && s.elemsize == sys.PtrSize
 	for nw > 0 {
 		hNext, anw := h.forwardOrBoundary(nw)
 		nbyte := anw / wordsPerBitmapByte
-		if sys.PtrSize == 8 && size == sys.PtrSize {
+		if isPtrs {
 			bitp := h.bitp
 			for i := uintptr(0); i < nbyte; i++ {
 				*bitp = bitPointerAll | bitScanAll
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index cd59010..78a3ae6 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -243,7 +243,7 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 
 	c.nonempty.remove(s)
 	unlock(&c.lock)
-	mheap_.freeSpan(s, false)
+	mheap_.freeSpan(s)
 	return true
 }
 
@@ -252,7 +252,7 @@ func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
 	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 
-	s := mheap_.alloc(npages, c.spanclass, false, true)
+	s := mheap_.alloc(npages, c.spanclass, true)
 	if s == nil {
 		return nil
 	}
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index 46b7334..b0040f9 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -139,6 +139,10 @@ const (
 	_ConcurrentSweep = true
 	_FinBlockSize    = 4 * 1024
 
+	// debugScanConservative enables debug logging for stack
+	// frames that are scanned conservatively.
+	debugScanConservative = false
+
 	// sweepMinHeapDistance is a lower bound on the heap distance
 	// (in bytes) reserved for concurrent sweeping between GC
 	// cycles.
@@ -231,6 +235,8 @@ func setGCPercent(in int32) (out int32) {
 		gcSetTriggerRatio(memstats.triggerRatio)
 		unlock(&mheap_.lock)
 	})
+	// Pacing changed, so the scavenger should be awoken.
+	wakeScavenger()
 
 	// If we just disabled GC, wait for any concurrent GC mark to
 	// finish so we always return with no GC running.
@@ -490,25 +496,25 @@ func (c *gcControllerState) revise() {
 	}
 	live := atomic.Load64(&memstats.heap_live)
 
-	var heapGoal, scanWorkExpected int64
-	if live <= memstats.next_gc {
-		// We're under the soft goal. Pace GC to complete at
-		// next_gc assuming the heap is in steady-state.
-		heapGoal = int64(memstats.next_gc)
+	// Assume we're under the soft goal. Pace GC to complete at
+	// next_gc assuming the heap is in steady-state.
+	heapGoal := int64(memstats.next_gc)
 
-		// Compute the expected scan work remaining.
-		//
-		// This is estimated based on the expected
-		// steady-state scannable heap. For example, with
-		// GOGC=100, only half of the scannable heap is
-		// expected to be live, so that's what we target.
-		//
-		// (This is a float calculation to avoid overflowing on
-		// 100*heap_scan.)
-		scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
-	} else {
-		// We're past the soft goal. Pace GC so that in the
-		// worst case it will complete by the hard goal.
+	// Compute the expected scan work remaining.
+	//
+	// This is estimated based on the expected
+	// steady-state scannable heap. For example, with
+	// GOGC=100, only half of the scannable heap is
+	// expected to be live, so that's what we target.
+	//
+	// (This is a float calculation to avoid overflowing on
+	// 100*heap_scan.)
+	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+
+	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+		// We're past the soft goal, or we've already done more scan
+		// work than we expected. Pace GC so that in the worst case it
+		// will complete by the hard goal.
 		const maxOvershoot = 1.1
 		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
 
@@ -520,7 +526,7 @@ func (c *gcControllerState) revise() {
 	//
 	// Note that we currently count allocations during GC as both
 	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so allocation will change this difference will
+	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
 	scanWorkRemaining := scanWorkExpected - c.scanWork
@@ -765,11 +771,25 @@ func gcSetTriggerRatio(triggerRatio float64) {
 		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 	}
 
+	// If we let triggerRatio go too low, then if the application
+	// is allocating very rapidly we might end up in a situation
+	// where we're allocating black during a nearly always-on GC.
+	// The result of this is a growing heap and ultimately an
+	// increase in RSS. By capping us at a point >0, we're essentially
+	// saying that we're OK using more CPU during the GC to prevent
+	// this growth in RSS.
+	//
+	// The current constant was chosen empirically: given a sufficiently
+	// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+	// to <0.05, this constant causes applications to retain the same peak
+	// RSS compared to not having this allocator.
+	const minTriggerRatio = 0.6
+
 	// Set the trigger ratio, capped to reasonable bounds.
-	if triggerRatio < 0 {
+	if triggerRatio < minTriggerRatio {
 		// This can happen if the mutator is allocating very
 		// quickly or the GC is scanning very slowly.
-		triggerRatio = 0
+		triggerRatio = minTriggerRatio
 	} else if gcpercent >= 0 {
 		// Ensure there's always a little margin so that the
 		// mutator assist ratio isn't infinity.
@@ -847,7 +867,8 @@ func gcSetTriggerRatio(triggerRatio float64) {
 			heapDistance = _PageSize
 		}
 		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
-		sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
+		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
+		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
 		if sweepDistancePages <= 0 {
 			mheap_.sweepPagesPerByte = 0
 		} else {
@@ -1250,6 +1271,7 @@ func gcStart(trigger gcTrigger) {
 	}
 
 	// Ok, we're doing it! Stop everybody else
+	semacquire(&gcsema)
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1348,6 +1370,13 @@ func gcStart(trigger gcTrigger) {
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	})
+
+	// Release the world sema before Gosched() in STW mode
+	// because we will need to reacquire it later but before
+	// this goroutine becomes runnable again, and we could
+	// self-deadlock otherwise.
+	semrelease(&worldsema)
+
 	// In STW mode, we could block the instant systemstack
 	// returns, so don't do anything important here. Make sure we
 	// block rather than returning to user code.
@@ -1417,6 +1446,10 @@ top:
 		return
 	}
 
+	// forEachP needs worldsema to execute, and we'll need it to
+	// stop the world later, so acquire worldsema now.
+	semacquire(&worldsema)
+
 	// Flush all local buffers and collect flushedWork flags.
 	gcMarkDoneFlushed = 0
 	systemstack(func() {
@@ -1477,6 +1510,7 @@ top:
 		// work to do. Keep going. It's possible the
 		// transition condition became true again during the
 		// ragged barrier, so re-check it.
+		semrelease(&worldsema)
 		goto top
 	}
 
@@ -1553,6 +1587,7 @@ top:
 				now := startTheWorldWithSema(true)
 				work.pauseNS += now - work.pauseStart
 			})
+			semrelease(&worldsema)
 			goto top
 		}
 	}
@@ -1651,9 +1686,16 @@ func gcMarkTermination(nextTriggerRatio float64) {
 		throw("gc done but gcphase != _GCoff")
 	}
 
+	// Record next_gc and heap_inuse for scavenger.
+	memstats.last_next_gc = memstats.next_gc
+	memstats.last_heap_inuse = memstats.heap_inuse
+
 	// Update GC trigger and pacing for the next cycle.
 	gcSetTriggerRatio(nextTriggerRatio)
 
+	// Pacing changed, so the scavenger should be awoken.
+	wakeScavenger()
+
 	// Update timing memstats
 	now := nanotime()
 	sec, nsec, _ := time_now()
@@ -1760,6 +1802,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	}
 
 	semrelease(&worldsema)
+	semrelease(&gcsema)
 	// Careful: another GC cycle may start now.
 
 	releasem(mp)
@@ -2152,8 +2195,7 @@ func gcResetMarkState() {
 	// allgs doesn't change.
 	lock(&allglock)
 	for _, gp := range allgs {
-		gp.gcscandone = false  // set to true in gcphasework
-		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcscandone = false // set to true in gcphasework
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
index d7ae260..21539eb 100644
--- a/libgo/go/runtime/mgc_gccgo.go
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -145,40 +145,15 @@ func registerGCRoots(r *gcRootList) {
 // and carries on.
 func checkPreempt() {
 	gp := getg()
-	if !gp.preempt || gp != gp.m.curg || gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.incgo {
+	if !gp.preempt || gp != gp.m.curg || !canPreemptM(gp.m) {
 		return
 	}
 
-	// Synchronize with scang.
-	gp.scanningself = true
-	casgstatus(gp, _Grunning, _Gwaiting)
-	if gp.preemptscan {
-		for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
-			// Likely to be racing with the GC as
-			// it sees a _Gwaiting and does the
-			// stack scan. If so, gcworkdone will
-			// be set and gcphasework will simply
-			// return.
-		}
-		if !gp.gcscandone {
-			mp := acquirem()
-			gcw := &gp.m.p.ptr().gcw
-			scanstack(gp, gcw)
-			releasem(mp)
-			gp.gcscandone = true
-		}
-		gp.preemptscan = false
-		gp.preempt = false
-		casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
-		// This clears gcscanvalid.
-		casgstatus(gp, _Gwaiting, _Grunning)
-		gp.scanningself = false
-		return
+	if gp.preemptStop {
+		mcall(preemptPark)
 	}
 
 	// Act like goroutine called runtime.Gosched.
-	casgstatus(gp, _Gwaiting, _Grunning)
-	gp.scanningself = false
 	mcall(gopreempt_m)
 }
 
diff --git a/libgo/go/runtime/mgclarge.go b/libgo/go/runtime/mgclarge.go
deleted file mode 100644
index 414db10..0000000
--- a/libgo/go/runtime/mgclarge.go
+++ /dev/null
@@ -1,657 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Page heap.
-//
-// See malloc.go for the general overview.
-//
-// Allocation policy is the subject of this file. All free spans live in
-// a treap for most of their time being free. See
-// https://en.wikipedia.org/wiki/Treap or
-// https://faculty.washington.edu/aragon/pubs/rst89.pdf for an overview.
-// sema.go also holds an implementation of a treap.
-//
-// Each treapNode holds a single span. The treap is sorted by base address
-// and each span necessarily has a unique base address.
-// Spans are returned based on a first-fit algorithm, acquiring the span
-// with the lowest base address which still satisfies the request.
-//
-// The first-fit algorithm is possible due to an augmentation of each
-// treapNode to maintain the size of the largest span in the subtree rooted
-// at that treapNode. Below we refer to this invariant as the maxPages
-// invariant.
-//
-// The primary routines are
-// insert: adds a span to the treap
-// remove: removes the span from that treap that best fits the required size
-// removeSpan: which removes a specific span from the treap
-//
-// Whenever a pointer to a span which is owned by the treap is acquired, that
-// span must not be mutated. To mutate a span in the treap, remove it first.
-//
-// mheap_.lock must be held when manipulating this data structure.
-
-package runtime
-
-import (
-	"unsafe"
-)
-
-//go:notinheap
-type mTreap struct {
-	treap           *treapNode
-	unscavHugePages uintptr // number of unscavenged huge pages in the treap
-}
-
-//go:notinheap
-type treapNode struct {
-	right    *treapNode      // all treapNodes > this treap node
-	left     *treapNode      // all treapNodes < this treap node
-	parent   *treapNode      // direct parent of this node, nil if root
-	key      uintptr         // base address of the span, used as primary sort key
-	span     *mspan          // span at base address key
-	maxPages uintptr         // the maximum size of any span in this subtree, including the root
-	priority uint32          // random number used by treap algorithm to keep tree probabilistically balanced
-	types    treapIterFilter // the types of spans available in this subtree
-}
-
-// updateInvariants is a helper method which has a node recompute its own
-// maxPages and types values by looking at its own span as well as the
-// values of its direct children.
-//
-// Returns true if anything changed.
-func (t *treapNode) updateInvariants() bool {
-	m, i := t.maxPages, t.types
-	t.maxPages = t.span.npages
-	t.types = t.span.treapFilter()
-	if t.left != nil {
-		t.types |= t.left.types
-		if t.maxPages < t.left.maxPages {
-			t.maxPages = t.left.maxPages
-		}
-	}
-	if t.right != nil {
-		t.types |= t.right.types
-		if t.maxPages < t.right.maxPages {
-			t.maxPages = t.right.maxPages
-		}
-	}
-	return m != t.maxPages || i != t.types
-}
-
-// findMinimal finds the minimal (lowest base addressed) node in the treap
-// which matches the criteria set out by the filter f and returns nil if
-// none exists.
-//
-// This algorithm is functionally the same as (*mTreap).find, so see that
-// method for more details.
-func (t *treapNode) findMinimal(f treapIterFilter) *treapNode {
-	if t == nil || !f.matches(t.types) {
-		return nil
-	}
-	for t != nil {
-		if t.left != nil && f.matches(t.left.types) {
-			t = t.left
-		} else if f.matches(t.span.treapFilter()) {
-			break
-		} else if t.right != nil && f.matches(t.right.types) {
-			t = t.right
-		} else {
-			println("runtime: f=", f)
-			throw("failed to find minimal node matching filter")
-		}
-	}
-	return t
-}
-
-// findMaximal finds the maximal (highest base addressed) node in the treap
-// which matches the criteria set out by the filter f and returns nil if
-// none exists.
-//
-// This algorithm is the logical inversion of findMinimal and just changes
-// the order of the left and right tests.
-func (t *treapNode) findMaximal(f treapIterFilter) *treapNode {
-	if t == nil || !f.matches(t.types) {
-		return nil
-	}
-	for t != nil {
-		if t.right != nil && f.matches(t.right.types) {
-			t = t.right
-		} else if f.matches(t.span.treapFilter()) {
-			break
-		} else if t.left != nil && f.matches(t.left.types) {
-			t = t.left
-		} else {
-			println("runtime: f=", f)
-			throw("failed to find minimal node matching filter")
-		}
-	}
-	return t
-}
-
-// pred returns the predecessor of t in the treap subject to the criteria
-// specified by the filter f. Returns nil if no such predecessor exists.
-func (t *treapNode) pred(f treapIterFilter) *treapNode {
-	if t.left != nil && f.matches(t.left.types) {
-		// The node has a left subtree which contains at least one matching
-		// node, find the maximal matching node in that subtree.
-		return t.left.findMaximal(f)
-	}
-	// Lacking a left subtree, look to the parents.
-	p := t // previous node
-	t = t.parent
-	for t != nil {
-		// Walk up the tree until we find a node that has a left subtree
-		// that we haven't already visited.
-		if t.right == p {
-			if f.matches(t.span.treapFilter()) {
-				// If this node matches, then it's guaranteed to be the
-				// predecessor since everything to its left is strictly
-				// greater.
-				return t
-			} else if t.left != nil && f.matches(t.left.types) {
-				// Failing the root of this subtree, if its left subtree has
-				// something, that's where we'll find our predecessor.
-				return t.left.findMaximal(f)
-			}
-		}
-		p = t
-		t = t.parent
-	}
-	// If the parent is nil, then we've hit the root without finding
-	// a suitable left subtree containing the node (and the predecessor
-	// wasn't on the path). Thus, there's no predecessor, so just return
-	// nil.
-	return nil
-}
-
-// succ returns the successor of t in the treap subject to the criteria
-// specified by the filter f. Returns nil if no such successor exists.
-func (t *treapNode) succ(f treapIterFilter) *treapNode {
-	// See pred. This method is just the logical inversion of it.
-	if t.right != nil && f.matches(t.right.types) {
-		return t.right.findMinimal(f)
-	}
-	p := t
-	t = t.parent
-	for t != nil {
-		if t.left == p {
-			if f.matches(t.span.treapFilter()) {
-				return t
-			} else if t.right != nil && f.matches(t.right.types) {
-				return t.right.findMinimal(f)
-			}
-		}
-		p = t
-		t = t.parent
-	}
-	return nil
-}
-
-// isSpanInTreap is handy for debugging. One should hold the heap lock, usually
-// mheap_.lock().
-func (t *treapNode) isSpanInTreap(s *mspan) bool {
-	if t == nil {
-		return false
-	}
-	return t.span == s || t.left.isSpanInTreap(s) || t.right.isSpanInTreap(s)
-}
-
-// walkTreap is handy for debugging and testing.
-// Starting at some treapnode t, for example the root, do a depth first preorder walk of
-// the tree executing fn at each treap node. One should hold the heap lock, usually
-// mheap_.lock().
-func (t *treapNode) walkTreap(fn func(tn *treapNode)) {
-	if t == nil {
-		return
-	}
-	fn(t)
-	t.left.walkTreap(fn)
-	t.right.walkTreap(fn)
-}
-
-// checkTreapNode when used in conjunction with walkTreap can usually detect a
-// poorly formed treap.
-func checkTreapNode(t *treapNode) {
-	if t == nil {
-		return
-	}
-	if t.span.next != nil || t.span.prev != nil || t.span.list != nil {
-		throw("span may be on an mSpanList while simultaneously in the treap")
-	}
-	if t.span.base() != t.key {
-		println("runtime: checkTreapNode treapNode t=", t, "     t.key=", t.key,
-			"t.span.base()=", t.span.base())
-		throw("why does span.base() and treap.key do not match?")
-	}
-	if t.left != nil && t.key < t.left.key {
-		throw("found out-of-order spans in treap (left child has greater base address)")
-	}
-	if t.right != nil && t.key > t.right.key {
-		throw("found out-of-order spans in treap (right child has lesser base address)")
-	}
-}
-
-// validateInvariants is handy for debugging and testing.
-// It ensures that the various invariants on each treap node are
-// appropriately maintained throughout the treap by walking the
-// treap in a post-order manner.
-func (t *treapNode) validateInvariants() (uintptr, treapIterFilter) {
-	if t == nil {
-		return 0, 0
-	}
-	leftMax, leftTypes := t.left.validateInvariants()
-	rightMax, rightTypes := t.right.validateInvariants()
-	max := t.span.npages
-	if leftMax > max {
-		max = leftMax
-	}
-	if rightMax > max {
-		max = rightMax
-	}
-	if max != t.maxPages {
-		println("runtime: t.maxPages=", t.maxPages, "want=", max)
-		throw("maxPages invariant violated in treap")
-	}
-	typ := t.span.treapFilter() | leftTypes | rightTypes
-	if typ != t.types {
-		println("runtime: t.types=", t.types, "want=", typ)
-		throw("types invariant violated in treap")
-	}
-	return max, typ
-}
-
-// treapIterType represents the type of iteration to perform
-// over the treap. Each different flag is represented by a bit
-// in the type, and types may be combined together by a bitwise
-// or operation.
-//
-// Note that only 5 bits are available for treapIterType, do not
-// use the 3 higher-order bits. This constraint is to allow for
-// expansion into a treapIterFilter, which is a uint32.
-type treapIterType uint8
-
-const (
-	treapIterScav treapIterType = 1 << iota // scavenged spans
-	treapIterHuge                           // spans containing at least one huge page
-	treapIterBits = iota
-)
-
-// treapIterFilter is a bitwise filter of different spans by binary
-// properties. Each bit of a treapIterFilter represents a unique
-// combination of bits set in a treapIterType, in other words, it
-// represents the power set of a treapIterType.
-//
-// The purpose of this representation is to allow the existence of
-// a specific span type to bubble up in the treap (see the types
-// field on treapNode).
-//
-// More specifically, any treapIterType may be transformed into a
-// treapIterFilter for a specific combination of flags via the
-// following operation: 1 << (0x1f&treapIterType).
-type treapIterFilter uint32
-
-// treapFilterAll represents the filter which allows all spans.
-const treapFilterAll = ^treapIterFilter(0)
-
-// treapFilter creates a new treapIterFilter from two treapIterTypes.
-// mask represents a bitmask for which flags we should check against
-// and match for the expected result after applying the mask.
-func treapFilter(mask, match treapIterType) treapIterFilter {
-	allow := treapIterFilter(0)
-	for i := treapIterType(0); i < 1<<treapIterBits; i++ {
-		if mask&i == match {
-			allow |= 1 << i
-		}
-	}
-	return allow
-}
-
-// matches returns true if m and f intersect.
-func (f treapIterFilter) matches(m treapIterFilter) bool {
-	return f&m != 0
-}
-
-// treapFilter returns the treapIterFilter exactly matching this span,
-// i.e. popcount(result) == 1.
-func (s *mspan) treapFilter() treapIterFilter {
-	have := treapIterType(0)
-	if s.scavenged {
-		have |= treapIterScav
-	}
-	if s.hugePages() > 0 {
-		have |= treapIterHuge
-	}
-	return treapIterFilter(uint32(1) << (0x1f & have))
-}
-
-// treapIter is a bidirectional iterator type which may be used to iterate over a
-// an mTreap in-order forwards (increasing order) or backwards (decreasing order).
-// Its purpose is to hide details about the treap from users when trying to iterate
-// over it.
-//
-// To create iterators over the treap, call start or end on an mTreap.
-type treapIter struct {
-	f treapIterFilter
-	t *treapNode
-}
-
-// span returns the span at the current position in the treap.
-// If the treap is not valid, span will panic.
-func (i *treapIter) span() *mspan {
-	return i.t.span
-}
-
-// valid returns whether the iterator represents a valid position
-// in the mTreap.
-func (i *treapIter) valid() bool {
-	return i.t != nil
-}
-
-// next moves the iterator forward by one. Once the iterator
-// ceases to be valid, calling next will panic.
-func (i treapIter) next() treapIter {
-	i.t = i.t.succ(i.f)
-	return i
-}
-
-// prev moves the iterator backwards by one. Once the iterator
-// ceases to be valid, calling prev will panic.
-func (i treapIter) prev() treapIter {
-	i.t = i.t.pred(i.f)
-	return i
-}
-
-// start returns an iterator which points to the start of the treap (the
-// left-most node in the treap) subject to mask and match constraints.
-func (root *mTreap) start(mask, match treapIterType) treapIter {
-	f := treapFilter(mask, match)
-	return treapIter{f, root.treap.findMinimal(f)}
-}
-
-// end returns an iterator which points to the end of the treap (the
-// right-most node in the treap) subject to mask and match constraints.
-func (root *mTreap) end(mask, match treapIterType) treapIter {
-	f := treapFilter(mask, match)
-	return treapIter{f, root.treap.findMaximal(f)}
-}
-
-// mutate allows one to mutate the span without removing it from the treap via a
-// callback. The span's base and size are allowed to change as long as the span
-// remains in the same order relative to its predecessor and successor.
-//
-// Note however that any operation that causes a treap rebalancing inside of fn
-// is strictly forbidden, as that may cause treap node metadata to go
-// out-of-sync.
-func (root *mTreap) mutate(i treapIter, fn func(span *mspan)) {
-	s := i.span()
-	// Save some state about the span for later inspection.
-	hpages := s.hugePages()
-	scavenged := s.scavenged
-	// Call the mutator.
-	fn(s)
-	// Update unscavHugePages appropriately.
-	if !scavenged {
-		mheap_.free.unscavHugePages -= hpages
-	}
-	if !s.scavenged {
-		mheap_.free.unscavHugePages += s.hugePages()
-	}
-	// Update the key in case the base changed.
-	i.t.key = s.base()
-	// Updating invariants up the tree needs to happen if
-	// anything changed at all, so just go ahead and do it
-	// unconditionally.
-	//
-	// If it turns out nothing changed, it'll exit quickly.
-	t := i.t
-	for t != nil && t.updateInvariants() {
-		t = t.parent
-	}
-}
-
-// insert adds span to the large span treap.
-func (root *mTreap) insert(span *mspan) {
-	if !span.scavenged {
-		root.unscavHugePages += span.hugePages()
-	}
-	base := span.base()
-	var last *treapNode
-	pt := &root.treap
-	for t := *pt; t != nil; t = *pt {
-		last = t
-		if t.key < base {
-			pt = &t.right
-		} else if t.key > base {
-			pt = &t.left
-		} else {
-			throw("inserting span already in treap")
-		}
-	}
-
-	// Add t as new leaf in tree of span size and unique addrs.
-	// The balanced tree is a treap using priority as the random heap priority.
-	// That is, it is a binary tree ordered according to the key,
-	// but then among the space of possible binary trees respecting those
-	// keys, it is kept balanced on average by maintaining a heap ordering
-	// on the priority: s.priority <= both s.right.priority and s.right.priority.
-	// https://en.wikipedia.org/wiki/Treap
-	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
-
-	t := (*treapNode)(mheap_.treapalloc.alloc())
-	t.key = span.base()
-	t.priority = fastrand()
-	t.span = span
-	t.maxPages = span.npages
-	t.types = span.treapFilter()
-	t.parent = last
-	*pt = t // t now at a leaf.
-
-	// Update the tree to maintain the various invariants.
-	i := t
-	for i.parent != nil && i.parent.updateInvariants() {
-		i = i.parent
-	}
-
-	// Rotate up into tree according to priority.
-	for t.parent != nil && t.parent.priority > t.priority {
-		if t != nil && t.span.base() != t.key {
-			println("runtime: insert t=", t, "t.key=", t.key)
-			println("runtime:      t.span=", t.span, "t.span.base()=", t.span.base())
-			throw("span and treap node base addresses do not match")
-		}
-		if t.parent.left == t {
-			root.rotateRight(t.parent)
-		} else {
-			if t.parent.right != t {
-				throw("treap insert finds a broken treap")
-			}
-			root.rotateLeft(t.parent)
-		}
-	}
-}
-
-func (root *mTreap) removeNode(t *treapNode) {
-	if !t.span.scavenged {
-		root.unscavHugePages -= t.span.hugePages()
-	}
-	if t.span.base() != t.key {
-		throw("span and treap node base addresses do not match")
-	}
-	// Rotate t down to be leaf of tree for removal, respecting priorities.
-	for t.right != nil || t.left != nil {
-		if t.right == nil || t.left != nil && t.left.priority < t.right.priority {
-			root.rotateRight(t)
-		} else {
-			root.rotateLeft(t)
-		}
-	}
-	// Remove t, now a leaf.
-	if t.parent != nil {
-		p := t.parent
-		if p.left == t {
-			p.left = nil
-		} else {
-			p.right = nil
-		}
-		// Walk up the tree updating invariants until no updates occur.
-		for p != nil && p.updateInvariants() {
-			p = p.parent
-		}
-	} else {
-		root.treap = nil
-	}
-	// Return the found treapNode's span after freeing the treapNode.
-	mheap_.treapalloc.free(unsafe.Pointer(t))
-}
-
-// find searches for, finds, and returns the treap iterator over all spans
-// representing the position of the span with the smallest base address which is
-// at least npages in size. If no span has at least npages it returns an invalid
-// iterator.
-//
-// This algorithm is as follows:
-// * If there's a left child and its subtree can satisfy this allocation,
-//   continue down that subtree.
-// * If there's no such left child, check if the root of this subtree can
-//   satisfy the allocation. If so, we're done.
-// * If the root cannot satisfy the allocation either, continue down the
-//   right subtree if able.
-// * Else, break and report that we cannot satisfy the allocation.
-//
-// The preference for left, then current, then right, results in us getting
-// the left-most node which will contain the span with the lowest base
-// address.
-//
-// Note that if a request cannot be satisfied the fourth case will be
-// reached immediately at the root, since neither the left subtree nor
-// the right subtree will have a sufficient maxPages, whilst the root
-// node is also unable to satisfy it.
-func (root *mTreap) find(npages uintptr) treapIter {
-	t := root.treap
-	for t != nil {
-		if t.span == nil {
-			throw("treap node with nil span found")
-		}
-		// Iterate over the treap trying to go as far left
-		// as possible while simultaneously ensuring that the
-		// subtrees we choose always have a span which can
-		// satisfy the allocation.
-		if t.left != nil && t.left.maxPages >= npages {
-			t = t.left
-		} else if t.span.npages >= npages {
-			// Before going right, if this span can satisfy the
-			// request, stop here.
-			break
-		} else if t.right != nil && t.right.maxPages >= npages {
-			t = t.right
-		} else {
-			t = nil
-		}
-	}
-	return treapIter{treapFilterAll, t}
-}
-
-// removeSpan searches for, finds, deletes span along with
-// the associated treap node. If the span is not in the treap
-// then t will eventually be set to nil and the t.span
-// will throw.
-func (root *mTreap) removeSpan(span *mspan) {
-	base := span.base()
-	t := root.treap
-	for t.span != span {
-		if t.key < base {
-			t = t.right
-		} else if t.key > base {
-			t = t.left
-		}
-	}
-	root.removeNode(t)
-}
-
-// erase removes the element referred to by the current position of the
-// iterator. This operation consumes the given iterator, so it should no
-// longer be used. It is up to the caller to get the next or previous
-// iterator before calling erase, if need be.
-func (root *mTreap) erase(i treapIter) {
-	root.removeNode(i.t)
-}
-
-// rotateLeft rotates the tree rooted at node x.
-// turning (x a (y b c)) into (y (x a b) c).
-func (root *mTreap) rotateLeft(x *treapNode) {
-	// p -> (x a (y b c))
-	p := x.parent
-	a, y := x.left, x.right
-	b, c := y.left, y.right
-
-	y.left = x
-	x.parent = y
-	y.right = c
-	if c != nil {
-		c.parent = y
-	}
-	x.left = a
-	if a != nil {
-		a.parent = x
-	}
-	x.right = b
-	if b != nil {
-		b.parent = x
-	}
-
-	y.parent = p
-	if p == nil {
-		root.treap = y
-	} else if p.left == x {
-		p.left = y
-	} else {
-		if p.right != x {
-			throw("large span treap rotateLeft")
-		}
-		p.right = y
-	}
-
-	x.updateInvariants()
-	y.updateInvariants()
-}
-
-// rotateRight rotates the tree rooted at node y.
-// turning (y (x a b) c) into (x a (y b c)).
-func (root *mTreap) rotateRight(y *treapNode) {
-	// p -> (y (x a b) c)
-	p := y.parent
-	x, c := y.left, y.right
-	a, b := x.left, x.right
-
-	x.left = a
-	if a != nil {
-		a.parent = x
-	}
-	x.right = y
-	y.parent = x
-	y.left = b
-	if b != nil {
-		b.parent = y
-	}
-	y.right = c
-	if c != nil {
-		c.parent = y
-	}
-
-	x.parent = p
-	if p == nil {
-		root.treap = x
-	} else if p.left == y {
-		p.left = x
-	} else {
-		if p.right != y {
-			throw("large span treap rotateRight")
-		}
-		p.right = x
-	}
-
-	y.updateInvariants()
-	x.updateInvariants()
-}
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index b6b69dd..a5af5d2 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -46,8 +46,6 @@ const (
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
 // some miscellany) and initializes scanning-related state.
 //
-// The caller must have call gcCopySpans().
-//
 // The world must be stopped.
 //
 //go:nowritebarrier
@@ -111,8 +109,7 @@ func gcMarkRootCheck() {
 fail:
 	println("gp", gp, "goid", gp.goid,
 		"status", readgstatus(gp),
-		"gcscandone", gp.gcscandone,
-		"gcscanvalid", gp.gcscanvalid)
+		"gcscandone", gp.gcscandone)
 	unlock(&allglock) // Avoid self-deadlock with traceback.
 	throw("scan missed a g")
 }
@@ -182,7 +179,7 @@ func markroot(gcw *gcWork, i uint32) {
 			gp.waitsince = work.tstart
 		}
 
-		// scang must be done on the system stack in case
+		// scanstack must be done on the system stack in case
 		// we're trying to scan our own stack.
 		systemstack(func() {
 			// If this is a self-scan, put the user G in
@@ -196,14 +193,24 @@ func markroot(gcw *gcWork, i uint32) {
 				userG.waitreason = waitReasonGarbageCollectionScan
 			}
 
-			// TODO: scang blocks until gp's stack has
-			// been scanned, which may take a while for
+			// TODO: suspendG blocks (and spins) until gp
+			// stops, which may take a while for
 			// running goroutines. Consider doing this in
 			// two phases where the first is non-blocking:
 			// we scan the stacks we can and ask running
 			// goroutines to scan themselves; and the
 			// second blocks.
-			scang(gp, gcw)
+			stopped := suspendG(gp)
+			if stopped.dead {
+				gp.gcscandone = true
+				return
+			}
+			if gp.gcscandone {
+				throw("g already scanned")
+			}
+			scanstack(gp, gcw)
+			gp.gcscandone = true
+			resumeG(stopped)
 
 			if selfScan {
 				casgstatus(userG, _Gwaiting, _Grunning)
@@ -242,13 +249,21 @@ func markrootSpans(gcw *gcWork, shard int) {
 	sg := mheap_.sweepgen
 	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
 	// Note that work.spans may not include spans that were
-	// allocated between entering the scan phase and now. This is
-	// okay because any objects with finalizers in those spans
-	// must have been allocated and given finalizers after we
-	// entered the scan phase, so addfinalizer will have ensured
-	// the above invariants for them.
-	for _, s := range spans {
-		if s.state != mSpanInUse {
+	// allocated between entering the scan phase and now. We may
+	// also race with spans being added into sweepSpans when they're
+	// just created, and as a result we may see nil pointers in the
+	// spans slice. This is okay because any objects with finalizers
+	// in those spans must have been allocated and given finalizers
+	// after we entered the scan phase, so addfinalizer will have
+	// ensured the above invariants for them.
+	for i := 0; i < len(spans); i++ {
+		// sweepBuf.block requires that we read pointers from the block atomically.
+		// It also requires that we ignore nil pointers.
+		s := (*mspan)(atomic.Loadp(unsafe.Pointer(&spans[i])))
+
+		// This is racing with spans being initialized, so
+		// check the state carefully.
+		if s == nil || s.state.get() != mSpanInUse {
 			continue
 		}
 		// Check that this span was swept (it may be cached or uncached).
@@ -600,16 +615,16 @@ func doscanstackswitch(*g, *g)
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
+// scanstack will also shrink the stack if it is safe to do so. If it
+// is not, it schedules a stack shrink for the next synchronous safe
+// point.
+//
 // scanstack is marked go:systemstack because it must not be preempted
 // while using a workbuf.
 //
 //go:nowritebarrier
 //go:systemstack
 func scanstack(gp *g, gcw *gcWork) {
-	if gp.gcscanvalid {
-		return
-	}
-
 	if readgstatus(gp)&_Gscan == 0 {
 		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
 		throw("scanstack - bad status")
@@ -622,17 +637,9 @@ func scanstack(gp *g, gcw *gcWork) {
 	case _Gdead:
 		return
 	case _Grunning:
-		// ok for gccgo, though not for gc.
-		if usestackmaps {
-			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-			throw("scanstack: goroutine not stopped")
-		}
-	case _Gsyscall:
-		if usestackmaps {
-			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-			throw("scanstack: goroutine in syscall")
-		}
-	case _Grunnable, _Gwaiting:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("scanstack: goroutine not stopped")
+	case _Grunnable, _Gsyscall, _Gwaiting:
 		// ok
 	}
 
@@ -644,6 +651,8 @@ func scanstack(gp *g, gcw *gcWork) {
 			doscanstack(gp, gcw)
 		} else if gp.entry != nil {
 			// This is a newly created g that hasn't run. No stack to scan.
+		} else if readgstatus(gp)&^_Gscan == _Gsyscall {
+			scanSyscallStack(gp, gcw)
 		} else {
 			// Scanning another g's stack. We need to switch to that g
 			// to unwind its stack. And switch back after scan.
@@ -661,8 +670,6 @@ func scanstack(gp *g, gcw *gcWork) {
 	// This is necessary as it uses stack objects (a.k.a. stack tracing).
 	// We don't (yet) do stack objects, and regular stack/heap scan
 	// will take care of defer records just fine.
-
-	gp.gcscanvalid = true
 }
 
 // scanstackswitch scans gp's stack by switching (gogo) to gp and
@@ -700,6 +707,38 @@ func scanstackswitch(gp *g, gcw *gcWork) {
 	releasem(mp)
 }
 
+// scanSyscallStack scans the stack of a goroutine blocked in a
+// syscall by waking it up and asking it to scan its own stack.
+func scanSyscallStack(gp *g, gcw *gcWork) {
+	if gp.scanningself {
+		// We've suspended the goroutine by setting the _Gscan bit,
+		// so this shouldn't be possible.
+		throw("scanSyscallStack: scanningself")
+	}
+	if gp.gcscandone {
+		// We've suspended the goroutine by setting the _Gscan bit,
+		// so this shouldn't be possible.
+
+		throw("scanSyscallStack: gcscandone")
+	}
+
+	gp.gcScannedSyscallStack = false
+	for {
+		mp := gp.m
+		noteclear(&mp.scannote)
+		gp.scangcw = uintptr(unsafe.Pointer(gcw))
+		tgkill(getpid(), _pid_t(mp.procid), _SIGURG)
+		// Wait for gp to scan its own stack.
+		notesleep(&mp.scannote)
+		if gp.gcScannedSyscallStack {
+			return
+		}
+
+		// The signal was delivered at a bad time.  Try again.
+		osyield()
+	}
+}
+
 type gcDrainFlags int
 
 const (
@@ -1087,10 +1126,10 @@ func scanstackblockwithmap(pc, b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
 				if obj != 0 {
 					o, span, objIndex := findObject(obj, b, i, false)
 					if obj < minPhysPageSize ||
-						span != nil && span.state != mSpanManual &&
-							(obj < span.base() || obj >= span.limit || span.state != mSpanInUse) {
+						span != nil && span.state.get() != mSpanManual &&
+							(obj < span.base() || obj >= span.limit || span.state.get() != mSpanInUse) {
 						print("runtime: found in object at *(", hex(b), "+", hex(i), ") = ", hex(obj), ", pc=", hex(pc), "\n")
-						name, file, line, _ := funcfileline(pc, -1)
+						name, file, line, _ := funcfileline(pc, -1, false)
 						print(name, "\n", file, ":", line, "\n")
 						//gcDumpObject("object", b, i)
 						throw("found bad pointer in Go stack (incorrect use of unsafe or cgo?)")
@@ -1218,15 +1257,15 @@ func gcDumpObject(label string, obj, off uintptr) {
 		return
 	}
 	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, " s.state=")
-	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
-		print(mSpanStateNames[s.state], "\n")
+	if state := s.state.get(); 0 <= state && int(state) < len(mSpanStateNames) {
+		print(mSpanStateNames[state], "\n")
 	} else {
-		print("unknown(", s.state, ")\n")
+		print("unknown(", state, ")\n")
 	}
 
 	skipped := false
 	size := s.elemsize
-	if s.state == mSpanManual && size == 0 {
+	if s.state.get() == mSpanManual && size == 0 {
 		// We're printing something from a stack frame. We
 		// don't know how big it is, so just show up to an
 		// including off.
@@ -1314,7 +1353,7 @@ var useCheckmark = false
 func initCheckmarks() {
 	useCheckmark = true
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			heapBitsForAddr(s.base()).initCheckmarkSpan(s.layout())
 		}
 	}
@@ -1323,7 +1362,7 @@ func initCheckmarks() {
 func clearCheckmarks() {
 	useCheckmark = false
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			heapBitsForAddr(s.base()).clearCheckmarkSpan(s.layout())
 		}
 	}
diff --git a/libgo/go/runtime/mgcscavenge.go b/libgo/go/runtime/mgcscavenge.go
index 9f8c472..f3856db 100644
--- a/libgo/go/runtime/mgcscavenge.go
+++ b/libgo/go/runtime/mgcscavenge.go
@@ -17,7 +17,29 @@
 // scavenger's primary goal is to bring the estimated heap RSS of the
 // application down to a goal.
 //
-// That goal is defined as (retainExtraPercent+100) / 100 * next_gc.
+// That goal is defined as:
+//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
+//
+// Essentially, we wish to have the application's RSS track the heap goal, but
+// the heap goal is defined in terms of bytes of objects, rather than pages like
+// RSS. As a result, we need to take into account for fragmentation internal to
+// spans. next_gc / last_next_gc defines the ratio between the current heap goal
+// and the last heap goal, which tells us by how much the heap is growing and
+// shrinking. We estimate what the heap will grow to in terms of pages by taking
+// this ratio and multiplying it by heap_inuse at the end of the last GC, which
+// allows us to account for this additional fragmentation. Note that this
+// procedure makes the assumption that the degree of fragmentation won't change
+// dramatically over the next GC cycle. Overestimating the amount of
+// fragmentation simply results in higher memory use, which will be accounted
+// for by the next pacing up date. Underestimating the fragmentation however
+// could lead to performance degradation. Handling this case is not within the
+// scope of the scavenger. Situations where the amount of fragmentation balloons
+// over the course of a single GC cycle should be considered pathologies,
+// flagged as bugs, and fixed appropriately.
+//
+// An additional factor of retainExtraPercent is added as a buffer to help ensure
+// that there's more unscavenged memory to allocate out of, since each allocation
+// out of scavenged memory incurs a potentially expensive page fault.
 //
 // The goal is updated after each GC and the scavenger's pacing parameters
 // (which live in mheap_) are updated to match. The pacing parameters work much
@@ -33,25 +55,18 @@
 
 package runtime
 
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
 const (
 	// The background scavenger is paced according to these parameters.
 	//
 	// scavengePercent represents the portion of mutator time we're willing
 	// to spend on scavenging in percent.
-	//
-	// scavengePageLatency is a worst-case estimate (order-of-magnitude) of
-	// the time it takes to scavenge one (regular-sized) page of memory.
-	// scavengeHugePageLatency is the same but for huge pages.
-	//
-	// scavengePagePeriod is derived from scavengePercent and scavengePageLatency,
-	// and represents the average time between scavenging one page that we're
-	// aiming for. scavengeHugePagePeriod is the same but for huge pages.
-	// These constants are core to the scavenge pacing algorithm.
-	scavengePercent         = 1    // 1%
-	scavengePageLatency     = 10e3 // 10µs
-	scavengeHugePageLatency = 10e3 // 10µs
-	scavengePagePeriod      = scavengePageLatency / (scavengePercent / 100.0)
-	scavengeHugePagePeriod  = scavengePageLatency / (scavengePercent / 100.0)
+	scavengePercent = 1 // 1%
 
 	// retainExtraPercent represents the amount of memory over the heap goal
 	// that the scavenger should keep as a buffer space for the allocator.
@@ -61,34 +76,46 @@ const (
 	// incurs an additional cost), to account for heap fragmentation and
 	// the ever-changing layout of the heap.
 	retainExtraPercent = 10
+
+	// maxPagesPerPhysPage is the maximum number of supported runtime pages per
+	// physical page, based on maxPhysPageSize.
+	maxPagesPerPhysPage = maxPhysPageSize / pageSize
 )
 
 // heapRetained returns an estimate of the current heap RSS.
-//
-// mheap_.lock must be held or the world must be stopped.
 func heapRetained() uint64 {
-	return memstats.heap_sys - memstats.heap_released
+	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
 // its rate and RSS goal.
 //
 // The RSS goal is based on the current heap goal with a small overhead
-// to accomodate non-determinism in the allocator.
+// to accommodate non-determinism in the allocator.
 //
 // The pacing is based on scavengePageRate, which applies to both regular and
 // huge pages. See that constant for more information.
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcPaceScavenger() {
-	// Compute our scavenging goal and align it to a physical page boundary
-	// to make the following calculations more exact.
-	retainedGoal := memstats.next_gc
+	// If we're called before the first GC completed, disable scavenging.
+	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
+	// information about the heap yet) so this is fine, and avoids a fault
+	// or garbage data later.
+	if memstats.last_next_gc == 0 {
+		mheap_.scavengeGoal = ^uint64(0)
+		return
+	}
+	// Compute our scavenging goal.
+	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
 	// (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8)
 	// that also avoids the overflow from a multiplication.
 	retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0))
+	// Align it to a physical page boundary to make the following calculations
+	// a bit more exact.
 	retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1)
 
 	// Represents where we are now in the heap's contribution to RSS in bytes.
@@ -104,86 +131,31 @@ func gcPaceScavenger() {
 	// physical page.
 	retainedNow := heapRetained()
 
-	// If we're already below our goal, publish the goal in case it changed
-	// then disable the background scavenger.
-	if retainedNow <= retainedGoal {
-		mheap_.scavengeRetainedGoal = retainedGoal
-		mheap_.scavengeBytesPerNS = 0
+	// If we're already below our goal, or within one page of our goal, then disable
+	// the background scavenger. We disable the background scavenger if there's
+	// less than one physical page of work to do because it's not worth it.
+	if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) {
+		mheap_.scavengeGoal = ^uint64(0)
 		return
 	}
-
-	// Now we start to compute the total amount of work necessary and the total
-	// amount of time we're willing to give the scavenger to complete this work.
-	// This will involve calculating how much of the work consists of huge pages
-	// and how much consists of regular pages since the former can let us scavenge
-	// more memory in the same time.
-	totalWork := retainedNow - retainedGoal
-
-	// On systems without huge page support, all work is regular work.
-	regularWork := totalWork
-	hugeTime := uint64(0)
-
-	// On systems where we have huge pages, we want to do as much of the
-	// scavenging work as possible on huge pages, because the costs are the
-	// same per page, but we can give back more more memory in a shorter
-	// period of time.
-	if physHugePageSize != 0 {
-		// Start by computing the amount of free memory we have in huge pages
-		// in total. Trivially, this is all the huge page work we need to do.
-		hugeWork := uint64(mheap_.free.unscavHugePages) << physHugePageShift
-
-		// ...but it could turn out that there's more huge work to do than
-		// total work, so cap it at total work. This might happen for very large
-		// heaps where the additional factor of retainExtraPercent can make it so
-		// that there are free chunks of memory larger than a huge page that we don't want
-		// to scavenge.
-		if hugeWork >= totalWork {
-			hugePages := totalWork >> physHugePageShift
-			hugeWork = hugePages << physHugePageShift
-		}
-		// Everything that's not huge work is regular work. At this point we
-		// know huge work so we can calculate how much time that will take
-		// based on scavengePageRate (which applies to pages of any size).
-		regularWork = totalWork - hugeWork
-		hugeTime = (hugeWork >> physHugePageShift) * scavengeHugePagePeriod
-	}
-	// Finally, we can compute how much time it'll take to do the regular work
-	// and the total time to do all the work.
-	regularTime := regularWork / uint64(physPageSize) * scavengePagePeriod
-	totalTime := hugeTime + regularTime
-
-	now := nanotime()
-
-	lock(&scavenge.lock)
-
-	// Update all the pacing parameters in mheap with scavenge.lock held,
-	// so that scavenge.gen is kept in sync with the updated values.
-	mheap_.scavengeRetainedGoal = retainedGoal
-	mheap_.scavengeRetainedBasis = retainedNow
-	mheap_.scavengeTimeBasis = now
-	mheap_.scavengeBytesPerNS = float64(totalWork) / float64(totalTime)
-	scavenge.gen++ // increase scavenge generation
-
-	// Wake up background scavenger if needed, since the pacing was just updated.
-	wakeScavengerLocked()
-
-	unlock(&scavenge.lock)
+	mheap_.scavengeGoal = retainedGoal
+	mheap_.pages.resetScavengeAddr()
 }
 
-// State of the background scavenger.
+// Sleep/wait state of the background scavenger.
 var scavenge struct {
 	lock   mutex
 	g      *g
 	parked bool
 	timer  *timer
-	gen    uint32 // read with either lock or mheap_.lock, write with both
 }
 
-// wakeScavengerLocked unparks the scavenger if necessary. It must be called
+// wakeScavenger unparks the scavenger if necessary. It must be called
 // after any pacing update.
 //
-// scavenge.lock must be held.
-func wakeScavengerLocked() {
+// mheap_.lock and scavenge.lock must not be held.
+func wakeScavenger() {
+	lock(&scavenge.lock)
 	if scavenge.parked {
 		// Try to stop the timer but we don't really care if we succeed.
 		// It's possible that either a timer was never started, or that
@@ -194,45 +166,44 @@ func wakeScavengerLocked() {
 		stopTimer(scavenge.timer)
 
 		// Unpark the goroutine and tell it that there may have been a pacing
-		// change.
+		// change. Note that we skip the scheduler's runnext slot because we
+		// want to avoid having the scavenger interfere with the fair
+		// scheduling of user goroutines. In effect, this schedules the
+		// scavenger at a "lower priority" but that's OK because it'll
+		// catch up on the work it missed when it does get scheduled.
 		scavenge.parked = false
-		ready(scavenge.g, 0, true)
+		systemstack(func() {
+			ready(scavenge.g, 0, false)
+		})
 	}
+	unlock(&scavenge.lock)
 }
 
 // scavengeSleep attempts to put the scavenger to sleep for ns.
-// It also checks to see if gen != scavenge.gen before going to sleep,
-// and aborts if true (meaning an update had occurred).
 //
 // Note that this function should only be called by the scavenger.
 //
 // The scavenger may be woken up earlier by a pacing change, and it may not go
 // to sleep at all if there's a pending pacing change.
 //
-// Returns false if awoken early (i.e. true means a complete sleep).
-func scavengeSleep(gen uint32, ns int64) bool {
+// Returns the amount of time actually slept.
+func scavengeSleep(ns int64) int64 {
 	lock(&scavenge.lock)
 
-	// If there was an update, just abort the sleep.
-	if scavenge.gen != gen {
-		unlock(&scavenge.lock)
-		return false
-	}
-
 	// Set the timer.
-	now := nanotime()
-	scavenge.timer.when = now + ns
-	startTimer(scavenge.timer)
-
-	// Park the goroutine. It's fine that we don't publish the
-	// fact that the timer was set; even if the timer wakes up
-	// and fire scavengeReady before we park, it'll block on
-	// scavenge.lock.
+	//
+	// This must happen here instead of inside gopark
+	// because we can't close over any variables without
+	// failing escape analysis.
+	start := nanotime()
+	resetTimer(scavenge.timer, start+ns)
+
+	// Mark ourself as asleep and go to sleep.
 	scavenge.parked = true
 	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
 
-	// Return true if we completed the full sleep.
-	return (nanotime() - now) >= ns
+	// Return how long we actually slept for.
+	return nanotime() - start
 }
 
 // Background scavenger.
@@ -250,118 +221,543 @@ func bgscavenge(c chan int) {
 
 	scavenge.timer = new(timer)
 	scavenge.timer.f = func(_ interface{}, _ uintptr) {
-		lock(&scavenge.lock)
-		wakeScavengerLocked()
-		unlock(&scavenge.lock)
+		wakeScavenger()
 	}
 
 	c <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
-	// Parameters for sleeping.
-	//
-	// If we end up doing more work than we need, we should avoid spinning
-	// until we have more work to do: instead, we know exactly how much time
-	// until more work will need to be done, so we sleep.
-	//
-	// We should avoid sleeping for less than minSleepNS because Gosched()
-	// overheads among other things will work out better in that case.
+	// Exponentially-weighted moving average of the fraction of time this
+	// goroutine spends scavenging (that is, percent of a single CPU).
+	// It represents a measure of scheduling overheads which might extend
+	// the sleep or the critical time beyond what's expected. Assume no
+	// overhead to begin with.
 	//
-	// There's no reason to set a maximum on sleep time because we'll always
-	// get woken up earlier if there's any kind of update that could change
-	// the scavenger's pacing.
-	//
-	// retryDelayNS tracks how much to sleep next time we fail to do any
-	// useful work.
-	const minSleepNS = int64(100 * 1000) // 100 µs
-
-	retryDelayNS := minSleepNS
+	// TODO(mknyszek): Consider making this based on total CPU time of the
+	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
+	// feasible now because the scavenger acquires the heap lock over the
+	// scavenging operation, which means scavenging effectively blocks
+	// allocators and isn't scalable. However, given a scalable allocator,
+	// it makes sense to also make the scavenger scale with it; if you're
+	// allocating more frequently, then presumably you're also generating
+	// more work for the scavenger.
+	const idealFraction = scavengePercent / 100.0
+	scavengeEWMA := float64(idealFraction)
 
 	for {
 		released := uintptr(0)
-		park := false
-		ttnext := int64(0)
-		gen := uint32(0)
+
+		// Time in scavenging critical section.
+		crit := int64(0)
 
 		// Run on the system stack since we grab the heap lock,
 		// and a stack growth with the heap lock means a deadlock.
 		systemstack(func() {
 			lock(&mheap_.lock)
 
-			gen = scavenge.gen
-
 			// If background scavenging is disabled or if there's no work to do just park.
-			retained := heapRetained()
-			if mheap_.scavengeBytesPerNS == 0 || retained <= mheap_.scavengeRetainedGoal {
+			retained, goal := heapRetained(), mheap_.scavengeGoal
+			if retained <= goal {
 				unlock(&mheap_.lock)
-				park = true
 				return
 			}
-
-			// Calculate how big we want the retained heap to be
-			// at this point in time.
-			//
-			// The formula is for that of a line, y = b - mx
-			// We want y (want),
-			//   m = scavengeBytesPerNS (> 0)
-			//   x = time between scavengeTimeBasis and now
-			//   b = scavengeRetainedBasis
-			rate := mheap_.scavengeBytesPerNS
-			tdist := nanotime() - mheap_.scavengeTimeBasis
-			rdist := uint64(rate * float64(tdist))
-			want := mheap_.scavengeRetainedBasis - rdist
-
-			// If we're above the line, scavenge to get below the
-			// line.
-			if retained > want {
-				released = mheap_.scavengeLocked(uintptr(retained - want))
-			}
 			unlock(&mheap_.lock)
 
-			// If we over-scavenged a bit, calculate how much time it'll
-			// take at the current rate for us to make that up. We definitely
-			// won't have any work to do until at least that amount of time
-			// passes.
-			if released > uintptr(retained-want) {
-				extra := released - uintptr(retained-want)
-				ttnext = int64(float64(extra) / rate)
-			}
+			// Scavenge one page, and measure the amount of time spent scavenging.
+			start := nanotime()
+			released = mheap_.pages.scavengeOne(physPageSize, false)
+			crit = nanotime() - start
 		})
 
-		if park {
+		if debug.gctrace > 0 {
+			if released > 0 {
+				print("scvg: ", released>>10, " KB released\n")
+			}
+			print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+		}
+
+		if released == 0 {
 			lock(&scavenge.lock)
 			scavenge.parked = true
 			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 			continue
 		}
 
-		if debug.gctrace > 0 {
-			if released > 0 {
-				print("scvg: ", released>>20, " MB released\n")
-			}
-			print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
+		// put their machine to sleep) in the critical section, bound the time we use to
+		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
+		const maxCrit = 10e6
+		if crit > maxCrit {
+			crit = maxCrit
 		}
 
-		if released == 0 {
-			// If we were unable to release anything this may be because there's
-			// no free memory available to scavenge. Go to sleep and try again.
-			if scavengeSleep(gen, retryDelayNS) {
-				// If we successfully slept through the delay, back off exponentially.
-				retryDelayNS *= 2
+		// Compute the amount of time to sleep, assuming we want to use at most
+		// scavengePercent of CPU time. Take into account scheduling overheads
+		// that may extend the length of our sleep by multiplying by how far
+		// off we are from the ideal ratio. For example, if we're sleeping too
+		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
+		// down.
+		adjust := scavengeEWMA / idealFraction
+		sleepTime := int64(adjust * float64(crit) / (scavengePercent / 100.0))
+
+		// Go to sleep.
+		slept := scavengeSleep(sleepTime)
+
+		// Compute the new ratio.
+		fraction := float64(crit) / float64(crit+slept)
+
+		// Set a lower bound on the fraction.
+		// Due to OS-related anomalies we may "sleep" for an inordinate amount
+		// of time. Let's avoid letting the ratio get out of hand by bounding
+		// the sleep time we use in our EWMA.
+		const minFraction = 1 / 1000
+		if fraction < minFraction {
+			fraction = minFraction
+		}
+
+		// Update scavengeEWMA by merging in the new crit/slept ratio.
+		const alpha = 0.5
+		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
+	}
+}
+
+// scavenge scavenges nbytes worth of free pages, starting with the
+// highest address first. Successive calls continue from where it left
+// off until the heap is exhausted. Call resetScavengeAddr to bring it
+// back to the top of the heap.
+//
+// Returns the amount of memory scavenged in bytes.
+//
+// If locked == false, s.mheapLock must not be locked. If locked == true,
+// s.mheapLock must be locked.
+//
+// Must run on the system stack because scavengeOne must run on the
+// system stack.
+//
+//go:systemstack
+func (s *pageAlloc) scavenge(nbytes uintptr, locked bool) uintptr {
+	released := uintptr(0)
+	for released < nbytes {
+		r := s.scavengeOne(nbytes-released, locked)
+		if r == 0 {
+			// Nothing left to scavenge! Give up.
+			break
+		}
+		released += r
+	}
+	return released
+}
+
+// resetScavengeAddr sets the scavenge start address to the top of the heap's
+// address space. This should be called each time the scavenger's pacing
+// changes.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) resetScavengeAddr() {
+	s.scavAddr = chunkBase(s.end) - 1
+}
+
+// scavengeOne starts from s.scavAddr and walks down the heap until it finds
+// a contiguous run of pages to scavenge. It will try to scavenge at most
+// max bytes at once, but may scavenge more to avoid breaking huge pages. Once
+// it scavenges some memory it returns how much it scavenged and updates s.scavAddr
+// appropriately. s.scavAddr must be reset manually and externally.
+//
+// Should it exhaust the heap, it will return 0 and set s.scavAddr to minScavAddr.
+//
+// If locked == false, s.mheapLock must not be locked.
+// If locked == true, s.mheapLock must be locked.
+//
+// Must be run on the system stack because it either acquires the heap lock
+// or executes with the heap lock acquired.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeOne(max uintptr, locked bool) uintptr {
+	// Calculate the maximum number of pages to scavenge.
+	//
+	// This should be alignUp(max, pageSize) / pageSize but max can and will
+	// be ^uintptr(0), so we need to be very careful not to overflow here.
+	// Rather than use alignUp, calculate the number of pages rounded down
+	// first, then add back one if necessary.
+	maxPages := max / pageSize
+	if max%pageSize != 0 {
+		maxPages++
+	}
+
+	// Calculate the minimum number of pages we can scavenge.
+	//
+	// Because we can only scavenge whole physical pages, we must
+	// ensure that we scavenge at least minPages each time, aligned
+	// to minPages*pageSize.
+	minPages := physPageSize / pageSize
+	if minPages < 1 {
+		minPages = 1
+	}
+
+	// Helpers for locking and unlocking only if locked == false.
+	lockHeap := func() {
+		if !locked {
+			lock(s.mheapLock)
+		}
+	}
+	unlockHeap := func() {
+		if !locked {
+			unlock(s.mheapLock)
+		}
+	}
+
+	lockHeap()
+	ci := chunkIndex(s.scavAddr)
+	if ci < s.start {
+		unlockHeap()
+		return 0
+	}
+
+	// Check the chunk containing the scav addr, starting at the addr
+	// and see if there are any free and unscavenged pages.
+	if s.summary[len(s.summary)-1][ci].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all. It's important that we only
+		// continue if the summary says we can because that's how
+		// we can tell if parts of the address space are unused.
+		// See the comment on s.chunks in mpagealloc.go.
+		base, npages := s.chunkOf(ci).findScavengeCandidate(chunkPageIndex(s.scavAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			s.scavengeRangeLocked(ci, base, npages)
+			unlockHeap()
+			return uintptr(npages) * pageSize
+		}
+	}
+
+	// getInUseRange returns the highest range in the
+	// intersection of [0, addr] and s.inUse.
+	//
+	// s.mheapLock must be held.
+	getInUseRange := func(addr uintptr) addrRange {
+		top := s.inUse.findSucc(addr)
+		if top == 0 {
+			return addrRange{}
+		}
+		r := s.inUse.ranges[top-1]
+		// addr is inclusive, so treat it as such when
+		// updating the limit, which is exclusive.
+		if r.limit > addr+1 {
+			r.limit = addr + 1
+		}
+		return r
+	}
+
+	// Slow path: iterate optimistically over the in-use address space
+	// looking for any free and unscavenged page. If we think we see something,
+	// lock and verify it!
+	//
+	// We iterate over the address space by taking ranges from inUse.
+newRange:
+	for {
+		r := getInUseRange(s.scavAddr)
+		if r.size() == 0 {
+			break
+		}
+		unlockHeap()
+
+		// Iterate over all of the chunks described by r.
+		// Note that r.limit is the exclusive upper bound, but what
+		// we want is the top chunk instead, inclusive, so subtract 1.
+		bot, top := chunkIndex(r.base), chunkIndex(r.limit-1)
+		for i := top; i >= bot; i-- {
+			// If this chunk is totally in-use or has no unscavenged pages, don't bother
+			// doing a  more sophisticated check.
+			//
+			// Note we're accessing the summary and the chunks without a lock, but
+			// that's fine. We're being optimistic anyway.
+
+			// Check quickly if there are enough free pages at all.
+			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+				continue
 			}
-			continue
+
+			// Run over the chunk looking harder for a candidate. Again, we could
+			// race with a lot of different pieces of code, but we're just being
+			// optimistic. Make sure we load the l2 pointer atomically though, to
+			// avoid races with heap growth. It may or may not be possible to also
+			// see a nil pointer in this case if we do race with heap growth, but
+			// just defensively ignore the nils. This operation is optimistic anyway.
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			if l2 == nil || !l2[i.l2()].hasScavengeCandidate(minPages) {
+				continue
+			}
+
+			// We found a candidate, so let's lock and verify it.
+			lockHeap()
+
+			// Find, verify, and scavenge if we can.
+			chunk := s.chunkOf(i)
+			base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
+			if npages > 0 {
+				// We found memory to scavenge! Mark the bits and report that up.
+				// scavengeRangeLocked will update scavAddr for us, also.
+				s.scavengeRangeLocked(i, base, npages)
+				unlockHeap()
+				return uintptr(npages) * pageSize
+			}
+
+			// We were fooled, let's take this opportunity to move the scavAddr
+			// all the way down to where we searched as scavenged for future calls
+			// and keep iterating. Then, go get a new range.
+			s.scavAddr = chunkBase(i-1) + pallocChunkPages*pageSize - 1
+			continue newRange
 		}
-		retryDelayNS = minSleepNS
+		lockHeap()
+
+		// Move the scavenger down the heap, past everything we just searched.
+		// Since we don't check if scavAddr moved while twe let go of the heap lock,
+		// it's possible that it moved down and we're moving it up here. This
+		// raciness could result in us searching parts of the heap unnecessarily.
+		// TODO(mknyszek): Remove this racy behavior through explicit address
+		// space reservations, which are difficult to do with just scavAddr.
+		s.scavAddr = r.base - 1
+	}
+	// We reached the end of the in-use address space and couldn't find anything,
+	// so signal that there's nothing left to scavenge.
+	s.scavAddr = minScavAddr
+	unlockHeap()
 
-		if ttnext > 0 && ttnext > minSleepNS {
-			// If there's an appreciable amount of time until the next scavenging
-			// goal, just sleep. We'll get woken up if anything changes and this
-			// way we avoid spinning.
-			scavengeSleep(gen, ttnext)
-			continue
+	return 0
+}
+
+// scavengeRangeLocked scavenges the given region of memory.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) {
+	s.chunkOf(ci).scavenged.setRange(base, npages)
+
+	// Compute the full address for the start of the range.
+	addr := chunkBase(ci) + uintptr(base)*pageSize
+
+	// Update the scav pointer.
+	s.scavAddr = addr - 1
+
+	// Only perform the actual scavenging if we're not in a test.
+	// It's dangerous to do so otherwise.
+	if s.test {
+		return
+	}
+	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+
+	// Update global accounting only when not in test, otherwise
+	// the runtime's accounting will be wrong.
+	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+}
+
+// fillAligned returns x but with all zeroes in m-aligned
+// groups of m bits set to 1 if any bit in the group is non-zero.
+//
+// For example, fillAligned(0x0100a3, 8) == 0xff00ff.
+//
+// Note that if m == 1, this is a no-op.
+//
+// m must be a power of 2 <= maxPagesPerPhysPage.
+func fillAligned(x uint64, m uint) uint64 {
+	apply := func(x uint64, c uint64) uint64 {
+		// The technique used it here is derived from
+		// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+		// and extended for more than just bytes (like nibbles
+		// and uint16s) by using an appropriate constant.
+		//
+		// To summarize the technique, quoting from that page:
+		// "[It] works by first zeroing the high bits of the [8]
+		// bytes in the word. Subsequently, it adds a number that
+		// will result in an overflow to the high bit of a byte if
+		// any of the low bits were initially set. Next the high
+		// bits of the original word are ORed with these values;
+		// thus, the high bit of a byte is set iff any bit in the
+		// byte was set. Finally, we determine if any of these high
+		// bits are zero by ORing with ones everywhere except the
+		// high bits and inverting the result."
+		return ^((((x & c) + c) | x) | c)
+	}
+	// Transform x to contain a 1 bit at the top of each m-aligned
+	// group of m zero bits.
+	switch m {
+	case 1:
+		return x
+	case 2:
+		x = apply(x, 0x5555555555555555)
+	case 4:
+		x = apply(x, 0x7777777777777777)
+	case 8:
+		x = apply(x, 0x7f7f7f7f7f7f7f7f)
+	case 16:
+		x = apply(x, 0x7fff7fff7fff7fff)
+	case 32:
+		x = apply(x, 0x7fffffff7fffffff)
+	case 64: // == maxPagesPerPhysPage
+		x = apply(x, 0x7fffffffffffffff)
+	default:
+		throw("bad m value")
+	}
+	// Now, the top bit of each m-aligned group in x is set
+	// that group was all zero in the original x.
+
+	// From each group of m bits subtract 1.
+	// Because we know only the top bits of each
+	// m-aligned group are set, we know this will
+	// set each group to have all the bits set except
+	// the top bit, so just OR with the original
+	// result to set all the bits.
+	return ^((x - (x >> (m - 1))) | x)
+}
+
+// hasScavengeCandidate returns true if there's any min-page-aligned groups of
+// min pages of free-and-unscavenged memory in the region represented by this
+// pallocData.
+//
+// min must be a non-zero power of 2 <= maxPagesPerPhysPage.
+func (m *pallocData) hasScavengeCandidate(min uintptr) bool {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+
+	// The goal of this search is to see if the chunk contains any free and unscavenged memory.
+	for i := len(m.scavenged) - 1; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		//
+		// TODO(mknyszek): Consider splitting up fillAligned into two
+		// functions, since here we technically could get by with just
+		// the first half of its computation. It'll save a few instructions
+		// but adds some additional code complexity.
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+
+		// Quickly skip over chunks of non-free or scavenged pages.
+		if x != ^uint64(0) {
+			return true
+		}
+	}
+	return false
+}
+
+// findScavengeCandidate returns a start index and a size for this pallocData
+// segment which represents a contiguous region of free and unscavenged memory.
+//
+// searchIdx indicates the page index within this chunk to start the search, but
+// note that findScavengeCandidate searches backwards through the pallocData. As a
+// a result, it will return the highest scavenge candidate in address order.
+//
+// min indicates a hard minimum size and alignment for runs of pages. That is,
+// findScavengeCandidate will not return a region smaller than min pages in size,
+// or that is min pages or greater in size but not aligned to min. min must be
+// a non-zero power of 2 <= maxPagesPerPhysPage.
+//
+// max is a hint for how big of a region is desired. If max >= pallocChunkPages, then
+// findScavengeCandidate effectively returns entire free and unscavenged regions.
+// If max < pallocChunkPages, it may truncate the returned region such that size is
+// max. However, findScavengeCandidate may still return a larger region if, for
+// example, it chooses to preserve huge pages, or if max is not aligned to min (it
+// will round up). That is, even if max is small, the returned size is not guaranteed
+// to be equal to max. max is allowed to be less than min, in which case it is as if
+// max == min.
+func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+	// max may not be min-aligned, so we might accidentally truncate to
+	// a max value which causes us to return a non-min-aligned value.
+	// To prevent this, align max up to a multiple of min (which is always
+	// a power of 2). This also prevents max from ever being less than
+	// min, unless it's zero, so handle that explicitly.
+	if max == 0 {
+		max = min
+	} else {
+		max = alignUp(max, min)
+	}
+
+	i := int(searchIdx / 64)
+	// Start by quickly skipping over blocks of non-free or scavenged pages.
+	for ; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+		if x != ^uint64(0) {
+			break
+		}
+	}
+	if i < 0 {
+		// Failed to find any free/unscavenged pages.
+		return 0, 0
+	}
+	// We have something in the 64-bit chunk at i, but it could
+	// extend further. Loop until we find the extent of it.
+
+	// 1s are scavenged OR non-free => 0s are unscavenged AND free
+	x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+	z1 := uint(sys.LeadingZeros64(^x))
+	run, end := uint(0), uint(i)*64+(64-z1)
+	if x<<z1 != 0 {
+		// After shifting out z1 bits, we still have 1s,
+		// so the run ends inside this word.
+		run = uint(sys.LeadingZeros64(x << z1))
+	} else {
+		// After shifting out z1 bits, we have no more 1s.
+		// This means the run extends to the bottom of the
+		// word so it may extend into further words.
+		run = 64 - z1
+		for j := i - 1; j >= 0; j-- {
+			x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min))
+			run += uint(sys.LeadingZeros64(x))
+			if x != 0 {
+				// The run stopped in this word.
+				break
+			}
 		}
+	}
 
-		// Give something else a chance to run, no locks are held.
-		Gosched()
+	// Split the run we found if it's larger than max but hold on to
+	// our original length, since we may need it later.
+	size := run
+	if size > uint(max) {
+		size = uint(max)
+	}
+	start := end - size
+
+	// Each huge page is guaranteed to fit in a single palloc chunk.
+	//
+	// TODO(mknyszek): Support larger huge page sizes.
+	// TODO(mknyszek): Consider taking pages-per-huge-page as a parameter
+	// so we can write tests for this.
+	if physHugePageSize > pageSize && physHugePageSize > physPageSize {
+		// We have huge pages, so let's ensure we don't break one by scavenging
+		// over a huge page boundary. If the range [start, start+size) overlaps with
+		// a free-and-unscavenged huge page, we want to grow the region we scavenge
+		// to include that huge page.
+
+		// Compute the huge page boundary above our candidate.
+		pagesPerHugePage := uintptr(physHugePageSize / pageSize)
+		hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage))
+
+		// If that boundary is within our current candidate, then we may be breaking
+		// a huge page.
+		if hugePageAbove <= end {
+			// Compute the huge page boundary below our candidate.
+			hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage))
+
+			if hugePageBelow >= end-run {
+				// We're in danger of breaking apart a huge page since start+size crosses
+				// a huge page boundary and rounding down start to the nearest huge
+				// page boundary is included in the full run we found. Include the entire
+				// huge page in the bound by rounding down to the huge page size.
+				size = size + (start - hugePageBelow)
+				start = hugePageBelow
+			}
+		}
 	}
+	return start, size
 }
diff --git a/libgo/go/runtime/mgcscavenge_test.go b/libgo/go/runtime/mgcscavenge_test.go
new file mode 100644
index 0000000..518d5ab
--- /dev/null
+++ b/libgo/go/runtime/mgcscavenge_test.go
@@ -0,0 +1,419 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+// makePallocData produces an initialized PallocData by setting
+// the ranges of described in alloc and scavenge.
+func makePallocData(alloc, scavenged []BitRange) *PallocData {
+	b := new(PallocData)
+	for _, v := range alloc {
+		if v.N == 0 {
+			// Skip N==0. It's harmless and allocRange doesn't
+			// handle this case.
+			continue
+		}
+		b.AllocRange(v.I, v.N)
+	}
+	for _, v := range scavenged {
+		if v.N == 0 {
+			// See the previous loop.
+			continue
+		}
+		b.ScavengedSetRange(v.I, v.N)
+	}
+	return b
+}
+
+func TestFillAligned(t *testing.T) {
+	fillAlignedSlow := func(x uint64, m uint) uint64 {
+		if m == 1 {
+			return x
+		}
+		out := uint64(0)
+		for i := uint(0); i < 64; i += m {
+			for j := uint(0); j < m; j++ {
+				if x&(uint64(1)<<(i+j)) != 0 {
+					out |= ((uint64(1) << m) - 1) << i
+					break
+				}
+			}
+		}
+		return out
+	}
+	check := func(x uint64, m uint) {
+		want := fillAlignedSlow(x, m)
+		if got := FillAligned(x, m); got != want {
+			t.Logf("got:  %064b", got)
+			t.Logf("want: %064b", want)
+			t.Errorf("bad fillAligned(%016x, %d)", x, m)
+		}
+	}
+	for m := uint(1); m <= 64; m *= 2 {
+		tests := []uint64{
+			0x0000000000000000,
+			0x00000000ffffffff,
+			0xffffffff00000000,
+			0x8000000000000001,
+			0xf00000000000000f,
+			0xf00000010050000f,
+			0xffffffffffffffff,
+			0x0000000000000001,
+			0x0000000000000002,
+			0x0000000000000008,
+			uint64(1) << (m - 1),
+			uint64(1) << m,
+			// Try a few fixed arbitrary examples.
+			0xb02b9effcf137016,
+			0x3975a076a9fbff18,
+			0x0f8c88ec3b81506e,
+			0x60f14d80ef2fa0e6,
+		}
+		for _, test := range tests {
+			check(test, m)
+		}
+		for i := 0; i < 1000; i++ {
+			// Try a pseudo-random numbers.
+			check(rand.Uint64(), m)
+
+			if m > 1 {
+				// For m != 1, let's construct a slightly more interesting
+				// random test. Generate a bitmap which is either 0 or
+				// randomly set bits for each m-aligned group of m bits.
+				val := uint64(0)
+				for n := uint(0); n < 64; n += m {
+					// For each group of m bits, flip a coin:
+					// * Leave them as zero.
+					// * Set them randomly.
+					if rand.Uint64()%2 == 0 {
+						val |= (rand.Uint64() & ((1 << m) - 1)) << n
+					}
+				}
+				check(val, m)
+			}
+		}
+	}
+}
+
+func TestPallocDataFindScavengeCandidate(t *testing.T) {
+	type test struct {
+		alloc, scavenged []BitRange
+		min, max         uintptr
+		want             BitRange
+	}
+	tests := map[string]test{
+		"MixedMin1": {
+			alloc:     []BitRange{{0, 40}, {42, PallocChunkPages - 42}},
+			scavenged: []BitRange{{0, 41}, {42, PallocChunkPages - 42}},
+			min:       1,
+			max:       PallocChunkPages,
+			want:      BitRange{41, 1},
+		},
+		"MultiMin1": {
+			alloc:     []BitRange{{0, 63}, {65, 20}, {87, PallocChunkPages - 87}},
+			scavenged: []BitRange{{86, 1}},
+			min:       1,
+			max:       PallocChunkPages,
+			want:      BitRange{85, 1},
+		},
+	}
+	// Try out different page minimums.
+	for m := uintptr(1); m <= 64; m *= 2 {
+		suffix := fmt.Sprintf("Min%d", m)
+		tests["AllFree"+suffix] = test{
+			min:  m,
+			max:  PallocChunkPages,
+			want: BitRange{0, PallocChunkPages},
+		}
+		tests["AllScavenged"+suffix] = test{
+			scavenged: []BitRange{{0, PallocChunkPages}},
+			min:       m,
+			max:       PallocChunkPages,
+			want:      BitRange{0, 0},
+		}
+		tests["NoneFree"+suffix] = test{
+			alloc:     []BitRange{{0, PallocChunkPages}},
+			scavenged: []BitRange{{PallocChunkPages / 2, PallocChunkPages / 2}},
+			min:       m,
+			max:       PallocChunkPages,
+			want:      BitRange{0, 0},
+		}
+		tests["StartFree"+suffix] = test{
+			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{0, uint(m)},
+		}
+		tests["StartFree"+suffix] = test{
+			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{0, uint(m)},
+		}
+		tests["EndFree"+suffix] = test{
+			alloc: []BitRange{{0, PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{PallocChunkPages - uint(m), uint(m)},
+		}
+		tests["Straddle64"+suffix] = test{
+			alloc: []BitRange{{0, 64 - uint(m)}, {64 + uint(m), PallocChunkPages - (64 + uint(m))}},
+			min:   m,
+			max:   2 * m,
+			want:  BitRange{64 - uint(m), 2 * uint(m)},
+		}
+		tests["BottomEdge64WithFull"+suffix] = test{
+			alloc:     []BitRange{{64, 64}, {128 + 3*uint(m), PallocChunkPages - (128 + 3*uint(m))}},
+			scavenged: []BitRange{{1, 10}},
+			min:       m,
+			max:       3 * m,
+			want:      BitRange{128, 3 * uint(m)},
+		}
+		tests["BottomEdge64WithPocket"+suffix] = test{
+			alloc:     []BitRange{{64, 62}, {127, 1}, {128 + 3*uint(m), PallocChunkPages - (128 + 3*uint(m))}},
+			scavenged: []BitRange{{1, 10}},
+			min:       m,
+			max:       3 * m,
+			want:      BitRange{128, 3 * uint(m)},
+		}
+		tests["Max0"+suffix] = test{
+			scavenged: []BitRange{{0, PallocChunkPages - uint(m)}},
+			min:       m,
+			max:       0,
+			want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+		}
+		if m <= 8 {
+			tests["OneFree"] = test{
+				alloc: []BitRange{{0, 40}, {40 + uint(m), PallocChunkPages - (40 + uint(m))}},
+				min:   m,
+				max:   PallocChunkPages,
+				want:  BitRange{40, uint(m)},
+			}
+			tests["OneScavenged"] = test{
+				alloc:     []BitRange{{0, 40}, {40 + uint(m), PallocChunkPages - (40 + uint(m))}},
+				scavenged: []BitRange{{40, 1}},
+				min:       m,
+				max:       PallocChunkPages,
+				want:      BitRange{0, 0},
+			}
+		}
+		if m > 1 {
+			tests["MaxUnaligned"+suffix] = test{
+				scavenged: []BitRange{{0, PallocChunkPages - uint(m*2-1)}},
+				min:       m,
+				max:       m - 2,
+				want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+			}
+			tests["SkipSmall"+suffix] = test{
+				alloc: []BitRange{{0, 64 - uint(m)}, {64, 5}, {70, 11}, {82, PallocChunkPages - 82}},
+				min:   m,
+				max:   m,
+				want:  BitRange{64 - uint(m), uint(m)},
+			}
+			tests["SkipMisaligned"+suffix] = test{
+				alloc: []BitRange{{0, 64 - uint(m)}, {64, 63}, {127 + uint(m), PallocChunkPages - (127 + uint(m))}},
+				min:   m,
+				max:   m,
+				want:  BitRange{64 - uint(m), uint(m)},
+			}
+			tests["MaxLessThan"+suffix] = test{
+				scavenged: []BitRange{{0, PallocChunkPages - uint(m)}},
+				min:       m,
+				max:       1,
+				want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+			}
+		}
+	}
+	if PhysHugePageSize > uintptr(PageSize) {
+		// Check hugepage preserving behavior.
+		bits := uint(PhysHugePageSize / uintptr(PageSize))
+		tests["PreserveHugePageBottom"] = test{
+			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
+			min:   1,
+			max:   3, // Make it so that max would have us try to break the huge page.
+			want:  BitRange{0, bits + 2},
+		}
+		if 3*bits < PallocChunkPages {
+			// We need at least 3 huge pages in a chunk for this test to make sense.
+			tests["PreserveHugePageMiddle"] = test{
+				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+				min:   1,
+				max:   12, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{bits, bits + 10},
+			}
+		}
+		tests["PreserveHugePageTop"] = test{
+			alloc: []BitRange{{0, PallocChunkPages - bits}},
+			min:   1,
+			max:   1, // Even one page would break a huge page in this case.
+			want:  BitRange{PallocChunkPages - bits, bits},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocData(v.alloc, v.scavenged)
+			start, size := b.FindScavengeCandidate(PallocChunkPages-1, v.min, v.max)
+			got := BitRange{start, size}
+			if !(got.N == 0 && v.want.N == 0) && got != v.want {
+				t.Fatalf("candidate mismatch: got %v, want %v", got, v.want)
+			}
+		})
+	}
+}
+
+// Tests end-to-end scavenging on a pageAlloc.
+func TestPageAllocScavenge(t *testing.T) {
+	type test struct {
+		request, expect uintptr
+	}
+	minPages := PhysPageSize / PageSize
+	if minPages < 1 {
+		minPages = 1
+	}
+	tests := map[string]struct {
+		beforeAlloc map[ChunkIdx][]BitRange
+		beforeScav  map[ChunkIdx][]BitRange
+		expect      []test
+		afterScav   map[ChunkIdx][]BitRange
+	}{
+		"AllFreeUnscavExhaust": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+			},
+			expect: []test{
+				{^uintptr(0), 3 * PallocChunkPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NoneFreeUnscavExhaust": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+			expect: []test{
+				{^uintptr(0), 0},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+		},
+		"ScavHighestPageFirst": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{1, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(minPages)}},
+			},
+		},
+		"ScavMultiple": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{minPages * PageSize, minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"ScavMultiple2": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{2 * minPages * PageSize, 2 * minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"ScavDiscontiguous": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {},
+				BaseChunkIdx + 0xe: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+				BaseChunkIdx + 0xe: {{uint(2 * minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{2 * minPages * PageSize, 2 * minPages * PageSize},
+				{^uintptr(0), 2 * minPages * PageSize},
+				{^uintptr(0), 0},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xe: {{0, PallocChunkPages}},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		runTest := func(t *testing.T, locked bool) {
+			b := NewPageAlloc(v.beforeAlloc, v.beforeScav)
+			defer FreePageAlloc(b)
+
+			for iter, h := range v.expect {
+				if got := b.Scavenge(h.request, locked); got != h.expect {
+					t.Fatalf("bad scavenge #%d: want %d, got %d", iter+1, h.expect, got)
+				}
+			}
+			want := NewPageAlloc(v.beforeAlloc, v.afterScav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		}
+		t.Run(name, func(t *testing.T) {
+			runTest(t, false)
+		})
+		t.Run(name+"Locked", func(t *testing.T) {
+			runTest(t, true)
+		})
+	}
+}
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index c1c6e65..1e959a4 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -116,12 +116,12 @@ func sweepone() uintptr {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
 		}
-		if s.state != mSpanInUse {
+		if state := s.state.get(); state != mSpanInUse {
 			// This can happen if direct sweeping already
 			// swept this span, but in that case the sweep
 			// generation should always be up-to-date.
 			if !(s.sweepgen == sg || s.sweepgen == sg+3) {
-				print("runtime: bad span s.state=", s.state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
 				throw("non in-use span in unswept list")
 			}
 			continue
@@ -213,8 +213,8 @@ func (s *mspan) sweep(preserve bool) bool {
 		throw("mspan.sweep: m is not locked")
 	}
 	sweepgen := mheap_.sweepgen
-	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 		throw("mspan.sweep: bad span state")
 	}
 
@@ -353,8 +353,8 @@ func (s *mspan) sweep(preserve bool) bool {
 	if freeToHeap || nfreed == 0 {
 		// The span must be in our exclusive ownership until we update sweepgen,
 		// check for potential races.
-		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 			throw("mspan.sweep: bad span state after sweep")
 		}
 		// Serialization point.
@@ -388,7 +388,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			s.limit = 0 // prevent mlookup from finding this span
 			sysFault(unsafe.Pointer(s.base()), size)
 		} else {
-			mheap_.freeSpan(s, true)
+			mheap_.freeSpan(s)
 		}
 		c.local_nlargefree++
 		c.local_largefree += size
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
index 0491f7c..7828822 100644
--- a/libgo/go/runtime/mgcsweepbuf.go
+++ b/libgo/go/runtime/mgcsweepbuf.go
@@ -111,8 +111,9 @@ retry:
 		unlock(&b.spineLock)
 	}
 
-	// We have a block. Insert the span.
-	block.spans[bottom] = s
+	// We have a block. Insert the span atomically, since there may be
+	// concurrent readers via the block API.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
 }
 
 // pop removes and returns a span from buffer b, or nil if b is empty.
@@ -147,7 +148,9 @@ func (b *gcSweepBuf) numBlocks() int {
 }
 
 // block returns the spans in the i'th block of buffer b. block is
-// safe to call concurrently with push.
+// safe to call concurrently with push. The block may contain nil
+// pointers that must be ignored, and each entry in the block must be
+// loaded atomically.
 func (b *gcSweepBuf) block(i int) []*mspan {
 	// Perform bounds check before loading spine address since
 	// push ensures the allocated length is at least spineLen.
@@ -169,11 +172,5 @@ func (b *gcSweepBuf) block(i int) []*mspan {
 	} else {
 		spans = block.spans[:bottom]
 	}
-
-	// push may have reserved a slot but not filled it yet, so
-	// trim away unused entries.
-	for len(spans) > 0 && spans[len(spans)-1] == nil {
-		spans = spans[:len(spans)-1]
-	}
 	return spans
 }
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index 89d1e0e..a1b61ad 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -126,12 +126,12 @@ func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
 	if debugCachedWork {
 		alreadyFailed := w.putGen == w.pauseGen
 		w.putGen = w.pauseGen
-		if m := getg().m; m.locks > 0 || m.mallocing != 0 || m.preemptoff != "" || m.p.ptr().status != _Prunning {
+		if !canPreemptM(getg().m) {
 			// If we were to spin, the runtime may
-			// deadlock: the condition above prevents
-			// preemption (see newstack), which could
-			// prevent gcMarkDone from finishing the
-			// ragged barrier and releasing the spin.
+			// deadlock. Since we can't be preempted, the
+			// spin could prevent gcMarkDone from
+			// finishing the ragged barrier, which is what
+			// releases us from the spin.
 			return
 		}
 		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index cd01b3f..f40589a 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -15,10 +15,19 @@ import (
 	"unsafe"
 )
 
-// minPhysPageSize is a lower-bound on the physical page size. The
-// true physical page size may be larger than this. In contrast,
-// sys.PhysPageSize is an upper-bound on the physical page size.
-const minPhysPageSize = 4096
+const (
+	// minPhysPageSize is a lower-bound on the physical page size. The
+	// true physical page size may be larger than this. In contrast,
+	// sys.PhysPageSize is an upper-bound on the physical page size.
+	minPhysPageSize = 4096
+
+	// maxPhysPageSize is the maximum page size the runtime supports.
+	maxPhysPageSize = 512 << 10
+
+	// maxPhysHugePageSize sets an upper-bound on the maximum huge page size
+	// that the runtime supports.
+	maxPhysHugePageSize = pallocChunkBytes
+)
 
 // Main malloc heap.
 // The heap itself is the "free" and "scav" treaps,
@@ -32,10 +41,10 @@ type mheap struct {
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
 	lock      mutex
-	free      mTreap // free spans
-	sweepgen  uint32 // sweep generation, see comment in mspan
-	sweepdone uint32 // all spans are swept
-	sweepers  uint32 // number of active sweepone calls
+	pages     pageAlloc // page allocation data structure
+	sweepgen  uint32    // sweep generation, see comment in mspan; written during STW
+	sweepdone uint32    // all spans are swept
+	sweepers  uint32    // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -81,7 +90,7 @@ type mheap struct {
 	// accounting for current progress. If we could only adjust
 	// the slope, it would create a discontinuity in debt if any
 	// progress has already been made.
-	pagesInUse         uint64  // pages of spans in stats mSpanInUse; R/W with mheap.lock
+	pagesInUse         uint64  // pages of spans in stats mSpanInUse; updated atomically
 	pagesSwept         uint64  // pages swept this cycle; updated atomically
 	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
 	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
@@ -89,24 +98,10 @@ type mheap struct {
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
 
-	// Scavenger pacing parameters
-	//
-	// The two basis parameters and the scavenge ratio parallel the proportional
-	// sweeping implementation, the primary differences being that:
-	//  * Scavenging concerns itself with RSS, estimated as heapRetained()
-	//  * Rather than pacing the scavenger to the GC, it is paced to a
-	//    time-based rate computed in gcPaceScavenger.
-	//
-	// scavengeRetainedGoal represents our goal RSS.
-	//
-	// All fields must be accessed with lock.
-	//
-	// TODO(mknyszek): Consider abstracting the basis fields and the scavenge ratio
-	// into its own type so that this logic may be shared with proportional sweeping.
-	scavengeTimeBasis     int64
-	scavengeRetainedBasis uint64
-	scavengeBytesPerNS    float64
-	scavengeRetainedGoal  uint64
+	// scavengeGoal is the amount of total retained heap memory (measured by
+	// heapRetained) that the runtime will try to maintain by returning memory
+	// to the OS.
+	scavengeGoal uint64
 
 	// Page reclaimer state
 
@@ -185,6 +180,12 @@ type mheap struct {
 	// simply blocking GC (by disabling preemption).
 	sweepArenas []arenaIdx
 
+	// curArena is the arena that the heap is currently growing
+	// into. This should always be physPageSize-aligned.
+	curArena struct {
+		base, end uintptr
+	}
+
 	_ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
@@ -199,7 +200,6 @@ type mheap struct {
 
 	spanalloc             fixalloc // allocator for span*
 	cachealloc            fixalloc // allocator for mcache*
-	treapalloc            fixalloc // allocator for treapNodes*
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
@@ -213,10 +213,6 @@ var mheap_ mheap
 // A heapArena stores metadata for a heap arena. heapArenas are stored
 // outside of the Go heap and accessed via the mheap_.arenas index.
 //
-// This gets allocated directly from the OS, so ideally it should be a
-// multiple of the system page size. For example, avoid adding small
-// fields.
-//
 //go:notinheap
 type heapArena struct {
 	// bitmap stores the pointer/scalar bitmap for the words in
@@ -242,7 +238,7 @@ type heapArena struct {
 	// but only the bit corresponding to the first page in each
 	// span is used.
 	//
-	// Writes are protected by mheap_.lock.
+	// Reads and writes are atomic.
 	pageInUse [pagesPerArena / 8]uint8
 
 	// pageMarks is a bitmap that indicates which spans have any
@@ -259,6 +255,18 @@ type heapArena struct {
 	// faster scanning, but we don't have 64-bit atomic bit
 	// operations.
 	pageMarks [pagesPerArena / 8]uint8
+
+	// zeroedBase marks the first byte of the first page in this
+	// arena which hasn't been used yet and is therefore already
+	// zero. zeroedBase is relative to the arena base.
+	// Increases monotonically until it hits heapArenaBytes.
+	//
+	// This field is sufficient to determine if an allocation
+	// needs to be zeroed because the page allocator follows an
+	// address-ordered first-fit policy.
+	//
+	// Read atomically and written with an atomic CAS.
+	zeroedBase uintptr
 }
 
 // arenaHint is a hint for where to grow the heap arenas. See
@@ -298,13 +306,20 @@ type arenaHint struct {
 // * During GC (gcphase != _GCoff), a span *must not* transition from
 //   manual or in-use to free. Because concurrent GC may read a pointer
 //   and then look up its span, the span state must be monotonic.
+//
+// Setting mspan.state to mSpanInUse or mSpanManual must be done
+// atomically and only after all other span fields are valid.
+// Likewise, if inspecting a span is contingent on it being
+// mSpanInUse, the state should be loaded atomically and checked
+// before depending on other fields. This allows the garbage collector
+// to safely deal with potentially invalid pointers, since resolving
+// such pointers may race with a span being allocated.
 type mSpanState uint8
 
 const (
 	mSpanDead   mSpanState = iota
 	mSpanInUse             // allocated for garbage collected heap
 	mSpanManual            // allocated for manual management (e.g., stack allocator)
-	mSpanFree
 )
 
 // mSpanStateNames are the names of the span states, indexed by
@@ -316,6 +331,21 @@ var mSpanStateNames = []string{
 	"mSpanFree",
 }
 
+// mSpanStateBox holds an mSpanState and provides atomic operations on
+// it. This is a separate type to disallow accidental comparison or
+// assignment with mSpanState.
+type mSpanStateBox struct {
+	s mSpanState
+}
+
+func (b *mSpanStateBox) set(s mSpanState) {
+	atomic.Store8((*uint8)(&b.s), uint8(s))
+}
+
+func (b *mSpanStateBox) get() mSpanState {
+	return mSpanState(atomic.Load8((*uint8)(&b.s)))
+}
+
 // mSpanList heads a linked list of spans.
 //
 //go:notinheap
@@ -397,19 +427,18 @@ type mspan struct {
 	// h->sweepgen is incremented by 2 after every GC
 
 	sweepgen    uint32
-	divMul      uint16     // for divide by elemsize - divMagic.mul
-	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
-	allocCount  uint16     // number of allocated objects
-	spanclass   spanClass  // size class and noscan (uint8)
-	state       mSpanState // mspaninuse etc
-	needzero    uint8      // needs to be zeroed before allocation
-	divShift    uint8      // for divide by elemsize - divMagic.shift
-	divShift2   uint8      // for divide by elemsize - divMagic.shift2
-	scavenged   bool       // whether this span has had its pages released to the OS
-	elemsize    uintptr    // computed from sizeclass or from npages
-	limit       uintptr    // end of data in span
-	speciallock mutex      // guards specials list
-	specials    *special   // linked list of special records sorted by offset.
+	divMul      uint16        // for divide by elemsize - divMagic.mul
+	baseMask    uint16        // if non-0, elemsize is a power of 2, & this will get object allocation base
+	allocCount  uint16        // number of allocated objects
+	spanclass   spanClass     // size class and noscan (uint8)
+	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
+	needzero    uint8         // needs to be zeroed before allocation
+	divShift    uint8         // for divide by elemsize - divMagic.shift
+	divShift2   uint8         // for divide by elemsize - divMagic.shift2
+	elemsize    uintptr       // computed from sizeclass or from npages
+	limit       uintptr       // end of data in span
+	speciallock mutex         // guards specials list
+	specials    *special      // linked list of special records sorted by offset.
 }
 
 func (s *mspan) base() uintptr {
@@ -425,181 +454,6 @@ func (s *mspan) layout() (size, n, total uintptr) {
 	return
 }
 
-// physPageBounds returns the start and end of the span
-// rounded in to the physical page size.
-func (s *mspan) physPageBounds() (uintptr, uintptr) {
-	start := s.base()
-	end := start + s.npages<<_PageShift
-	if physPageSize > _PageSize {
-		// Round start and end in.
-		start = (start + physPageSize - 1) &^ (physPageSize - 1)
-		end &^= physPageSize - 1
-	}
-	return start, end
-}
-
-func (h *mheap) coalesce(s *mspan) {
-	// merge is a helper which merges other into s, deletes references to other
-	// in heap metadata, and then discards it. other must be adjacent to s.
-	merge := func(a, b, other *mspan) {
-		// Caller must ensure a.startAddr < b.startAddr and that either a or
-		// b is s. a and b must be adjacent. other is whichever of the two is
-		// not s.
-
-		if pageSize < physPageSize && a.scavenged && b.scavenged {
-			// If we're merging two scavenged spans on systems where
-			// pageSize < physPageSize, then their boundary should always be on
-			// a physical page boundary, due to the realignment that happens
-			// during coalescing. Throw if this case is no longer true, which
-			// means the implementation should probably be changed to scavenge
-			// along the boundary.
-			_, start := a.physPageBounds()
-			end, _ := b.physPageBounds()
-			if start != end {
-				println("runtime: a.base=", hex(a.base()), "a.npages=", a.npages)
-				println("runtime: b.base=", hex(b.base()), "b.npages=", b.npages)
-				println("runtime: physPageSize=", physPageSize, "pageSize=", pageSize)
-				throw("neighboring scavenged spans boundary is not a physical page boundary")
-			}
-		}
-
-		// Adjust s via base and npages and also in heap metadata.
-		s.npages += other.npages
-		s.needzero |= other.needzero
-		if a == s {
-			h.setSpan(s.base()+s.npages*pageSize-1, s)
-		} else {
-			s.startAddr = other.startAddr
-			h.setSpan(s.base(), s)
-		}
-
-		// The size is potentially changing so the treap needs to delete adjacent nodes and
-		// insert back as a combined node.
-		h.free.removeSpan(other)
-		other.state = mSpanDead
-		h.spanalloc.free(unsafe.Pointer(other))
-	}
-
-	// realign is a helper which shrinks other and grows s such that their
-	// boundary is on a physical page boundary.
-	realign := func(a, b, other *mspan) {
-		// Caller must ensure a.startAddr < b.startAddr and that either a or
-		// b is s. a and b must be adjacent. other is whichever of the two is
-		// not s.
-
-		// If pageSize >= physPageSize then spans are always aligned
-		// to physical page boundaries, so just exit.
-		if pageSize >= physPageSize {
-			return
-		}
-		// Since we're resizing other, we must remove it from the treap.
-		h.free.removeSpan(other)
-
-		// Round boundary to the nearest physical page size, toward the
-		// scavenged span.
-		boundary := b.startAddr
-		if a.scavenged {
-			boundary &^= (physPageSize - 1)
-		} else {
-			boundary = (boundary + physPageSize - 1) &^ (physPageSize - 1)
-		}
-		a.npages = (boundary - a.startAddr) / pageSize
-		b.npages = (b.startAddr + b.npages*pageSize - boundary) / pageSize
-		b.startAddr = boundary
-
-		h.setSpan(boundary-1, a)
-		h.setSpan(boundary, b)
-
-		// Re-insert other now that it has a new size.
-		h.free.insert(other)
-	}
-
-	hpMiddle := s.hugePages()
-
-	// Coalesce with earlier, later spans.
-	var hpBefore uintptr
-	if before := spanOf(s.base() - 1); before != nil && before.state == mSpanFree {
-		if s.scavenged == before.scavenged {
-			hpBefore = before.hugePages()
-			merge(before, s, before)
-		} else {
-			realign(before, s, before)
-		}
-	}
-
-	// Now check to see if next (greater addresses) span is free and can be coalesced.
-	var hpAfter uintptr
-	if after := spanOf(s.base() + s.npages*pageSize); after != nil && after.state == mSpanFree {
-		if s.scavenged == after.scavenged {
-			hpAfter = after.hugePages()
-			merge(s, after, after)
-		} else {
-			realign(s, after, after)
-		}
-	}
-	if !s.scavenged && s.hugePages() > hpBefore+hpMiddle+hpAfter {
-		// If s has grown such that it now may contain more huge pages than it
-		// and its now-coalesced neighbors did before, then mark the whole region
-		// as huge-page-backable.
-		//
-		// Otherwise, on systems where we break up huge pages (like Linux)
-		// s may not be backed by huge pages because it could be made up of
-		// pieces which are broken up in the underlying VMA. The primary issue
-		// with this is that it can lead to a poor estimate of the amount of
-		// free memory backed by huge pages for determining the scavenging rate.
-		//
-		// TODO(mknyszek): Measure the performance characteristics of sysHugePage
-		// and determine whether it makes sense to only sysHugePage on the pages
-		// that matter, or if it's better to just mark the whole region.
-		sysHugePage(unsafe.Pointer(s.base()), s.npages*pageSize)
-	}
-}
-
-// hugePages returns the number of aligned physical huge pages in the memory
-// regioned owned by this mspan.
-func (s *mspan) hugePages() uintptr {
-	if physHugePageSize == 0 || s.npages < physHugePageSize/pageSize {
-		return 0
-	}
-	start := s.base()
-	end := start + s.npages*pageSize
-	if physHugePageSize > pageSize {
-		// Round start and end in.
-		start = (start + physHugePageSize - 1) &^ (physHugePageSize - 1)
-		end &^= physHugePageSize - 1
-	}
-	if start < end {
-		return (end - start) >> physHugePageShift
-	}
-	return 0
-}
-
-func (s *mspan) scavenge() uintptr {
-	// start and end must be rounded in, otherwise madvise
-	// will round them *out* and release more memory
-	// than we want.
-	start, end := s.physPageBounds()
-	if end <= start {
-		// start and end don't span a whole physical page.
-		return 0
-	}
-	released := end - start
-	memstats.heap_released += uint64(released)
-	s.scavenged = true
-	sysUnused(unsafe.Pointer(start), released)
-	return released
-}
-
-// released returns the number of bytes in this span
-// which were returned back to the OS.
-func (s *mspan) released() uintptr {
-	if !s.scavenged {
-		return 0
-	}
-	start, end := s.physPageBounds()
-	return end - start
-}
-
 // recordspan adds a newly allocated span to h.allspans.
 //
 // This only happens the first time a span is allocated from
@@ -726,7 +580,7 @@ func inHeapOrStack(b uintptr) bool {
 	if s == nil || b < s.base() {
 		return false
 	}
-	switch s.state {
+	switch s.state.get() {
 	case mSpanInUse, mSpanManual:
 		return b < s.limit
 	default:
@@ -793,9 +647,12 @@ func spanOfUnchecked(p uintptr) *mspan {
 //go:nosplit
 func spanOfHeap(p uintptr) *mspan {
 	s := spanOf(p)
-	// If p is not allocated, it may point to a stale span, so we
-	// have to check the span's bounds and state.
-	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
+	// s is nil if it's never been allocated. Otherwise, we check
+	// its state first because we don't trust this pointer, so we
+	// have to synchronize with span initialization. Then, it's
+	// still possible we picked up a stale span pointer, so we
+	// have to check the span's bounds.
+	if s == nil || s.state.get() != mSpanInUse || p < s.base() || p >= s.limit {
 		return nil
 	}
 	return s
@@ -813,7 +670,6 @@ func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8)
 
 // Initialize the heap.
 func (h *mheap) init() {
-	h.treapalloc.init(unsafe.Sizeof(treapNode{}), nil, nil, &memstats.other_sys)
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -834,6 +690,8 @@ func (h *mheap) init() {
 	for i := range h.central {
 		h.central[i].mcentral.init(spanClass(i))
 	}
+
+	h.pages.init(&h.lock, &memstats.gc_sys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -954,7 +812,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		// Scan this bitmap chunk for spans that are in-use
 		// but have no marked objects on them.
 		for i := range inUse {
-			inUseUnmarked := inUse[i] &^ marked[i]
+			inUseUnmarked := atomic.Load8(&inUse[i]) &^ marked[i]
 			if inUseUnmarked == 0 {
 				continue
 			}
@@ -973,7 +831,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 						// spans were freed when we dropped the
 						// lock and we don't want to get stale
 						// pointers from the spans array.
-						inUseUnmarked = inUse[i] &^ marked[i]
+						inUseUnmarked = atomic.Load8(&inUse[i]) &^ marked[i]
 					}
 				}
 			}
@@ -990,100 +848,23 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	return nFreed
 }
 
-// alloc_m is the internal implementation of mheap.alloc.
-//
-// alloc_m must run on the system stack because it locks the heap, so
-// any stack growth during alloc_m would self-deadlock.
-//
-//go:systemstack
-func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
-	_g_ := getg()
-
-	// To prevent excessive heap growth, before allocating n pages
-	// we need to sweep and reclaim at least n pages.
-	if h.sweepdone == 0 {
-		h.reclaim(npage)
-	}
-
-	lock(&h.lock)
-	// transfer stats from cache to global
-	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
-	_g_.m.mcache.local_scan = 0
-	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
-	_g_.m.mcache.local_tinyallocs = 0
-
-	s := h.allocSpanLocked(npage, &memstats.heap_inuse)
-	if s != nil {
-		// Record span info, because gc needs to be
-		// able to map interior pointer to containing span.
-		atomic.Store(&s.sweepgen, h.sweepgen)
-		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
-		s.state = mSpanInUse
-		s.allocCount = 0
-		s.spanclass = spanclass
-		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
-			s.elemsize = s.npages << _PageShift
-			s.divShift = 0
-			s.divMul = 0
-			s.divShift2 = 0
-			s.baseMask = 0
-		} else {
-			s.elemsize = uintptr(class_to_size[sizeclass])
-			m := &class_to_divmagic[sizeclass]
-			s.divShift = m.shift
-			s.divMul = m.mul
-			s.divShift2 = m.shift2
-			s.baseMask = m.baseMask
-		}
-
-		// Mark in-use span in arena page bitmap.
-		arena, pageIdx, pageMask := pageIndexOf(s.base())
-		arena.pageInUse[pageIdx] |= pageMask
-
-		// update stats, sweep lists
-		h.pagesInUse += uint64(npage)
-		if large {
-			memstats.heap_objects++
-			mheap_.largealloc += uint64(s.elemsize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
-		}
-	}
-	// heap_scan and heap_live were updated.
-	if gcBlackenEnabled != 0 {
-		gcController.revise()
-	}
-
-	if trace.enabled {
-		traceHeapAlloc()
-	}
-
-	// h.spans is accessed concurrently without synchronization
-	// from other threads. Hence, there must be a store/store
-	// barrier here to ensure the writes to h.spans above happen
-	// before the caller can publish a pointer p to an object
-	// allocated from s. As soon as this happens, the garbage
-	// collector running on another processor could read p and
-	// look up s in h.spans. The unlock acts as the barrier to
-	// order these writes. On the read side, the data dependency
-	// between p and the index in h.spans orders the reads.
-	unlock(&h.lock)
-	return s
-}
-
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
-// Either large must be true or spanclass must indicates the span's
-// size class and scannability.
+// spanclass indicates the span's size class and scannability.
 //
 // If needzero is true, the memory for the returned span will be zeroed.
-func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan {
+func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
 	var s *mspan
 	systemstack(func() {
-		s = h.alloc_m(npage, spanclass, large)
+		// To prevent excessive heap growth, before allocating n pages
+		// we need to sweep and reclaim at least n pages.
+		if h.sweepdone == 0 {
+			h.reclaim(npages)
+		}
+		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
 	})
 
 	if s != nil {
@@ -1105,35 +886,12 @@ func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero b
 // The memory backing the returned span may not be zeroed if
 // span.needzero is set.
 //
-// allocManual must be called on the system stack because it acquires
-// the heap lock. See mheap for details.
+// allocManual must be called on the system stack because it may
+// acquire the heap lock via allocSpan. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
-	lock(&h.lock)
-	s := h.allocSpanLocked(npage, stat)
-	if s != nil {
-		s.state = mSpanManual
-		s.manualFreeList = 0
-		s.allocCount = 0
-		s.spanclass = 0
-		s.nelems = 0
-		s.elemsize = 0
-		s.limit = s.base() + s.npages<<_PageShift
-		// Manually managed memory doesn't count toward heap_sys.
-		memstats.heap_sys -= uint64(s.npages << _PageShift)
-	}
-
-	// This unlock acts as a release barrier. See mheap.alloc_m.
-	unlock(&h.lock)
-
-	return s
-}
-
-// setSpan modifies the span map so spanOf(base) is s.
-func (h *mheap) setSpan(base uintptr, s *mspan) {
-	ai := arenaIndex(base)
-	h.arenas[ai.l1()][ai.l2()].spans[(base/pageSize)%pagesPerArena] = s
+func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
+	return h.allocSpan(npages, true, 0, stat)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -1152,93 +910,357 @@ func (h *mheap) setSpans(base, npage uintptr, s *mspan) {
 	}
 }
 
-// Allocates a span of the given size.  h must be locked.
-// The returned span has been removed from the
-// free structures, but its state is still mSpanFree.
-func (h *mheap) allocSpanLocked(npage uintptr, stat *uint64) *mspan {
-	t := h.free.find(npage)
-	if t.valid() {
-		goto HaveSpan
+// allocNeedsZero checks if the region of address space [base, base+npage*pageSize),
+// assumed to be allocated, needs to be zeroed, updating heap arena metadata for
+// future allocations.
+//
+// This must be called each time pages are allocated from the heap, even if the page
+// allocator can otherwise prove the memory it's allocating is already zero because
+// they're fresh from the operating system. It updates heapArena metadata that is
+// critical for future page allocations.
+//
+// There are no locking constraints on this method.
+func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
+	for npage > 0 {
+		ai := arenaIndex(base)
+		ha := h.arenas[ai.l1()][ai.l2()]
+
+		zeroedBase := atomic.Loaduintptr(&ha.zeroedBase)
+		arenaBase := base % heapArenaBytes
+		if arenaBase < zeroedBase {
+			// We extended into the non-zeroed part of the
+			// arena, so this region needs to be zeroed before use.
+			//
+			// zeroedBase is monotonically increasing, so if we see this now then
+			// we can be sure we need to zero this memory region.
+			//
+			// We still need to update zeroedBase for this arena, and
+			// potentially more arenas.
+			needZero = true
+		}
+		// We may observe arenaBase > zeroedBase if we're racing with one or more
+		// allocations which are acquiring memory directly before us in the address
+		// space. But, because we know no one else is acquiring *this* memory, it's
+		// still safe to not zero.
+
+		// Compute how far into the arena we extend into, capped
+		// at heapArenaBytes.
+		arenaLimit := arenaBase + npage*pageSize
+		if arenaLimit > heapArenaBytes {
+			arenaLimit = heapArenaBytes
+		}
+		// Increase ha.zeroedBase so it's >= arenaLimit.
+		// We may be racing with other updates.
+		for arenaLimit > zeroedBase {
+			if atomic.Casuintptr(&ha.zeroedBase, zeroedBase, arenaLimit) {
+				break
+			}
+			zeroedBase = atomic.Loaduintptr(&ha.zeroedBase)
+			// Sanity check zeroedBase.
+			if zeroedBase <= arenaLimit && zeroedBase > arenaBase {
+				// The zeroedBase moved into the space we were trying to
+				// claim. That's very bad, and indicates someone allocated
+				// the same region we did.
+				throw("potentially overlapping in-use allocations detected")
+			}
+		}
+
+		// Move base forward and subtract from npage to move into
+		// the next arena, or finish.
+		base += arenaLimit - arenaBase
+		npage -= (arenaLimit - arenaBase) / pageSize
 	}
-	if !h.grow(npage) {
+	return
+}
+
+// tryAllocMSpan attempts to allocate an mspan object from
+// the P-local cache, but may fail.
+//
+// h need not be locked.
+//
+// This caller must ensure that its P won't change underneath
+// it during this function. Currently to ensure that we enforce
+// that the function is run on the system stack, because that's
+// the only place it is used now. In the future, this requirement
+// may be relaxed if its use is necessary elsewhere.
+//
+//go:systemstack
+func (h *mheap) tryAllocMSpan() *mspan {
+	pp := getg().m.p.ptr()
+	// If we don't have a p or the cache is empty, we can't do
+	// anything here.
+	if pp == nil || pp.mspancache.len == 0 {
 		return nil
 	}
-	t = h.free.find(npage)
-	if t.valid() {
-		goto HaveSpan
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
+
+// allocMSpanLocked allocates an mspan object.
+//
+// h must be locked.
+//
+// allocMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) allocMSpanLocked() *mspan {
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		// We don't have a p so just do the normal thing.
+		return (*mspan)(h.spanalloc.alloc())
+	}
+	// Refill the cache if necessary.
+	if pp.mspancache.len == 0 {
+		const refillCount = len(pp.mspancache.buf) / 2
+		for i := 0; i < refillCount; i++ {
+			pp.mspancache.buf[i] = (*mspan)(h.spanalloc.alloc())
+		}
+		pp.mspancache.len = refillCount
 	}
-	throw("grew heap, but no adequate free span found")
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
 
-HaveSpan:
-	s := t.span()
-	if s.state != mSpanFree {
-		throw("candidate mspan for allocation is not free")
-	}
-
-	// First, subtract any memory that was released back to
-	// the OS from s. We will add back what's left if necessary.
-	memstats.heap_released -= uint64(s.released())
-
-	if s.npages == npage {
-		h.free.erase(t)
-	} else if s.npages > npage {
-		// Trim off the lower bits and make that our new span.
-		// Do this in-place since this operation does not
-		// affect the original span's location in the treap.
-		n := (*mspan)(h.spanalloc.alloc())
-		h.free.mutate(t, func(s *mspan) {
-			n.init(s.base(), npage)
-			s.npages -= npage
-			s.startAddr = s.base() + npage*pageSize
-			h.setSpan(s.base()-1, n)
-			h.setSpan(s.base(), s)
-			h.setSpan(n.base(), n)
-			n.needzero = s.needzero
-			// n may not be big enough to actually be scavenged, but that's fine.
-			// We still want it to appear to be scavenged so that we can do the
-			// right bookkeeping later on in this function (i.e. sysUsed).
-			n.scavenged = s.scavenged
-			// Check if s is still scavenged.
-			if s.scavenged {
-				start, end := s.physPageBounds()
-				if start < end {
-					memstats.heap_released += uint64(end - start)
-				} else {
-					s.scavenged = false
-				}
+// freeMSpanLocked free an mspan object.
+//
+// h must be locked.
+//
+// freeMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) freeMSpanLocked(s *mspan) {
+	pp := getg().m.p.ptr()
+	// First try to free the mspan directly to the cache.
+	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
+		pp.mspancache.buf[pp.mspancache.len] = s
+		pp.mspancache.len++
+		return
+	}
+	// Failing that (or if we don't have a p), just free it to
+	// the heap.
+	h.spanalloc.free(unsafe.Pointer(s))
+}
+
+// allocSpan allocates an mspan which owns npages worth of memory.
+//
+// If manual == false, allocSpan allocates a heap span of class spanclass
+// and updates heap accounting. If manual == true, allocSpan allocates a
+// manually-managed span (spanclass is ignored), and the caller is
+// responsible for any accounting related to its use of the span. Either
+// way, allocSpan will atomically add the bytes in the newly allocated
+// span to *sysStat.
+//
+// The returned span is fully initialized.
+//
+// h must not be locked.
+//
+// allocSpan must be called on the system stack both because it acquires
+// the heap lock and because it must block GC transitions.
+//
+//go:systemstack
+func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+	// Function-global state.
+	gp := getg()
+	base, scav := uintptr(0), uintptr(0)
+
+	// If the allocation is small enough, try the page cache!
+	pp := gp.m.p.ptr()
+	if pp != nil && npages < pageCachePages/4 {
+		c := &pp.pcache
+
+		// If the cache is empty, refill it.
+		if c.empty() {
+			lock(&h.lock)
+			*c = h.pages.allocToCache()
+			unlock(&h.lock)
+		}
+
+		// Try to allocate from the cache.
+		base, scav = c.alloc(npages)
+		if base != 0 {
+			s = h.tryAllocMSpan()
+
+			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+				goto HaveSpan
 			}
-		})
-		s = n
-	} else {
-		throw("candidate mspan for allocation is too small")
+			// We're either running duing GC, failed to acquire a mspan,
+			// or the allocation is for a large object. This means we
+			// have to lock the heap and do a bunch of extra work,
+			// so go down the HaveBaseLocked path.
+			//
+			// We must do this during GC to avoid skew with heap_scan
+			// since we flush mcache stats whenever we lock.
+			//
+			// TODO(mknyszek): It would be nice to not have to
+			// lock the heap if it's a large allocation, but
+			// it's fine for now. The critical section here is
+			// short and large object allocations are relatively
+			// infrequent.
+		}
 	}
-	// "Unscavenge" s only AFTER splitting so that
-	// we only sysUsed whatever we actually need.
-	if s.scavenged {
+
+	// For one reason or another, we couldn't get the
+	// whole job done without the heap lock.
+	lock(&h.lock)
+
+	if base == 0 {
+		// Try to acquire a base address.
+		base, scav = h.pages.alloc(npages)
+		if base == 0 {
+			if !h.grow(npages) {
+				unlock(&h.lock)
+				return nil
+			}
+			base, scav = h.pages.alloc(npages)
+			if base == 0 {
+				throw("grew heap, but no adequate free space found")
+			}
+		}
+	}
+	if s == nil {
+		// We failed to get an mspan earlier, so grab
+		// one now that we have the heap lock.
+		s = h.allocMSpanLocked()
+	}
+	if !manual {
+		// This is a heap span, so we should do some additional accounting
+		// which may only be done with the heap locked.
+
+		// Transfer stats from mcache to global.
+		memstats.heap_scan += uint64(gp.m.mcache.local_scan)
+		gp.m.mcache.local_scan = 0
+		memstats.tinyallocs += uint64(gp.m.mcache.local_tinyallocs)
+		gp.m.mcache.local_tinyallocs = 0
+
+		// Do some additional accounting if it's a large allocation.
+		if spanclass.sizeclass() == 0 {
+			mheap_.largealloc += uint64(npages * pageSize)
+			mheap_.nlargealloc++
+			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+		}
+
+		// Either heap_live or heap_scan could have been updated.
+		if gcBlackenEnabled != 0 {
+			gcController.revise()
+		}
+	}
+	unlock(&h.lock)
+
+HaveSpan:
+	// At this point, both s != nil and base != 0, and the heap
+	// lock is no longer held. Initialize the span.
+	s.init(base, npages)
+	if h.allocNeedsZero(base, npages) {
+		s.needzero = 1
+	}
+	nbytes := npages * pageSize
+	if manual {
+		s.manualFreeList = 0
+		s.nelems = 0
+		s.limit = s.base() + s.npages*pageSize
+		// Manually managed memory doesn't count toward heap_sys.
+		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
+		s.state.set(mSpanManual)
+	} else {
+		// We must set span properties before the span is published anywhere
+		// since we're not holding the heap lock.
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
+			s.elemsize = nbytes
+			s.nelems = 1
+
+			s.divShift = 0
+			s.divMul = 0
+			s.divShift2 = 0
+			s.baseMask = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			s.nelems = nbytes / s.elemsize
+
+			m := &class_to_divmagic[sizeclass]
+			s.divShift = m.shift
+			s.divMul = m.mul
+			s.divShift2 = m.shift2
+			s.baseMask = m.baseMask
+		}
+
+		// Initialize mark and allocation structures.
+		s.freeindex = 0
+		s.allocCache = ^uint64(0) // all 1s indicating all free.
+		s.gcmarkBits = newMarkBits(s.nelems)
+		s.allocBits = newAllocBits(s.nelems)
+
+		// It's safe to access h.sweepgen without the heap lock because it's
+		// only ever updated with the world stopped and we run on the
+		// systemstack which blocks a STW transition.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+
+		// Now that the span is filled in, set its state. This
+		// is a publication barrier for the other fields in
+		// the span. While valid pointers into this span
+		// should never be visible until the span is returned,
+		// if the garbage collector finds an invalid pointer,
+		// access to the span may race with initialization of
+		// the span. We resolve this race by atomically
+		// setting the state after the span is fully
+		// initialized, and atomically checking the state in
+		// any situation where a pointer is suspect.
+		s.state.set(mSpanInUse)
+	}
+
+	// Commit and account for any scavenged memory that the span now owns.
+	if scav != 0 {
 		// sysUsed all the pages that are actually available
-		// in the span. Note that we don't need to decrement
-		// heap_released since we already did so earlier.
-		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
-		s.scavenged = false
-
-		// Since we allocated out of a scavenged span, we just
-		// grew the RSS. Mitigate this by scavenging enough free
-		// space to make up for it but only if we need to.
-		//
-		// scavengeLocked may cause coalescing, so prevent
-		// coalescing with s by temporarily changing its state.
-		s.state = mSpanManual
-		h.scavengeIfNeededLocked(s.npages * pageSize)
-		s.state = mSpanFree
+		// in the span since some of them might be scavenged.
+		sysUsed(unsafe.Pointer(base), nbytes)
+		mSysStatDec(&memstats.heap_released, scav)
 	}
+	// Update stats.
+	mSysStatInc(sysStat, nbytes)
+	mSysStatDec(&memstats.heap_idle, nbytes)
 
-	h.setSpans(s.base(), npage, s)
+	// Publish the span in various locations.
 
-	*stat += uint64(npage << _PageShift)
-	memstats.heap_idle -= uint64(npage << _PageShift)
+	// This is safe to call without the lock held because the slots
+	// related to this span will only every be read or modified by
+	// this thread until pointers into the span are published or
+	// pageInUse is updated.
+	h.setSpans(s.base(), npages, s)
 
-	if s.inList() {
-		throw("still in list")
+	if !manual {
+		// Add to swept in-use list.
+		//
+		// This publishes the span to root marking.
+		//
+		// h.sweepgen is guaranteed to only change during STW,
+		// and preemption is disabled in the page allocator.
+		h.sweepSpans[h.sweepgen/2%2].push(s)
+
+		// Mark in-use span in arena page bitmap.
+		//
+		// This publishes the span to the page sweeper, so
+		// it's imperative that the span be completely initialized
+		// prior to this line.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
+
+		// Update related page sweeper stats.
+		atomic.Xadd64(&h.pagesInUse, int64(npages))
+
+		if trace.enabled {
+			// Trace that a heap alloc occurred.
+			traceHeapAlloc()
+		}
 	}
 	return s
 }
@@ -1248,37 +1270,73 @@ HaveSpan:
 //
 // h must be locked.
 func (h *mheap) grow(npage uintptr) bool {
-	ask := npage << _PageShift
-	v, size := h.sysAlloc(ask)
-	if v == nil {
-		print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
-		return false
-	}
+	// We must grow the heap in whole palloc chunks.
+	ask := alignUp(npage, pallocChunkPages) * pageSize
+
+	totalGrowth := uintptr(0)
+	nBase := alignUp(h.curArena.base+ask, physPageSize)
+	if nBase > h.curArena.end {
+		// Not enough room in the current arena. Allocate more
+		// arena space. This may not be contiguous with the
+		// current arena, so we have to request the full ask.
+		av, asize := h.sysAlloc(ask)
+		if av == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return false
+		}
+
+		if uintptr(av) == h.curArena.end {
+			// The new space is contiguous with the old
+			// space, so just extend the current space.
+			h.curArena.end = uintptr(av) + asize
+		} else {
+			// The new space is discontiguous. Track what
+			// remains of the current space and switch to
+			// the new space. This should be rare.
+			if size := h.curArena.end - h.curArena.base; size != 0 {
+				h.pages.grow(h.curArena.base, size)
+				totalGrowth += size
+			}
+			// Switch to the new space.
+			h.curArena.base = uintptr(av)
+			h.curArena.end = uintptr(av) + asize
+		}
 
-	// Create a fake "in use" span and free it, so that the
-	// right accounting and coalescing happens.
-	s := (*mspan)(h.spanalloc.alloc())
-	s.init(uintptr(v), size/pageSize)
-	h.setSpans(s.base(), s.npages, s)
-	s.state = mSpanFree
-	memstats.heap_idle += uint64(size)
-	// (*mheap).sysAlloc returns untouched/uncommitted memory.
-	s.scavenged = true
-	// s is always aligned to the heap arena size which is always > physPageSize,
-	// so its totally safe to just add directly to heap_released. Coalescing,
-	// if possible, will also always be correct in terms of accounting, because
-	// s.base() must be a physical page boundary.
-	memstats.heap_released += uint64(size)
-	h.coalesce(s)
-	h.free.insert(s)
+		// The memory just allocated counts as both released
+		// and idle, even though it's not yet backed by spans.
+		//
+		// The allocation is always aligned to the heap arena
+		// size which is always > physPageSize, so its safe to
+		// just add directly to heap_released.
+		mSysStatInc(&memstats.heap_released, asize)
+		mSysStatInc(&memstats.heap_idle, asize)
+
+		// Recalculate nBase
+		nBase = alignUp(h.curArena.base+ask, physPageSize)
+	}
+
+	// Grow into the current arena.
+	v := h.curArena.base
+	h.curArena.base = nBase
+	h.pages.grow(v, nBase-v)
+	totalGrowth += nBase - v
+
+	// We just caused a heap growth, so scavenge down what will soon be used.
+	// By scavenging inline we deal with the failure to allocate out of
+	// memory fragments by scavenging the memory fragments that are least
+	// likely to be re-used.
+	if retained := heapRetained(); retained+uint64(totalGrowth) > h.scavengeGoal {
+		todo := totalGrowth
+		if overage := uintptr(retained + uint64(totalGrowth) - h.scavengeGoal); todo > overage {
+			todo = overage
+		}
+		h.pages.scavenge(todo, true)
+	}
 	return true
 }
 
 // Free the span back into the heap.
-//
-// large must match the value of large passed to mheap.alloc. This is
-// used for accounting.
-func (h *mheap) freeSpan(s *mspan, large bool) {
+func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
 		mp := getg().m
 		lock(&h.lock)
@@ -1292,10 +1350,6 @@ func (h *mheap) freeSpan(s *mspan, large bool) {
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if large {
-			// Match accounting done in mheap.alloc.
-			memstats.heap_objects--
-		}
 		if gcBlackenEnabled != 0 {
 			// heap_scan changed.
 			gcController.revise()
@@ -1319,14 +1373,14 @@ func (h *mheap) freeSpan(s *mspan, large bool) {
 func (h *mheap) freeManual(s *mspan, stat *uint64) {
 	s.needzero = 1
 	lock(&h.lock)
-	*stat -= uint64(s.npages << _PageShift)
-	memstats.heap_sys += uint64(s.npages << _PageShift)
+	mSysStatDec(stat, s.npages*pageSize)
+	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
 	h.freeSpanLocked(s, false, true)
 	unlock(&h.lock)
 }
 
 func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
-	switch s.state {
+	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
 			throw("mheap.freeSpanLocked - invalid stack free")
@@ -1336,140 +1390,28 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 			print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
 			throw("mheap.freeSpanLocked - invalid free")
 		}
-		h.pagesInUse -= uint64(s.npages)
+		atomic.Xadd64(&h.pagesInUse, -int64(s.npages))
 
 		// Clear in-use bit in arena page bitmap.
 		arena, pageIdx, pageMask := pageIndexOf(s.base())
-		arena.pageInUse[pageIdx] &^= pageMask
+		atomic.And8(&arena.pageInUse[pageIdx], ^pageMask)
 	default:
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
 	if acctinuse {
-		memstats.heap_inuse -= uint64(s.npages << _PageShift)
+		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
 	}
 	if acctidle {
-		memstats.heap_idle += uint64(s.npages << _PageShift)
+		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
 	}
-	s.state = mSpanFree
-
-	// Coalesce span with neighbors.
-	h.coalesce(s)
 
-	// Insert s into the treap.
-	h.free.insert(s)
-}
+	// Mark the space as free.
+	h.pages.free(s.base(), s.npages)
 
-// scavengeSplit takes t.span() and attempts to split off a span containing size
-// (in bytes) worth of physical pages from the back.
-//
-// The split point is only approximately defined by size since the split point
-// is aligned to physPageSize and pageSize every time. If physHugePageSize is
-// non-zero and the split point would break apart a huge page in the span, then
-// the split point is also aligned to physHugePageSize.
-//
-// If the desired split point ends up at the base of s, or if size is obviously
-// much larger than s, then a split is not possible and this method returns nil.
-// Otherwise if a split occurred it returns the newly-created span.
-func (h *mheap) scavengeSplit(t treapIter, size uintptr) *mspan {
-	s := t.span()
-	start, end := s.physPageBounds()
-	if end <= start || end-start <= size {
-		// Size covers the whole span.
-		return nil
-	}
-	// The span is bigger than what we need, so compute the base for the new
-	// span if we decide to split.
-	base := end - size
-	// Round down to the next physical or logical page, whichever is bigger.
-	base &^= (physPageSize - 1) | (pageSize - 1)
-	if base <= start {
-		return nil
-	}
-	if physHugePageSize > pageSize && base&^(physHugePageSize-1) >= start {
-		// We're in danger of breaking apart a huge page, so include the entire
-		// huge page in the bound by rounding down to the huge page size.
-		// base should still be aligned to pageSize.
-		base &^= physHugePageSize - 1
-	}
-	if base == start {
-		// After all that we rounded base down to s.base(), so no need to split.
-		return nil
-	}
-	if base < start {
-		print("runtime: base=", base, ", s.npages=", s.npages, ", s.base()=", s.base(), ", size=", size, "\n")
-		print("runtime: physPageSize=", physPageSize, ", physHugePageSize=", physHugePageSize, "\n")
-		throw("bad span split base")
-	}
-
-	// Split s in-place, removing from the back.
-	n := (*mspan)(h.spanalloc.alloc())
-	nbytes := s.base() + s.npages*pageSize - base
-	h.free.mutate(t, func(s *mspan) {
-		n.init(base, nbytes/pageSize)
-		s.npages -= nbytes / pageSize
-		h.setSpan(n.base()-1, s)
-		h.setSpan(n.base(), n)
-		h.setSpan(n.base()+nbytes-1, n)
-		n.needzero = s.needzero
-		n.state = s.state
-	})
-	return n
-}
-
-// scavengeLocked scavenges nbytes worth of spans in the free treap by
-// starting from the span with the highest base address and working down.
-// It then takes those spans and places them in scav.
-//
-// Returns the amount of memory scavenged in bytes. h must be locked.
-func (h *mheap) scavengeLocked(nbytes uintptr) uintptr {
-	released := uintptr(0)
-	// Iterate over spans with huge pages first, then spans without.
-	const mask = treapIterScav | treapIterHuge
-	for _, match := range []treapIterType{treapIterHuge, 0} {
-		// Iterate over the treap backwards (from highest address to lowest address)
-		// scavenging spans until we've reached our quota of nbytes.
-		for t := h.free.end(mask, match); released < nbytes && t.valid(); {
-			s := t.span()
-			start, end := s.physPageBounds()
-			if start >= end {
-				// This span doesn't cover at least one physical page, so skip it.
-				t = t.prev()
-				continue
-			}
-			n := t.prev()
-			if span := h.scavengeSplit(t, nbytes-released); span != nil {
-				s = span
-			} else {
-				h.free.erase(t)
-			}
-			released += s.scavenge()
-			// Now that s is scavenged, we must eagerly coalesce it
-			// with its neighbors to prevent having two spans with
-			// the same scavenged state adjacent to each other.
-			h.coalesce(s)
-			t = n
-			h.free.insert(s)
-		}
-	}
-	return released
-}
-
-// scavengeIfNeededLocked calls scavengeLocked if we're currently above the
-// scavenge goal in order to prevent the mutator from out-running the
-// the scavenger.
-//
-// h must be locked.
-func (h *mheap) scavengeIfNeededLocked(size uintptr) {
-	if r := heapRetained(); r+uint64(size) > h.scavengeRetainedGoal {
-		todo := uint64(size)
-		// If we're only going to go a little bit over, just request what
-		// we actually need done.
-		if overage := r + uint64(size) - h.scavengeRetainedGoal; overage < todo {
-			todo = overage
-		}
-		h.scavengeLocked(uintptr(todo))
-	}
+	// Free the span structure. We no longer have a use for it.
+	s.state.set(mSpanDead)
+	h.freeMSpanLocked(s)
 }
 
 // scavengeAll visits each node in the free treap and scavenges the
@@ -1477,12 +1419,14 @@ func (h *mheap) scavengeIfNeededLocked(size uintptr) {
 // unscav and adds it into scav before continuing.
 func (h *mheap) scavengeAll() {
 	// Disallow malloc or panic while holding the heap lock. We do
-	// this here because this is an non-mallocgc entry-point to
+	// this here because this is a non-mallocgc entry-point to
 	// the mheap API.
 	gp := getg()
 	gp.m.mallocing++
 	lock(&h.lock)
-	released := h.scavengeLocked(^uintptr(0))
+	// Reset the scavenger address so we have access to the whole heap.
+	h.pages.resetScavengeAddr()
+	released := h.pages.scavenge(^uintptr(0), true)
 	unlock(&h.lock)
 	gp.m.mallocing--
 
@@ -1511,14 +1455,13 @@ func (span *mspan) init(base uintptr, npages uintptr) {
 	span.allocCount = 0
 	span.spanclass = 0
 	span.elemsize = 0
-	span.state = mSpanDead
-	span.scavenged = false
 	span.speciallock.key = 0
 	span.specials = nil
 	span.needzero = 0
 	span.freeindex = 0
 	span.allocBits = nil
 	span.gcmarkBits = nil
+	span.state.set(mSpanDead)
 }
 
 func (span *mspan) inList() bool {
diff --git a/libgo/go/runtime/mkpreempt.go b/libgo/go/runtime/mkpreempt.go
new file mode 100644
index 0000000..615ec18
--- /dev/null
+++ b/libgo/go/runtime/mkpreempt.go
@@ -0,0 +1,522 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// mkpreempt generates the asyncPreempt functions for each
+// architecture.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"strings"
+)
+
+// Copied from cmd/compile/internal/ssa/gen/*Ops.go
+
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+}
+
+var regNamesAMD64 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+	"X8",
+	"X9",
+	"X10",
+	"X11",
+	"X12",
+	"X13",
+	"X14",
+	"X15",
+}
+
+var out io.Writer
+
+var arches = map[string]func(){
+	"386":     gen386,
+	"amd64":   genAMD64,
+	"arm":     genARM,
+	"arm64":   genARM64,
+	"mips64x": func() { genMIPS(true) },
+	"mipsx":   func() { genMIPS(false) },
+	"ppc64x":  genPPC64,
+	"s390x":   genS390X,
+	"wasm":    genWasm,
+}
+var beLe = map[string]bool{"mips64x": true, "mipsx": true, "ppc64x": true}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() > 0 {
+		out = os.Stdout
+		for _, arch := range flag.Args() {
+			gen, ok := arches[arch]
+			if !ok {
+				log.Fatalf("unknown arch %s", arch)
+			}
+			header(arch)
+			gen()
+		}
+		return
+	}
+
+	for arch, gen := range arches {
+		f, err := os.Create(fmt.Sprintf("preempt_%s.s", arch))
+		if err != nil {
+			log.Fatal(err)
+		}
+		out = f
+		header(arch)
+		gen()
+		if err := f.Close(); err != nil {
+			log.Fatal(err)
+		}
+	}
+}
+
+func header(arch string) {
+	fmt.Fprintf(out, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
+	if beLe[arch] {
+		base := arch[:len(arch)-1]
+		fmt.Fprintf(out, "// +build %s %sle\n\n", base, base)
+	}
+	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
+	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
+}
+
+func p(f string, args ...interface{}) {
+	fmted := fmt.Sprintf(f, args...)
+	fmt.Fprintf(out, "\t%s\n", strings.Replace(fmted, "\n", "\n\t", -1))
+}
+
+func label(l string) {
+	fmt.Fprintf(out, "%s\n", l)
+}
+
+type layout struct {
+	stack int
+	regs  []regPos
+	sp    string // stack pointer register
+}
+
+type regPos struct {
+	pos int
+
+	op  string
+	reg string
+
+	// If this register requires special save and restore, these
+	// give those operations with a %d placeholder for the stack
+	// offset.
+	save, restore string
+}
+
+func (l *layout) add(op, reg string, size int) {
+	l.regs = append(l.regs, regPos{op: op, reg: reg, pos: l.stack})
+	l.stack += size
+}
+
+func (l *layout) addSpecial(save, restore string, size int) {
+	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+	l.stack += size
+}
+
+func (l *layout) save() {
+	for _, reg := range l.regs {
+		if reg.save != "" {
+			p(reg.save, reg.pos)
+		} else {
+			p("%s %s, %d(%s)", reg.op, reg.reg, reg.pos, l.sp)
+		}
+	}
+}
+
+func (l *layout) restore() {
+	for i := len(l.regs) - 1; i >= 0; i-- {
+		reg := l.regs[i]
+		if reg.restore != "" {
+			p(reg.restore, reg.pos)
+		} else {
+			p("%s %d(%s), %s", reg.op, reg.pos, l.sp, reg.reg)
+		}
+	}
+}
+
+func gen386() {
+	p("PUSHFL")
+
+	// Save general purpose registers.
+	var l = layout{sp: "SP"}
+	for _, reg := range regNames386 {
+		if reg == "SP" || strings.HasPrefix(reg, "X") {
+			continue
+		}
+		l.add("MOVL", reg, 4)
+	}
+
+	// Save the 387 state.
+	l.addSpecial(
+		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
+		"FRSTOR %d(SP)",
+		108)
+
+	// Save SSE state only if supported.
+	lSSE := layout{stack: l.stack, sp: "SP"}
+	for i := 0; i < 8; i++ {
+		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
+	}
+
+	p("ADJSP $%d", lSSE.stack)
+	p("NOP SP")
+	l.save()
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse")
+	lSSE.save()
+	label("nosse:")
+	p("CALL ·asyncPreempt2(SB)")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse2")
+	lSSE.restore()
+	label("nosse2:")
+	l.restore()
+	p("ADJSP $%d", -lSSE.stack)
+
+	p("POPFL")
+	p("RET")
+}
+
+func genAMD64() {
+	// Assign stack offsets.
+	var l = layout{sp: "SP"}
+	for _, reg := range regNamesAMD64 {
+		if reg == "SP" || reg == "BP" {
+			continue
+		}
+		if strings.HasPrefix(reg, "X") {
+			l.add("MOVUPS", reg, 16)
+		} else {
+			l.add("MOVQ", reg, 8)
+		}
+	}
+
+	// TODO: MXCSR register?
+
+	p("PUSHQ BP")
+	p("MOVQ SP, BP")
+	p("// Save flags before clobbering them")
+	p("PUSHFQ")
+	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
+	p("ADJSP $%d", l.stack)
+	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
+	p("NOP SP")
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+	p("ADJSP $%d", -l.stack)
+	p("POPFQ")
+	p("POPQ BP")
+	p("RET")
+}
+
+func genARM() {
+	// Add integer registers R0-R12.
+	// R13 (SP), R14 (LR), R15 (PC) are special and not saved here.
+	var l = layout{sp: "R13", stack: 4} // add LR slot
+	for i := 0; i <= 12; i++ {
+		reg := fmt.Sprintf("R%d", i)
+		if i == 10 {
+			continue // R10 is g register, no need to save/restore
+		}
+		l.add("MOVW", reg, 4)
+	}
+	// Add flag register.
+	l.addSpecial(
+		"MOVW CPSR, R0\nMOVW R0, %d(R13)",
+		"MOVW %d(R13), R0\nMOVW R0, CPSR",
+		4)
+
+	// Add floating point registers F0-F15 and flag register.
+	var lfp = layout{stack: l.stack, sp: "R13"}
+	lfp.addSpecial(
+		"MOVW FPCR, R0\nMOVW R0, %d(R13)",
+		"MOVW %d(R13), R0\nMOVW R0, FPCR",
+		4)
+	for i := 0; i <= 15; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		lfp.add("MOVD", reg, 8)
+	}
+
+	p("MOVW.W R14, -%d(R13)", lfp.stack) // allocate frame, save LR
+	l.save()
+	p("MOVB ·goarm(SB), R0\nCMP $6, R0\nBLT nofp") // test goarm, and skip FP registers if goarm=5.
+	lfp.save()
+	label("nofp:")
+	p("CALL ·asyncPreempt2(SB)")
+	p("MOVB ·goarm(SB), R0\nCMP $6, R0\nBLT nofp2") // test goarm, and skip FP registers if goarm=5.
+	lfp.restore()
+	label("nofp2:")
+	l.restore()
+
+	p("MOVW %d(R13), R14", lfp.stack)     // sigctxt.pushCall pushes LR on stack, restore it
+	p("MOVW.P %d(R13), R15", lfp.stack+4) // load PC, pop frame (including the space pushed by sigctxt.pushCall)
+	p("UNDEF")                            // shouldn't get here
+}
+
+func genARM64() {
+	// Add integer registers R0-R26
+	// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
+	// and not saved here.
+	var l = layout{sp: "RSP", stack: 8} // add slot to save PC of interrupted instruction
+	for i := 0; i <= 26; i++ {
+		if i == 18 {
+			continue // R18 is not used, skip
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add("MOVD", reg, 8)
+	}
+	// Add flag registers.
+	l.addSpecial(
+		"MOVD NZCV, R0\nMOVD R0, %d(RSP)",
+		"MOVD %d(RSP), R0\nMOVD R0, NZCV",
+		8)
+	l.addSpecial(
+		"MOVD FPSR, R0\nMOVD R0, %d(RSP)",
+		"MOVD %d(RSP), R0\nMOVD R0, FPSR",
+		8)
+	// TODO: FPCR? I don't think we'll change it, so no need to save.
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+	if l.stack%16 != 0 {
+		l.stack += 8 // SP needs 16-byte alignment
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR)
+	p("MOVD R30, %d(RSP)", -l.stack)
+	p("SUB $%d, RSP", l.stack)
+	p("#ifdef GOOS_linux")
+	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
+	p("SUB $8, RSP, R29")  // set up new frame pointer
+	p("#endif")
+	// On darwin, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// so any writes below SP may be clobbered.
+	p("#ifdef GOOS_darwin")
+	p("MOVD R30, (RSP)")
+	p("#endif")
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p("#ifdef GOOS_linux")
+	p("MOVD -8(RSP), R29") // restore frame pointer
+	p("#endif")
+	p("MOVD (RSP), R27")          // load PC to REGTMP
+	p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (R27)")
+}
+
+func genMIPS(_64bit bool) {
+	mov := "MOVW"
+	movf := "MOVF"
+	add := "ADD"
+	sub := "SUB"
+	r28 := "R28"
+	regsize := 4
+	if _64bit {
+		mov = "MOVV"
+		movf = "MOVD"
+		add = "ADDV"
+		sub = "SUBV"
+		r28 = "RSB"
+		regsize = 8
+	}
+
+	// Add integer registers R1-R22, R24-R25, R28
+	// R0 (zero), R23 (REGTMP), R29 (SP), R30 (g), R31 (LR) are special,
+	// and not saved here. R26 and R27 are reserved by kernel and not used.
+	var l = layout{sp: "R29", stack: regsize} // add slot to save PC of interrupted instruction (in LR)
+	for i := 1; i <= 25; i++ {
+		if i == 23 {
+			continue // R23 is REGTMP
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add(mov, reg, regsize)
+	}
+	l.add(mov, r28, regsize)
+	l.addSpecial(
+		mov+" HI, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, HI",
+		regsize)
+	l.addSpecial(
+		mov+" LO, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, LO",
+		regsize)
+	// Add floating point control/status register FCR31 (FCR0-FCR30 are irrelevant)
+	l.addSpecial(
+		mov+" FCR31, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, FCR31",
+		regsize)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add(movf, reg, regsize)
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR)
+	p(mov+" R31, -%d(R29)", l.stack)
+	p(sub+" $%d, R29", l.stack)
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p(mov+" %d(R29), R31", l.stack)     // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p(mov + " (R29), R23")              // load PC to REGTMP
+	p(add+" $%d, R29", l.stack+regsize) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (R23)")
+}
+
+func genPPC64() {
+	// Add integer registers R3-R29
+	// R0 (zero), R1 (SP), R30 (g) are special and not saved here.
+	// R2 (TOC pointer in PIC mode), R12 (function entry address in PIC mode) have been saved in sigctxt.pushCall.
+	// R31 (REGTMP) will be saved manually.
+	var l = layout{sp: "R1", stack: 32 + 8} // MinFrameSize on PPC64, plus one word for saving R31
+	for i := 3; i <= 29; i++ {
+		if i == 12 || i == 13 {
+			// R12 has been saved in sigctxt.pushCall.
+			// R13 is TLS pointer, not used by Go code. we must NOT
+			// restore it, otherwise if we parked and resumed on a
+			// different thread we'll mess up TLS addresses.
+			continue
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add("MOVD", reg, 8)
+	}
+	l.addSpecial(
+		"MOVW CR, R31\nMOVW R31, %d(R1)",
+		"MOVW %d(R1), R31\nMOVFL R31, $0xff", // this is MOVW R31, CR
+		8)                                    // CR is 4-byte wide, but just keep the alignment
+	l.addSpecial(
+		"MOVD XER, R31\nMOVD R31, %d(R1)",
+		"MOVD %d(R1), R31\nMOVD R31, XER",
+		8)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+	// Add floating point control/status register FPSCR.
+	l.addSpecial(
+		"MOVFL FPSCR, F0\nFMOVD F0, %d(R1)",
+		"FMOVD %d(R1), F0\nMOVFL F0, FPSCR",
+		8)
+
+	p("MOVD R31, -%d(R1)", l.stack-32) // save R31 first, we'll use R31 for saving LR
+	p("MOVD LR, R31")
+	p("MOVDU R31, -%d(R1)", l.stack) // allocate frame, save PC of interrupted instruction (in LR)
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(R1), R31", l.stack) // sigctxt.pushCall has pushed LR, R2, R12 (at interrupt) on stack, restore them
+	p("MOVD R31, LR")
+	p("MOVD %d(R1), R2", l.stack+8)
+	p("MOVD %d(R1), R12", l.stack+16)
+	p("MOVD (R1), R31") // load PC to CTR
+	p("MOVD R31, CTR")
+	p("MOVD 32(R1), R31")        // restore R31
+	p("ADD $%d, R1", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (CTR)")
+}
+
+func genS390X() {
+	// Add integer registers R0-R12
+	// R13 (g), R14 (LR), R15 (SP) are special, and not saved here.
+	// Saving R10 (REGTMP) is not necessary, but it is saved anyway.
+	var l = layout{sp: "R15", stack: 16} // add slot to save PC of interrupted instruction and flags
+	l.addSpecial(
+		"STMG R0, R12, %d(R15)",
+		"LMG %d(R15), R0, R12",
+		13*8)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 15; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR) and flags (condition code)
+	p("IPM R10") // save flags upfront, as ADD will clobber flags
+	p("MOVD R14, -%d(R15)", l.stack)
+	p("ADD $-%d, R15", l.stack)
+	p("MOVW R10, 8(R15)") // save flags
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(R15), R14", l.stack)    // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p("ADD $%d, R15", l.stack+8)       // pop frame (including the space pushed by sigctxt.pushCall)
+	p("MOVWZ -%d(R15), R10", l.stack)  // load flags to REGTMP
+	p("TMLH R10, $(3<<12)")            // restore flags
+	p("MOVD -%d(R15), R10", l.stack+8) // load PC to REGTMP
+	p("JMP (R10)")
+}
+
+func genWasm() {
+	p("// No async preemption on wasm")
+	p("UNDEF")
+}
+
+func notImplemented() {
+	p("// Not implemented yet")
+	p("JMP ·abort(SB)")
+}
diff --git a/libgo/go/runtime/mpagealloc.go b/libgo/go/runtime/mpagealloc.go
new file mode 100644
index 0000000..572e6a9
--- /dev/null
+++ b/libgo/go/runtime/mpagealloc.go
@@ -0,0 +1,938 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page allocator.
+//
+// The page allocator manages mapped pages (defined by pageSize, NOT
+// physPageSize) for allocation and re-use. It is embedded into mheap.
+//
+// Pages are managed using a bitmap that is sharded into chunks.
+// In the bitmap, 1 means in-use, and 0 means free. The bitmap spans the
+// process's address space. Chunks are managed in a sparse-array-style structure
+// similar to mheap.arenas, since the bitmap may be large on some systems.
+//
+// The bitmap is efficiently searched by using a radix tree in combination
+// with fast bit-wise intrinsics. Allocation is performed using an address-ordered
+// first-fit approach.
+//
+// Each entry in the radix tree is a summary that describes three properties of
+// a particular region of the address space: the number of contiguous free pages
+// at the start and end of the region it represents, and the maximum number of
+// contiguous free pages found anywhere in that region.
+//
+// Each level of the radix tree is stored as one contiguous array, which represents
+// a different granularity of subdivision of the processes' address space. Thus, this
+// radix tree is actually implicit in these large arrays, as opposed to having explicit
+// dynamically-allocated pointer-based node structures. Naturally, these arrays may be
+// quite large for system with large address spaces, so in these cases they are mapped
+// into memory as needed. The leaf summaries of the tree correspond to a bitmap chunk.
+//
+// The root level (referred to as L0 and index 0 in pageAlloc.summary) has each
+// summary represent the largest section of address space (16 GiB on 64-bit systems),
+// with each subsequent level representing successively smaller subsections until we
+// reach the finest granularity at the leaves, a chunk.
+//
+// More specifically, each summary in each level (except for leaf summaries)
+// represents some number of entries in the following level. For example, each
+// summary in the root level may represent a 16 GiB region of address space,
+// and in the next level there could be 8 corresponding entries which represent 2
+// GiB subsections of that 16 GiB region, each of which could correspond to 8
+// entries in the next level which each represent 256 MiB regions, and so on.
+//
+// Thus, this design only scales to heaps so large, but can always be extended to
+// larger heaps by simply adding levels to the radix tree, which mostly costs
+// additional virtual address space. The choice of managing large arrays also means
+// that a large amount of virtual address space may be reserved by the runtime.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// The size of a bitmap chunk, i.e. the amount of bits (that is, pages) to consider
+	// in the bitmap at once.
+	pallocChunkPages    = 1 << logPallocChunkPages
+	pallocChunkBytes    = pallocChunkPages * pageSize
+	logPallocChunkPages = 9
+	logPallocChunkBytes = logPallocChunkPages + pageShift
+
+	// The number of radix bits for each level.
+	//
+	// The value of 3 is chosen such that the block of summaries we need to scan at
+	// each level fits in 64 bytes (2^3 summaries * 8 bytes per summary), which is
+	// close to the L1 cache line width on many systems. Also, a value of 3 fits 4 tree
+	// levels perfectly into the 21-bit pallocBits summary field at the root level.
+	//
+	// The following equation explains how each of the constants relate:
+	// summaryL0Bits + (summaryLevels-1)*summaryLevelBits + logPallocChunkBytes = heapAddrBits
+	//
+	// summaryLevels is an architecture-dependent value defined in mpagealloc_*.go.
+	summaryLevelBits = 3
+	summaryL0Bits    = heapAddrBits - logPallocChunkBytes - (summaryLevels-1)*summaryLevelBits
+
+	// pallocChunksL2Bits is the number of bits of the chunk index number
+	// covered by the second level of the chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this change.
+	pallocChunksL2Bits  = heapAddrBits - logPallocChunkBytes - pallocChunksL1Bits
+	pallocChunksL1Shift = pallocChunksL2Bits
+
+	// Maximum searchAddr value, which indicates that the heap has no free space.
+	//
+	// We subtract arenaBaseOffset because we want this to represent the maximum
+	// value in the shifted address space, but searchAddr is stored as a regular
+	// memory address. See arenaBaseOffset for details.
+	maxSearchAddr = ^uintptr(0) - arenaBaseOffset
+
+	// Minimum scavAddr value, which indicates that the scavenger is done.
+	//
+	// minScavAddr + arenaBaseOffset == 0
+	minScavAddr = (^arenaBaseOffset + 1) & uintptrMask
+)
+
+// Global chunk index.
+//
+// Represents an index into the leaf level of the radix tree.
+// Similar to arenaIndex, except instead of arenas, it divides the address
+// space into chunks.
+type chunkIdx uint
+
+// chunkIndex returns the global index of the palloc chunk containing the
+// pointer p.
+func chunkIndex(p uintptr) chunkIdx {
+	return chunkIdx((p + arenaBaseOffset) / pallocChunkBytes)
+}
+
+// chunkIndex returns the base address of the palloc chunk at index ci.
+func chunkBase(ci chunkIdx) uintptr {
+	return uintptr(ci)*pallocChunkBytes - arenaBaseOffset
+}
+
+// chunkPageIndex computes the index of the page that contains p,
+// relative to the chunk which contains p.
+func chunkPageIndex(p uintptr) uint {
+	return uint(p % pallocChunkBytes / pageSize)
+}
+
+// l1 returns the index into the first level of (*pageAlloc).chunks.
+func (i chunkIdx) l1() uint {
+	if pallocChunksL1Bits == 0 {
+		// Let the compiler optimize this away if there's no
+		// L1 map.
+		return 0
+	} else {
+		return uint(i) >> pallocChunksL1Shift
+	}
+}
+
+// l2 returns the index into the second level of (*pageAlloc).chunks.
+func (i chunkIdx) l2() uint {
+	if pallocChunksL1Bits == 0 {
+		return uint(i)
+	} else {
+		return uint(i) & (1<<pallocChunksL2Bits - 1)
+	}
+}
+
+// addrsToSummaryRange converts base and limit pointers into a range
+// of entries for the given summary level.
+//
+// The returned range is inclusive on the lower bound and exclusive on
+// the upper bound.
+func addrsToSummaryRange(level int, base, limit uintptr) (lo int, hi int) {
+	// This is slightly more nuanced than just a shift for the exclusive
+	// upper-bound. Note that the exclusive upper bound may be within a
+	// summary at this level, meaning if we just do the obvious computation
+	// hi will end up being an inclusive upper bound. Unfortunately, just
+	// adding 1 to that is too broad since we might be on the very edge of
+	// of a summary's max page count boundary for this level
+	// (1 << levelLogPages[level]). So, make limit an inclusive upper bound
+	// then shift, then add 1, so we get an exclusive upper bound at the end.
+	lo = int((base + arenaBaseOffset) >> levelShift[level])
+	hi = int(((limit-1)+arenaBaseOffset)>>levelShift[level]) + 1
+	return
+}
+
+// blockAlignSummaryRange aligns indices into the given level to that
+// level's block width (1 << levelBits[level]). It assumes lo is inclusive
+// and hi is exclusive, and so aligns them down and up respectively.
+func blockAlignSummaryRange(level int, lo, hi int) (int, int) {
+	e := uintptr(1) << levelBits[level]
+	return int(alignDown(uintptr(lo), e)), int(alignUp(uintptr(hi), e))
+}
+
+type pageAlloc struct {
+	// Radix tree of summaries.
+	//
+	// Each slice's cap represents the whole memory reservation.
+	// Each slice's len reflects the allocator's maximum known
+	// mapped heap address for that level.
+	//
+	// The backing store of each summary level is reserved in init
+	// and may or may not be committed in grow (small address spaces
+	// may commit all the memory in init).
+	//
+	// The purpose of keeping len <= cap is to enforce bounds checks
+	// on the top end of the slice so that instead of an unknown
+	// runtime segmentation fault, we get a much friendlier out-of-bounds
+	// error.
+	//
+	// To iterate over a summary level, use inUse to determine which ranges
+	// are currently available. Otherwise one might try to access
+	// memory which is only Reserved which may result in a hard fault.
+	//
+	// We may still get segmentation faults < len since some of that
+	// memory may not be committed yet.
+	summary [summaryLevels][]pallocSum
+
+	// chunks is a slice of bitmap chunks.
+	//
+	// The total size of chunks is quite large on most 64-bit platforms
+	// (O(GiB) or more) if flattened, so rather than making one large mapping
+	// (which has problems on some platforms, even when PROT_NONE) we use a
+	// two-level sparse array approach similar to the arena index in mheap.
+	//
+	// To find the chunk containing a memory address `a`, do:
+	//   chunkOf(chunkIndex(a))
+	//
+	// Below is a table describing the configuration for chunks for various
+	// heapAddrBits supported by the runtime.
+	//
+	// heapAddrBits | L1 Bits | L2 Bits | L2 Entry Size
+	// ------------------------------------------------
+	// 32           | 0       | 10      | 128 KiB
+	// 33 (iOS)     | 0       | 11      | 256 KiB
+	// 48           | 13      | 13      | 1 MiB
+	//
+	// There's no reason to use the L1 part of chunks on 32-bit, the
+	// address space is small so the L2 is small. For platforms with a
+	// 48-bit address space, we pick the L1 such that the L2 is 1 MiB
+	// in size, which is a good balance between low granularity without
+	// making the impact on BSS too high (note the L1 is stored directly
+	// in pageAlloc).
+	//
+	// To iterate over the bitmap, use inUse to determine which ranges
+	// are currently available. Otherwise one might iterate over unused
+	// ranges.
+	//
+	// TODO(mknyszek): Consider changing the definition of the bitmap
+	// such that 1 means free and 0 means in-use so that summaries and
+	// the bitmaps align better on zero-values.
+	chunks [1 << pallocChunksL1Bits]*[1 << pallocChunksL2Bits]pallocData
+
+	// The address to start an allocation search with.
+	//
+	// When added with arenaBaseOffset, we guarantee that
+	// all valid heap addresses (when also added with
+	// arenaBaseOffset) below this value are allocated and
+	// not worth searching.
+	//
+	// Note that adding in arenaBaseOffset transforms addresses
+	// to a new address space with a linear view of the full address
+	// space on architectures with segmented address spaces.
+	searchAddr uintptr
+
+	// The address to start a scavenge candidate search with.
+	scavAddr uintptr
+
+	// start and end represent the chunk indices
+	// which pageAlloc knows about. It assumes
+	// chunks in the range [start, end) are
+	// currently ready to use.
+	start, end chunkIdx
+
+	// inUse is a slice of ranges of address space which are
+	// known by the page allocator to be currently in-use (passed
+	// to grow).
+	//
+	// This field is currently unused on 32-bit architectures but
+	// is harmless to track. We care much more about having a
+	// contiguous heap in these cases and take additional measures
+	// to ensure that, so in nearly all cases this should have just
+	// 1 element.
+	//
+	// All access is protected by the mheapLock.
+	inUse addrRanges
+
+	// mheap_.lock. This level of indirection makes it possible
+	// to test pageAlloc indepedently of the runtime allocator.
+	mheapLock *mutex
+
+	// sysStat is the runtime memstat to update when new system
+	// memory is committed by the pageAlloc for allocation metadata.
+	sysStat *uint64
+
+	// Whether or not this struct is being used in tests.
+	test bool
+}
+
+func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+	if levelLogPages[0] > logMaxPackedValue {
+		// We can't represent 1<<levelLogPages[0] pages, the maximum number
+		// of pages we need to represent at the root level, in a summary, which
+		// is a big problem. Throw.
+		print("runtime: root level max pages = ", 1<<levelLogPages[0], "\n")
+		print("runtime: summary max pages = ", maxPackedValue, "\n")
+		throw("root level max pages doesn't fit in summary")
+	}
+	s.sysStat = sysStat
+
+	// Initialize s.inUse.
+	s.inUse.init(sysStat)
+
+	// System-dependent initialization.
+	s.sysInit()
+
+	// Start with the searchAddr in a state indicating there's no free memory.
+	s.searchAddr = maxSearchAddr
+
+	// Start with the scavAddr in a state indicating there's nothing more to do.
+	s.scavAddr = minScavAddr
+
+	// Set the mheapLock.
+	s.mheapLock = mheapLock
+}
+
+// compareSearchAddrTo compares an address against s.searchAddr in a linearized
+// view of the address space on systems with discontinuous process address spaces.
+// This linearized view is the same one generated by chunkIndex and arenaIndex,
+// done by adding arenaBaseOffset.
+//
+// On systems without a discontinuous address space, it's just a normal comparison.
+//
+// Returns < 0 if addr is less than s.searchAddr in the linearized address space.
+// Returns > 0 if addr is greater than s.searchAddr in the linearized address space.
+// Returns 0 if addr and s.searchAddr are equal.
+func (s *pageAlloc) compareSearchAddrTo(addr uintptr) int {
+	// Compare with arenaBaseOffset added because it gives us a linear, contiguous view
+	// of the heap on architectures with signed address spaces.
+	lAddr := addr + arenaBaseOffset
+	lSearchAddr := s.searchAddr + arenaBaseOffset
+	if lAddr < lSearchAddr {
+		return -1
+	} else if lAddr > lSearchAddr {
+		return 1
+	}
+	return 0
+}
+
+// chunkOf returns the chunk at the given chunk index.
+func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &s.chunks[ci.l1()][ci.l2()]
+}
+
+// grow sets up the metadata for the address range [base, base+size).
+// It may allocate metadata, in which case *s.sysStat will be updated.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) grow(base, size uintptr) {
+	// Round up to chunks, since we can't deal with increments smaller
+	// than chunks. Also, sysGrow expects aligned values.
+	limit := alignUp(base+size, pallocChunkBytes)
+	base = alignDown(base, pallocChunkBytes)
+
+	// Grow the summary levels in a system-dependent manner.
+	// We just update a bunch of additional metadata here.
+	s.sysGrow(base, limit)
+
+	// Update s.start and s.end.
+	// If no growth happened yet, start == 0. This is generally
+	// safe since the zero page is unmapped.
+	firstGrowth := s.start == 0
+	start, end := chunkIndex(base), chunkIndex(limit)
+	if firstGrowth || start < s.start {
+		s.start = start
+	}
+	if end > s.end {
+		s.end = end
+	}
+	// Note that [base, limit) will never overlap with any existing
+	// range inUse because grow only ever adds never-used memory
+	// regions to the page allocator.
+	s.inUse.add(addrRange{base, limit})
+
+	// A grow operation is a lot like a free operation, so if our
+	// chunk ends up below the (linearized) s.searchAddr, update
+	// s.searchAddr to the new address, just like in free.
+	if s.compareSearchAddrTo(base) < 0 {
+		s.searchAddr = base
+	}
+
+	// Add entries into chunks, which is sparse, if needed. Then,
+	// initialize the bitmap.
+	//
+	// Newly-grown memory is always considered scavenged.
+	// Set all the bits in the scavenged bitmaps high.
+	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
+		if s.chunks[c.l1()] == nil {
+			// Create the necessary l2 entry.
+			//
+			// Store it atomically to avoid races with readers which
+			// don't acquire the heap lock.
+			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+		}
+		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+	}
+
+	// Update summaries accordingly. The grow acts like a free, so
+	// we need to ensure this newly-free memory is visible in the
+	// summaries.
+	s.update(base, size/pageSize, true, false)
+}
+
+// update updates heap metadata. It must be called each time the bitmap
+// is updated.
+//
+// If contig is true, update does some optimizations assuming that there was
+// a contiguous allocation or free between addr and addr+npages. alloc indicates
+// whether the operation performed was an allocation or a free.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	// base, limit, start, and end are inclusive.
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+
+	// Handle updating the lowest level first.
+	if sc == ec {
+		// Fast path: the allocation doesn't span more than one chunk,
+		// so update this one and if the summary didn't change, return.
+		x := s.summary[len(s.summary)-1][sc]
+		y := s.chunkOf(sc).summarize()
+		if x == y {
+			return
+		}
+		s.summary[len(s.summary)-1][sc] = y
+	} else if contig {
+		// Slow contiguous path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		summary := s.summary[len(s.summary)-1]
+
+		// Update the summary for chunk sc.
+		summary[sc] = s.chunkOf(sc).summarize()
+
+		// Update the summaries for chunks in between, which are
+		// either totally allocated or freed.
+		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		if alloc {
+			// Should optimize into a memclr.
+			for i := range whole {
+				whole[i] = 0
+			}
+		} else {
+			for i := range whole {
+				whole[i] = freeChunkSum
+			}
+		}
+
+		// Update the summary for chunk ec.
+		summary[ec] = s.chunkOf(ec).summarize()
+	} else {
+		// Slow general path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		//
+		// We can't assume a contiguous allocation happened, so walk over
+		// every chunk in the range and manually recompute the summary.
+		summary := s.summary[len(s.summary)-1]
+		for c := sc; c <= ec; c++ {
+			summary[c] = s.chunkOf(c).summarize()
+		}
+	}
+
+	// Walk up the radix tree and update the summaries appropriately.
+	changed := true
+	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+		// Update summaries at level l from summaries at level l+1.
+		changed = false
+
+		// "Constants" for the previous level which we
+		// need to compute the summary from that level.
+		logEntriesPerBlock := levelBits[l+1]
+		logMaxPages := levelLogPages[l+1]
+
+		// lo and hi describe all the parts of the level we need to look at.
+		lo, hi := addrsToSummaryRange(l, base, limit+1)
+
+		// Iterate over each block, updating the corresponding summary in the less-granular level.
+		for i := lo; i < hi; i++ {
+			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			sum := mergeSummaries(children, logMaxPages)
+			old := s.summary[l][i]
+			if old != sum {
+				changed = true
+				s.summary[l][i] = sum
+			}
+		}
+	}
+}
+
+// allocRange marks the range of memory [base, base+npages*pageSize) as
+// allocated. It also updates the summaries to reflect the newly-updated
+// bitmap.
+//
+// Returns the amount of scavenged memory in bytes present in the
+// allocated range.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+	scav := uint(0)
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		chunk := s.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, ei+1-si)
+		chunk.allocRange(si, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		chunk := s.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
+		chunk.allocRange(si, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			chunk := s.chunkOf(c)
+			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
+			chunk.allocAll()
+		}
+		chunk = s.chunkOf(ec)
+		scav += chunk.scavenged.popcntRange(0, ei+1)
+		chunk.allocRange(0, ei+1)
+	}
+	s.update(base, npages, true, true)
+	return uintptr(scav) * pageSize
+}
+
+// find searches for the first (address-ordered) contiguous free region of
+// npages in size and returns a base address for that region.
+//
+// It uses s.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(s.searchAddr) contain any free memory at all.
+//
+// find also computes and returns a candidate s.searchAddr, which may or
+// may not prune more of the address space than s.searchAddr already does.
+//
+// find represents the slow path and the full radix tree search.
+//
+// Returns a base address of 0 on failure, in which case the candidate
+// searchAddr returned is invalid and must be ignored.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) find(npages uintptr) (uintptr, uintptr) {
+	// Search algorithm.
+	//
+	// This algorithm walks each level l of the radix tree from the root level
+	// to the leaf level. It iterates over at most 1 << levelBits[l] of entries
+	// in a given level in the radix tree, and uses the summary information to
+	// find either:
+	//  1) That a given subtree contains a large enough contiguous region, at
+	//     which point it continues iterating on the next level, or
+	//  2) That there are enough contiguous boundary-crossing bits to satisfy
+	//     the allocation, at which point it knows exactly where to start
+	//     allocating from.
+	//
+	// i tracks the index into the current level l's structure for the
+	// contiguous 1 << levelBits[l] entries we're actually interested in.
+	//
+	// NOTE: Technically this search could allocate a region which crosses
+	// the arenaBaseOffset boundary, which when arenaBaseOffset != 0, is
+	// a discontinuity. However, the only way this could happen is if the
+	// page at the zero address is mapped, and this is impossible on
+	// every system we support where arenaBaseOffset != 0. So, the
+	// discontinuity is already encoded in the fact that the OS will never
+	// map the zero page for us, and this function doesn't try to handle
+	// this case in any way.
+
+	// i is the beginning of the block of entries we're searching at the
+	// current level.
+	i := 0
+
+	// firstFree is the region of address space that we are certain to
+	// find the first free page in the heap. base and bound are the inclusive
+	// bounds of this window, and both are addresses in the linearized, contiguous
+	// view of the address space (with arenaBaseOffset pre-added). At each level,
+	// this window is narrowed as we find the memory region containing the
+	// first free page of memory. To begin with, the range reflects the
+	// full process address space.
+	//
+	// firstFree is updated by calling foundFree each time free space in the
+	// heap is discovered.
+	//
+	// At the end of the search, base-arenaBaseOffset is the best new
+	// searchAddr we could deduce in this search.
+	firstFree := struct {
+		base, bound uintptr
+	}{
+		base:  0,
+		bound: (1<<heapAddrBits - 1),
+	}
+	// foundFree takes the given address range [addr, addr+size) and
+	// updates firstFree if it is a narrower range. The input range must
+	// either be fully contained within firstFree or not overlap with it
+	// at all.
+	//
+	// This way, we'll record the first summary we find with any free
+	// pages on the root level and narrow that down if we descend into
+	// that summary. But as soon as we need to iterate beyond that summary
+	// in a level to find a large enough range, we'll stop narrowing.
+	foundFree := func(addr, size uintptr) {
+		if firstFree.base <= addr && addr+size-1 <= firstFree.bound {
+			// This range fits within the current firstFree window, so narrow
+			// down the firstFree window to the base and bound of this range.
+			firstFree.base = addr
+			firstFree.bound = addr + size - 1
+		} else if !(addr+size-1 < firstFree.base || addr > firstFree.bound) {
+			// This range only partially overlaps with the firstFree range,
+			// so throw.
+			print("runtime: addr = ", hex(addr), ", size = ", size, "\n")
+			print("runtime: base = ", hex(firstFree.base), ", bound = ", hex(firstFree.bound), "\n")
+			throw("range partially overlaps")
+		}
+	}
+
+	// lastSum is the summary which we saw on the previous level that made us
+	// move on to the next level. Used to print additional information in the
+	// case of a catastrophic failure.
+	// lastSumIdx is that summary's index in the previous level.
+	lastSum := packPallocSum(0, 0, 0)
+	lastSumIdx := -1
+
+nextLevel:
+	for l := 0; l < len(s.summary); l++ {
+		// For the root level, entriesPerBlock is the whole level.
+		entriesPerBlock := 1 << levelBits[l]
+		logMaxPages := levelLogPages[l]
+
+		// We've moved into a new level, so let's update i to our new
+		// starting index. This is a no-op for level 0.
+		i <<= levelBits[l]
+
+		// Slice out the block of entries we care about.
+		entries := s.summary[l][i : i+entriesPerBlock]
+
+		// Determine j0, the first index we should start iterating from.
+		// The searchAddr may help us eliminate iterations if we followed the
+		// searchAddr on the previous level or we're on the root leve, in which
+		// case the searchAddr should be the same as i after levelShift.
+		j0 := 0
+		if searchIdx := int((s.searchAddr + arenaBaseOffset) >> levelShift[l]); searchIdx&^(entriesPerBlock-1) == i {
+			j0 = searchIdx & (entriesPerBlock - 1)
+		}
+
+		// Run over the level entries looking for
+		// a contiguous run of at least npages either
+		// within an entry or across entries.
+		//
+		// base contains the page index (relative to
+		// the first entry's first page) of the currently
+		// considered run of consecutive pages.
+		//
+		// size contains the size of the currently considered
+		// run of consecutive pages.
+		var base, size uint
+		for j := j0; j < len(entries); j++ {
+			sum := entries[j]
+			if sum == 0 {
+				// A full entry means we broke any streak and
+				// that we should skip it altogether.
+				size = 0
+				continue
+			}
+
+			// We've encountered a non-zero summary which means
+			// free memory, so update firstFree.
+			foundFree(uintptr((i+j)<<levelShift[l]), (uintptr(1)<<logMaxPages)*pageSize)
+
+			s := sum.start()
+			if size+s >= uint(npages) {
+				// If size == 0 we don't have a run yet,
+				// which means base isn't valid. So, set
+				// base to the first page in this block.
+				if size == 0 {
+					base = uint(j) << logMaxPages
+				}
+				// We hit npages; we're done!
+				size += s
+				break
+			}
+			if sum.max() >= uint(npages) {
+				// The entry itself contains npages contiguous
+				// free pages, so continue on the next level
+				// to find that run.
+				i += j
+				lastSumIdx = i
+				lastSum = sum
+				continue nextLevel
+			}
+			if size == 0 || s < 1<<logMaxPages {
+				// We either don't have a current run started, or this entry
+				// isn't totally free (meaning we can't continue the current
+				// one), so try to begin a new run by setting size and base
+				// based on sum.end.
+				size = sum.end()
+				base = uint(j+1)<<logMaxPages - size
+				continue
+			}
+			// The entry is completely free, so continue the run.
+			size += 1 << logMaxPages
+		}
+		if size >= uint(npages) {
+			// We found a sufficiently large run of free pages straddling
+			// some boundary, so compute the address and return it.
+			addr := uintptr(i<<levelShift[l]) - arenaBaseOffset + uintptr(base)*pageSize
+			return addr, firstFree.base - arenaBaseOffset
+		}
+		if l == 0 {
+			// We're at level zero, so that means we've exhausted our search.
+			return 0, maxSearchAddr
+		}
+
+		// We're not at level zero, and we exhausted the level we were looking in.
+		// This means that either our calculations were wrong or the level above
+		// lied to us. In either case, dump some useful state and throw.
+		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
+		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
+		print("runtime: s.searchAddr = ", hex(s.searchAddr), ", i = ", i, "\n")
+		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
+		for j := 0; j < len(entries); j++ {
+			sum := entries[j]
+			print("runtime: summary[", l, "][", i+j, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		}
+		throw("bad summary data")
+	}
+
+	// Since we've gotten to this point, that means we haven't found a
+	// sufficiently-sized free region straddling some boundary (chunk or larger).
+	// This means the last summary we inspected must have had a large enough "max"
+	// value, so look inside the chunk to find a suitable run.
+	//
+	// After iterating over all levels, i must contain a chunk index which
+	// is what the final level represents.
+	ci := chunkIdx(i)
+	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	if j < 0 {
+		// We couldn't find any space in this chunk despite the summaries telling
+		// us it should be there. There's likely a bug, so dump some state and throw.
+		sum := s.summary[len(s.summary)-1][i]
+		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		print("runtime: npages = ", npages, "\n")
+		throw("bad summary data")
+	}
+
+	// Compute the address at which the free space starts.
+	addr := chunkBase(ci) + uintptr(j)*pageSize
+
+	// Since we actually searched the chunk, we may have
+	// found an even narrower free window.
+	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
+	foundFree(searchAddr+arenaBaseOffset, chunkBase(ci+1)-searchAddr)
+	return addr, firstFree.base - arenaBaseOffset
+}
+
+// alloc allocates npages worth of memory from the page heap, returning the base
+// address for the allocation and the amount of scavenged memory in bytes
+// contained in the region [base address, base address + npages*pageSize).
+//
+// Returns a 0 base address on failure, in which case other returned values
+// should be ignored.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(s.searchAddr) >= s.end {
+		return 0, 0
+	}
+
+	// If npages has a chance of fitting in the chunk where the searchAddr is,
+	// search it directly.
+	searchAddr := uintptr(0)
+	if pallocChunkPages-chunkPageIndex(s.searchAddr) >= uint(npages) {
+		// npages is guaranteed to be no greater than pallocChunkPages here.
+		i := chunkIndex(s.searchAddr)
+		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr))
+			if j < 0 {
+				print("runtime: max = ", max, ", npages = ", npages, "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr), ", s.searchAddr = ", hex(s.searchAddr), "\n")
+				throw("bad summary data")
+			}
+			addr = chunkBase(i) + uintptr(j)*pageSize
+			searchAddr = chunkBase(i) + uintptr(searchIdx)*pageSize
+			goto Found
+		}
+	}
+	// We failed to use a searchAddr for one reason or another, so try
+	// the slow path.
+	addr, searchAddr = s.find(npages)
+	if addr == 0 {
+		if npages == 1 {
+			// We failed to find a single free page, the smallest unit
+			// of allocation. This means we know the heap is completely
+			// exhausted. Otherwise, the heap still might have free
+			// space in it, just not enough contiguous space to
+			// accommodate npages.
+			s.searchAddr = maxSearchAddr
+		}
+		return 0, 0
+	}
+Found:
+	// Go ahead and actually mark the bits now that we have an address.
+	scav = s.allocRange(addr, npages)
+
+	// If we found a higher (linearized) searchAddr, we know that all the
+	// heap memory before that searchAddr in a linear address space is
+	// allocated, so bump s.searchAddr up to the new one.
+	if s.compareSearchAddrTo(searchAddr) > 0 {
+		s.searchAddr = searchAddr
+	}
+	return addr, scav
+}
+
+// free returns npages worth of memory starting at base back to the page heap.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) free(base, npages uintptr) {
+	// If we're freeing pages below the (linearized) s.searchAddr, update searchAddr.
+	if s.compareSearchAddrTo(base) < 0 {
+		s.searchAddr = base
+	}
+	if npages == 1 {
+		// Fast path: we're clearing a single bit, and we know exactly
+		// where it is, so mark it directly.
+		i := chunkIndex(base)
+		s.chunkOf(i).free1(chunkPageIndex(base))
+	} else {
+		// Slow path: we're clearing more bits so we may need to iterate.
+		limit := base + npages*pageSize - 1
+		sc, ec := chunkIndex(base), chunkIndex(limit)
+		si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+		if sc == ec {
+			// The range doesn't cross any chunk boundaries.
+			s.chunkOf(sc).free(si, ei+1-si)
+		} else {
+			// The range crosses at least one chunk boundary.
+			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			for c := sc + 1; c < ec; c++ {
+				s.chunkOf(c).freeAll()
+			}
+			s.chunkOf(ec).free(0, ei+1)
+		}
+	}
+	s.update(base, npages, true, false)
+}
+
+const (
+	pallocSumBytes = unsafe.Sizeof(pallocSum(0))
+
+	// maxPackedValue is the maximum value that any of the three fields in
+	// the pallocSum may take on.
+	maxPackedValue    = 1 << logMaxPackedValue
+	logMaxPackedValue = logPallocChunkPages + (summaryLevels-1)*summaryLevelBits
+
+	freeChunkSum = pallocSum(uint64(pallocChunkPages) |
+		uint64(pallocChunkPages<<logMaxPackedValue) |
+		uint64(pallocChunkPages<<(2*logMaxPackedValue)))
+)
+
+// pallocSum is a packed summary type which packs three numbers: start, max,
+// and end into a single 8-byte value. Each of these values are a summary of
+// a bitmap and are thus counts, each of which may have a maximum value of
+// 2^21 - 1, or all three may be equal to 2^21. The latter case is represented
+// by just setting the 64th bit.
+type pallocSum uint64
+
+// packPallocSum takes a start, max, and end value and produces a pallocSum.
+func packPallocSum(start, max, end uint) pallocSum {
+	if max == maxPackedValue {
+		return pallocSum(uint64(1 << 63))
+	}
+	return pallocSum((uint64(start) & (maxPackedValue - 1)) |
+		((uint64(max) & (maxPackedValue - 1)) << logMaxPackedValue) |
+		((uint64(end) & (maxPackedValue - 1)) << (2 * logMaxPackedValue)))
+}
+
+// start extracts the start value from a packed sum.
+func (p pallocSum) start() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1))
+}
+
+// max extracts the max value from a packed sum.
+func (p pallocSum) max() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1))
+}
+
+// end extracts the end value from a packed sum.
+func (p pallocSum) end() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// unpack unpacks all three values from the summary.
+func (p pallocSum) unpack() (uint, uint, uint) {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue, maxPackedValue, maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1)),
+		uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1)),
+		uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// mergeSummaries merges consecutive summaries which may each represent at
+// most 1 << logMaxPagesPerSum pages each together into one.
+func mergeSummaries(sums []pallocSum, logMaxPagesPerSum uint) pallocSum {
+	// Merge the summaries in sums into one.
+	//
+	// We do this by keeping a running summary representing the merged
+	// summaries of sums[:i] in start, max, and end.
+	start, max, end := sums[0].unpack()
+	for i := 1; i < len(sums); i++ {
+		// Merge in sums[i].
+		si, mi, ei := sums[i].unpack()
+
+		// Merge in sums[i].start only if the running summary is
+		// completely free, otherwise this summary's start
+		// plays no role in the combined sum.
+		if start == uint(i)<<logMaxPagesPerSum {
+			start += si
+		}
+
+		// Recompute the max value of the running sum by looking
+		// across the boundary between the running sum and sums[i]
+		// and at the max sums[i], taking the greatest of those two
+		// and the max of the running sum.
+		if end+si > max {
+			max = end + si
+		}
+		if mi > max {
+			max = mi
+		}
+
+		// Merge in end by checking if this new summary is totally
+		// free. If it is, then we want to extend the running sum's
+		// end by the new summary. If not, then we have some alloc'd
+		// pages in there and we just want to take the end value in
+		// sums[i].
+		if ei == 1<<logMaxPagesPerSum {
+			end += 1 << logMaxPagesPerSum
+		} else {
+			end = ei
+		}
+	}
+	return packPallocSum(start, max, end)
+}
diff --git a/libgo/go/runtime/mpagealloc_32bit.go b/libgo/go/runtime/mpagealloc_32bit.go
new file mode 100644
index 0000000..d18970c
--- /dev/null
+++ b/libgo/go/runtime/mpagealloc_32bit.go
@@ -0,0 +1,116 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 arm mips mipsle wasm darwin,arm64 amd64p32 armbe m68k mips64p32 mips64p32le nios2 ppc s390 sh shbe sparc
+
+// wasm is a treated as a 32-bit architecture for the purposes of the page
+// allocator, even though it has 64-bit pointers. This is because any wasm
+// pointer always has its top 32 bits as zero, so the effective heap address
+// space is only 2^32 bytes in size (see heapAddrBits).
+
+// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// page allocator, even though it has 64-bit pointers and a 33-bit address
+// space (see heapAddrBits). The 33 bit address space cannot be rounded up
+// to 64 bits because there are too many summary levels to fit in just 33
+// bits.
+
+package runtime
+
+import "unsafe"
+
+const (
+	// The number of levels in the radix tree.
+	summaryLevels = 4
+
+	// Constants for testing.
+	pageAlloc32Bit = 1
+	pageAlloc64Bit = 0
+
+	// Number of bits needed to represent all indices into the L1 of the
+	// chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this number change.
+	pallocChunksL1Bits = 0
+)
+
+// See comment in mpagealloc_64bit.go.
+var levelBits = [summaryLevels]uint{
+	summaryL0Bits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+}
+
+// See comment in mpagealloc_64bit.go.
+var levelShift = [summaryLevels]uint{
+	heapAddrBits - summaryL0Bits,
+	heapAddrBits - summaryL0Bits - 1*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 2*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 3*summaryLevelBits,
+}
+
+// See comment in mpagealloc_64bit.go.
+var levelLogPages = [summaryLevels]uint{
+	logPallocChunkPages + 3*summaryLevelBits,
+	logPallocChunkPages + 2*summaryLevelBits,
+	logPallocChunkPages + 1*summaryLevelBits,
+	logPallocChunkPages,
+}
+
+// See mpagealloc_64bit.go for details.
+func (s *pageAlloc) sysInit() {
+	// Calculate how much memory all our entries will take up.
+	//
+	// This should be around 12 KiB or less.
+	totalSize := uintptr(0)
+	for l := 0; l < summaryLevels; l++ {
+		totalSize += (uintptr(1) << (heapAddrBits - levelShift[l])) * pallocSumBytes
+	}
+	totalSize = alignUp(totalSize, physPageSize)
+
+	// Reserve memory for all levels in one go. There shouldn't be much for 32-bit.
+	reservation := sysReserve(nil, totalSize)
+	if reservation == nil {
+		throw("failed to reserve page summary memory")
+	}
+	// There isn't much. Just map it and mark it as used immediately.
+	sysMap(reservation, totalSize, s.sysStat)
+	sysUsed(reservation, totalSize)
+
+	// Iterate over the reservation and cut it up into slices.
+	//
+	// Maintain i as the byte offset from reservation where
+	// the new slice should start.
+	for l, shift := range levelShift {
+		entries := 1 << (heapAddrBits - shift)
+
+		// Put this reservation into a slice.
+		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
+		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+
+		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
+	}
+}
+
+// See mpagealloc_64bit.go for details.
+func (s *pageAlloc) sysGrow(base, limit uintptr) {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+
+	// Walk up the tree and update the summary slices.
+	for l := len(s.summary) - 1; l >= 0; l-- {
+		// Figure out what part of the summary array this new address space needs.
+		// Note that we need to align the ranges to the block width (1<<levelBits[l])
+		// at this level because the full block is needed to compute the summary for
+		// the next level.
+		lo, hi := addrsToSummaryRange(l, base, limit)
+		_, hi = blockAlignSummaryRange(l, lo, hi)
+		if hi > len(s.summary[l]) {
+			s.summary[l] = s.summary[l][:hi]
+		}
+	}
+}
diff --git a/libgo/go/runtime/mpagealloc_64bit.go b/libgo/go/runtime/mpagealloc_64bit.go
new file mode 100644
index 0000000..dd44da1
--- /dev/null
+++ b/libgo/go/runtime/mpagealloc_64bit.go
@@ -0,0 +1,180 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le s390x arm64be alpha sparc64 ia64 riscv64
+
+// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+
+package runtime
+
+import "unsafe"
+
+const (
+	// The number of levels in the radix tree.
+	summaryLevels = 5
+
+	// Constants for testing.
+	pageAlloc32Bit = 0
+	pageAlloc64Bit = 1
+
+	// Number of bits needed to represent all indices into the L1 of the
+	// chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this number change.
+	pallocChunksL1Bits = 13
+)
+
+// levelBits is the number of bits in the radix for a given level in the super summary
+// structure.
+//
+// The sum of all the entries of levelBits should equal heapAddrBits.
+var levelBits = [summaryLevels]uint{
+	summaryL0Bits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+}
+
+// levelShift is the number of bits to shift to acquire the radix for a given level
+// in the super summary structure.
+//
+// With levelShift, one can compute the index of the summary at level l related to a
+// pointer p by doing:
+//   p >> levelShift[l]
+var levelShift = [summaryLevels]uint{
+	heapAddrBits - summaryL0Bits,
+	heapAddrBits - summaryL0Bits - 1*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 2*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 3*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 4*summaryLevelBits,
+}
+
+// levelLogPages is log2 the maximum number of runtime pages in the address space
+// a summary in the given level represents.
+//
+// The leaf level always represents exactly log2 of 1 chunk's worth of pages.
+var levelLogPages = [summaryLevels]uint{
+	logPallocChunkPages + 4*summaryLevelBits,
+	logPallocChunkPages + 3*summaryLevelBits,
+	logPallocChunkPages + 2*summaryLevelBits,
+	logPallocChunkPages + 1*summaryLevelBits,
+	logPallocChunkPages,
+}
+
+// sysInit performs architecture-dependent initialization of fields
+// in pageAlloc. pageAlloc should be uninitialized except for sysStat
+// if any runtime statistic should be updated.
+func (s *pageAlloc) sysInit() {
+	// Reserve memory for each level. This will get mapped in
+	// as R/W by setArenas.
+	for l, shift := range levelShift {
+		entries := 1 << (heapAddrBits - shift)
+
+		// Reserve b bytes of memory anywhere in the address space.
+		b := alignUp(uintptr(entries)*pallocSumBytes, physPageSize)
+		r := sysReserve(nil, b)
+		if r == nil {
+			throw("failed to reserve page summary memory")
+		}
+
+		// Put this reservation into a slice.
+		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
+		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+	}
+}
+
+// sysGrow performs architecture-dependent operations on heap
+// growth for the page allocator, such as mapping in new memory
+// for summaries. It also updates the length of the slices in
+// s.summary.
+//
+// base is the base of the newly-added heap memory and limit is
+// the first address past the end of the newly-added heap memory.
+// Both must be aligned to pallocChunkBytes.
+//
+// The caller must update s.start and s.end after calling sysGrow.
+func (s *pageAlloc) sysGrow(base, limit uintptr) {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+
+	// addrRangeToSummaryRange converts a range of addresses into a range
+	// of summary indices which must be mapped to support those addresses
+	// in the summary range.
+	addrRangeToSummaryRange := func(level int, r addrRange) (int, int) {
+		sumIdxBase, sumIdxLimit := addrsToSummaryRange(level, r.base, r.limit)
+		return blockAlignSummaryRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// summaryRangeToSumAddrRange converts a range of indices in any
+	// level of s.summary into page-aligned addresses which cover that
+	// range of indices.
+	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
+		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
+		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
+		base := unsafe.Pointer(&s.summary[level][0])
+		return addrRange{
+			uintptr(add(base, baseOffset)),
+			uintptr(add(base, limitOffset)),
+		}
+	}
+
+	// addrRangeToSumAddrRange is a convienience function that converts
+	// an address range r to the address range of the given summary level
+	// that stores the summaries for r.
+	addrRangeToSumAddrRange := func(level int, r addrRange) addrRange {
+		sumIdxBase, sumIdxLimit := addrRangeToSummaryRange(level, r)
+		return summaryRangeToSumAddrRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// Find the first inUse index which is strictly greater than base.
+	//
+	// Because this function will never be asked remap the same memory
+	// twice, this index is effectively the index at which we would insert
+	// this new growth, and base will never overlap/be contained within
+	// any existing range.
+	//
+	// This will be used to look at what memory in the summary array is already
+	// mapped before and after this new range.
+	inUseIndex := s.inUse.findSucc(base)
+
+	// Walk up the radix tree and map summaries in as needed.
+	for l := range s.summary {
+		// Figure out what part of the summary array this new address space needs.
+		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, addrRange{base, limit})
+
+		// Update the summary slices with a new upper-bound. This ensures
+		// we get tight bounds checks on at least the top bound.
+		//
+		// We must do this regardless of whether we map new memory.
+		if needIdxLimit > len(s.summary[l]) {
+			s.summary[l] = s.summary[l][:needIdxLimit]
+		}
+
+		// Compute the needed address range in the summary array for level l.
+		need := summaryRangeToSumAddrRange(l, needIdxBase, needIdxLimit)
+
+		// Prune need down to what needs to be newly mapped. Some parts of it may
+		// already be mapped by what inUse describes due to page alignment requirements
+		// for mapping. prune's invariants are guaranteed by the fact that this
+		// function will never be asked to remap the same memory twice.
+		if inUseIndex > 0 {
+			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+		}
+		if inUseIndex < len(s.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		}
+		// It's possible that after our pruning above, there's nothing new to map.
+		if need.size() == 0 {
+			continue
+		}
+
+		// Map and commit need.
+		sysMap(unsafe.Pointer(need.base), need.size(), s.sysStat)
+		sysUsed(unsafe.Pointer(need.base), need.size())
+	}
+}
diff --git a/libgo/go/runtime/mpagealloc_test.go b/libgo/go/runtime/mpagealloc_test.go
new file mode 100644
index 0000000..6c48296
--- /dev/null
+++ b/libgo/go/runtime/mpagealloc_test.go
@@ -0,0 +1,921 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	. "runtime"
+	"testing"
+)
+
+func checkPageAlloc(t *testing.T, want, got *PageAlloc) {
+	// Ensure start and end are correct.
+	wantStart, wantEnd := want.Bounds()
+	gotStart, gotEnd := got.Bounds()
+	if gotStart != wantStart {
+		t.Fatalf("start values not equal: got %d, want %d", gotStart, wantStart)
+	}
+	if gotEnd != wantEnd {
+		t.Fatalf("end values not equal: got %d, want %d", gotEnd, wantEnd)
+	}
+
+	for i := gotStart; i < gotEnd; i++ {
+		// Check the bitmaps. Note that we may have nil data.
+		gb, wb := got.PallocData(i), want.PallocData(i)
+		if gb == nil && wb == nil {
+			continue
+		}
+		if (gb == nil && wb != nil) || (gb != nil && wb == nil) {
+			t.Errorf("chunk %d nilness mismatch", i)
+		}
+		if !checkPallocBits(t, gb.PallocBits(), wb.PallocBits()) {
+			t.Logf("in chunk %d (mallocBits)", i)
+		}
+		if !checkPallocBits(t, gb.Scavenged(), wb.Scavenged()) {
+			t.Logf("in chunk %d (scavenged)", i)
+		}
+	}
+	// TODO(mknyszek): Verify summaries too?
+}
+
+func TestPageAllocGrow(t *testing.T) {
+	type test struct {
+		chunks []ChunkIdx
+		inUse  []AddrRange
+	}
+	tests := map[string]test{
+		"One": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+			},
+		},
+		"Contiguous2": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+			},
+		},
+		"Contiguous5": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 3,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"Discontiguous": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"Mixed": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"WildlyDiscontiguous": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 0x10,
+				BaseChunkIdx + 0x21,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
+				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+			},
+		},
+		"ManyDiscontiguous": {
+			// The initial cap is 16. Test 33 ranges, to exercise the growth path (twice).
+			chunks: []ChunkIdx{
+				BaseChunkIdx, BaseChunkIdx + 2, BaseChunkIdx + 4, BaseChunkIdx + 6,
+				BaseChunkIdx + 8, BaseChunkIdx + 10, BaseChunkIdx + 12, BaseChunkIdx + 14,
+				BaseChunkIdx + 16, BaseChunkIdx + 18, BaseChunkIdx + 20, BaseChunkIdx + 22,
+				BaseChunkIdx + 24, BaseChunkIdx + 26, BaseChunkIdx + 28, BaseChunkIdx + 30,
+				BaseChunkIdx + 32, BaseChunkIdx + 34, BaseChunkIdx + 36, BaseChunkIdx + 38,
+				BaseChunkIdx + 40, BaseChunkIdx + 42, BaseChunkIdx + 44, BaseChunkIdx + 46,
+				BaseChunkIdx + 48, BaseChunkIdx + 50, BaseChunkIdx + 52, BaseChunkIdx + 54,
+				BaseChunkIdx + 56, BaseChunkIdx + 58, BaseChunkIdx + 60, BaseChunkIdx + 62,
+				BaseChunkIdx + 64,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
+				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
+				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
+				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
+				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
+				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
+				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
+				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
+				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
+				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
+				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
+				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
+				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
+				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
+				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
+				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
+				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
+				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
+				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
+				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
+				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
+				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
+				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
+				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
+				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
+				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
+				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
+				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
+				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
+				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+			},
+		},
+	}
+	if PageAlloc64Bit != 0 {
+		tests["ExtremelyDiscontiguous"] = test{
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+			},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			// By creating a new pageAlloc, we will
+			// grow it for each chunk defined in x.
+			x := make(map[ChunkIdx][]BitRange)
+			for _, c := range v.chunks {
+				x[c] = []BitRange{}
+			}
+			b := NewPageAlloc(x, nil)
+			defer FreePageAlloc(b)
+
+			got := b.InUse()
+			want := v.inUse
+
+			// Check for mismatches.
+			if len(got) != len(want) {
+				t.Fail()
+			} else {
+				for i := range want {
+					if want[i] != got[i] {
+						t.Fail()
+						break
+					}
+				}
+			}
+			if t.Failed() {
+				t.Logf("found inUse mismatch")
+				t.Logf("got:")
+				for i, r := range got {
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+				}
+				t.Logf("want:")
+				for i, r := range want {
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+				}
+			}
+		})
+	}
+}
+
+func TestPageAllocAlloc(t *testing.T) {
+	type hit struct {
+		npages, base, scav uintptr
+	}
+	tests := map[string]struct {
+		scav   map[ChunkIdx][]BitRange
+		before map[ChunkIdx][]BitRange
+		after  map[ChunkIdx][]BitRange
+		hits   []hit
+	}{
+		"AllFree1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 1}, {2, 2}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx, 0), PageSize},
+				{1, PageBase(BaseChunkIdx, 1), 0},
+				{1, PageBase(BaseChunkIdx, 2), PageSize},
+				{1, PageBase(BaseChunkIdx, 3), PageSize},
+				{1, PageBase(BaseChunkIdx, 4), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 5}},
+			},
+		},
+		"ManyArena1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 1}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx+2, PallocChunkPages-1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguous1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx+0xff, 0), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 1}},
+			},
+		},
+		"AllFree2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 3}, {7, 1}},
+			},
+			hits: []hit{
+				{2, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{2, PageBase(BaseChunkIdx, 2), PageSize},
+				{2, PageBase(BaseChunkIdx, 4), 0},
+				{2, PageBase(BaseChunkIdx, 6), PageSize},
+				{2, PageBase(BaseChunkIdx, 8), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 10}},
+			},
+		},
+		"Straddle2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages - 1}},
+				BaseChunkIdx + 1: {{1, PallocChunkPages - 1}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages - 1, 1}},
+				BaseChunkIdx + 1: {},
+			},
+			hits: []hit{
+				{2, PageBase(BaseChunkIdx, PallocChunkPages-1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFree5": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 8}, {9, 1}, {17, 5}},
+			},
+			hits: []hit{
+				{5, PageBase(BaseChunkIdx, 0), 5 * PageSize},
+				{5, PageBase(BaseChunkIdx, 5), 4 * PageSize},
+				{5, PageBase(BaseChunkIdx, 10), 0},
+				{5, PageBase(BaseChunkIdx, 15), 3 * PageSize},
+				{5, PageBase(BaseChunkIdx, 20), 2 * PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 25}},
+			},
+		},
+		"AllFree64": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{21, 1}, {63, 65}},
+			},
+			hits: []hit{
+				{64, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{64, PageBase(BaseChunkIdx, 64), 64 * PageSize},
+				{64, PageBase(BaseChunkIdx, 128), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 192}},
+			},
+		},
+		"AllFree65": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{129, 1}},
+			},
+			hits: []hit{
+				{65, PageBase(BaseChunkIdx, 0), 0},
+				{65, PageBase(BaseChunkIdx, 65), PageSize},
+				{65, PageBase(BaseChunkIdx, 130), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 195}},
+			},
+		},
+		// TODO(mknyszek): Add tests close to the chunk size.
+		"ExhaustPallocChunkPages-3": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{10, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages - 3, PageBase(BaseChunkIdx, 0), PageSize},
+				{PallocChunkPages - 3, 0, 0},
+				{1, PageBase(BaseChunkIdx, PallocChunkPages-3), 0},
+				{2, PageBase(BaseChunkIdx, PallocChunkPages-2), 0},
+				{1, 0, 0},
+				{PallocChunkPages - 3, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFreePallocChunkPages": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 1}, {PallocChunkPages - 1, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{3, 100}},
+			},
+			hits: []hit{
+				{PallocChunkPages, PageBase(BaseChunkIdx, PallocChunkPages/2), 100 * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages+1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{PallocChunkPages + 1, PageBase(BaseChunkIdx, PallocChunkPages/2), (PallocChunkPages + 1) * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, PageBase(BaseChunkIdx+1, PallocChunkPages/2+1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages/2 + 2}},
+			},
+		},
+		"AllFreePallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx, 0), 0},
+				{PallocChunkPages * 2, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguousPallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {},
+				BaseChunkIdx + 0x40: {},
+				BaseChunkIdx + 0x41: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0x40: {},
+				BaseChunkIdx + 0x41: {},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx+0x40, 0), 0},
+				{21, PageBase(BaseChunkIdx, 0), 21 * PageSize},
+				{1, PageBase(BaseChunkIdx, 21), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, 22}},
+				BaseChunkIdx + 0x40: {{0, PallocChunkPages}},
+				BaseChunkIdx + 0x41: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, 7}},
+				BaseChunkIdx + 1: {{3, 5}, {121, 10}},
+				BaseChunkIdx + 2: {{PallocChunkPages/2 + 12, 2}},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx, PallocChunkPages/2), 15 * PageSize},
+				{PallocChunkPages * 2, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages*5/4": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages * 3 / 4}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages * 3 / 4}},
+				BaseChunkIdx + 3: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{PallocChunkPages / 2, PallocChunkPages/4 + 1}},
+				BaseChunkIdx + 2: {{PallocChunkPages / 3, 1}},
+				BaseChunkIdx + 3: {{PallocChunkPages * 2 / 3, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages * 5 / 4, PageBase(BaseChunkIdx+2, PallocChunkPages*3/4), PageSize},
+				{PallocChunkPages * 5 / 4, 0, 0},
+				{1, PageBase(BaseChunkIdx+1, PallocChunkPages*3/4), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages*3/4 + 1}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFreePallocChunkPages*7+5": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{50, 1}},
+				BaseChunkIdx + 1: {{31, 1}},
+				BaseChunkIdx + 2: {{7, 1}},
+				BaseChunkIdx + 3: {{200, 1}},
+				BaseChunkIdx + 4: {{3, 1}},
+				BaseChunkIdx + 5: {{51, 1}},
+				BaseChunkIdx + 6: {{20, 1}},
+				BaseChunkIdx + 7: {{1, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages*7 + 5, PageBase(BaseChunkIdx, 0), 8 * PageSize},
+				{PallocChunkPages*7 + 5, 0, 0},
+				{1, PageBase(BaseChunkIdx+7, 5), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+				BaseChunkIdx + 4: {{0, PallocChunkPages}},
+				BaseChunkIdx + 5: {{0, PallocChunkPages}},
+				BaseChunkIdx + 6: {{0, PallocChunkPages}},
+				BaseChunkIdx + 7: {{0, 6}},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, v.scav)
+			defer FreePageAlloc(b)
+
+			for iter, i := range v.hits {
+				a, s := b.Alloc(i.npages)
+				if a != i.base {
+					t.Fatalf("bad alloc #%d: want base 0x%x, got 0x%x", iter+1, i.base, a)
+				}
+				if s != i.scav {
+					t.Fatalf("bad alloc #%d: want scav %d, got %d", iter+1, i.scav, s)
+				}
+			}
+			want := NewPageAlloc(v.after, v.scav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocExhaust(t *testing.T) {
+	for _, npages := range []uintptr{1, 2, 3, 4, 5, 8, 16, 64, 1024, 1025, 2048, 2049} {
+		npages := npages
+		t.Run(fmt.Sprintf("%d", npages), func(t *testing.T) {
+			// Construct b.
+			bDesc := make(map[ChunkIdx][]BitRange)
+			for i := ChunkIdx(0); i < 4; i++ {
+				bDesc[BaseChunkIdx+i] = []BitRange{}
+			}
+			b := NewPageAlloc(bDesc, nil)
+			defer FreePageAlloc(b)
+
+			// Allocate into b with npages until we've exhausted the heap.
+			nAlloc := (PallocChunkPages * 4) / int(npages)
+			for i := 0; i < nAlloc; i++ {
+				addr := PageBase(BaseChunkIdx, uint(i)*uint(npages))
+				if a, _ := b.Alloc(npages); a != addr {
+					t.Fatalf("bad alloc #%d: want 0x%x, got 0x%x", i+1, addr, a)
+				}
+			}
+
+			// Check to make sure the next allocation fails.
+			if a, _ := b.Alloc(npages); a != 0 {
+				t.Fatalf("bad alloc #%d: want 0, got 0x%x", nAlloc, a)
+			}
+
+			// Construct what we want the heap to look like now.
+			allocPages := nAlloc * int(npages)
+			wantDesc := make(map[ChunkIdx][]BitRange)
+			for i := ChunkIdx(0); i < 4; i++ {
+				if allocPages >= PallocChunkPages {
+					wantDesc[BaseChunkIdx+i] = []BitRange{{0, PallocChunkPages}}
+					allocPages -= PallocChunkPages
+				} else if allocPages > 0 {
+					wantDesc[BaseChunkIdx+i] = []BitRange{{0, uint(allocPages)}}
+					allocPages = 0
+				} else {
+					wantDesc[BaseChunkIdx+i] = []BitRange{}
+				}
+			}
+			want := NewPageAlloc(wantDesc, nil)
+			defer FreePageAlloc(want)
+
+			// Check to make sure the heap b matches what we want.
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocFree(t *testing.T) {
+	tests := map[string]struct {
+		before map[ChunkIdx][]BitRange
+		after  map[ChunkIdx][]BitRange
+		npages uintptr
+		frees  []uintptr
+	}{
+		"Free1": {
+			npages: 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 1),
+				PageBase(BaseChunkIdx, 2),
+				PageBase(BaseChunkIdx, 3),
+				PageBase(BaseChunkIdx, 4),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{5, PallocChunkPages - 5}},
+			},
+		},
+		"ManyArena1": {
+			npages: 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+				PageBase(BaseChunkIdx+1, 0),
+				PageBase(BaseChunkIdx+2, PallocChunkPages-1),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}, {PallocChunkPages/2 + 1, PallocChunkPages/2 - 1}},
+				BaseChunkIdx + 1: {{1, PallocChunkPages - 1}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 1}},
+			},
+		},
+		"Free2": {
+			npages: 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 2),
+				PageBase(BaseChunkIdx, 4),
+				PageBase(BaseChunkIdx, 6),
+				PageBase(BaseChunkIdx, 8),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{10, PallocChunkPages - 10}},
+			},
+		},
+		"Straddle2": {
+			npages: 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages - 1, 1}},
+				BaseChunkIdx + 1: {{0, 1}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages-1),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"Free5": {
+			npages: 5,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 5),
+				PageBase(BaseChunkIdx, 10),
+				PageBase(BaseChunkIdx, 15),
+				PageBase(BaseChunkIdx, 20),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{25, PallocChunkPages - 25}},
+			},
+		},
+		"Free64": {
+			npages: 64,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 64),
+				PageBase(BaseChunkIdx, 128),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{192, PallocChunkPages - 192}},
+			},
+		},
+		"Free65": {
+			npages: 65,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 65),
+				PageBase(BaseChunkIdx, 130),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{195, PallocChunkPages - 195}},
+			},
+		},
+		"FreePallocChunkPages": {
+			npages: PallocChunkPages,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+		},
+		"StraddlePallocChunkPages": {
+			npages: PallocChunkPages,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages / 2, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages / 2}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"StraddlePallocChunkPages+1": {
+			npages: PallocChunkPages + 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{PallocChunkPages/2 + 1, PallocChunkPages/2 - 1}},
+			},
+		},
+		"FreePallocChunkPages*2": {
+			npages: PallocChunkPages * 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"StraddlePallocChunkPages*2": {
+			npages: PallocChunkPages * 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+		},
+		"AllFreePallocChunkPages*7+5": {
+			npages: PallocChunkPages*7 + 5,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+				BaseChunkIdx + 4: {{0, PallocChunkPages}},
+				BaseChunkIdx + 5: {{0, PallocChunkPages}},
+				BaseChunkIdx + 6: {{0, PallocChunkPages}},
+				BaseChunkIdx + 7: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {{5, PallocChunkPages - 5}},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, nil)
+			defer FreePageAlloc(b)
+
+			for _, addr := range v.frees {
+				b.Free(addr, v.npages)
+			}
+			want := NewPageAlloc(v.after, nil)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocAllocAndFree(t *testing.T) {
+	type hit struct {
+		alloc  bool
+		npages uintptr
+		base   uintptr
+	}
+	tests := map[string]struct {
+		init map[ChunkIdx][]BitRange
+		hits []hit
+	}{
+		// TODO(mknyszek): Write more tests here.
+		"Chunks8": {
+			init: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {},
+			},
+			hits: []hit{
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, 1, PageBase(BaseChunkIdx, 0)},
+				{false, 1, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.init, nil)
+			defer FreePageAlloc(b)
+
+			for iter, i := range v.hits {
+				if i.alloc {
+					if a, _ := b.Alloc(i.npages); a != i.base {
+						t.Fatalf("bad alloc #%d: want 0x%x, got 0x%x", iter+1, i.base, a)
+					}
+				} else {
+					b.Free(i.base, i.npages)
+				}
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/mpagecache.go b/libgo/go/runtime/mpagecache.go
new file mode 100644
index 0000000..9fc338b
--- /dev/null
+++ b/libgo/go/runtime/mpagecache.go
@@ -0,0 +1,156 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const pageCachePages = 8 * unsafe.Sizeof(pageCache{}.cache)
+
+// pageCache represents a per-p cache of pages the allocator can
+// allocate from without a lock. More specifically, it represents
+// a pageCachePages*pageSize chunk of memory with 0 or more free
+// pages in it.
+type pageCache struct {
+	base  uintptr // base address of the chunk
+	cache uint64  // 64-bit bitmap representing free pages (1 means free)
+	scav  uint64  // 64-bit bitmap representing scavenged pages (1 means scavenged)
+}
+
+// empty returns true if the pageCache has any free pages, and false
+// otherwise.
+func (c *pageCache) empty() bool {
+	return c.cache == 0
+}
+
+// alloc allocates npages from the page cache and is the main entry
+// point for allocation.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+//
+// Returns a base address of zero on failure, in which case the
+// amount of scavenged memory should be ignored.
+func (c *pageCache) alloc(npages uintptr) (uintptr, uintptr) {
+	if c.cache == 0 {
+		return 0, 0
+	}
+	if npages == 1 {
+		i := uintptr(sys.TrailingZeros64(c.cache))
+		scav := (c.scav >> i) & 1
+		c.cache &^= 1 << i // set bit to mark in-use
+		c.scav &^= 1 << i  // clear bit to mark unscavenged
+		return c.base + i*pageSize, uintptr(scav) * pageSize
+	}
+	return c.allocN(npages)
+}
+
+// allocN is a helper which attempts to allocate npages worth of pages
+// from the cache. It represents the general case for allocating from
+// the page cache.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
+	i := findBitRange64(c.cache, uint(npages))
+	if i >= 64 {
+		return 0, 0
+	}
+	mask := ((uint64(1) << npages) - 1) << i
+	scav := sys.OnesCount64(c.scav & mask)
+	c.cache &^= mask // mark in-use bits
+	c.scav &^= mask  // clear scavenged bits
+	return c.base + uintptr(i*pageSize), uintptr(scav) * pageSize
+}
+
+// flush empties out unallocated free pages in the given cache
+// into s. Then, it clears the cache, such that empty returns
+// true.
+//
+// s.mheapLock must be held or the world must be stopped.
+func (c *pageCache) flush(s *pageAlloc) {
+	if c.empty() {
+		return
+	}
+	ci := chunkIndex(c.base)
+	pi := chunkPageIndex(c.base)
+
+	// This method is called very infrequently, so just do the
+	// slower, safer thing by iterating over each bit individually.
+	for i := uint(0); i < 64; i++ {
+		if c.cache&(1<<i) != 0 {
+			s.chunkOf(ci).free1(pi + i)
+		}
+		if c.scav&(1<<i) != 0 {
+			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+		}
+	}
+	// Since this is a lot like a free, we need to make sure
+	// we update the searchAddr just like free does.
+	if s.compareSearchAddrTo(c.base) < 0 {
+		s.searchAddr = c.base
+	}
+	s.update(c.base, pageCachePages, false, false)
+	*c = pageCache{}
+}
+
+// allocToCache acquires a pageCachePages-aligned chunk of free pages which
+// may not be contiguous, and returns a pageCache structure which owns the
+// chunk.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) allocToCache() pageCache {
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(s.searchAddr) >= s.end {
+		return pageCache{}
+	}
+	c := pageCache{}
+	ci := chunkIndex(s.searchAddr) // chunk index
+	if s.summary[len(s.summary)-1][ci] != 0 {
+		// Fast path: there's free pages at or near the searchAddr address.
+		chunk := s.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr))
+		if j < 0 {
+			throw("bad summary data")
+		}
+		c = pageCache{
+			base:  chunkBase(ci) + alignDown(uintptr(j), 64)*pageSize,
+			cache: ^chunk.pages64(j),
+			scav:  chunk.scavenged.block64(j),
+		}
+	} else {
+		// Slow path: the searchAddr address had nothing there, so go find
+		// the first free page the slow way.
+		addr, _ := s.find(1)
+		if addr == 0 {
+			// We failed to find adequate free space, so mark the searchAddr as OoM
+			// and return an empty pageCache.
+			s.searchAddr = maxSearchAddr
+			return pageCache{}
+		}
+		ci := chunkIndex(addr)
+		chunk := s.chunkOf(ci)
+		c = pageCache{
+			base:  alignDown(addr, 64*pageSize),
+			cache: ^chunk.pages64(chunkPageIndex(addr)),
+			scav:  chunk.scavenged.block64(chunkPageIndex(addr)),
+		}
+	}
+
+	// Set the bits as allocated and clear the scavenged bits.
+	s.allocRange(c.base, pageCachePages)
+
+	// Update as an allocation, but note that it's not contiguous.
+	s.update(c.base, pageCachePages, false, true)
+
+	// We're always searching for the first free page, and we always know the
+	// up to pageCache size bits will be allocated, so we can always move the
+	// searchAddr past the cache.
+	s.searchAddr = c.base + pageSize*pageCachePages
+	return c
+}
diff --git a/libgo/go/runtime/mpagecache_test.go b/libgo/go/runtime/mpagecache_test.go
new file mode 100644
index 0000000..6fdaa04
--- /dev/null
+++ b/libgo/go/runtime/mpagecache_test.go
@@ -0,0 +1,364 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+func checkPageCache(t *testing.T, got, want PageCache) {
+	if got.Base() != want.Base() {
+		t.Errorf("bad pageCache base: got 0x%x, want 0x%x", got.Base(), want.Base())
+	}
+	if got.Cache() != want.Cache() {
+		t.Errorf("bad pageCache bits: got %016x, want %016x", got.Base(), want.Base())
+	}
+	if got.Scav() != want.Scav() {
+		t.Errorf("bad pageCache scav: got %016x, want %016x", got.Scav(), want.Scav())
+	}
+}
+
+func TestPageCacheAlloc(t *testing.T) {
+	base := PageBase(BaseChunkIdx, 0)
+	type hit struct {
+		npages uintptr
+		base   uintptr
+		scav   uintptr
+	}
+	tests := map[string]struct {
+		cache PageCache
+		hits  []hit
+	}{
+		"Empty": {
+			cache: NewPageCache(base, 0, 0),
+			hits: []hit{
+				{1, 0, 0},
+				{2, 0, 0},
+				{3, 0, 0},
+				{4, 0, 0},
+				{5, 0, 0},
+				{11, 0, 0},
+				{12, 0, 0},
+				{16, 0, 0},
+				{27, 0, 0},
+				{32, 0, 0},
+				{43, 0, 0},
+				{57, 0, 0},
+				{64, 0, 0},
+				{121, 0, 0},
+			},
+		},
+		"Lo1": {
+			cache: NewPageCache(base, 0x1, 0x1),
+			hits: []hit{
+				{1, base, PageSize},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Hi1": {
+			cache: NewPageCache(base, 0x1<<63, 0x1),
+			hits: []hit{
+				{1, base + 63*PageSize, 0},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Swiss1": {
+			cache: NewPageCache(base, 0x20005555, 0x5505),
+			hits: []hit{
+				{2, 0, 0},
+				{1, base, PageSize},
+				{1, base + 2*PageSize, PageSize},
+				{1, base + 4*PageSize, 0},
+				{1, base + 6*PageSize, 0},
+				{1, base + 8*PageSize, PageSize},
+				{1, base + 10*PageSize, PageSize},
+				{1, base + 12*PageSize, PageSize},
+				{1, base + 14*PageSize, PageSize},
+				{1, base + 29*PageSize, 0},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Lo2": {
+			cache: NewPageCache(base, 0x3, 0x2<<62),
+			hits: []hit{
+				{2, base, 0},
+				{2, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Hi2": {
+			cache: NewPageCache(base, 0x3<<62, 0x3<<62),
+			hits: []hit{
+				{2, base + 62*PageSize, 2 * PageSize},
+				{2, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Swiss2": {
+			cache: NewPageCache(base, 0x3333<<31, 0x3030<<31),
+			hits: []hit{
+				{2, base + 31*PageSize, 0},
+				{2, base + 35*PageSize, 2 * PageSize},
+				{2, base + 39*PageSize, 0},
+				{2, base + 43*PageSize, 2 * PageSize},
+				{2, 0, 0},
+			},
+		},
+		"Hi53": {
+			cache: NewPageCache(base, ((uint64(1)<<53)-1)<<10, ((uint64(1)<<16)-1)<<10),
+			hits: []hit{
+				{53, base + 10*PageSize, 16 * PageSize},
+				{53, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Full53": {
+			cache: NewPageCache(base, ^uint64(0), ((uint64(1)<<16)-1)<<10),
+			hits: []hit{
+				{53, base, 16 * PageSize},
+				{53, 0, 0},
+				{1, base + 53*PageSize, 0},
+			},
+		},
+		"Full64": {
+			cache: NewPageCache(base, ^uint64(0), ^uint64(0)),
+			hits: []hit{
+				{64, base, 64 * PageSize},
+				{64, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"FullMixed": {
+			cache: NewPageCache(base, ^uint64(0), ^uint64(0)),
+			hits: []hit{
+				{5, base, 5 * PageSize},
+				{7, base + 5*PageSize, 7 * PageSize},
+				{1, base + 12*PageSize, 1 * PageSize},
+				{23, base + 13*PageSize, 23 * PageSize},
+				{63, 0, 0},
+				{3, base + 36*PageSize, 3 * PageSize},
+				{3, base + 39*PageSize, 3 * PageSize},
+				{3, base + 42*PageSize, 3 * PageSize},
+				{12, base + 45*PageSize, 12 * PageSize},
+				{11, 0, 0},
+				{4, base + 57*PageSize, 4 * PageSize},
+				{4, 0, 0},
+				{6, 0, 0},
+				{36, 0, 0},
+				{2, base + 61*PageSize, 2 * PageSize},
+				{3, 0, 0},
+				{1, base + 63*PageSize, 1 * PageSize},
+				{4, 0, 0},
+				{2, 0, 0},
+				{62, 0, 0},
+				{1, 0, 0},
+			},
+		},
+	}
+	for name, test := range tests {
+		test := test
+		t.Run(name, func(t *testing.T) {
+			c := test.cache
+			for i, h := range test.hits {
+				b, s := c.Alloc(h.npages)
+				if b != h.base {
+					t.Fatalf("bad alloc base #%d: got 0x%x, want 0x%x", i, b, h.base)
+				}
+				if s != h.scav {
+					t.Fatalf("bad alloc scav #%d: got %d, want %d", i, s, h.scav)
+				}
+			}
+		})
+	}
+}
+
+func TestPageCacheFlush(t *testing.T) {
+	bits64ToBitRanges := func(bits uint64, base uint) []BitRange {
+		var ranges []BitRange
+		start, size := uint(0), uint(0)
+		for i := 0; i < 64; i++ {
+			if bits&(1<<i) != 0 {
+				if size == 0 {
+					start = uint(i) + base
+				}
+				size++
+			} else {
+				if size != 0 {
+					ranges = append(ranges, BitRange{start, size})
+					size = 0
+				}
+			}
+		}
+		if size != 0 {
+			ranges = append(ranges, BitRange{start, size})
+		}
+		return ranges
+	}
+	runTest := func(t *testing.T, base uint, cache, scav uint64) {
+		// Set up the before state.
+		beforeAlloc := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: {{base, 64}},
+		}
+		beforeScav := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: {},
+		}
+		b := NewPageAlloc(beforeAlloc, beforeScav)
+		defer FreePageAlloc(b)
+
+		// Create and flush the cache.
+		c := NewPageCache(PageBase(BaseChunkIdx, base), cache, scav)
+		c.Flush(b)
+		if !c.Empty() {
+			t.Errorf("pageCache flush did not clear cache")
+		}
+
+		// Set up the expected after state.
+		afterAlloc := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: bits64ToBitRanges(^cache, base),
+		}
+		afterScav := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: bits64ToBitRanges(scav, base),
+		}
+		want := NewPageAlloc(afterAlloc, afterScav)
+		defer FreePageAlloc(want)
+
+		// Check to see if it worked.
+		checkPageAlloc(t, want, b)
+	}
+
+	// Empty.
+	runTest(t, 0, 0, 0)
+
+	// Full.
+	runTest(t, 0, ^uint64(0), ^uint64(0))
+
+	// Random.
+	for i := 0; i < 100; i++ {
+		// Generate random valid base within a chunk.
+		base := uint(rand.Intn(PallocChunkPages/64)) * 64
+
+		// Generate random cache.
+		cache := rand.Uint64()
+		scav := rand.Uint64() & cache
+
+		// Run the test.
+		runTest(t, base, cache, scav)
+	}
+}
+
+func TestPageAllocAllocToCache(t *testing.T) {
+	tests := map[string]struct {
+		before map[ChunkIdx][]BitRange
+		scav   map[ChunkIdx][]BitRange
+		hits   []PageCache // expected base addresses and patterns
+		after  map[ChunkIdx][]BitRange
+	}{
+		"AllFree": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{1, 1}, {64, 64}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx, 0), ^uint64(0), 0x2),
+				NewPageCache(PageBase(BaseChunkIdx, 64), ^uint64(0), ^uint64(0)),
+				NewPageCache(PageBase(BaseChunkIdx, 128), ^uint64(0), 0),
+				NewPageCache(PageBase(BaseChunkIdx, 192), ^uint64(0), 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 256}},
+			},
+		},
+		"ManyArena": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 64}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx+2, PallocChunkPages-64), ^uint64(0), 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguous": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{31, 67}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx+0xff, 0), ^uint64(0), ((uint64(1)<<33)-1)<<31),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 64}},
+			},
+		},
+		"First": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 32}, {33, 31}, {96, 32}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{1, 4}, {31, 5}, {66, 2}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx, 0), 1<<32, 1<<32),
+				NewPageCache(PageBase(BaseChunkIdx, 64), (uint64(1)<<32)-1, 0x3<<2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 128}},
+			},
+		},
+		"Fail": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			hits: []PageCache{
+				NewPageCache(0, 0, 0),
+				NewPageCache(0, 0, 0),
+				NewPageCache(0, 0, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, v.scav)
+			defer FreePageAlloc(b)
+
+			for _, expect := range v.hits {
+				checkPageCache(t, b.AllocToCache(), expect)
+				if t.Failed() {
+					return
+				}
+			}
+			want := NewPageAlloc(v.after, v.scav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
diff --git a/libgo/go/runtime/mpallocbits.go b/libgo/go/runtime/mpallocbits.go
new file mode 100644
index 0000000..9d01ff8
--- /dev/null
+++ b/libgo/go/runtime/mpallocbits.go
@@ -0,0 +1,394 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+)
+
+// pageBits is a bitmap representing one bit per page in a palloc chunk.
+type pageBits [pallocChunkPages / 64]uint64
+
+// get returns the value of the i'th bit in the bitmap.
+func (b *pageBits) get(i uint) uint {
+	return uint((b[i/64] >> (i % 64)) & 1)
+}
+
+// block64 returns the 64-bit aligned block of bits containing the i'th bit.
+func (b *pageBits) block64(i uint) uint64 {
+	return b[i/64]
+}
+
+// set sets bit i of pageBits.
+func (b *pageBits) set(i uint) {
+	b[i/64] |= 1 << (i % 64)
+}
+
+// setRange sets bits in the range [i, i+n).
+func (b *pageBits) setRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.set(i)
+		return
+	}
+	// Set bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] |= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Set leading bits.
+	b[i/64] |= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = ^uint64(0)
+	}
+	// Set trailing bits.
+	b[j/64] |= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// setAll sets all the bits of b.
+func (b *pageBits) setAll() {
+	for i := range b {
+		b[i] = ^uint64(0)
+	}
+}
+
+// clear clears bit i of pageBits.
+func (b *pageBits) clear(i uint) {
+	b[i/64] &^= 1 << (i % 64)
+}
+
+// clearRange clears bits in the range [i, i+n).
+func (b *pageBits) clearRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.clear(i)
+		return
+	}
+	// Clear bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] &^= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Clear leading bits.
+	b[i/64] &^= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = 0
+	}
+	// Clear trailing bits.
+	b[j/64] &^= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// clearAll frees all the bits of b.
+func (b *pageBits) clearAll() {
+	for i := range b {
+		b[i] = 0
+	}
+}
+
+// popcntRange counts the number of set bits in the
+// range [i, i+n).
+func (b *pageBits) popcntRange(i, n uint) (s uint) {
+	if n == 1 {
+		return uint((b[i/64] >> (i % 64)) & 1)
+	}
+	_ = b[i/64]
+	j := i + n - 1
+	if i/64 == j/64 {
+		return uint(sys.OnesCount64((b[i/64] >> (i % 64)) & ((1 << n) - 1)))
+	}
+	_ = b[j/64]
+	s += uint(sys.OnesCount64(b[i/64] >> (i % 64)))
+	for k := i/64 + 1; k < j/64; k++ {
+		s += uint(sys.OnesCount64(b[k]))
+	}
+	s += uint(sys.OnesCount64(b[j/64] & ((1 << (j%64 + 1)) - 1)))
+	return
+}
+
+// pallocBits is a bitmap that tracks page allocations for at most one
+// palloc chunk.
+//
+// The precise representation is an implementation detail, but for the
+// sake of documentation, 0s are free pages and 1s are allocated pages.
+type pallocBits pageBits
+
+// consec8tab is a table containing the number of consecutive
+// zero bits for any uint8 value.
+//
+// The table is generated by calling consec8(i) for each
+// possible uint8 value, which is defined as:
+//
+// // consec8 counts the maximum number of consecutive 0 bits
+// // in a uint8.
+// func consec8(n uint8) int {
+// 	n = ^n
+// 	i := 0
+// 	for n != 0 {
+// 		n &= (n << 1)
+// 		i++
+// 	}
+// 	return i
+// }
+var consec8tab = [256]uint{
+	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+	4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
+}
+
+// summarize returns a packed summary of the bitmap in pallocBits.
+func (b *pallocBits) summarize() pallocSum {
+	// TODO(mknyszek): There may be something more clever to be done
+	// here to make the summarize operation more efficient. For example,
+	// we can compute start and end with 64-bit wide operations easily,
+	// but max is a bit more complex. Perhaps there exists some way to
+	// leverage the 64-bit start and end to our advantage?
+	var start, max, end uint
+	for i := 0; i < len(b); i++ {
+		a := b[i]
+		for j := 0; j < 64; j += 8 {
+			k := uint8(a >> j)
+
+			// Compute start.
+			si := uint(sys.TrailingZeros8(k))
+			if start == uint(i*64+j) {
+				start += si
+			}
+
+			// Compute max.
+			if end+si > max {
+				max = end + si
+			}
+			if mi := consec8tab[k]; mi > max {
+				max = mi
+			}
+
+			// Compute end.
+			if k == 0 {
+				end += 8
+			} else {
+				end = uint(sys.LeadingZeros8(k))
+			}
+		}
+	}
+	return packPallocSum(start, max, end)
+}
+
+// find searches for npages contiguous free pages in pallocBits and returns
+// the index where that run starts, as well as the index of the first free page
+// it found in the search. searchIdx represents the first known free page and
+// where to begin the search from.
+//
+// If find fails to find any free space, it returns an index of ^uint(0) and
+// the new searchIdx should be ignored.
+//
+// The returned searchIdx is always the index of the first free page found
+// in this bitmap during the search, except if npages == 1, in which
+// case it will be the index just after the first free page, because the
+// index returned as the first result is assumed to be allocated and so
+// represents a minor optimization for that case.
+func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
+	if npages == 1 {
+		addr := b.find1(searchIdx)
+		// Return a searchIdx of addr + 1 since we assume addr will be
+		// allocated.
+		return addr, addr + 1
+	} else if npages <= 64 {
+		return b.findSmallN(npages, searchIdx)
+	}
+	return b.findLargeN(npages, searchIdx)
+}
+
+// find1 is a helper for find which searches for a single free page
+// in the pallocBits and returns the index.
+//
+// See find for an explanation of the searchIdx parameter.
+func (b *pallocBits) find1(searchIdx uint) uint {
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if x == ^uint64(0) {
+			continue
+		}
+		return i*64 + uint(sys.TrailingZeros64(^x))
+	}
+	return ^uint(0)
+}
+
+// findSmallN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run of contiguous pages
+// starts as well as the index of the first free page it finds in its search.
+//
+// See find for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findSmallN assumes npages <= 64, where any such contiguous run of pages
+// crosses at most one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
+	end, newSearchIdx := uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		bi := b[i]
+		if bi == ^uint64(0) {
+			end = 0
+			continue
+		}
+		// First see if we can pack our allocation in the trailing
+		// zeros plus the end of the last 64 bits.
+		start := uint(sys.TrailingZeros64(bi))
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
+		}
+		if end+start >= uint(npages) {
+			return i*64 - end, newSearchIdx
+		}
+		// Next, check the interior of the 64-bit chunk.
+		j := findBitRange64(^bi, uint(npages))
+		if j < 64 {
+			return i*64 + j, newSearchIdx
+		}
+		end = uint(sys.LeadingZeros64(bi))
+	}
+	return ^uint(0), newSearchIdx
+}
+
+// findLargeN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run starts, as well as the
+// index of the first free page it found it its search.
+//
+// See alloc for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findLargeN assumes npages > 64, where any such run of free pages
+// crosses at least one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findLargeN(npages uintptr, searchIdx uint) (uint, uint) {
+	start, size, newSearchIdx := ^uint(0), uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if x == ^uint64(0) {
+			size = 0
+			continue
+		}
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^x))
+		}
+		if size == 0 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		s := uint(sys.TrailingZeros64(x))
+		if s+size >= uint(npages) {
+			size += s
+			return start, newSearchIdx
+		}
+		if s < 64 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		size += 64
+	}
+	if size < uint(npages) {
+		return ^uint(0), newSearchIdx
+	}
+	return start, newSearchIdx
+}
+
+// allocRange allocates the range [i, i+n).
+func (b *pallocBits) allocRange(i, n uint) {
+	(*pageBits)(b).setRange(i, n)
+}
+
+// allocAll allocates all the bits of b.
+func (b *pallocBits) allocAll() {
+	(*pageBits)(b).setAll()
+}
+
+// free1 frees a single page in the pallocBits at i.
+func (b *pallocBits) free1(i uint) {
+	(*pageBits)(b).clear(i)
+}
+
+// free frees the range [i, i+n) of pages in the pallocBits.
+func (b *pallocBits) free(i, n uint) {
+	(*pageBits)(b).clearRange(i, n)
+}
+
+// freeAll frees all the bits of b.
+func (b *pallocBits) freeAll() {
+	(*pageBits)(b).clearAll()
+}
+
+// pages64 returns a 64-bit bitmap representing a block of 64 pages aligned
+// to 64 pages. The returned block of pages is the one containing the i'th
+// page in this pallocBits. Each bit represents whether the page is in-use.
+func (b *pallocBits) pages64(i uint) uint64 {
+	return (*pageBits)(b).block64(i)
+}
+
+// findBitRange64 returns the bit index of the first set of
+// n consecutive 1 bits. If no consecutive set of 1 bits of
+// size n may be found in c, then it returns an integer >= 64.
+func findBitRange64(c uint64, n uint) uint {
+	i := uint(0)
+	cont := uint(sys.TrailingZeros64(^c))
+	for cont < n && i < 64 {
+		i += cont
+		i += uint(sys.TrailingZeros64(c >> i))
+		cont = uint(sys.TrailingZeros64(^(c >> i)))
+	}
+	return i
+}
+
+// pallocData encapsulates pallocBits and a bitmap for
+// whether or not a given page is scavenged in a single
+// structure. It's effectively a pallocBits with
+// additional functionality.
+//
+// Update the comment on (*pageAlloc).chunks should this
+// structure change.
+type pallocData struct {
+	pallocBits
+	scavenged pageBits
+}
+
+// allocRange sets bits [i, i+n) in the bitmap to 1 and
+// updates the scavenged bits appropriately.
+func (m *pallocData) allocRange(i, n uint) {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocRange(i, n)
+	m.scavenged.clearRange(i, n)
+}
+
+// allocAll sets every bit in the bitmap to 1 and updates
+// the scavenged bits appropriately.
+func (m *pallocData) allocAll() {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocAll()
+	m.scavenged.clearAll()
+}
diff --git a/libgo/go/runtime/mpallocbits_test.go b/libgo/go/runtime/mpallocbits_test.go
new file mode 100644
index 0000000..71a29f3
--- /dev/null
+++ b/libgo/go/runtime/mpallocbits_test.go
@@ -0,0 +1,510 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+// Ensures that got and want are the same, and if not, reports
+// detailed diff information.
+func checkPallocBits(t *testing.T, got, want *PallocBits) bool {
+	d := DiffPallocBits(got, want)
+	if len(d) != 0 {
+		t.Errorf("%d range(s) different", len(d))
+		for _, bits := range d {
+			t.Logf("\t@ bit index %d", bits.I)
+			t.Logf("\t|  got: %s", StringifyPallocBits(got, bits))
+			t.Logf("\t| want: %s", StringifyPallocBits(want, bits))
+		}
+		return false
+	}
+	return true
+}
+
+// makePallocBits produces an initialized PallocBits by setting
+// the ranges in s to 1 and the rest to zero.
+func makePallocBits(s []BitRange) *PallocBits {
+	b := new(PallocBits)
+	for _, v := range s {
+		b.AllocRange(v.I, v.N)
+	}
+	return b
+}
+
+// Ensures that PallocBits.AllocRange works, which is a fundamental
+// method used for testing and initialization since it's used by
+// makePallocBits.
+func TestPallocBitsAllocRange(t *testing.T) {
+	test := func(t *testing.T, i, n uint, want *PallocBits) {
+		checkPallocBits(t, makePallocBits([]BitRange{{i, n}}), want)
+	}
+	t.Run("OneLow", func(t *testing.T) {
+		want := new(PallocBits)
+		want[0] = 0x1
+		test(t, 0, 1, want)
+	})
+	t.Run("OneHigh", func(t *testing.T) {
+		want := new(PallocBits)
+		want[PallocChunkPages/64-1] = 1 << 63
+		test(t, PallocChunkPages-1, 1, want)
+	})
+	t.Run("Inner", func(t *testing.T) {
+		want := new(PallocBits)
+		want[2] = 0x3e
+		test(t, 129, 5, want)
+	})
+	t.Run("Aligned", func(t *testing.T) {
+		want := new(PallocBits)
+		want[2] = ^uint64(0)
+		want[3] = ^uint64(0)
+		test(t, 128, 128, want)
+	})
+	t.Run("Begin", func(t *testing.T) {
+		want := new(PallocBits)
+		want[0] = ^uint64(0)
+		want[1] = ^uint64(0)
+		want[2] = ^uint64(0)
+		want[3] = ^uint64(0)
+		want[4] = ^uint64(0)
+		want[5] = 0x1
+		test(t, 0, 321, want)
+	})
+	t.Run("End", func(t *testing.T) {
+		want := new(PallocBits)
+		want[PallocChunkPages/64-1] = ^uint64(0)
+		want[PallocChunkPages/64-2] = ^uint64(0)
+		want[PallocChunkPages/64-3] = ^uint64(0)
+		want[PallocChunkPages/64-4] = 1 << 63
+		test(t, PallocChunkPages-(64*3+1), 64*3+1, want)
+	})
+	t.Run("All", func(t *testing.T) {
+		want := new(PallocBits)
+		for i := range want {
+			want[i] = ^uint64(0)
+		}
+		test(t, 0, PallocChunkPages, want)
+	})
+}
+
+// Inverts every bit in the PallocBits.
+func invertPallocBits(b *PallocBits) {
+	for i := range b {
+		b[i] = ^b[i]
+	}
+}
+
+// Ensures two packed summaries are identical, and reports a detailed description
+// of the difference if they're not.
+func checkPallocSum(t *testing.T, got, want PallocSum) {
+	if got.Start() != want.Start() {
+		t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
+	}
+	if got.Max() != want.Max() {
+		t.Errorf("inconsistent max: got %d, want %d", got.Max(), want.Max())
+	}
+	if got.End() != want.End() {
+		t.Errorf("inconsistent end: got %d, want %d", got.End(), want.End())
+	}
+}
+
+func TestMallocBitsPopcntRange(t *testing.T) {
+	type test struct {
+		i, n uint // bit range to popcnt over.
+		want uint // expected popcnt result on that range.
+	}
+	tests := map[string]struct {
+		init  []BitRange // bit ranges to set to 1 in the bitmap.
+		tests []test     // a set of popcnt tests to run over the bitmap.
+	}{
+		"None": {
+			tests: []test{
+				{0, 1, 0},
+				{5, 3, 0},
+				{2, 11, 0},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, 0},
+				{0, PallocChunkPages, 0},
+			},
+		},
+		"All": {
+			init: []BitRange{{0, PallocChunkPages}},
+			tests: []test{
+				{0, 1, 1},
+				{5, 3, 3},
+				{2, 11, 11},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, PallocChunkPages / 2},
+				{0, PallocChunkPages, PallocChunkPages},
+			},
+		},
+		"Half": {
+			init: []BitRange{{PallocChunkPages / 2, PallocChunkPages / 2}},
+			tests: []test{
+				{0, 1, 0},
+				{5, 3, 0},
+				{2, 11, 0},
+				{PallocChunkPages/2 - 1, 1, 0},
+				{PallocChunkPages / 2, 1, 1},
+				{PallocChunkPages/2 + 10, 1, 1},
+				{PallocChunkPages/2 - 1, 2, 1},
+				{PallocChunkPages / 4, PallocChunkPages / 4, 0},
+				{PallocChunkPages / 4, PallocChunkPages/4 + 1, 1},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, PallocChunkPages/4 + 1},
+				{0, PallocChunkPages, PallocChunkPages / 2},
+			},
+		},
+		"OddBound": {
+			init: []BitRange{{0, 111}},
+			tests: []test{
+				{0, 1, 1},
+				{5, 3, 3},
+				{2, 11, 11},
+				{110, 2, 1},
+				{99, 50, 12},
+				{110, 1, 1},
+				{111, 1, 0},
+				{99, 1, 1},
+				{120, 1, 0},
+				{PallocChunkPages / 2, PallocChunkPages / 2, 0},
+				{0, PallocChunkPages, 111},
+			},
+		},
+		"Scattered": {
+			init: []BitRange{
+				{1, 3}, {5, 1}, {7, 1}, {10, 2}, {13, 1}, {15, 4},
+				{21, 1}, {23, 1}, {26, 2}, {30, 5}, {36, 2}, {40, 3},
+				{44, 6}, {51, 1}, {53, 2}, {58, 3}, {63, 1}, {67, 2},
+				{71, 10}, {84, 1}, {89, 7}, {99, 2}, {103, 1}, {107, 2},
+				{111, 1}, {113, 1}, {115, 1}, {118, 1}, {120, 2}, {125, 5},
+			},
+			tests: []test{
+				{0, 11, 6},
+				{0, 64, 39},
+				{13, 64, 40},
+				{64, 64, 34},
+				{0, 128, 73},
+				{1, 128, 74},
+				{0, PallocChunkPages, 75},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.init)
+			for _, h := range v.tests {
+				if got := b.PopcntRange(h.i, h.n); got != h.want {
+					t.Errorf("bad popcnt (i=%d, n=%d): got %d, want %d", h.i, h.n, got, h.want)
+				}
+			}
+		})
+	}
+}
+
+// Ensures computing bit summaries works as expected by generating random
+// bitmaps and checking against a reference implementation.
+func TestPallocBitsSummarizeRandom(t *testing.T) {
+	b := new(PallocBits)
+	for i := 0; i < 1000; i++ {
+		// Randomize bitmap.
+		for i := range b {
+			b[i] = rand.Uint64()
+		}
+		// Check summary against reference implementation.
+		checkPallocSum(t, b.Summarize(), SummarizeSlow(b))
+	}
+}
+
+// Ensures computing bit summaries works as expected.
+func TestPallocBitsSummarize(t *testing.T) {
+	var emptySum = PackPallocSum(PallocChunkPages, PallocChunkPages, PallocChunkPages)
+	type test struct {
+		free []BitRange // Ranges of free (zero) bits.
+		hits []PallocSum
+	}
+	tests := make(map[string]test)
+	tests["NoneFree"] = test{
+		free: []BitRange{},
+		hits: []PallocSum{
+			PackPallocSum(0, 0, 0),
+		},
+	}
+	tests["OnlyStart"] = test{
+		free: []BitRange{{0, 10}},
+		hits: []PallocSum{
+			PackPallocSum(10, 10, 0),
+		},
+	}
+	tests["OnlyEnd"] = test{
+		free: []BitRange{{PallocChunkPages - 40, 40}},
+		hits: []PallocSum{
+			PackPallocSum(0, 40, 40),
+		},
+	}
+	tests["StartAndEnd"] = test{
+		free: []BitRange{{0, 11}, {PallocChunkPages - 23, 23}},
+		hits: []PallocSum{
+			PackPallocSum(11, 23, 23),
+		},
+	}
+	tests["StartMaxEnd"] = test{
+		free: []BitRange{{0, 4}, {50, 100}, {PallocChunkPages - 4, 4}},
+		hits: []PallocSum{
+			PackPallocSum(4, 100, 4),
+		},
+	}
+	tests["OnlyMax"] = test{
+		free: []BitRange{{1, 20}, {35, 241}, {PallocChunkPages - 50, 30}},
+		hits: []PallocSum{
+			PackPallocSum(0, 241, 0),
+		},
+	}
+	tests["MultiMax"] = test{
+		free: []BitRange{{35, 2}, {40, 5}, {100, 5}},
+		hits: []PallocSum{
+			PackPallocSum(0, 5, 0),
+		},
+	}
+	tests["One"] = test{
+		free: []BitRange{{2, 1}},
+		hits: []PallocSum{
+			PackPallocSum(0, 1, 0),
+		},
+	}
+	tests["AllFree"] = test{
+		free: []BitRange{{0, PallocChunkPages}},
+		hits: []PallocSum{
+			emptySum,
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.free)
+			// In the PallocBits we create 1's represent free spots, but in our actual
+			// PallocBits 1 means not free, so invert.
+			invertPallocBits(b)
+			for _, h := range v.hits {
+				checkPallocSum(t, b.Summarize(), h)
+			}
+		})
+	}
+}
+
+// Benchmarks how quickly we can summarize a PallocBits.
+func BenchmarkPallocBitsSummarize(b *testing.B) {
+	buf0 := new(PallocBits)
+	buf1 := new(PallocBits)
+	for i := 0; i < len(buf1); i++ {
+		buf1[i] = ^uint64(0)
+	}
+	bufa := new(PallocBits)
+	for i := 0; i < len(bufa); i++ {
+		bufa[i] = 0xaa
+	}
+	for _, buf := range []*PallocBits{buf0, buf1, bufa} {
+		b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				buf.Summarize()
+			}
+		})
+	}
+}
+
+// Ensures page allocation works.
+func TestPallocBitsAlloc(t *testing.T) {
+	tests := map[string]struct {
+		before []BitRange
+		after  []BitRange
+		npages uintptr
+		hits   []uint
+	}{
+		"AllFree1": {
+			npages: 1,
+			hits:   []uint{0, 1, 2, 3, 4, 5},
+			after:  []BitRange{{0, 6}},
+		},
+		"AllFree2": {
+			npages: 2,
+			hits:   []uint{0, 2, 4, 6, 8, 10},
+			after:  []BitRange{{0, 12}},
+		},
+		"AllFree5": {
+			npages: 5,
+			hits:   []uint{0, 5, 10, 15, 20},
+			after:  []BitRange{{0, 25}},
+		},
+		"AllFree64": {
+			npages: 64,
+			hits:   []uint{0, 64, 128},
+			after:  []BitRange{{0, 192}},
+		},
+		"AllFree65": {
+			npages: 65,
+			hits:   []uint{0, 65, 130},
+			after:  []BitRange{{0, 195}},
+		},
+		"SomeFree64": {
+			before: []BitRange{{0, 32}, {64, 32}, {100, PallocChunkPages - 100}},
+			npages: 64,
+			hits:   []uint{^uint(0)},
+			after:  []BitRange{{0, 32}, {64, 32}, {100, PallocChunkPages - 100}},
+		},
+		"NoneFree1": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 1,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree2": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 2,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree5": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 5,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree65": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 65,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit1": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 - 2, PallocChunkPages/2 + 2}},
+			npages: 1,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit2": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 - 1, PallocChunkPages/2 + 1}},
+			npages: 2,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit5": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 + 2, PallocChunkPages/2 - 2}},
+			npages: 5,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit65": {
+			before: []BitRange{{0, PallocChunkPages/2 - 31}, {PallocChunkPages/2 + 34, PallocChunkPages/2 - 34}},
+			npages: 65,
+			hits:   []uint{PallocChunkPages/2 - 31, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"SomeFree161": {
+			before: []BitRange{{0, 185}, {331, 1}},
+			npages: 161,
+			hits:   []uint{332},
+			after:  []BitRange{{0, 185}, {331, 162}},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.before)
+			for iter, i := range v.hits {
+				a, _ := b.Find(v.npages, 0)
+				if i != a {
+					t.Fatalf("find #%d picked wrong index: want %d, got %d", iter+1, i, a)
+				}
+				if i != ^uint(0) {
+					b.AllocRange(a, uint(v.npages))
+				}
+			}
+			want := makePallocBits(v.after)
+			checkPallocBits(t, b, want)
+		})
+	}
+}
+
+// Ensures page freeing works.
+func TestPallocBitsFree(t *testing.T) {
+	tests := map[string]struct {
+		beforeInv []BitRange
+		afterInv  []BitRange
+		frees     []uint
+		npages    uintptr
+	}{
+		"SomeFree": {
+			npages:    1,
+			beforeInv: []BitRange{{0, 32}, {64, 32}, {100, 1}},
+			frees:     []uint{32},
+			afterInv:  []BitRange{{0, 33}, {64, 32}, {100, 1}},
+		},
+		"NoneFree1": {
+			npages:   1,
+			frees:    []uint{0, 1, 2, 3, 4, 5},
+			afterInv: []BitRange{{0, 6}},
+		},
+		"NoneFree2": {
+			npages:   2,
+			frees:    []uint{0, 2, 4, 6, 8, 10},
+			afterInv: []BitRange{{0, 12}},
+		},
+		"NoneFree5": {
+			npages:   5,
+			frees:    []uint{0, 5, 10, 15, 20},
+			afterInv: []BitRange{{0, 25}},
+		},
+		"NoneFree64": {
+			npages:   64,
+			frees:    []uint{0, 64, 128},
+			afterInv: []BitRange{{0, 192}},
+		},
+		"NoneFree65": {
+			npages:   65,
+			frees:    []uint{0, 65, 130},
+			afterInv: []BitRange{{0, 195}},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.beforeInv)
+			invertPallocBits(b)
+			for _, i := range v.frees {
+				b.Free(i, uint(v.npages))
+			}
+			want := makePallocBits(v.afterInv)
+			invertPallocBits(want)
+			checkPallocBits(t, b, want)
+		})
+	}
+}
+
+func TestFindBitRange64(t *testing.T) {
+	check := func(x uint64, n uint, result uint) {
+		i := FindBitRange64(x, n)
+		if result == ^uint(0) && i < 64 {
+			t.Errorf("case (%016x, %d): got %d, want failure", x, n, i)
+		} else if result != ^uint(0) && i != result {
+			t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result)
+		}
+	}
+	for i := uint(0); i <= 64; i++ {
+		check(^uint64(0), i, 0)
+	}
+	check(0, 0, 0)
+	for i := uint(1); i <= 64; i++ {
+		check(0, i, ^uint(0))
+	}
+	check(0x8000000000000000, 1, 63)
+	check(0xc000010001010000, 2, 62)
+	check(0xc000010001030000, 2, 16)
+	check(0xe000030001030000, 3, 61)
+	check(0xe000030001070000, 3, 16)
+	check(0xffff03ff01070000, 16, 48)
+	check(0xffff03ff0107ffff, 16, 0)
+	check(0x0fff03ff01079fff, 16, ^uint(0))
+}
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 132c2ff..dd257d1 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -614,10 +614,20 @@ func fixupStack(stk []uintptr, skip int, canonStack *[maxStack]uintptr, size uin
 	// Increase the skip count to take into account the frames corresponding
 	// to runtime.callersRaw and to the C routine that it invokes.
 	skip += 2
+	sawSigtramp := false
 	for _, pc := range stk {
 		// Subtract 1 from PC to undo the 1 we added in callback in
 		// go-callers.c.
-		function, file, _, frames := funcfileline(pc-1, -1)
+		function, file, _, frames := funcfileline(pc-1, -1, false)
+
+		// Skip an unnamed function above sigtramp, as it is
+		// likely the signal handler.
+		if sawSigtramp {
+			sawSigtramp = false
+			if function == "" {
+				continue
+			}
+		}
 
 		// Skip split-stack functions (match by function name)
 		skipFrame := false
@@ -630,11 +640,18 @@ func fixupStack(stk []uintptr, skip int, canonStack *[maxStack]uintptr, size uin
 			skipFrame = true
 		}
 
-		// Skip thunks and recover functions.  There is no equivalent to
-		// these functions in the gc toolchain.
+		// Skip thunks and recover functions and other functions
+		// specific to gccgo, that do not appear in the gc toolchain.
 		fcn := function
 		if hasSuffix(fcn, "..r") {
 			skipFrame = true
+		} else if function == "runtime.deferreturn" || function == "runtime.sighandler" {
+			skipFrame = true
+		} else if function == "runtime.sigtramp" || function == "runtime.sigtrampgo" {
+			skipFrame = true
+			// Also skip subsequent unnamed functions,
+			// which will be the signal handler itself.
+			sawSigtramp = true
 		} else {
 			for fcn != "" && (fcn[len(fcn)-1] >= '0' && fcn[len(fcn)-1] <= '9') {
 				fcn = fcn[:len(fcn)-1]
diff --git a/libgo/go/runtime/mranges.go b/libgo/go/runtime/mranges.go
new file mode 100644
index 0000000..c14e5c7
--- /dev/null
+++ b/libgo/go/runtime/mranges.go
@@ -0,0 +1,147 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Address range data structure.
+//
+// This file contains an implementation of a data structure which
+// manages ordered address ranges.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// addrRange represents a region of address space.
+type addrRange struct {
+	// base and limit together represent the region of address space
+	// [base, limit). That is, base is inclusive, limit is exclusive.
+	base, limit uintptr
+}
+
+// size returns the size of the range represented in bytes.
+func (a addrRange) size() uintptr {
+	if a.limit <= a.base {
+		return 0
+	}
+	return a.limit - a.base
+}
+
+// subtract takes the addrRange toPrune and cuts out any overlap with
+// from, then returns the new range. subtract assumes that a and b
+// either don't overlap at all, only overlap on one side, or are equal.
+// If b is strictly contained in a, thus forcing a split, it will throw.
+func (a addrRange) subtract(b addrRange) addrRange {
+	if a.base >= b.base && a.limit <= b.limit {
+		return addrRange{}
+	} else if a.base < b.base && a.limit > b.limit {
+		throw("bad prune")
+	} else if a.limit > b.limit && a.base < b.limit {
+		a.base = b.limit
+	} else if a.base < b.base && a.limit > b.base {
+		a.limit = b.base
+	}
+	return a
+}
+
+// addrRanges is a data structure holding a collection of ranges of
+// address space.
+//
+// The ranges are coalesced eagerly to reduce the
+// number ranges it holds.
+//
+// The slice backing store for this field is persistentalloc'd
+// and thus there is no way to free it.
+//
+// addrRanges is not thread-safe.
+type addrRanges struct {
+	// ranges is a slice of ranges sorted by base.
+	ranges []addrRange
+
+	// sysStat is the stat to track allocations by this type
+	sysStat *uint64
+}
+
+func (a *addrRanges) init(sysStat *uint64) {
+	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+	ranges.len = 0
+	ranges.cap = 16
+	ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, sysStat))
+	a.sysStat = sysStat
+}
+
+// findSucc returns the first index in a such that base is
+// less than the base of the addrRange at that index.
+func (a *addrRanges) findSucc(base uintptr) int {
+	// TODO(mknyszek): Consider a binary search for large arrays.
+	// While iterating over these ranges is potentially expensive,
+	// the expected number of ranges is small, ideally just 1,
+	// since Go heaps are usually mostly contiguous.
+	for i := range a.ranges {
+		if base < a.ranges[i].base {
+			return i
+		}
+	}
+	return len(a.ranges)
+}
+
+// add inserts a new address range to a.
+//
+// r must not overlap with any address range in a.
+func (a *addrRanges) add(r addrRange) {
+	// The copies in this function are potentially expensive, but this data
+	// structure is meant to represent the Go heap. At worst, copying this
+	// would take ~160µs assuming a conservative copying rate of 25 GiB/s (the
+	// copy will almost never trigger a page fault) for a 1 TiB heap with 4 MiB
+	// arenas which is completely discontiguous. ~160µs is still a lot, but in
+	// practice most platforms have 64 MiB arenas (which cuts this by a factor
+	// of 16) and Go heaps are usually mostly contiguous, so the chance that
+	// an addrRanges even grows to that size is extremely low.
+
+	// Because we assume r is not currently represented in a,
+	// findSucc gives us our insertion index.
+	i := a.findSucc(r.base)
+	coalescesDown := i > 0 && a.ranges[i-1].limit == r.base
+	coalescesUp := i < len(a.ranges) && r.limit == a.ranges[i].base
+	if coalescesUp && coalescesDown {
+		// We have neighbors and they both border us.
+		// Merge a.ranges[i-1], r, and a.ranges[i] together into a.ranges[i-1].
+		a.ranges[i-1].limit = a.ranges[i].limit
+
+		// Delete a.ranges[i].
+		copy(a.ranges[i:], a.ranges[i+1:])
+		a.ranges = a.ranges[:len(a.ranges)-1]
+	} else if coalescesDown {
+		// We have a neighbor at a lower address only and it borders us.
+		// Merge the new space into a.ranges[i-1].
+		a.ranges[i-1].limit = r.limit
+	} else if coalescesUp {
+		// We have a neighbor at a higher address only and it borders us.
+		// Merge the new space into a.ranges[i].
+		a.ranges[i].base = r.base
+	} else {
+		// We may or may not have neighbors which don't border us.
+		// Add the new range.
+		if len(a.ranges)+1 > cap(a.ranges) {
+			// Grow the array. Note that this leaks the old array, but since
+			// we're doubling we have at most 2x waste. For a 1 TiB heap and
+			// 4 MiB arenas which are all discontiguous (both very conservative
+			// assumptions), this would waste at most 4 MiB of memory.
+			oldRanges := a.ranges
+			ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+			ranges.len = len(oldRanges) + 1
+			ranges.cap = cap(oldRanges) * 2
+			ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, a.sysStat))
+
+			// Copy in the old array, but make space for the new range.
+			copy(a.ranges[:i], oldRanges[:i])
+			copy(a.ranges[i+1:], oldRanges[i:])
+		} else {
+			a.ranges = a.ranges[:len(a.ranges)+1]
+			copy(a.ranges[i+1:], a.ranges[i:])
+		}
+		a.ranges[i] = r
+	}
+}
diff --git a/libgo/go/runtime/msize.go b/libgo/go/runtime/msize.go
index 0accb83..11d06ce 100644
--- a/libgo/go/runtime/msize.go
+++ b/libgo/go/runtime/msize.go
@@ -21,5 +21,5 @@ func roundupsize(size uintptr) uintptr {
 	if size+_PageSize < size {
 		return size
 	}
-	return round(size, _PageSize)
+	return alignUp(size, _PageSize)
 }
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index cdab2ca..2d4cdbe 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -31,7 +31,7 @@ type mstats struct {
 	nfree       uint64 // number of frees
 
 	// Statistics about malloc heap.
-	// Protected by mheap.lock
+	// Updated atomically, or with the world stopped.
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
@@ -40,19 +40,22 @@ type mstats struct {
 	heap_idle     uint64 // bytes in idle spans
 	heap_inuse    uint64 // bytes in mSpanInUse spans
 	heap_released uint64 // bytes released to the os
-	heap_objects  uint64 // total number of allocated objects
+
+	// heap_objects is not used by the runtime directly and instead
+	// computed on the fly by updatememstats.
+	heap_objects uint64 // total number of allocated objects
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans
+	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
 	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
 	mspan_sys    uint64
 	mcache_inuse uint64 // mcache structures
 	mcache_sys   uint64
 	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64
-	other_sys    uint64
+	gc_sys       uint64 // updated atomically or during STW
+	other_sys    uint64 // updated atomically or during STW
 
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
@@ -79,6 +82,8 @@ type mstats struct {
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+	last_next_gc     uint64 // next_gc for the previous GC
+	last_heap_inuse  uint64 // heap_inuse at mark termination of the previous GC
 
 	// triggerRatio is the heap growth ratio that triggers marking.
 	//
diff --git a/libgo/go/runtime/nbpipe_pipe.go b/libgo/go/runtime/nbpipe_pipe.go
new file mode 100644
index 0000000..822b294
--- /dev/null
+++ b/libgo/go/runtime/nbpipe_pipe.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe()
+	if errno != 0 {
+		return -1, -1, errno
+	}
+	closeonexec(r)
+	setNonblock(r)
+	closeonexec(w)
+	setNonblock(w)
+	return r, w, errno
+}
diff --git a/libgo/go/runtime/nbpipe_pipe2.go b/libgo/go/runtime/nbpipe_pipe2.go
new file mode 100644
index 0000000..e3639d9
--- /dev/null
+++ b/libgo/go/runtime/nbpipe_pipe2.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe2(_O_NONBLOCK | _O_CLOEXEC)
+	if errno == -_ENOSYS {
+		r, w, errno = pipe()
+		if errno != 0 {
+			return -1, -1, errno
+		}
+		closeonexec(r)
+		setNonblock(r)
+		closeonexec(w)
+		setNonblock(w)
+	}
+	return r, w, errno
+}
diff --git a/libgo/go/runtime/nbpipe_test.go b/libgo/go/runtime/nbpipe_test.go
new file mode 100644
index 0000000..981143e
--- /dev/null
+++ b/libgo/go/runtime/nbpipe_test.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime_test
+
+import (
+	"runtime"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestNonblockingPipe(t *testing.T) {
+	t.Parallel()
+
+	// NonblockingPipe is the test name for nonblockingPipe.
+	r, w, errno := runtime.NonblockingPipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+
+	checkIsPipe(t, r, w)
+	checkNonblocking(t, r, "reader")
+	checkCloseonexec(t, r, "reader")
+	checkNonblocking(t, w, "writer")
+	checkCloseonexec(t, w, "writer")
+}
+
+func checkIsPipe(t *testing.T, r, w int32) {
+	bw := byte(42)
+	if n := runtime.Write(uintptr(w), unsafe.Pointer(&bw), 1); n != 1 {
+		t.Fatalf("Write(w, &b, 1) == %d, expected 1", n)
+	}
+	var br byte
+	if n := runtime.Read(r, unsafe.Pointer(&br), 1); n != 1 {
+		t.Fatalf("Read(r, &b, 1) == %d, expected 1", n)
+	}
+	if br != bw {
+		t.Errorf("pipe read %d, expected %d", br, bw)
+	}
+}
+
+func checkNonblocking(t *testing.T, fd int32, name string) {
+	t.Helper()
+	flags, errno := fcntl(uintptr(fd), syscall.F_GETFL, 0)
+	if errno != 0 {
+		t.Errorf("fcntl(%s, F_GETFL) failed: %v", name, syscall.Errno(errno))
+	} else if flags&syscall.O_NONBLOCK == 0 {
+		t.Errorf("O_NONBLOCK not set in %s flags %#x", name, flags)
+	}
+}
+
+func checkCloseonexec(t *testing.T, fd int32, name string) {
+	t.Helper()
+	flags, errno := fcntl(uintptr(fd), syscall.F_GETFD, 0)
+	if errno != 0 {
+		t.Errorf("fcntl(%s, F_GETFD) failed: %v", name, syscall.Errno(errno))
+	} else if flags&syscall.FD_CLOEXEC == 0 {
+		t.Errorf("FD_CLOEXEC not set in %s flags %#x", name, flags)
+	}
+}
+
+func TestSetNonblock(t *testing.T) {
+	t.Parallel()
+
+	r, w, errno := runtime.Pipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+
+	checkIsPipe(t, r, w)
+
+	runtime.SetNonblock(r)
+	runtime.SetNonblock(w)
+	checkNonblocking(t, r, "reader")
+	checkNonblocking(t, w, "writer")
+
+	runtime.Closeonexec(r)
+	runtime.Closeonexec(w)
+	checkCloseonexec(t, r, "reader")
+	checkCloseonexec(t, w, "writer")
+}
+
+//extern __go_fcntl_uintptr
+func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
+
+// Call fcntl libc function rather than calling syscall.
+func fcntl(fd uintptr, cmd int, arg uintptr) (uintptr, syscall.Errno) {
+	res, errno := fcntlUintptr(fd, uintptr(cmd), arg)
+	return res, syscall.Errno(errno)
+}
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index 1ce9808..d2fb775 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd hurd js,wasm linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows
 
 package runtime
 
@@ -15,12 +15,26 @@ import (
 //go:linkname netpoll
 
 // Integrated network poller (platform-independent part).
-// A particular implementation (epoll/kqueue) must define the following functions:
-// func netpollinit()			// to initialize the poller
-// func netpollopen(fd uintptr, pd *pollDesc) int32	// to arm edge-triggered notifications
-// and associate fd with pd.
-// An implementation must call the following function to denote that the pd is ready.
-// func netpollready(gpp **g, pd *pollDesc, mode int32)
+// A particular implementation (epoll/kqueue/port/AIX/Windows)
+// must define the following functions:
+//
+// func netpollinit()
+//     Initialize the poller. Only called once.
+//
+// func netpollopen(fd uintptr, pd *pollDesc) int32
+//     Arm edge-triggered notifications for fd. The pd argument is to pass
+//     back to netpollready when fd is ready. Return an errno value.
+//
+// func netpoll(delta int64) gList
+//     Poll the network. If delta < 0, block indefinitely. If delta == 0,
+//     poll without blocking. If delta > 0, block for up to delta nanoseconds.
+//     Return a list of goroutines built by calling netpollready.
+//
+// func netpollBreak()
+//     Wake up the network poller, assumed to be blocked in netpoll.
+//
+// func netpollIsPollDescriptor(fd uintptr) bool
+//     Reports whether fd is a file descriptor used by the poller.
 
 // pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
 // goroutines respectively. The semaphore can be in the following states:
@@ -82,15 +96,27 @@ type pollCache struct {
 }
 
 var (
-	netpollInited  uint32
+	netpollInitLock mutex
+	netpollInited   uint32
+
 	pollcache      pollCache
 	netpollWaiters uint32
 )
 
 //go:linkname poll_runtime_pollServerInit internal..z2fpoll.runtime_pollServerInit
 func poll_runtime_pollServerInit() {
-	netpollinit()
-	atomic.Store(&netpollInited, 1)
+	netpollGenericInit()
+}
+
+func netpollGenericInit() {
+	if atomic.Load(&netpollInited) == 0 {
+		lock(&netpollInitLock)
+		if netpollInited == 0 {
+			netpollinit()
+			atomic.Store(&netpollInited, 1)
+		}
+		unlock(&netpollInitLock)
+	}
 }
 
 func netpollinited() bool {
@@ -102,14 +128,7 @@ func netpollinited() bool {
 // poll_runtime_isPollServerDescriptor reports whether fd is a
 // descriptor being used by netpoll.
 func poll_runtime_isPollServerDescriptor(fd uintptr) bool {
-	fds := netpolldescriptor()
-	if GOOS != "aix" && GOOS != "hurd" {
-		return fd == fds
-	} else {
-		// AIX have a pipe in its netpoll implementation.
-		// Therefore, two fd are returned by netpolldescriptor using a mask.
-		return fd == fds&0xFFFF || fd == (fds>>16)&0xFFFF
-	}
+	return netpollIsPollDescriptor(fd)
 }
 
 //go:linkname poll_runtime_pollOpen internal..z2fpoll.runtime_pollOpen
@@ -240,13 +259,12 @@ func poll_runtime_pollSetDeadline(ctx uintptr, d int64, mode int) {
 	if pd.rt.f == nil {
 		if pd.rd > 0 {
 			pd.rt.f = rtf
-			pd.rt.when = pd.rd
 			// Copy current seq into the timer arg.
 			// Timer func will check the seq against current descriptor seq,
 			// if they differ the descriptor was reused or timers were reset.
 			pd.rt.arg = pd
 			pd.rt.seq = pd.rseq
-			addtimer(&pd.rt)
+			resettimer(&pd.rt, pd.rd)
 		}
 	} else if pd.rd != rd0 || combo != combo0 {
 		pd.rseq++ // invalidate current timers
@@ -260,10 +278,9 @@ func poll_runtime_pollSetDeadline(ctx uintptr, d int64, mode int) {
 	if pd.wt.f == nil {
 		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
-			pd.wt.when = pd.wd
 			pd.wt.arg = pd
 			pd.wt.seq = pd.wseq
-			addtimer(&pd.wt)
+			resettimer(&pd.wt, pd.wd)
 		}
 	} else if pd.wd != wd0 || combo != combo0 {
 		pd.wseq++ // invalidate current timers
@@ -325,8 +342,13 @@ func poll_runtime_pollUnblock(ctx uintptr) {
 	}
 }
 
-// make pd ready, newly runnable goroutines (if any) are added to toRun.
-// May run during STW, so write barriers are not allowed.
+// netpollready is called by the platform-specific netpoll function.
+// It declares that the fd associated with pd is ready for I/O.
+// The toRun argument is used to build a list of goroutines to return
+// from netpoll. The mode argument is 'r', 'w', or 'r'+'w' to indicate
+// whether the fd is ready for reading or writing or both.
+//
+// This may run while the world is stopped, so write barriers are not allowed.
 //go:nowritebarrier
 func netpollready(toRun *gList, pd *pollDesc, mode int32) {
 	var rg, wg *g
diff --git a/libgo/go/runtime/netpoll_aix.go b/libgo/go/runtime/netpoll_aix.go
index 39e36c7..a00742e 100644
--- a/libgo/go/runtime/netpoll_aix.go
+++ b/libgo/go/runtime/netpoll_aix.go
@@ -14,18 +14,6 @@ import "unsafe"
 //extern poll
 func libc_poll(pfds *pollfd, npfds uintptr, timeout uintptr) int32
 
-//go:noescape
-//extern pipe
-func libc_pipe(fd *int32) int32
-
-//extern __go_fcntl_uintptr
-func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
-
-func fcntl(fd, cmd int32, arg uintptr) int32 {
-	r, _ := fcntlUintptr(uintptr(fd), uintptr(cmd), arg)
-	return int32(r)
-}
-
 // pollfd represents the poll structure for AIX operating system.
 type pollfd struct {
 	fd      int32
@@ -49,22 +37,13 @@ var (
 )
 
 func netpollinit() {
-	var p [2]int32
-
 	// Create the pipe we use to wakeup poll.
-	if err := libc_pipe(&p[0]); err < 0 {
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
 		throw("netpollinit: failed to create pipe")
 	}
-	rdwake = p[0]
-	wrwake = p[1]
-
-	fl := uintptr(fcntl(rdwake, _F_GETFL, 0))
-	fcntl(rdwake, _F_SETFL, fl|_O_NONBLOCK)
-	fcntl(rdwake, _F_SETFD, _FD_CLOEXEC)
-
-	fl = uintptr(fcntl(wrwake, _F_GETFL, 0))
-	fcntl(wrwake, _F_SETFL, fl|_O_NONBLOCK)
-	fcntl(wrwake, _F_SETFD, _FD_CLOEXEC)
+	rdwake = r
+	wrwake = w
 
 	// Pre-allocate array of pollfd structures for poll.
 	pfds = make([]pollfd, 1, 128)
@@ -77,12 +56,8 @@ func netpollinit() {
 	pds[0] = nil
 }
 
-func netpolldescriptor() uintptr {
-	// Both fd must be returned
-	if rdwake > 0xFFFF || wrwake > 0xFFFF {
-		throw("netpolldescriptor: invalid fd number")
-	}
-	return uintptr(rdwake<<16 | wrwake)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(rdwake) || fd == uintptr(wrwake)
 }
 
 // netpollwakeup writes on wrwake to wakeup poll before any changes.
@@ -146,12 +121,32 @@ func netpollarm(pd *pollDesc, mode int) {
 	unlock(&mtxset)
 }
 
+// netpollBreak interrupts an epollwait.
+func netpollBreak() {
+	netpollwakeup()
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
 //go:nowritebarrierrec
-func netpoll(block bool) gList {
-	timeout := ^uintptr(0)
-	if !block {
-		timeout = 0
+func netpoll(delay int64) gList {
+	var timeout uintptr
+	if delay < 0 {
+		timeout = ^uintptr(0)
+	} else if delay == 0 {
+		// TODO: call poll with timeout == 0
 		return gList{}
+	} else if delay < 1e6 {
+		timeout = 1
+	} else if delay < 1e15 {
+		timeout = uintptr(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		timeout = 1e9
 	}
 retry:
 	lock(&mtxpoll)
@@ -167,20 +162,29 @@ retry:
 			throw("poll failed")
 		}
 		unlock(&mtxset)
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if timeout > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	// Check if some descriptors need to be changed
 	if n != 0 && pfds[0].revents&(_POLLIN|_POLLHUP|_POLLERR) != 0 {
-		var b [1]byte
-		for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
+		if delay != 0 {
+			// A netpollwakeup could be picked up by a
+			// non-blocking poll. Only clear the wakeup
+			// if blocking.
+			var b [1]byte
+			for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
+			}
 		}
-		// Do not look at the other fds in this case as the mode may have changed
-		// XXX only additions of flags are made, so maybe it is ok
-		unlock(&mtxset)
-		goto retry
+		// Still look at the other fds even if the mode may have
+		// changed, as netpollBreak might have been called.
+		n--
 	}
 	var toRun gList
-	for i := 0; i < len(pfds) && n > 0; i++ {
+	for i := 1; i < len(pfds) && n > 0; i++ {
 		pfd := &pfds[i]
 
 		var mode int32
@@ -202,8 +206,5 @@ retry:
 		}
 	}
 	unlock(&mtxset)
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_epoll.go b/libgo/go/runtime/netpoll_epoll.go
index 885ac1f..7b215f3 100644
--- a/libgo/go/runtime/netpoll_epoll.go
+++ b/libgo/go/runtime/netpoll_epoll.go
@@ -22,33 +22,44 @@ func epollctl(epfd, op, fd int32, ev *epollevent) int32
 //extern epoll_wait
 func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
 
-//extern __go_fcntl_uintptr
-func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
-
-func closeonexec(fd int32) {
-	fcntlUintptr(uintptr(fd), _F_SETFD, _FD_CLOEXEC)
-}
-
 var (
 	epfd int32 = -1 // epoll descriptor
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
 )
 
 func netpollinit() {
 	epfd = epollcreate1(_EPOLL_CLOEXEC)
-	if epfd >= 0 {
-		return
-	}
-	epfd = epollcreate(1024)
-	if epfd >= 0 {
+	if epfd < 0 {
+		epfd = epollcreate(1024)
+		if epfd < 0 {
+			println("runtime: epollcreate failed with", -epfd)
+			throw("runtime: netpollinit failed")
+		}
 		closeonexec(epfd)
-		return
 	}
-	println("netpollinit: failed to create epoll descriptor", errno())
-	throw("netpollinit: failed to create descriptor")
+	r, w, cerrno := nonblockingPipe()
+	if cerrno != 0 {
+		println("runtime: pipe failed with", cerrno)
+		throw("runtime: pipe failed")
+	}
+	ev := epollevent{
+		events: _EPOLLIN,
+	}
+	*(**uintptr)(unsafe.Pointer(&ev.data)) = &netpollBreakRd
+	if epollctl(epfd, _EPOLL_CTL_ADD, r, &ev) < 0 {
+		cerrno = int32(errno())
+	}
+	if cerrno != 0 {
+		println("runtime: epollctl failed with", cerrno)
+		throw("runtime: epollctl failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(epfd)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(epfd) || fd == netpollBreakRd || fd == netpollBreakWr
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -73,15 +84,47 @@ func netpollarm(pd *pollDesc, mode int) {
 	throw("runtime: unused")
 }
 
-// polls for ready network connections
-// returns list of goroutines that become runnable
-func netpoll(block bool) gList {
+// netpollBreak interrupts an epollwait.
+func netpollBreak() {
+	for {
+		var b byte
+		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+		if n == 1 {
+			break
+		}
+		if n == -_EINTR {
+			continue
+		}
+		if n == -_EAGAIN {
+			return
+		}
+		println("runtime: netpollBreak write failed with", -n)
+		throw("runtime: netpollBreak write failed")
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if epfd == -1 {
 		return gList{}
 	}
-	waitms := int32(-1)
-	if !block {
+	var waitms int32
+	if delay < 0 {
+		waitms = -1
+	} else if delay == 0 {
 		waitms = 0
+	} else if delay < 1e6 {
+		waitms = 1
+	} else if delay < 1e15 {
+		waitms = int32(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		waitms = 1e9
 	}
 	var events [128]epollevent
 retry:
@@ -92,6 +135,11 @@ retry:
 			println("runtime: epollwait on fd", epfd, "failed with", e)
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if waitms > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	var toRun gList
@@ -100,6 +148,22 @@ retry:
 		if ev.events == 0 {
 			continue
 		}
+
+		if *(**uintptr)(unsafe.Pointer(&ev.data)) == &netpollBreakRd {
+			if ev.events != _EPOLLIN {
+				println("runtime: netpoll: break fd ready for", ev.events)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+			}
+			continue
+		}
+
 		var mode int32
 		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
 			mode += 'r'
@@ -116,8 +180,5 @@ retry:
 			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_fake.go b/libgo/go/runtime/netpoll_fake.go
index 5b1a63a..b2af3b8 100644
--- a/libgo/go/runtime/netpoll_fake.go
+++ b/libgo/go/runtime/netpoll_fake.go
@@ -2,18 +2,18 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Fake network poller for NaCl and wasm/js.
-// Should never be used, because NaCl and wasm/js network connections do not honor "SetNonblock".
+// Fake network poller for wasm/js.
+// Should never be used, because wasm/js network connections do not honor "SetNonblock".
 
-// +build nacl js,wasm
+// +build js,wasm
 
 package runtime
 
 func netpollinit() {
 }
 
-func netpolldescriptor() uintptr {
-	return ^uintptr(0)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return false
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -27,6 +27,9 @@ func netpollclose(fd uintptr) int32 {
 func netpollarm(pd *pollDesc, mode int) {
 }
 
-func netpoll(block bool) gList {
+func netpollBreak() {
+}
+
+func netpoll(delay int64) gList {
 	return gList{}
 }
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index ce1acdf..9450461 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -17,16 +17,10 @@ func kqueue() int32
 //extern kevent
 func kevent(kq int32, ch *keventt, nch uintptr, ev *keventt, nev uintptr, ts *timespec) int32
 
-//extern __go_fcntl_uintptr
-func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
-
-//go:nosplit
-func closeonexec(fd int32) {
-	fcntlUintptr(uintptr(fd), _F_SETFD, _FD_CLOEXEC)
-}
-
 var (
 	kq int32 = -1
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
 )
 
 func netpollinit() {
@@ -36,10 +30,27 @@ func netpollinit() {
 		throw("runtime: netpollinit failed")
 	}
 	closeonexec(kq)
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
+		println("runtime: pipe failed with", -errno)
+		throw("runtime: pipe failed")
+	}
+	ev := keventt{
+		filter: _EVFILT_READ,
+		flags:  _EV_ADD,
+	}
+	*(*uintptr)(unsafe.Pointer(&ev.ident)) = uintptr(r)
+	n := kevent(kq, &ev, 1, nil, 0, nil)
+	if n < 0 {
+		println("runtime: kevent failed with", -n)
+		throw("runtime: kevent failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(kq)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(kq) || fd == netpollBreakRd || fd == netpollBreakWr
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -72,15 +83,43 @@ func netpollarm(pd *pollDesc, mode int) {
 	throw("runtime: unused")
 }
 
-// Polls for ready network connections.
+// netpollBreak interrupts an epollwait.
+func netpollBreak() {
+	for {
+		var b byte
+		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+		if n == 1 || n == -_EAGAIN {
+			break
+		}
+		if n == -_EINTR {
+			continue
+		}
+		println("runtime: netpollBreak write failed with", -n)
+		throw("runtime: netpollBreak write failed")
+	}
+}
+
+// netpoll checks for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if kq == -1 {
 		return gList{}
 	}
 	var tp *timespec
 	var ts timespec
-	if !block {
+	if delay < 0 {
+		tp = nil
+	} else if delay == 0 {
+		tp = &ts
+	} else {
+		ts.setNsec(delay)
+		if ts.tv_sec > 1e6 {
+			// Darwin returns EINVAL if the sleep time is too long.
+			ts.tv_sec = 1e6
+		}
 		tp = &ts
 	}
 	var events [64]keventt
@@ -92,11 +131,32 @@ retry:
 			println("runtime: kevent on fd", kq, "failed with", e)
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if delay > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
+
+		if uintptr(ev.ident) == netpollBreakRd {
+			if ev.filter != _EVFILT_READ {
+				println("runtime: netpoll: break fd ready for", ev.filter)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+			}
+			continue
+		}
+
 		var mode int32
 		switch ev.filter {
 		case _EVFILT_READ:
@@ -126,8 +186,5 @@ retry:
 			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_solaris.go b/libgo/go/runtime/netpoll_solaris.go
index 222af29..acb8bab 100644
--- a/libgo/go/runtime/netpoll_solaris.go
+++ b/libgo/go/runtime/netpoll_solaris.go
@@ -67,14 +67,6 @@ import "unsafe"
 // again we know for sure we are always talking about the same file
 // descriptor and can safely access the data we want (the event set).
 
-//extern __go_fcntl_uintptr
-func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
-
-func fcntl(fd, cmd int32, arg uintptr) int32 {
-	r, _ := fcntlUintptr(uintptr(fd), uintptr(cmd), arg)
-	return int32(r)
-}
-
 //extern port_create
 func port_create() int32
 
@@ -88,12 +80,15 @@ func port_dissociate(port, source int32, object uintptr) int32
 //extern port_getn
 func port_getn(port int32, evs *portevent, max uint32, nget *uint32, timeout *timespec) int32
 
+//extern port_alert
+func port_alert(port int32, flags, events uint32, user uintptr) int32
+
 var portfd int32 = -1
 
 func netpollinit() {
 	portfd = port_create()
 	if portfd >= 0 {
-		fcntl(portfd, _F_SETFD, _FD_CLOEXEC)
+		closeonexec(portfd)
 		return
 	}
 
@@ -101,8 +96,8 @@ func netpollinit() {
 	throw("runtime: netpollinit failed")
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(portfd)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(portfd)
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -164,27 +159,68 @@ func netpollarm(pd *pollDesc, mode int) {
 	unlock(&pd.lock)
 }
 
-// polls for ready network connections
-// returns list of goroutines that become runnable
-func netpoll(block bool) gList {
+// netpollBreak interrupts a port_getn wait.
+func netpollBreak() {
+	// Use port_alert to put portfd into alert mode.
+	// This will wake up all threads sleeping in port_getn on portfd,
+	// and cause their calls to port_getn to return immediately.
+	// Further, until portfd is taken out of alert mode,
+	// all calls to port_getn will return immediately.
+	if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
+		if e := errno(); e != _EBUSY {
+			println("runtime: port_alert failed with", e)
+			throw("runtime: netpoll: port_alert failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if portfd == -1 {
 		return gList{}
 	}
 
 	var wait *timespec
-	var zero timespec
-	if !block {
-		wait = &zero
+	var ts timespec
+	if delay < 0 {
+		wait = nil
+	} else if delay == 0 {
+		wait = &ts
+	} else {
+		ts.setNsec(delay)
+		if ts.tv_sec > 1e6 {
+			// An arbitrary cap on how long to wait for a timer.
+			// 1e6 s == ~11.5 days.
+			ts.tv_sec = 1e6
+		}
+		wait = &ts
 	}
 
 	var events [128]portevent
 retry:
 	var n uint32 = 1
-	if port_getn(portfd, &events[0], uint32(len(events)), &n, wait) < 0 {
-		if e := errno(); e != _EINTR {
+	r := port_getn(portfd, &events[0], uint32(len(events)), &n, wait)
+	e := errno()
+	if r < 0 && e == _ETIME && n > 0 {
+		// As per port_getn(3C), an ETIME failure does not preclude the
+		// delivery of some number of events.  Treat a timeout failure
+		// with delivered events as a success.
+		r = 0
+	}
+	if r < 0 {
+		if e != _EINTR && e != _ETIME {
 			print("runtime: port_getn on fd ", portfd, " failed (errno=", e, ")\n")
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted and there are no events,
+		// just return to recalculate how long we should sleep now.
+		if delay > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 
@@ -192,6 +228,24 @@ retry:
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 
+		if ev.portev_source == _PORT_SOURCE_ALERT {
+			if ev.portev_events != _POLLHUP || unsafe.Pointer(ev.portev_user) != unsafe.Pointer(&portfd) {
+				throw("runtime: netpoll: bad port_alert wakeup")
+			}
+			if delay != 0 {
+				// Now that a blocking call to netpoll
+				// has seen the alert, take portfd
+				// back out of alert mode.
+				// See the comment in netpollBreak.
+				if port_alert(portfd, 0, 0, 0) < 0 {
+					e := errno()
+					println("runtime: port_alert failed with", e)
+					throw("runtime: netpoll: port_alert failed")
+				}
+			}
+			continue
+		}
+
 		if ev.portev_events == 0 {
 			continue
 		}
@@ -228,8 +282,5 @@ retry:
 		}
 	}
 
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_stub.go b/libgo/go/runtime/netpoll_stub.go
index f585333..fe45cfb 100644
--- a/libgo/go/runtime/netpoll_stub.go
+++ b/libgo/go/runtime/netpoll_stub.go
@@ -6,16 +6,42 @@
 
 package runtime
 
+import "runtime/internal/atomic"
+
+var netpollInited uint32
 var netpollWaiters uint32
 
+var netpollStubLock mutex
+var netpollNote note
+var netpollBroken uint32
+
+func netpollGenericInit() {
+	atomic.Store(&netpollInited, 1)
+}
+
+func netpollBreak() {
+	if atomic.Cas(&netpollBroken, 0, 1) {
+		notewakeup(&netpollNote)
+	}
+}
+
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+func netpoll(delay int64) gList {
 	// Implementation for platforms that do not support
 	// integrated network poller.
+	if delay != 0 {
+		// This lock ensures that only one goroutine tries to use
+		// the note. It should normally be completely uncontended.
+		lock(&netpollStubLock)
+		noteclear(&netpollNote)
+		atomic.Store(&netpollBroken, 0)
+		notetsleep(&netpollNote, delay)
+		unlock(&netpollStubLock)
+	}
 	return gList{}
 }
 
 func netpollinited() bool {
-	return false
+	return atomic.Load(&netpollInited) != 0
 }
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index 07ef15c..ced52cb 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -41,8 +41,8 @@ func netpollinit() {
 	}
 }
 
-func netpolldescriptor() uintptr {
-	return iocphandle
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == iocphandle
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -61,9 +61,19 @@ func netpollarm(pd *pollDesc, mode int) {
 	throw("runtime: unused")
 }
 
-// Polls for completed network IO.
+func netpollBreak() {
+	if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
+		println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
+		throw("runtime: netpoll: PostQueuedCompletionStatus failed")
+	}
+}
+
+// netpoll checks for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	var entries [64]overlappedEntry
 	var wait, qty, key, flags, n, i uint32
 	var errno int32
@@ -75,23 +85,32 @@ func netpoll(block bool) gList {
 	if iocphandle == _INVALID_HANDLE_VALUE {
 		return gList{}
 	}
-	wait = 0
-	if block {
+	if delay < 0 {
 		wait = _INFINITE
+	} else if delay == 0 {
+		wait = 0
+	} else if delay < 1e6 {
+		wait = 1
+	} else if delay < 1e15 {
+		wait = uint32(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		wait = 1e9
 	}
-retry:
+
 	if _GetQueuedCompletionStatusEx != nil {
 		n = uint32(len(entries) / int(gomaxprocs))
 		if n < 8 {
 			n = 8
 		}
-		if block {
+		if delay != 0 {
 			mp.blocked = true
 		}
 		if stdcall6(_GetQueuedCompletionStatusEx, iocphandle, uintptr(unsafe.Pointer(&entries[0])), uintptr(n), uintptr(unsafe.Pointer(&n)), uintptr(wait), 0) == 0 {
 			mp.blocked = false
 			errno = int32(getlasterror())
-			if !block && errno == _WAIT_TIMEOUT {
+			if errno == _WAIT_TIMEOUT {
 				return gList{}
 			}
 			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
@@ -100,24 +119,32 @@ retry:
 		mp.blocked = false
 		for i = 0; i < n; i++ {
 			op = entries[i].op
-			errno = 0
-			qty = 0
-			if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
-				errno = int32(getlasterror())
+			if op != nil {
+				errno = 0
+				qty = 0
+				if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
+					errno = int32(getlasterror())
+				}
+				handlecompletion(&toRun, op, errno, qty)
+			} else {
+				if delay == 0 {
+					// Forward the notification to the
+					// blocked poller.
+					netpollBreak()
+				}
 			}
-			handlecompletion(&toRun, op, errno, qty)
 		}
 	} else {
 		op = nil
 		errno = 0
 		qty = 0
-		if block {
+		if delay != 0 {
 			mp.blocked = true
 		}
 		if stdcall5(_GetQueuedCompletionStatus, iocphandle, uintptr(unsafe.Pointer(&qty)), uintptr(unsafe.Pointer(&key)), uintptr(unsafe.Pointer(&op)), uintptr(wait)) == 0 {
 			mp.blocked = false
 			errno = int32(getlasterror())
-			if !block && errno == _WAIT_TIMEOUT {
+			if errno == _WAIT_TIMEOUT {
 				return gList{}
 			}
 			if op == nil {
@@ -127,11 +154,16 @@ retry:
 			// dequeued failed IO packet, so report that
 		}
 		mp.blocked = false
+		if op == nil {
+			if delay == 0 {
+				// Forward the notification to the
+				// blocked poller.
+				netpollBreak()
+			}
+			return gList{}
+		}
 		handlecompletion(&toRun, op, errno, qty)
 	}
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
 
diff --git a/libgo/go/runtime/os3_solaris.go b/libgo/go/runtime/os3_solaris.go
index d5fbccd..001feed 100644
--- a/libgo/go/runtime/os3_solaris.go
+++ b/libgo/go/runtime/os3_solaris.go
@@ -25,13 +25,6 @@ func getncpu() int32 {
 	return n
 }
 
-func osinit() {
-	ncpu = getncpu()
-	if physPageSize == 0 {
-		physPageSize = uintptr(getPageSize())
-	}
-}
-
 func sysargs(argc int32, argv **byte) {
 	executablePath = gostringnocopy(getexecname())
 }
diff --git a/libgo/go/runtime/os_darwin.go b/libgo/go/runtime/os_darwin.go
index 498bd43..58e0412 100644
--- a/libgo/go/runtime/os_darwin.go
+++ b/libgo/go/runtime/os_darwin.go
@@ -6,24 +6,6 @@ package runtime
 
 import "unsafe"
 
-//extern pipe
-func libcPipe([2]int32) int32
-
-func pipe() (r, w int32, e int32) {
-	var p [2]int32
-	r := libcPipe(noescape(unsafe.Pointer(&p)))
-	if r < 0 {
-		e = int32(errno())
-	}
-	return p[0], p[1], e
-}
-
-//go:nosplit
-func setNonblock(fd int32) {
-	flags := fcntlUintptr(uintptr(fd), _F_GETFL, 0)
-	fcntlUintptr(uintptr(fd), _F_SETFL, flags|_O_NONBLOCK)
-}
-
 type mOS struct {
 	initialized bool
 	mutex       pthreadmutex
diff --git a/libgo/go/runtime/os_freebsd_arm64.go b/libgo/go/runtime/os_freebsd_arm64.go
new file mode 100644
index 0000000..51ebf9d
--- /dev/null
+++ b/libgo/go/runtime/os_freebsd_arm64.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "internal/cpu"
+
+const (
+	hwcap_FP       = 1 << 0
+	hwcap_ASIMD    = 1 << 1
+	hwcap_EVTSTRM  = 1 << 2
+	hwcap_AES      = 1 << 3
+	hwcap_PMULL    = 1 << 4
+	hwcap_SHA1     = 1 << 5
+	hwcap_SHA2     = 1 << 6
+	hwcap_CRC32    = 1 << 7
+	hwcap_ATOMICS  = 1 << 8
+	hwcap_FPHP     = 1 << 9
+	hwcap_ASIMDHP  = 1 << 10
+	hwcap_CPUID    = 1 << 11
+	hwcap_ASIMDRDM = 1 << 12
+	hwcap_JSCVT    = 1 << 13
+	hwcap_FCMA     = 1 << 14
+	hwcap_LRCPC    = 1 << 15
+	hwcap_DCPOP    = 1 << 16
+	hwcap_SHA3     = 1 << 17
+	hwcap_SM3      = 1 << 18
+	hwcap_SM4      = 1 << 19
+	hwcap_ASIMDDP  = 1 << 20
+	hwcap_SHA512   = 1 << 21
+	hwcap_SVE      = 1 << 22
+	hwcap_ASIMDFHM = 1 << 23
+)
+
+func getisar0() uint64
+func getisar1() uint64
+func getpfr0() uint64
+
+// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
+// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
+func archauxv(tag, val uintptr) {
+	var isar0, isar1, pfr0 uint64
+
+	isar0 = getisar0()
+	isar1 = getisar1()
+	pfr0 = getpfr0()
+
+	// ID_AA64ISAR0_EL1
+	switch extractBits(isar0, 4, 7) {
+	case 1:
+		cpu.HWCap |= hwcap_AES
+	case 2:
+		cpu.HWCap |= hwcap_PMULL | hwcap_AES
+	}
+
+	switch extractBits(isar0, 8, 11) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA1
+	}
+
+	switch extractBits(isar0, 12, 15) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA2
+	case 2:
+		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
+	}
+
+	switch extractBits(isar0, 16, 19) {
+	case 1:
+		cpu.HWCap |= hwcap_CRC32
+	}
+
+	switch extractBits(isar0, 20, 23) {
+	case 2:
+		cpu.HWCap |= hwcap_ATOMICS
+	}
+
+	switch extractBits(isar0, 28, 31) {
+	case 1:
+		cpu.HWCap |= hwcap_ASIMDRDM
+	}
+
+	switch extractBits(isar0, 32, 35) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA3
+	}
+
+	switch extractBits(isar0, 36, 39) {
+	case 1:
+		cpu.HWCap |= hwcap_SM3
+	}
+
+	switch extractBits(isar0, 40, 43) {
+	case 1:
+		cpu.HWCap |= hwcap_SM4
+	}
+
+	switch extractBits(isar0, 44, 47) {
+	case 1:
+		cpu.HWCap |= hwcap_ASIMDDP
+	}
+
+	// ID_AA64ISAR1_EL1
+	switch extractBits(isar1, 0, 3) {
+	case 1:
+		cpu.HWCap |= hwcap_DCPOP
+	}
+
+	switch extractBits(isar1, 12, 15) {
+	case 1:
+		cpu.HWCap |= hwcap_JSCVT
+	}
+
+	switch extractBits(isar1, 16, 19) {
+	case 1:
+		cpu.HWCap |= hwcap_FCMA
+	}
+
+	switch extractBits(isar1, 20, 23) {
+	case 1:
+		cpu.HWCap |= hwcap_LRCPC
+	}
+
+	// ID_AA64PFR0_EL1
+	switch extractBits(pfr0, 16, 19) {
+	case 0:
+		cpu.HWCap |= hwcap_FP
+	case 1:
+		cpu.HWCap |= hwcap_FP | hwcap_FPHP
+	}
+
+	switch extractBits(pfr0, 20, 23) {
+	case 0:
+		cpu.HWCap |= hwcap_ASIMD
+	case 1:
+		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
+	}
+
+	switch extractBits(pfr0, 32, 35) {
+	case 1:
+		cpu.HWCap |= hwcap_SVE
+	}
+}
+
+func extractBits(data uint64, start, end uint) uint {
+	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
+}
+
+//go:nosplit
+func cputicks() int64 {
+	// Currently cputicks() is used in blocking profiler and to seed fastrand().
+	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	return nanotime()
+}
diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go
index ef33d67..ab19022 100644
--- a/libgo/go/runtime/os_gccgo.go
+++ b/libgo/go/runtime/os_gccgo.go
@@ -51,3 +51,45 @@ func getRandomData(r []byte) {
 	closefd(fd)
 	extendRandom(r, int(n))
 }
+
+//go:noescape
+//extern pipe
+func libcPipe(*[2]int32) int32
+
+func pipe() (r, w int32, e int32) {
+	var p [2]int32
+	res := libcPipe(&p)
+	if res < 0 {
+		e = int32(errno())
+	}
+	return p[0], p[1], e
+}
+
+//go:noescape
+//extern pipe2
+func libcPipe2(*[2]int32, int32) int32
+
+func pipe2(flags int32) (r, w int32, e int32) {
+	var p [2]int32
+	res := libcPipe2(&p, flags)
+	if res < 0 {
+		e = int32(errno())
+	}
+	return p[0], p[1], e
+}
+
+//extern __go_fcntl_uintptr
+func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
+
+//go:nosplit
+func closeonexec(fd int32) {
+	fcntlUintptr(uintptr(fd), _F_SETFD, _FD_CLOEXEC)
+}
+
+//go:nosplit
+func setNonblock(fd int32) {
+	flags, errno := fcntlUintptr(uintptr(fd), _F_GETFL, 0)
+	if errno == 0 {
+		fcntlUintptr(uintptr(fd), _F_SETFL, flags|_O_NONBLOCK)
+	}
+}
diff --git a/libgo/go/runtime/os_illumos.go b/libgo/go/runtime/os_illumos.go
new file mode 100644
index 0000000..ddcb8c9
--- /dev/null
+++ b/libgo/go/runtime/os_illumos.go
@@ -0,0 +1,102 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// Return the minimum value seen for the zone CPU cap, or 0 if no cap is
+// detected.
+func getcpucap() uint64 {
+	// The resource control block is an opaque object whose size is only
+	// known to libc.  In practice, given the contents, it is unlikely to
+	// grow beyond 8KB so we'll use a static buffer of that size here.
+	const rblkmaxsize = 8 * 1024
+	if rctlblk_size() > rblkmaxsize {
+		return 0
+	}
+
+	// The "zone.cpu-cap" resource control, as described in
+	// resource_controls(5), "sets a limit on the amount of CPU time that
+	// can be used by a zone.  The unit used is the percentage of a single
+	// CPU that can be used by all user threads in a zone, expressed as an
+	// integer."  A C string of the name must be passed to getrctl(2).
+	name := []byte("zone.cpu-cap\x00")
+
+	// To iterate over the list of values for a particular resource
+	// control, we need two blocks: one for the previously read value and
+	// one for the next value.
+	var rblk0 [rblkmaxsize]byte
+	var rblk1 [rblkmaxsize]byte
+	rblk := &rblk0[0]
+	rblkprev := &rblk1[0]
+
+	var flag uint32 = _RCTL_FIRST
+	var capval uint64 = 0
+
+	for {
+		if getrctl(unsafe.Pointer(&name[0]), unsafe.Pointer(rblkprev), unsafe.Pointer(rblk), flag) != 0 {
+			// The end of the sequence is reported as an ENOENT
+			// failure, but determining the CPU cap is not critical
+			// here.  We'll treat any failure as if it were the end
+			// of sequence.
+			break
+		}
+
+		lflags := rctlblk_get_local_flags(unsafe.Pointer(rblk))
+		action := rctlblk_get_local_action(unsafe.Pointer(rblk), 0)
+		if (lflags&_RCTL_LOCAL_MAXIMAL) == 0 && action == _RCTL_LOCAL_DENY {
+			// This is a finite (not maximal) value representing a
+			// cap (deny) action.
+			v := rctlblk_get_value(unsafe.Pointer(rblk))
+			if capval == 0 || capval > v {
+				capval = v
+			}
+		}
+
+		// Swap the blocks around so that we can fetch the next value
+		t := rblk
+		rblk = rblkprev
+		rblkprev = t
+		flag = _RCTL_NEXT
+	}
+
+	return capval
+}
+
+func getncpu() int32 {
+	n := int32(sysconf(__SC_NPROCESSORS_ONLN))
+	if n < 1 {
+		return 1
+	}
+
+	if cents := int32(getcpucap()); cents > 0 {
+		// Convert from a percentage of CPUs to a number of CPUs,
+		// rounding up to make use of a fractional CPU
+		// e.g., 336% becomes 4 CPUs
+		ncap := (cents + 99) / 100
+		if ncap < n {
+			return ncap
+		}
+	}
+
+	return n
+}
+
+//extern getrctl
+func getrctl(controlname, oldbuf, newbuf unsafe.Pointer, flags uint32) int32
+
+//extern rctlblk_get_local_action
+func rctlblk_get_local_action(buf, signalp unsafe.Pointer) uint32
+
+//extern rctlblk_get_local_flags
+func rctlblk_get_local_flags(buf unsafe.Pointer) uint32
+
+//extern rctlblk_get_value
+func rctlblk_get_value(buf unsafe.Pointer) uint64
+
+//extern rctlblk_size
+func rclblk_size() uintptr
diff --git a/libgo/go/runtime/os_js.go b/libgo/go/runtime/os_js.go
index ad6db18..ff0ee3a 100644
--- a/libgo/go/runtime/os_js.go
+++ b/libgo/go/runtime/os_js.go
@@ -12,7 +12,7 @@ import (
 
 func exit(code int32)
 
-func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
 	if fd > 2 {
 		throw("runtime.write to fd > 2 is unsupported")
 	}
@@ -131,7 +131,6 @@ func os_sigpipe() {
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
 
@@ -143,3 +142,9 @@ func syscall_now() (sec int64, nsec int32) {
 
 // gsignalStack is unused on js.
 type gsignalStack struct{}
+
+const preemptMSupported = false
+
+func preemptM(mp *m) {
+	// No threads, so nothing to do.
+}
diff --git a/libgo/go/runtime/os_linux_arm.go b/libgo/go/runtime/os_linux_arm.go
index 8de9d11..0a90188 100644
--- a/libgo/go/runtime/os_linux_arm.go
+++ b/libgo/go/runtime/os_linux_arm.go
@@ -6,16 +6,8 @@ package runtime
 
 import "internal/cpu"
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
 	case _AT_HWCAP:
 		cpu.HWCap = uint(val)
 	case _AT_HWCAP2:
diff --git a/libgo/go/runtime/os_linux_arm64.go b/libgo/go/runtime/os_linux_arm64.go
index 30d63bf..a482d47 100644
--- a/libgo/go/runtime/os_linux_arm64.go
+++ b/libgo/go/runtime/os_linux_arm64.go
@@ -8,17 +8,8 @@ package runtime
 
 import "internal/cpu"
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-
 	case _AT_HWCAP:
 		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
 		// HWCAP/HWCAP2 bits for hardware capabilities.
diff --git a/libgo/go/runtime/os_linux_mips64x.go b/libgo/go/runtime/os_linux_mips64x.go
index b7f737f..2b59dcb 100644
--- a/libgo/go/runtime/os_linux_mips64x.go
+++ b/libgo/go/runtime/os_linux_mips64x.go
@@ -7,15 +7,5 @@
 
 package runtime
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
-	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-	}
 }
diff --git a/libgo/go/runtime/os_linux_mipsx.go b/libgo/go/runtime/os_linux_mipsx.go
index a2696de..2bfd6f40 100644
--- a/libgo/go/runtime/os_linux_mipsx.go
+++ b/libgo/go/runtime/os_linux_mipsx.go
@@ -7,15 +7,5 @@
 
 package runtime
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
-	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-	}
 }
diff --git a/libgo/go/runtime/os_netbsd_arm64.go b/libgo/go/runtime/os_netbsd_arm64.go
index fd81eb7..8d21b0a 100644
--- a/libgo/go/runtime/os_netbsd_arm64.go
+++ b/libgo/go/runtime/os_netbsd_arm64.go
@@ -19,6 +19,5 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/libgo/go/runtime/os_only_solaris.go b/libgo/go/runtime/os_only_solaris.go
new file mode 100644
index 0000000..e2f5409
--- /dev/null
+++ b/libgo/go/runtime/os_only_solaris.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Solaris code that doesn't also apply to illumos.
+
+// +build !illumos
+
+package runtime
+
+func getncpu() int32 {
+	n := int32(sysconf(__SC_NPROCESSORS_ONLN))
+	if n < 1 {
+		return 1
+	}
+
+	return n
+}
diff --git a/libgo/go/runtime/os_openbsd_arm64.go b/libgo/go/runtime/os_openbsd_arm64.go
index f15a95b..d559a2a 100644
--- a/libgo/go/runtime/os_openbsd_arm64.go
+++ b/libgo/go/runtime/os_openbsd_arm64.go
@@ -12,7 +12,6 @@ import (
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
 
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 9667181..88c598e 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -50,7 +50,7 @@ import (
 // pc should be the program counter of the compiler-generated code that
 // triggered this panic.
 func panicCheck1(pc uintptr, msg string) {
-	name, _, _, _ := funcfileline(pc-1, -1)
+	name, _, _, _ := funcfileline(pc-1, -1, false)
 	if hasPrefix(name, "runtime.") {
 		throw(msg)
 	}
@@ -548,6 +548,14 @@ func Goexit() {
 	// for detailed comments.
 	gp := getg()
 	gp.goexiting = true
+
+	// Create a panic object for Goexit, so we can recognize when it might be
+	// bypassed by a recover().
+	var p _panic
+	p.goexit = true
+	p.link = gp._panic
+	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
 	for {
 		d := gp._defer
 		if d == nil {
@@ -608,7 +616,12 @@ func preprintpanics(p *_panic) {
 func printpanics(p *_panic) {
 	if p.link != nil {
 		printpanics(p.link)
-		print("\t")
+		if !p.link.goexit {
+			print("\t")
+		}
+	}
+	if p.goexit {
+		return
 	}
 	print("panic: ")
 	printany(p.arg)
@@ -704,9 +717,12 @@ func gopanic(e interface{}) {
 		d._panic = nil
 
 		if p.recovered {
-			atomic.Xadd(&runningPanicDefers, -1)
-
 			gp._panic = p.link
+			if gp._panic != nil && gp._panic.goexit && gp._panic.aborted {
+				Goexit()
+				throw("Goexit returned")
+			}
+			atomic.Xadd(&runningPanicDefers, -1)
 
 			// Aborted panics are marked but remain on the g.panic list.
 			// Remove them from the list.
@@ -717,6 +733,11 @@ func gopanic(e interface{}) {
 				gp.sig = 0
 			}
 
+			if gp._panic != nil && gp._panic.goexit {
+				Goexit()
+				throw("Goexit returned")
+			}
+
 			// Unwind the stack by throwing an exception.
 			// The compiler has arranged to create
 			// exception handlers in each function
@@ -922,7 +943,7 @@ func makefuncreturning() {
 func gorecover() interface{} {
 	gp := getg()
 	p := gp._panic
-	if p != nil && !p.recovered {
+	if p != nil && !p.goexit && !p.recovered {
 		p.recovered = true
 		return p.arg
 	}
diff --git a/libgo/go/runtime/pprof/label.go b/libgo/go/runtime/pprof/label.go
index 20f9cdb..2d92ef7 100644
--- a/libgo/go/runtime/pprof/label.go
+++ b/libgo/go/runtime/pprof/label.go
@@ -60,11 +60,11 @@ func Labels(args ...string) LabelSet {
 	if len(args)%2 != 0 {
 		panic("uneven number of arguments to pprof.Labels")
 	}
-	labels := LabelSet{}
+	list := make([]label, 0, len(args)/2)
 	for i := 0; i+1 < len(args); i += 2 {
-		labels.list = append(labels.list, label{key: args[i], value: args[i+1]})
+		list = append(list, label{key: args[i], value: args[i+1]})
 	}
-	return labels
+	return LabelSet{list: list}
 }
 
 // Label returns the value of the label with the given key on ctx, and a boolean indicating
diff --git a/libgo/go/runtime/pprof/label_test.go b/libgo/go/runtime/pprof/label_test.go
index 240445f..de39d85 100644
--- a/libgo/go/runtime/pprof/label_test.go
+++ b/libgo/go/runtime/pprof/label_test.go
@@ -24,7 +24,7 @@ func (s labelSorter) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
 func (s labelSorter) Less(i, j int) bool { return s[i].key < s[j].key }
 
 func TestContextLabels(t *testing.T) {
-	// Background context starts with no lablels.
+	// Background context starts with no labels.
 	ctx := context.Background()
 	labels := labelsSorted(ctx)
 	if len(labels) != 0 {
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index 6fe892b..c352dea 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !js
+
 package pprof
 
 import (
@@ -10,6 +12,7 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
+	"runtime/pprof/internal/profile"
 	"testing"
 	"unsafe"
 )
@@ -27,6 +30,10 @@ func allocateTransient2M() {
 	memSink = make([]byte, 2<<20)
 }
 
+func allocateTransient2MInline() {
+	memSink = make([]byte, 4<<20)
+}
+
 type Obj32 struct {
 	link *Obj32
 	pad  [32 - unsafe.Sizeof(uintptr(0))]byte
@@ -71,47 +78,102 @@ func TestMemoryProfiler(t *testing.T) {
 	// Do the interesting allocations.
 	allocateTransient1M()
 	allocateTransient2M()
+	allocateTransient2MInline()
 	allocatePersistent1K()
 	allocateReflect()
 	memSink = nil
 
 	runtime.GC() // materialize stats
-	var buf bytes.Buffer
-	if err := Lookup("heap").WriteTo(&buf, 1); err != nil {
-		t.Fatalf("failed to write heap profile: %v", err)
-	}
 
 	memoryProfilerRun++
 
-	tests := []string{
-
-		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:40
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:74
+	tests := []struct {
+		stk    []string
+		legacy string
+	}{{
+		stk: []string{"pprof.allocatePersistent1K", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f x]+
+#	0x[0-9,a-f]+	pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:47
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:82
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
-
-		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:21
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:72
-`, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
-
+	}, {
+		stk: []string{"pprof.allocateTransient1M", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
+#	0x[0-9,a-f]+	pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:24
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:79
+`, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
+	}, {
+		stk: []string{"pprof.allocateTransient2M", "runtime/pprof.TestMemoryProfiler"},
 		// This should start with "0: 0" but gccgo's imprecise
 		// GC means that sometimes the value is not collected.
-		fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:27
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:73
+		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
+#	0x[0-9,a-f]+	pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:30
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:80
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
-
-		// This should start with "0: 0" but gccgo's imprecise
-		// GC means that sometimes the value is not collected.
-		fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:48
+	}, {
+		stk: []string{"pprof.allocateTransient2MInline", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
+#	0x[0-9,a-f]+	pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/mprof_test.go:34
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:81
+`, memoryProfilerRun, (4<<20)*memoryProfilerRun, memoryProfilerRun, (4<<20)*memoryProfilerRun),
+	}, {
+		stk: []string{"pprof.allocateReflectTransient"},
+		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @( 0x[0-9,a-f]+)+
+#	0x[0-9,a-f]+	pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:55
 `, memoryProfilerRun, (3<<20)*memoryProfilerRun, memoryProfilerRun, (3<<20)*memoryProfilerRun),
-	}
+	}}
 
-	for _, test := range tests {
-		if !regexp.MustCompile(test).Match(buf.Bytes()) {
-			t.Fatalf("The entry did not match:\n%v\n\nProfile:\n%v\n", test, buf.String())
+	t.Run("debug=1", func(t *testing.T) {
+		var buf bytes.Buffer
+		if err := Lookup("heap").WriteTo(&buf, 1); err != nil {
+			t.Fatalf("failed to write heap profile: %v", err)
 		}
-	}
+
+		for _, test := range tests {
+			if !regexp.MustCompile(test.legacy).Match(buf.Bytes()) {
+				t.Fatalf("The entry did not match:\n%v\n\nProfile:\n%v\n", test.legacy, buf.String())
+			}
+		}
+	})
+
+	t.Run("proto", func(t *testing.T) {
+		var buf bytes.Buffer
+		if err := Lookup("heap").WriteTo(&buf, 0); err != nil {
+			t.Fatalf("failed to write heap profile: %v", err)
+		}
+		p, err := profile.Parse(&buf)
+		if err != nil {
+			t.Fatalf("failed to parse heap profile: %v", err)
+		}
+		t.Logf("Profile = %v", p)
+
+		stks := stacks(p)
+		for _, test := range tests {
+			if !containsStack(stks, test.stk) {
+				t.Logf("stks:\n%v", stks)
+				t.Fatalf("No matching stack entry for %q\n\nProfile:\n%v\n", test.stk, p)
+			}
+		}
+
+		if !containsInlinedCall(TestMemoryProfiler, 4<<10) {
+			t.Logf("Can't determine whether allocateTransient2MInline was inlined into TestMemoryProfiler.")
+			return
+		}
+
+		// Check the inlined function location is encoded correctly.
+		for _, loc := range p.Location {
+			inlinedCaller, inlinedCallee := false, false
+			for _, line := range loc.Line {
+				if line.Function.Name == "runtime/pprof.allocateTransient2MInline" {
+					inlinedCallee = true
+				}
+				if inlinedCallee && line.Function.Name == "runtime/pprof.TestMemoryProfiler" {
+					inlinedCaller = true
+				}
+			}
+			if inlinedCallee != inlinedCaller {
+				t.Errorf("want allocateTransient2MInline after TestMemoryProfiler in one location, got separate location entries:\n%v", loc)
+			}
+		}
+	})
 }
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 996b3cb..183881c 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -28,7 +28,7 @@
 //            if err != nil {
 //                log.Fatal("could not create CPU profile: ", err)
 //            }
-//            defer f.Close()
+//            defer f.Close() // error handling omitted for example
 //            if err := pprof.StartCPUProfile(f); err != nil {
 //                log.Fatal("could not start CPU profile: ", err)
 //            }
@@ -42,7 +42,7 @@
 //            if err != nil {
 //                log.Fatal("could not create memory profile: ", err)
 //            }
-//            defer f.Close()
+//            defer f.Close() // error handling omitted for example
 //            runtime.GC() // get up-to-date statistics
 //            if err := pprof.WriteHeapProfile(f); err != nil {
 //                log.Fatal("could not write memory profile: ", err)
@@ -386,16 +386,9 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, scaler fun
 		count, nanosec := scaler(r.Count, float64(r.Cycles)/cpuGHz)
 		values[0] = count
 		values[1] = int64(nanosec)
-		locs = locs[:0]
-		for _, addr := range r.Stack() {
-			// For count profiles, all stack addresses are
-			// return PCs, which is what locForPC expects.
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
-			}
-			locs = append(locs, l)
-		}
+		// For count profiles, all stack addresses are
+		// return PCs, which is what appendLocsForStack expects.
+		locs = b.appendLocsForStack(locs[:0], r.Stack())
 		b.pbSample(values, locs, nil)
 	}
 	b.build()
@@ -451,16 +444,9 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 	var locs []uint64
 	for _, k := range keys {
 		values[0] = int64(count[k])
-		locs = locs[:0]
-		for _, addr := range p.Stack(index[k]) {
-			// For count profiles, all stack addresses are
-			// return PCs, which is what locForPC expects.
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
-			}
-			locs = append(locs, l)
-		}
+		// For count profiles, all stack addresses are
+		// return PCs, which is what appendLocsForStack expects.
+		locs = b.appendLocsForStack(locs[:0], p.Stack(index[k]))
 		b.pbSample(values, locs, nil)
 	}
 	b.build()
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 49a555c..bba9ee3 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !nacl,!js
+// +build !js
 
 package pprof
 
@@ -49,8 +49,12 @@ var (
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
 func cpuHog1(x int) int {
+	return cpuHog0(x, 1e5)
+}
+
+func cpuHog0(x, n int) int {
 	foo := x
-	for i := 0; i < 1e5; i++ {
+	for i := 0; i < n; i++ {
 		if foo > 0 {
 			foo *= foo
 		} else {
@@ -100,35 +104,111 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	})
 }
 
+// containsInlinedCall reports whether the function body for the function f is
+// known to contain an inlined function call within the first maxBytes bytes.
+func containsInlinedCall(f interface{}, maxBytes int) bool {
+	_, found := findInlinedCall(f, maxBytes)
+	return found
+}
+
+// findInlinedCall returns the PC of an inlined function call within
+// the function body for the function f if any.
+func findInlinedCall(f interface{}, maxBytes int) (pc uint64, found bool) {
+	fFunc := runtime.FuncForPC(uintptr(funcPC(f)))
+	if fFunc == nil || fFunc.Entry() == 0 {
+		panic("failed to locate function entry")
+	}
+
+	for offset := 0; offset < maxBytes; offset++ {
+		innerPC := fFunc.Entry() + uintptr(offset)
+		inner := runtime.FuncForPC(innerPC)
+		if inner == nil {
+			// No function known for this PC value.
+			// It might simply be misaligned, so keep searching.
+			continue
+		}
+		if inner.Entry() != fFunc.Entry() {
+			// Scanned past f and didn't find any inlined functions.
+			break
+		}
+		if inner.Name() != fFunc.Name() {
+			// This PC has f as its entry-point, but is not f. Therefore, it must be a
+			// function inlined into f.
+			return uint64(innerPC), true
+		}
+	}
+
+	return 0, false
+}
+
 func TestCPUProfileInlining(t *testing.T) {
-	testCPUProfile(t, stackContains, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
+	if !containsInlinedCall(inlinedCaller, 4<<10) {
+		t.Skip("Can't determine whether inlinedCallee was inlined into inlinedCaller.")
+	}
+
+	p := testCPUProfile(t, stackContains, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(inlinedCaller, &salt1, dur)
 	})
+
+	// Check if inlined function locations are encoded correctly. The inlinedCalee and inlinedCaller should be in one location.
+	for _, loc := range p.Location {
+		hasInlinedCallerAfterInlinedCallee, hasInlinedCallee := false, false
+		for _, line := range loc.Line {
+			if line.Function.Name == "runtime/pprof.inlinedCallee" {
+				hasInlinedCallee = true
+			}
+			if hasInlinedCallee && line.Function.Name == "runtime/pprof.inlinedCaller" {
+				hasInlinedCallerAfterInlinedCallee = true
+			}
+		}
+		if hasInlinedCallee != hasInlinedCallerAfterInlinedCallee {
+			t.Fatalf("want inlinedCallee followed by inlinedCaller, got separate Location entries:\n%v", p)
+		}
+	}
 }
 
 func inlinedCaller(x int) int {
-	x = inlinedCallee(x)
+	x = inlinedCallee(x, 1e5)
 	return x
 }
 
-func inlinedCallee(x int) int {
-	// We could just use cpuHog1, but for loops prevent inlining
-	// right now. :(
-	foo := x
-	i := 0
-loop:
-	if foo > 0 {
-		foo *= foo
-	} else {
-		foo *= foo + 1
+func inlinedCallee(x, n int) int {
+	return cpuHog0(x, n)
+}
+
+func TestCPUProfileRecursion(t *testing.T) {
+	p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) {
+		cpuHogger(recursionCaller, &salt1, dur)
+	})
+
+	// check the Location encoding was not confused by recursive calls.
+	for i, loc := range p.Location {
+		recursionFunc := 0
+		for _, line := range loc.Line {
+			if name := line.Function.Name; name == "runtime/pprof.recursionCaller" || name == "runtime/pprof.recursionCallee" {
+				recursionFunc++
+			}
+		}
+		if recursionFunc > 1 {
+			t.Fatalf("want at most one recursionCaller or recursionCallee in one Location, got a violating Location (index: %d):\n%v", i, p)
+		}
 	}
-	if i++; i < 1e5 {
-		goto loop
+}
+
+func recursionCaller(x int) int {
+	y := recursionCallee(3, x)
+	return y
+}
+
+func recursionCallee(n, x int) int {
+	if n == 0 {
+		return 1
 	}
-	return foo
+	y := inlinedCallee(x, 1e4)
+	return y * recursionCallee(n-1, x)
 }
 
-func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile {
 	p, err := profile.Parse(bytes.NewReader(valBytes))
 	if err != nil {
 		t.Fatal(err)
@@ -137,11 +217,12 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 		count := uintptr(sample.Value[0])
 		f(count, sample.Location, sample.Label)
 	}
+	return p
 }
 
 // testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
-// as interpreted by matches.
-func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
+// as interpreted by matches, and returns the parsed profile.
+func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
@@ -195,8 +276,8 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 		f(duration)
 		StopCPUProfile()
 
-		if profileOk(t, matches, need, avoid, prof, duration) {
-			return
+		if p, ok := profileOk(t, matches, need, avoid, prof, duration); ok {
+			return p
 		}
 
 		duration *= 2
@@ -217,6 +298,7 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
 	}
 	t.FailNow()
+	return nil
 }
 
 func contains(slice []string, s string) bool {
@@ -242,7 +324,7 @@ func stackContains(spec string, count uintptr, stk []*profile.Location, labels m
 
 type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
 
-func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (_ *profile.Profile, ok bool) {
 	ok = true
 
 	// Check that profile is well formed, contains 'need', and does not contain
@@ -251,7 +333,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
 	avoidSamples := make([]uintptr, len(avoid))
 	var samples uintptr
 	var buf bytes.Buffer
-	parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
+	p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
 		fmt.Fprintf(&buf, "%d:", count)
 		fprintStack(&buf, stk)
 		samples += count
@@ -287,7 +369,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
 		// not enough samples due to coarse timer
 		// resolution. Let it go.
 		t.Log("too few samples on Windows (golang.org/issue/10842)")
-		return false
+		return p, false
 	}
 
 	// Check that we got a reasonable number of samples.
@@ -309,7 +391,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
 	}
 
 	if len(need) == 0 {
-		return ok
+		return p, ok
 	}
 
 	var total uintptr
@@ -332,7 +414,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
 			ok = false
 		}
 	}
-	return ok
+	return p, ok
 }
 
 // Fork can hang if preempted with signals frequently enough (see issue 5517).
@@ -1076,3 +1158,111 @@ func TestTracebackAll(t *testing.T) {
 		runtime.Stack(buf, true)
 	}
 }
+
+// TestTryAdd tests the cases that's hard to test with real program execution.
+// For example, the current go compilers may not inline functions involved in recursion
+// but that may not be true in the future compilers. This tests such cases by
+// using fake call sequences and forcing the profile build utilizing
+// translateCPUProfile defined in proto_test.go
+func TestTryAdd(t *testing.T) {
+	inlinedCallerPtr := uint64(funcPC(inlinedCaller)) + 1
+	inlinedCalleePtr, found := findInlinedCall(inlinedCaller, 4<<10)
+	if !found {
+		t.Skip("Can't determine whether inlinedCallee was inlined into inlinedCaller.")
+	}
+	inlinedCalleePtr += 1 // +1 to be safely inside of the function body.
+
+	period := int64(2000 * 1000) // 1/500*1e9 nanosec.
+
+	testCases := []struct {
+		name        string
+		input       []uint64          // following the input format assumed by profileBuilder.addCPUData.
+		wantLocs    [][]string        // ordered location entries with function names.
+		wantSamples []*profile.Sample // ordered samples, we care only about Value and the profile location IDs.
+	}{{
+		name: "bug35538",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			7, 0, 10, inlinedCalleePtr, inlinedCallerPtr, inlinedCalleePtr, inlinedCallerPtr,
+			5, 0, 20, inlinedCalleePtr, inlinedCallerPtr,
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{10, 10 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}}},
+			{Value: []int64{20, 20 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "recursive_inlined_funcs",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 30, inlinedCalleePtr, inlinedCalleePtr,
+			4, 0, 40, inlinedCalleePtr,
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCallee"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{30, 30 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}}},
+			{Value: []int64{40, 40 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "truncated_stack_trace_later",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 50, inlinedCalleePtr, inlinedCallerPtr,
+			4, 0, 60, inlinedCalleePtr,
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
+			{Value: []int64{60, 60 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "truncated_stack_trace_first",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			4, 0, 70, inlinedCalleePtr,
+			5, 0, 80, inlinedCalleePtr, inlinedCallerPtr,
+		},
+		wantLocs: [][]string{ // the inline info is screwed up, but better than a crash.
+			{"runtime/pprof.inlinedCallee"},
+			{"runtime/pprof.inlinedCaller"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
+			{Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}}},
+		},
+	}}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			p, err := translateCPUProfile(tc.input)
+			if err != nil {
+				t.Fatalf("translating profile: %v", err)
+			}
+			t.Logf("Profile: %v\n", p)
+
+			// One location entry with all inlined functions.
+			var gotLoc [][]string
+			for _, loc := range p.Location {
+				var names []string
+				for _, line := range loc.Line {
+					names = append(names, line.Function.Name)
+				}
+				gotLoc = append(gotLoc, names)
+			}
+			if got, want := fmtJSON(gotLoc), fmtJSON(tc.wantLocs); got != want {
+				t.Errorf("Got Location = %+v\n\twant %+v", got, want)
+			}
+			// All samples should point to one location.
+			var gotSamples []*profile.Sample
+			for _, sample := range p.Sample {
+				var locs []*profile.Location
+				for _, loc := range sample.Location {
+					locs = append(locs, &profile.Location{ID: loc.ID})
+				}
+				gotSamples = append(gotSamples, &profile.Sample{Value: sample.Value, Location: locs})
+			}
+			if got, want := fmtJSON(gotSamples), fmtJSON(tc.wantSamples); got != want {
+				t.Errorf("Got Samples = %+v\n\twant %+v", got, want)
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index ef3eeb1..ba5db85 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -54,9 +54,10 @@ type profileBuilder struct {
 	pb        protobuf
 	strings   []string
 	stringMap map[string]int
-	locs      map[uintptr]int
-	funcs     map[string]int // Package path-qualified function name to Function.ID
+	locs      map[uintptr]locInfo // list of locInfo starting with the given PC.
+	funcs     map[string]int      // Package path-qualified function name to Function.ID
 	mem       []memMap
+	deck      pcDeck
 }
 
 type memMap struct {
@@ -220,15 +221,7 @@ func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file
 	b.pb.endMessage(tag, start)
 }
 
-// locForPC returns the location ID for addr.
-// addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call.
-// It may emit to b.pb, so there must be no message encoding in progress.
-func (b *profileBuilder) locForPC(addr uintptr) uint64 {
-	id := uint64(b.locs[addr])
-	if id != 0 {
-		return id
-	}
-
+func allFrames(addr uintptr) ([]runtime.Frame, symbolizeFlag) {
 	// Expand this one address using CallersFrames so we can cache
 	// each expansion. In general, CallersFrames takes a whole
 	// stack, but in this case we know there will be no skips in
@@ -238,7 +231,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	if frame.Function == "runtime.goexit" || frame.Function == "runtime.kickoff" {
 		// Short-circuit if we see runtime.goexit so the loop
 		// below doesn't allocate a useless empty location.
-		return 0
+		return nil, 0
 	}
 
 	symbolizeResult := lookupTried
@@ -251,59 +244,22 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 		// a reasonable call PC. This mostly happens in tests.
 		frame.PC = addr - 1
 	}
-
-	// We can't write out functions while in the middle of the
-	// Location message, so record new functions we encounter and
-	// write them out after the Location.
-	type newFunc struct {
-		id         uint64
-		name, file string
-	}
-	newFuncs := make([]newFunc, 0, 8)
-
-	id = uint64(len(b.locs)) + 1
-	b.locs[addr] = int(id)
-	start := b.pb.startMessage()
-	b.pb.uint64Opt(tagLocation_ID, id)
-	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
-	for frame.Function != "runtime.goexit" && frame.Function != "runtime.kickoff" {
-		// Write out each line in frame expansion.
-		funcID := uint64(b.funcs[frame.Function])
-		if funcID == 0 {
-			funcID = uint64(len(b.funcs)) + 1
-			b.funcs[frame.Function] = int(funcID)
-			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
-		}
-		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
-		if !more {
-			break
-		}
+	ret := []runtime.Frame{frame}
+	for frame.Function != "runtime.goexit" && frame.Function != "runtime.kickoff" && more == true {
 		frame, more = frames.Next()
+		ret = append(ret, frame)
 	}
-	for i := range b.mem {
-		if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
-			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
-
-			m := b.mem[i]
-			m.funcs |= symbolizeResult
-			b.mem[i] = m
-			break
-		}
-	}
-	b.pb.endMessage(tagProfile_Location, start)
+	return ret, symbolizeResult
+}
 
-	// Write out functions we found during frame expansion.
-	for _, fn := range newFuncs {
-		start := b.pb.startMessage()
-		b.pb.uint64Opt(tagFunction_ID, fn.id)
-		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
-		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
-		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
-		b.pb.endMessage(tagProfile_Function, start)
-	}
+type locInfo struct {
+	// location id assigned by the profileBuilder
+	id uint64
 
-	b.flush()
-	return id
+	// sequence of PCs, including the fake PCs returned by the traceback
+	// to represent inlined functions
+	// https://github.com/golang/go/blob/d6f2f833c93a41ec1c68e49804b8387a06b131c5/src/runtime/traceback.go#L347-L368
+	pcs []uintptr
 }
 
 // newProfileBuilder returns a new profileBuilder.
@@ -318,7 +274,7 @@ func newProfileBuilder(w io.Writer) *profileBuilder {
 		start:     time.Now(),
 		strings:   []string{""},
 		stringMap: map[string]int{"": 0},
-		locs:      map[uintptr]int{},
+		locs:      map[uintptr]locInfo{},
 		funcs:     map[string]int{},
 	}
 	b.readMapping()
@@ -402,6 +358,7 @@ func (b *profileBuilder) build() {
 
 	values := []int64{0, 0}
 	var locs []uint64
+
 	for e := b.m.all; e != nil; e = e.nextAll {
 		values[0] = e.count
 		values[1] = e.count * b.period
@@ -415,23 +372,8 @@ func (b *profileBuilder) build() {
 			}
 		}
 
-		locs = locs[:0]
-		for i, addr := range e.stk {
-			// Addresses from stack traces point to the
-			// next instruction after each call, except
-			// for the leaf, which points to where the
-			// signal occurred. locForPC expects return
-			// PCs, so increment the leaf address to look
-			// like a return PC.
-			if i == 0 {
-				addr++
-			}
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
-			}
-			locs = append(locs, l)
-		}
+		locs = b.appendLocsForStack(locs[:0], e.stk)
+
 		b.pbSample(values, locs, labels)
 	}
 
@@ -448,6 +390,203 @@ func (b *profileBuilder) build() {
 	b.zw.Close()
 }
 
+// appendLocsForStack appends the location IDs for the given stack trace to the given
+// location ID slice, locs. The addresses in the stack are return PCs or 1 + the PC of
+// an inline marker as the runtime traceback function returns.
+//
+// It may emit to b.pb, so there must be no message encoding in progress.
+func (b *profileBuilder) appendLocsForStack(locs []uint64, stk []uintptr) (newLocs []uint64) {
+	b.deck.reset()
+	for len(stk) > 0 {
+		addr := stk[0]
+		if l, ok := b.locs[addr]; ok {
+			// first record the location if there is any pending accumulated info.
+			if id := b.emitLocation(); id > 0 {
+				locs = append(locs, id)
+			}
+
+			// then, record the cached location.
+			locs = append(locs, l.id)
+
+			// The stk may be truncated due to the stack depth limit
+			// (e.g. See maxStack and maxCPUProfStack in runtime) or
+			// bugs in runtime. Avoid the crash in either case.
+			// TODO(hyangah): The correct fix may require using the exact
+			// pcs as the key for b.locs cache management instead of just
+			// relying on the very first pc. We are late in the go1.14 dev
+			// cycle, so this is a workaround with little code change.
+			if len(l.pcs) > len(stk) {
+				stk = nil
+				// TODO(hyangah): would be nice if we can enable
+				// debug print out on demand and report the problematic
+				// cached location entry and stack traces. Do we already
+				// have such facility to utilize (e.g. GODEBUG)?
+			} else {
+				stk = stk[len(l.pcs):] // skip the matching pcs.
+			}
+			continue
+		}
+
+		frames, symbolizeResult := allFrames(addr)
+		if len(frames) == 0 { // runtime.goexit.
+			if id := b.emitLocation(); id > 0 {
+				locs = append(locs, id)
+			}
+			stk = stk[1:]
+			continue
+		}
+
+		if added := b.deck.tryAdd(addr, frames, symbolizeResult); added {
+			stk = stk[1:]
+			continue
+		}
+		// add failed because this addr is not inlined with
+		// the existing PCs in the deck. Flush the deck and retry to
+		// handle this pc.
+		if id := b.emitLocation(); id > 0 {
+			locs = append(locs, id)
+		}
+
+		// check cache again - previous emitLocation added a new entry
+		if l, ok := b.locs[addr]; ok {
+			locs = append(locs, l.id)
+			stk = stk[len(l.pcs):] // skip the matching pcs.
+		} else {
+			b.deck.tryAdd(addr, frames, symbolizeResult) // must succeed.
+			stk = stk[1:]
+		}
+	}
+	if id := b.emitLocation(); id > 0 { // emit remaining location.
+		locs = append(locs, id)
+	}
+	return locs
+}
+
+// pcDeck is a helper to detect a sequence of inlined functions from
+// a stack trace returned by the runtime.
+//
+// The stack traces returned by runtime's trackback functions are fully
+// expanded (at least for Go functions) and include the fake pcs representing
+// inlined functions. The profile proto expects the inlined functions to be
+// encoded in one Location message.
+// https://github.com/google/pprof/blob/5e965273ee43930341d897407202dd5e10e952cb/proto/profile.proto#L177-L184
+//
+// Runtime does not directly expose whether a frame is for an inlined function
+// and looking up debug info is not ideal, so we use a heuristic to filter
+// the fake pcs and restore the inlined and entry functions. Inlined functions
+// have the following properties:
+//   Frame's Func is nil (note: also true for non-Go functions), and
+//   Frame's Entry matches its entry function frame's Entry. (note: could also be true for recursive calls and non-Go functions),
+//   Frame's Name does not match its entry function frame's name.
+//
+// As reading and processing the pcs in a stack trace one by one (from leaf to the root),
+// we use pcDeck to temporarily hold the observed pcs and their expanded frames
+// until we observe the entry function frame.
+type pcDeck struct {
+	pcs             []uintptr
+	frames          []runtime.Frame
+	symbolizeResult symbolizeFlag
+}
+
+func (d *pcDeck) reset() {
+	d.pcs = d.pcs[:0]
+	d.frames = d.frames[:0]
+	d.symbolizeResult = 0
+}
+
+// tryAdd tries to add the pc and Frames expanded from it (most likely one,
+// since the stack trace is already fully expanded) and the symbolizeResult
+// to the deck. If it fails the caller needs to flush the deck and retry.
+func (d *pcDeck) tryAdd(pc uintptr, frames []runtime.Frame, symbolizeResult symbolizeFlag) (success bool) {
+	if existing := len(d.pcs); existing > 0 {
+		// 'frames' are all expanded from one 'pc' and represent all inlined functions
+		// so we check only the last one.
+		newFrame := frames[0]
+		last := d.frames[existing-1]
+		if last.Func != nil { // the last frame can't be inlined. Flush.
+			return false
+		}
+		if last.Entry == 0 || newFrame.Entry == 0 { // Possibly not a Go function. Don't try to merge.
+			return false
+		}
+
+		if last.Entry != newFrame.Entry { // newFrame is for a different function.
+			return false
+		}
+		if last.Function == newFrame.Function { // maybe recursion.
+			return false
+		}
+	}
+	d.pcs = append(d.pcs, pc)
+	d.frames = append(d.frames, frames...)
+	d.symbolizeResult |= symbolizeResult
+	return true
+}
+
+// emitLocation emits the new location and function information recorded in the deck
+// and returns the location ID encoded in the profile protobuf.
+// It emits to b.pb, so there must be no message encoding in progress.
+// It resets the deck.
+func (b *profileBuilder) emitLocation() uint64 {
+	if len(b.deck.pcs) == 0 {
+		return 0
+	}
+	defer b.deck.reset()
+
+	addr := b.deck.pcs[0]
+	firstFrame := b.deck.frames[0]
+
+	// We can't write out functions while in the middle of the
+	// Location message, so record new functions we encounter and
+	// write them out after the Location.
+	type newFunc struct {
+		id         uint64
+		name, file string
+	}
+	newFuncs := make([]newFunc, 0, 8)
+
+	id := uint64(len(b.locs)) + 1
+	b.locs[addr] = locInfo{id: id, pcs: append([]uintptr{}, b.deck.pcs...)}
+
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLocation_ID, id)
+	b.pb.uint64Opt(tagLocation_Address, uint64(firstFrame.PC))
+	for _, frame := range b.deck.frames {
+		// Write out each line in frame expansion.
+		funcID := uint64(b.funcs[frame.Function])
+		if funcID == 0 {
+			funcID = uint64(len(b.funcs)) + 1
+			b.funcs[frame.Function] = int(funcID)
+			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+		}
+		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
+	}
+	for i := range b.mem {
+		if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
+			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+
+			m := b.mem[i]
+			m.funcs |= b.deck.symbolizeResult
+			b.mem[i] = m
+			break
+		}
+	}
+	b.pb.endMessage(tagProfile_Location, start)
+
+	// Write out functions we found during frame expansion.
+	for _, fn := range newFuncs {
+		start := b.pb.startMessage()
+		b.pb.uint64Opt(tagFunction_ID, fn.id)
+		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+		b.pb.endMessage(tagProfile_Function, start)
+	}
+
+	b.flush()
+	return id
+}
+
 // readMapping reads /proc/self/maps and writes mappings to b.pb.
 // It saves the address ranges of the mappings in b.mem for use
 // when emitting locations.
diff --git a/libgo/go/runtime/pprof/proto_test.go b/libgo/go/runtime/pprof/proto_test.go
index 7e7c7cc..67d6b5d 100644
--- a/libgo/go/runtime/pprof/proto_test.go
+++ b/libgo/go/runtime/pprof/proto_test.go
@@ -118,9 +118,9 @@ func TestConvertCPUProfile(t *testing.T) {
 
 	b := []uint64{
 		3, 0, 500, // hz = 500
-		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
-		5, 0, 40, uint64(addr2), uint64(addr2 + 2), // 40 samples in addr2
-		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 10, uint64(addr1 + 1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 40, uint64(addr2 + 1), uint64(addr2 + 2), // 40 samples in addr2
+		5, 0, 10, uint64(addr1 + 1), uint64(addr1 + 2), // 10 samples in addr1
 	}
 	p, err := translateCPUProfile(b)
 	if err != nil {
@@ -360,6 +360,17 @@ func TestMapping(t *testing.T) {
 					continue
 				}
 			}
+
+			if traceback == "Go+C" {
+				// The test code was arranged to have PCs from C and
+				// they are not symbolized.
+				// Check no Location containing those unsymbolized PCs contains multiple lines.
+				for i, loc := range prof.Location {
+					if !symbolized(loc) && len(loc.Line) > 1 {
+						t.Errorf("Location[%d] contains unsymbolized PCs and multiple lines: %v", i, loc)
+					}
+				}
+			}
 		})
 	}
 }
diff --git a/libgo/go/runtime/pprof/protomem.go b/libgo/go/runtime/pprof/protomem.go
index 1c88aae..fa75a28 100644
--- a/libgo/go/runtime/pprof/protomem.go
+++ b/libgo/go/runtime/pprof/protomem.go
@@ -27,30 +27,27 @@ func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64, defau
 	values := []int64{0, 0, 0, 0}
 	var locs []uint64
 	for _, r := range p {
-		locs = locs[:0]
 		hideRuntime := true
 		for tries := 0; tries < 2; tries++ {
-			for _, addr := range r.Stack() {
-				// For heap profiles, all stack
-				// addresses are return PCs, which is
-				// what locForPC expects.
-				if hideRuntime {
+			stk := r.Stack()
+			// For heap profiles, all stack
+			// addresses are return PCs, which is
+			// what appendLocsForStack expects.
+			if hideRuntime {
+				for i, addr := range stk {
 					if f := runtime.FuncForPC(addr); f != nil && strings.HasPrefix(f.Name(), "runtime.") {
 						continue
 					}
 					// Found non-runtime. Show any runtime uses above it.
-					hideRuntime = false
+					stk = stk[i:]
+					break
 				}
-				l := b.locForPC(addr)
-				if l == 0 { // runtime.goexit
-					continue
-				}
-				locs = append(locs, l)
 			}
+			locs = b.appendLocsForStack(locs[:0], stk)
 			if len(locs) > 0 {
 				break
 			}
-			hideRuntime = false // try again, and show all frames
+			hideRuntime = false // try again, and show all frames next time.
 		}
 
 		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
diff --git a/libgo/go/runtime/pprof/testdata/README b/libgo/go/runtime/pprof/testdata/README
new file mode 100644
index 0000000..876538e
--- /dev/null
+++ b/libgo/go/runtime/pprof/testdata/README
@@ -0,0 +1,9 @@
+These binaries were generated by:
+
+$ cat empty.s
+.global _start
+_start:
+$ as --32 -o empty.o empty.s && ld  --build-id -m elf_i386 -o test32 empty.o
+$ as --64 -o empty.o empty.s && ld --build-id -o test64 empty.o
+$ powerpc-linux-gnu-as -o empty.o empty.s && powerpc-linux-gnu-ld --build-id -o test32be empty.o
+$ powerpc64-linux-gnu-as -o empty.o empty.s && powerpc64-linux-gnu-ld --build-id -o test64be empty.o
diff --git a/libgo/go/runtime/pprof/testdata/mappingtest/main.go b/libgo/go/runtime/pprof/testdata/mappingtest/main.go
index 476b9e8..484b7f9 100644
--- a/libgo/go/runtime/pprof/testdata/mappingtest/main.go
+++ b/libgo/go/runtime/pprof/testdata/mappingtest/main.go
@@ -17,8 +17,7 @@ package main
 int cpuHogCSalt1 = 0;
 int cpuHogCSalt2 = 0;
 
-void CPUHogCFunction() {
-	int foo = cpuHogCSalt1;
+void CPUHogCFunction0(int foo) {
 	int i;
 	for (i = 0; i < 100000; i++) {
 		if (foo > 0) {
@@ -30,6 +29,10 @@ void CPUHogCFunction() {
 	}
 }
 
+void CPUHogCFunction() {
+	CPUHogCFunction0(cpuHogCSalt1);
+}
+
 struct CgoTracebackArg {
 	uintptr_t context;
         uintptr_t sigContext;
@@ -39,8 +42,9 @@ struct CgoTracebackArg {
 
 void CollectCgoTraceback(void* parg) {
         struct CgoTracebackArg* arg = (struct CgoTracebackArg*)(parg);
-	arg->buf[0] = (uintptr_t)(CPUHogCFunction);
-	arg->buf[1] = 0;
+	arg->buf[0] = (uintptr_t)(CPUHogCFunction0);
+	arg->buf[1] = (uintptr_t)(CPUHogCFunction);
+	arg->buf[2] = 0;
 };
 */
 import "C"
@@ -81,7 +85,6 @@ var salt1 int
 var salt2 int
 
 func cpuHogGoFunction() {
-	// Generates CPU profile samples including a Go call path.
 	for {
 		foo := salt1
 		for i := 0; i < 1e5; i++ {
diff --git a/libgo/go/runtime/pprof/testdata/test32 b/libgo/go/runtime/pprof/testdata/test32
new file mode 100755
index 0000000..ce59472
--- /dev/null
+++ b/libgo/go/runtime/pprof/testdata/test32
diff --git a/libgo/go/runtime/pprof/testdata/test32be b/libgo/go/runtime/pprof/testdata/test32be
new file mode 100755
index 0000000..f13a732
--- /dev/null
+++ b/libgo/go/runtime/pprof/testdata/test32be
diff --git a/libgo/go/runtime/pprof/testdata/test64 b/libgo/go/runtime/pprof/testdata/test64
new file mode 100755
index 0000000..3fb42fb
--- /dev/null
+++ b/libgo/go/runtime/pprof/testdata/test64
diff --git a/libgo/go/runtime/pprof/testdata/test64be b/libgo/go/runtime/pprof/testdata/test64be
new file mode 100755
index 0000000..09b4b01
--- /dev/null
+++ b/libgo/go/runtime/pprof/testdata/test64be
diff --git a/libgo/go/runtime/preempt.go b/libgo/go/runtime/preempt.go
new file mode 100644
index 0000000..1a8f9ac
--- /dev/null
+++ b/libgo/go/runtime/preempt.go
@@ -0,0 +1,370 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Goroutine preemption
+//
+// A goroutine can be preempted at any safe-point. Currently, there
+// are a few categories of safe-points:
+//
+// 1. A blocked safe-point occurs for the duration that a goroutine is
+//    descheduled, blocked on synchronization, or in a system call.
+//
+// 2. Synchronous safe-points occur when a running goroutine checks
+//    for a preemption request.
+//
+// 3. Asynchronous safe-points occur at any instruction in user code
+//    where the goroutine can be safely paused and a conservative
+//    stack and register scan can find stack roots. The runtime can
+//    stop a goroutine at an async safe-point using a signal.
+//
+// At both blocked and synchronous safe-points, a goroutine's CPU
+// state is minimal and the garbage collector has complete information
+// about its entire stack. This makes it possible to deschedule a
+// goroutine with minimal space, and to precisely scan a goroutine's
+// stack.
+//
+// Synchronous safe-points are implemented by overloading the stack
+// bound check in function prologues. To preempt a goroutine at the
+// next synchronous safe-point, the runtime poisons the goroutine's
+// stack bound to a value that will cause the next stack bound check
+// to fail and enter the stack growth implementation, which will
+// detect that it was actually a preemption and redirect to preemption
+// handling.
+//
+// Preemption at asynchronous safe-points is implemented by suspending
+// the thread using an OS mechanism (e.g., signals) and inspecting its
+// state to determine if the goroutine was at an asynchronous
+// safe-point. Since the thread suspension itself is generally
+// asynchronous, it also checks if the running goroutine wants to be
+// preempted, since this could have changed. If all conditions are
+// satisfied, it adjusts the signal context to make it look like the
+// signaled thread just called asyncPreempt and resumes the thread.
+// asyncPreempt spills all registers and enters the scheduler.
+//
+// (An alternative would be to preempt in the signal handler itself.
+// This would let the OS save and restore the register state and the
+// runtime would only need to know how to extract potentially
+// pointer-containing registers from the signal context. However, this
+// would consume an M for every preempted G, and the scheduler itself
+// is not designed to run from a signal handler, as it tends to
+// allocate memory and start threads in the preemption path.)
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+)
+
+type suspendGState struct {
+	g *g
+
+	// dead indicates the goroutine was not suspended because it
+	// is dead. This goroutine could be reused after the dead
+	// state was observed, so the caller must not assume that it
+	// remains dead.
+	dead bool
+
+	// stopped indicates that this suspendG transitioned the G to
+	// _Gwaiting via g.preemptStop and thus is responsible for
+	// readying it when done.
+	stopped bool
+}
+
+// suspendG suspends goroutine gp at a safe-point and returns the
+// state of the suspended goroutine. The caller gets read access to
+// the goroutine until it calls resumeG.
+//
+// It is safe for multiple callers to attempt to suspend the same
+// goroutine at the same time. The goroutine may execute between
+// subsequent successful suspend operations. The current
+// implementation grants exclusive access to the goroutine, and hence
+// multiple callers will serialize. However, the intent is to grant
+// shared read access, so please don't depend on exclusive access.
+//
+// This must be called from the system stack and the user goroutine on
+// the current M (if any) must be in a preemptible state. This
+// prevents deadlocks where two goroutines attempt to suspend each
+// other and both are in non-preemptible states. There are other ways
+// to resolve this deadlock, but this seems simplest.
+//
+// TODO(austin): What if we instead required this to be called from a
+// user goroutine? Then we could deschedule the goroutine while
+// waiting instead of blocking the thread. If two goroutines tried to
+// suspend each other, one of them would win and the other wouldn't
+// complete the suspend until it was resumed. We would have to be
+// careful that they couldn't actually queue up suspend for each other
+// and then both be suspended. This would also avoid the need for a
+// kernel context switch in the synchronous case because we could just
+// directly schedule the waiter. The context switch is unavoidable in
+// the signal case.
+//
+//go:systemstack
+func suspendG(gp *g) suspendGState {
+	if mp := getg().m; mp.curg != nil && readgstatus(mp.curg) == _Grunning {
+		// Since we're on the system stack of this M, the user
+		// G is stuck at an unsafe point. If another goroutine
+		// were to try to preempt m.curg, it could deadlock.
+		throw("suspendG from non-preemptible goroutine")
+	}
+
+	// See https://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Drive the goroutine to a preemption point.
+	stopped := false
+	var asyncM *m
+	var asyncGen uint32
+	var nextPreemptM int64
+	for i := 0; ; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			if s&_Gscan != 0 {
+				// Someone else is suspending it. Wait
+				// for them to finish.
+				//
+				// TODO: It would be nicer if we could
+				// coalesce suspends.
+				break
+			}
+
+			dumpgstatus(gp)
+			throw("invalid g status")
+
+		case _Gdead:
+			// Nothing to suspend.
+			//
+			// preemptStop may need to be cleared, but
+			// doing that here could race with goroutine
+			// reuse. Instead, goexit0 clears it.
+			return suspendGState{dead: true}
+
+		case _Gcopystack:
+			// The stack is being copied. We need to wait
+			// until this is done.
+
+		case _Gexitingsyscall:
+			// This is a transient state.  Try again.
+
+		case _Gpreempted:
+			// We (or someone else) suspended the G. Claim
+			// ownership of it by transitioning it to
+			// _Gwaiting.
+			if !casGFromPreempted(gp, _Gpreempted, _Gwaiting) {
+				break
+			}
+
+			// We stopped the G, so we have to ready it later.
+			stopped = true
+
+			s = _Gwaiting
+			fallthrough
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// This may race with execution or readying of gp.
+			// The scan bit keeps it from transition state.
+			if !castogscanstatus(gp, s, s|_Gscan) {
+				break
+			}
+
+			// Clear the preemption request. It's safe to
+			// reset the stack guard because we hold the
+			// _Gscan bit and thus own the stack.
+			gp.preemptStop = false
+			gp.preempt = false
+
+			// The goroutine was already at a safe-point
+			// and we've now locked that in.
+			//
+			// TODO: It would be much better if we didn't
+			// leave it in _Gscan, but instead gently
+			// prevented its scheduling until resumption.
+			// Maybe we only use this to bump a suspended
+			// count and the scheduler skips suspended
+			// goroutines? That wouldn't be enough for
+			// {_Gsyscall,_Gwaiting} -> _Grunning. Maybe
+			// for all those transitions we need to check
+			// suspended and deschedule?
+			return suspendGState{g: gp, stopped: stopped}
+
+		case _Grunning:
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if asyncM != nil && gp.preemptStop && gp.preempt && asyncM == gp.m && atomic.Load(&asyncM.preemptGen) == asyncGen {
+				break
+			}
+
+			// Temporarily block state transitions.
+			if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				break
+			}
+
+			// Request synchronous preemption.
+			gp.preemptStop = true
+			gp.preempt = true
+
+			// Prepare for asynchronous preemption.
+			asyncM2 := gp.m
+			asyncGen2 := atomic.Load(&asyncM2.preemptGen)
+			needAsync := asyncM != asyncM2 || asyncGen != asyncGen2
+			asyncM = asyncM2
+			asyncGen = asyncGen2
+
+			casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+
+			// Send asynchronous preemption. We do this
+			// after CASing the G back to _Grunning
+			// because preemptM may be synchronous and we
+			// don't want to catch the G just spinning on
+			// its status.
+			if preemptMSupported && debug.asyncpreemptoff == 0 && needAsync {
+				// Rate limit preemptM calls. This is
+				// particularly important on Windows
+				// where preemptM is actually
+				// synchronous and the spin loop here
+				// can lead to live-lock.
+				now := nanotime()
+				if now >= nextPreemptM {
+					nextPreemptM = now + yieldDelay/2
+					preemptM(asyncM)
+				}
+			}
+		}
+
+		// TODO: Don't busy wait. This loop should really only
+		// be a simple read/decide/CAS loop that only fails if
+		// there's an active race. Once the CAS succeeds, we
+		// should queue up the preemption (which will require
+		// it to be reliable in the _Grunning case, not
+		// best-effort) and then sleep until we're notified
+		// that the goroutine is suspended.
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+}
+
+// resumeG undoes the effects of suspendG, allowing the suspended
+// goroutine to continue from its current safe-point.
+func resumeG(state suspendGState) {
+	if state.dead {
+		// We didn't actually stop anything.
+		return
+	}
+
+	gp := state.g
+	switch s := readgstatus(gp); s {
+	default:
+		dumpgstatus(gp)
+		throw("unexpected g status")
+
+	case _Grunnable | _Gscan,
+		_Gwaiting | _Gscan,
+		_Gsyscall | _Gscan:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+
+	if state.stopped {
+		// We stopped it, so we need to re-schedule it.
+		ready(gp, 0, true)
+	}
+}
+
+// canPreemptM reports whether mp is in a state that is safe to preempt.
+//
+// It is nosplit because it has nosplit callers.
+//
+//go:nosplit
+func canPreemptM(mp *m) bool {
+	return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+}
+
+//go:generate go run mkpreempt.go
+
+// asyncPreempt saves all user registers and calls asyncPreempt2.
+//
+// When stack scanning encounters an asyncPreempt frame, it scans that
+// frame and its parent frame conservatively.
+//
+// asyncPreempt is implemented in assembly.
+func asyncPreempt()
+
+//go:nosplit
+func asyncPreempt2() {
+	gp := getg()
+	gp.asyncSafePoint = true
+	if gp.preemptStop {
+		mcall(preemptPark)
+	} else {
+		mcall(gopreempt_m)
+	}
+	gp.asyncSafePoint = false
+}
+
+// wantAsyncPreempt returns whether an asynchronous preemption is
+// queued for gp.
+func wantAsyncPreempt(gp *g) bool {
+	// Check both the G and the P.
+	return (gp.preempt || gp.m.p != 0 && gp.m.p.ptr().preempt) && readgstatus(gp)&^_Gscan == _Grunning
+}
+
+// isAsyncSafePoint reports whether gp at instruction PC is an
+// asynchronous safe point. This indicates that:
+//
+// 1. It's safe to suspend gp and conservatively scan its stack and
+// registers. There are no potentially hidden pointer values and it's
+// not in the middle of an atomic sequence like a write barrier.
+//
+// 2. gp has enough stack space to inject the asyncPreempt call.
+//
+// 3. It's generally safe to interact with the runtime, even if we're
+// in a signal handler stopped here. For example, there are no runtime
+// locks held, so acquiring a runtime lock won't self-deadlock.
+func isAsyncSafePoint(gp *g, pc uintptr) bool {
+	mp := gp.m
+
+	// Only user Gs can have safe-points. We check this first
+	// because it's extremely common that we'll catch mp in the
+	// scheduler processing this G preemption.
+	if mp.curg != gp {
+		return false
+	}
+
+	// Check M state.
+	if mp.p == 0 || !canPreemptM(mp) {
+		return false
+	}
+
+	// Check if PC is an unsafe-point.
+	f := FuncForPC(pc)
+	if f == nil {
+		// Not Go code.
+		return false
+	}
+	name := f.Name()
+	if hasPrefix(name, "runtime.") ||
+		hasPrefix(name, "runtime..z2finternal..z2f") ||
+		hasPrefix(name, "reflect.") {
+		// For now we never async preempt the runtime or
+		// anything closely tied to the runtime. Known issues
+		// include: various points in the scheduler ("don't
+		// preempt between here and here"), much of the defer
+		// implementation (untyped info on stack), bulk write
+		// barriers (write barrier check),
+		// reflect.{makeFuncStub,methodValueCall}.
+		//
+		// TODO(austin): We should improve this, or opt things
+		// in incrementally.
+		return false
+	}
+
+	return true
+}
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index a025137..c0e8577 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -548,6 +548,7 @@ func schedinit() {
 	usestackmaps = probestackmaps()
 
 	mallocinit()
+	fastrandinit() // must run before mcommoninit
 	mcommoninit(_g_.m)
 	cpuinit() // must run before alginit
 	alginit() // maps must not be used before this call
@@ -622,8 +623,8 @@ func mcommoninit(mp *m) {
 	sched.mnext++
 	checkmcount()
 
-	mp.fastrand[0] = 1597334677 * uint32(mp.id)
-	mp.fastrand[1] = uint32(cputicks())
+	mp.fastrand[0] = uint32(int64Hash(uint64(mp.id), fastrandseed))
+	mp.fastrand[1] = uint32(int64Hash(uint64(cputicks()), ^fastrandseed))
 	if mp.fastrand[0]|mp.fastrand[1] == 0 {
 		mp.fastrand[1] = 1
 	}
@@ -640,6 +641,13 @@ func mcommoninit(mp *m) {
 	unlock(&sched.lock)
 }
 
+var fastrandseed uintptr
+
+func fastrandinit() {
+	s := (*[unsafe.Sizeof(fastrandseed)]byte)(unsafe.Pointer(&fastrandseed))[:]
+	getRandomData(s)
+}
+
 // Mark gp ready to run.
 func ready(gp *g, traceskip int, next bool) {
 	if trace.enabled {
@@ -704,18 +712,6 @@ func readgstatus(gp *g) uint32 {
 	return atomic.Load(&gp.atomicstatus)
 }
 
-// Ownership of gcscanvalid:
-//
-// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
-// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
-//
-// Otherwise, a second goroutine can lock the scan state by setting _Gscan
-// in the status bit and then modify gcscanvalid, and then unlock the scan state.
-//
-// Note that the first condition implies an exception to the second:
-// if a second goroutine changes gp's status to _Grunning|_Gscan,
-// that second goroutine still does not have the right to modify gcscanvalid.
-
 // The Gscanstatuses are acting like locks and this releases them.
 // If it proves to be a performance hit we should be able to make these
 // simple atomic stores but for now we are going to throw if
@@ -732,7 +728,8 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 	case _Gscanrunnable,
 		_Gscanwaiting,
 		_Gscanrunning,
-		_Gscansyscall:
+		_Gscansyscall,
+		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
 			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
@@ -774,17 +771,6 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		})
 	}
 
-	if oldval == _Grunning && gp.gcscanvalid {
-		// If oldvall == _Grunning, then the actual status must be
-		// _Grunning or _Grunning|_Gscan; either way,
-		// we own gp.gcscanvalid, so it's safe to read.
-		// gp.gcscanvalid must not be true when we are running.
-		systemstack(func() {
-			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-			throw("casgstatus")
-		})
-	}
-
 	// See https://golang.org/cl/21503 for justification of the yield delay.
 	const yieldDelay = 5 * 1000
 	var nextYield int64
@@ -795,14 +781,6 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
 			throw("casgstatus: waiting for Gwaiting but is Grunnable")
 		}
-		// Help GC if needed.
-		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-		// 	gp.preemptscan = false
-		// 	systemstack(func() {
-		// 		gcphasework(gp)
-		// 	})
-		// }
-		// But meanwhile just yield.
 		if i == 0 {
 			nextYield = nanotime() + yieldDelay
 		}
@@ -815,174 +793,28 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
 }
 
-// scang blocks until gp's stack has been scanned.
-// It might be scanned by scang or it might be scanned by the goroutine itself.
-// Either way, the stack scan has completed when scang returns.
-func scang(gp *g, gcw *gcWork) {
-	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
-	// Nothing is racing with us now, but gcscandone might be set to true left over
-	// from an earlier round of stack scanning (we scan twice per GC).
-	// We use gcscandone to record whether the scan has been done during this round.
-
-	gp.gcscandone = false
-
-	// See https://golang.org/cl/21503 for justification of the yield delay.
-	const yieldDelay = 10 * 1000
-	var nextYield int64
-
-	// Endeavor to get gcscandone set to true,
-	// either by doing the stack scan ourselves or by coercing gp to scan itself.
-	// gp.gcscandone can transition from false to true when we're not looking
-	// (if we asked for preemption), so any time we lock the status using
-	// castogscanstatus we have to double-check that the scan is still not done.
-loop:
-	for i := 0; !gp.gcscandone; i++ {
-		switch s := readgstatus(gp); s {
-		default:
-			dumpgstatus(gp)
-			throw("stopg: invalid status")
-
-		case _Gdead:
-			// No stack.
-			gp.gcscandone = true
-			break loop
-
-		case _Gcopystack:
-		// Stack being switched. Go around again.
-
-		case _Gsyscall:
-			if usestackmaps {
-				// Claim goroutine by setting scan bit.
-				// Racing with execution or readying of gp.
-				// The scan bit keeps them from running
-				// the goroutine until we're done.
-				if castogscanstatus(gp, s, s|_Gscan) {
-					if gp.scanningself {
-						// Don't try to scan the stack
-						// if the goroutine is going to do
-						// it itself.
-						// FIXME: can this happen?
-						restartg(gp)
-						break
-					}
-					if !gp.gcscandone {
-						// Send a signal to let the goroutine scan
-						// itself. This races with enter/exitsyscall.
-						// If the goroutine is not stopped at a safepoint,
-						// it will not scan the stack and we'll try again.
-						mp := gp.m
-						noteclear(&mp.scannote)
-						gp.scangcw = uintptr(unsafe.Pointer(gcw))
-						tgkill(getpid(), _pid_t(mp.procid), _SIGURG)
-
-						// Wait for gp to scan its own stack.
-						notesleep(&mp.scannote)
-
-						if !gp.gcscandone {
-							// The signal delivered at a bad time.
-							// Try again.
-							restartg(gp)
-							break
-						}
-					}
-					restartg(gp)
-					break loop
-				}
-				break
-			}
-			fallthrough
-
-		case _Grunnable, _Gwaiting:
-			// Claim goroutine by setting scan bit.
-			// Racing with execution or readying of gp.
-			// The scan bit keeps them from running
-			// the goroutine until we're done.
-			if castogscanstatus(gp, s, s|_Gscan) {
-				if gp.scanningself {
-					// Don't try to scan the stack
-					// if the goroutine is going to do
-					// it itself.
-					restartg(gp)
-					break
-				}
-				if !gp.gcscandone {
-					scanstack(gp, gcw)
-					gp.gcscandone = true
-				}
-				restartg(gp)
-				break loop
-			}
-
-		case _Gexitingsyscall:
-			// This is a transient state during which we should not scan its stack.
-			// Try again.
-
-		case _Gscanwaiting:
-			// newstack is doing a scan for us right now. Wait.
-
-		case _Gscanrunning:
-			// checkPreempt is scanning. Wait.
-
-		case _Grunning:
-			// Goroutine running. Try to preempt execution so it can scan itself.
-			// The preemption handler (in newstack) does the actual scan.
-
-			// Optimization: if there is already a pending preemption request
-			// (from the previous loop iteration), don't bother with the atomics.
-			if gp.preemptscan && gp.preempt {
-				break
-			}
-
-			// Ask for preemption and self scan.
-			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
-				if !gp.gcscandone {
-					gp.preemptscan = true
-					gp.preempt = true
-				}
-				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
-			}
-		}
-
-		if i == 0 {
-			nextYield = nanotime() + yieldDelay
-		}
-		if nanotime() < nextYield {
-			procyield(10)
-		} else {
-			osyield()
-			nextYield = nanotime() + yieldDelay/2
-		}
+// casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted.
+//
+// TODO(austin): This is the only status operation that both changes
+// the status and locks the _Gscan bit. Rethink this.
+func casGToPreemptScan(gp *g, old, new uint32) {
+	if old != _Grunning || new != _Gscan|_Gpreempted {
+		throw("bad g transition")
 	}
-
-	gp.preemptscan = false // cancel scan request if no longer needed
-}
-
-// The GC requests that this routine be moved from a scanmumble state to a mumble state.
-func restartg(gp *g) {
-	if gp.scang != 0 || gp.scangcw != 0 {
-		print("g ", gp.goid, "is being scanned scang=", gp.scang, " scangcw=", gp.scangcw, "\n")
-		throw("restartg: being scanned")
+	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
 	}
+}
 
-	s := readgstatus(gp)
-	switch s {
-	default:
-		dumpgstatus(gp)
-		throw("restartg: unexpected status")
-
-	case _Gdead:
-	// ok
-
-	case _Gscanrunnable,
-		_Gscanwaiting,
-		_Gscansyscall:
-		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+// casGFromPreempted attempts to transition gp from _Gpreempted to
+// _Gwaiting. If successful, the caller is responsible for
+// re-scheduling gp.
+func casGFromPreempted(gp *g, old, new uint32) bool {
+	if old != _Gpreempted || new != _Gwaiting {
+		throw("bad g transition")
 	}
+	return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
 }
 
 // stopTheWorld stops all P's from executing goroutines, interrupting
@@ -1001,8 +833,23 @@ func restartg(gp *g) {
 // goroutines.
 func stopTheWorld(reason string) {
 	semacquire(&worldsema)
-	getg().m.preemptoff = reason
-	systemstack(stopTheWorldWithSema)
+	gp := getg()
+	gp.m.preemptoff = reason
+	systemstack(func() {
+		// Mark the goroutine which called stopTheWorld preemptible so its
+		// stack may be scanned.
+		// This lets a mark worker scan us while we try to stop the world
+		// since otherwise we could get in a mutual preemption deadlock.
+		// We must not modify anything on the G stack because a stack shrink
+		// may occur. A stack shrink is otherwise OK though because in order
+		// to return from this function (and to leave the system stack) we
+		// must have preempted all goroutines, including any attempting
+		// to scan our stack, in which case, any stack shrinking will
+		// have already completed by the time we exit.
+		casgstatus(gp, _Grunning, _Gwaiting)
+		stopTheWorldWithSema()
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
 }
 
 // startTheWorld undoes the effects of stopTheWorld.
@@ -1014,10 +861,31 @@ func startTheWorld() {
 	getg().m.preemptoff = ""
 }
 
-// Holding worldsema grants an M the right to try to stop the world
-// and prevents gomaxprocs from changing concurrently.
+// stopTheWorldGC has the same effect as stopTheWorld, but blocks
+// until the GC is not running. It also blocks a GC from starting
+// until startTheWorldGC is called.
+func stopTheWorldGC(reason string) {
+	semacquire(&gcsema)
+	stopTheWorld(reason)
+}
+
+// startTheWorldGC undoes the effects of stopTheWorldGC.
+func startTheWorldGC() {
+	startTheWorld()
+	semrelease(&gcsema)
+}
+
+// Holding worldsema grants an M the right to try to stop the world.
 var worldsema uint32 = 1
 
+// Holding gcsema grants the M the right to block a GC, and blocks
+// until the current GC is done. In particular, it prevents gomaxprocs
+// from changing concurrently.
+//
+// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle
+// being changed/enabled during a GC, remove this.
+var gcsema uint32 = 1
+
 // stopTheWorldWithSema is the core implementation of stopTheWorld.
 // The caller is responsible for acquiring worldsema and disabling
 // preemption first and then should stopTheWorldWithSema on the system
@@ -1119,7 +987,7 @@ func stopTheWorldWithSema() {
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if netpollinited() {
-		list := netpoll(false) // non-blocking
+		list := netpoll(0) // non-blocking
 		injectglist(&list)
 	}
 	lock(&sched.lock)
@@ -1299,6 +1167,11 @@ func mexit(osStack bool) {
 	// Free the gsignal stack.
 	if m.gsignal != nil {
 		stackfree(m.gsignal)
+		// On some platforms, when calling into VDSO (e.g. nanotime)
+		// we store our g on the gsignal stack, if there is one.
+		// Now the stack is freed, unlink it from the m, so we
+		// won't write to it when calling VDSO code.
+		m.gsignal = nil
 	}
 
 	// Remove m from allm.
@@ -1637,8 +1510,6 @@ func oneNewExtraM() {
 	// the goroutine stack ends.
 	mp, g0SP, g0SPSize := allocm(nil, nil, true)
 	gp := malg(true, false, nil, nil)
-	gp.gcscanvalid = true
-	gp.gcscandone = true
 	// malg returns status as _Gidle. Change to _Gdead before
 	// adding to allg where GC can see it. We use _Gdead to hide
 	// this from tracebacks and stack scans since it isn't a
@@ -1704,6 +1575,7 @@ func dropm() {
 
 	// Return mp.curg to dead state.
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	mp.curg.preemptStop = false
 	atomic.Xadd(&sched.ngsys, +1)
 
 	// Block signals before unminit.
@@ -2034,6 +1906,9 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
+	if when := nobarrierWakeTime(_p_); when != 0 {
+		wakeNetPoller(when)
+	}
 	pidleput(_p_)
 	unlock(&sched.lock)
 }
@@ -2135,14 +2010,16 @@ func gcstopm() {
 func execute(gp *g, inheritTime bool) {
 	_g_ := getg()
 
+	// Assign gp.m before entering _Grunning so running Gs have an
+	// M.
+	_g_.m.curg = gp
+	gp.m = _g_.m
 	casgstatus(gp, _Grunnable, _Grunning)
 	gp.waitsince = 0
 	gp.preempt = false
 	if !inheritTime {
 		_g_.m.p.ptr().schedtick++
 	}
-	_g_.m.curg = gp
-	gp.m = _g_.m
 
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
@@ -2163,7 +2040,7 @@ func execute(gp *g, inheritTime bool) {
 }
 
 // Finds a runnable goroutine to execute.
-// Tries to steal from other P's, get g from global queue, poll network.
+// Tries to steal from other P's, get g from local or global queue, poll network.
 func findrunnable() (gp *g, inheritTime bool) {
 	_g_ := getg()
 
@@ -2180,6 +2057,9 @@ top:
 	if _p_.runSafePointFn != 0 {
 		runSafePointFn()
 	}
+
+	now, pollUntil, _ := checkTimers(_p_, 0)
+
 	if fingwait && fingwake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
@@ -2212,7 +2092,7 @@ top:
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
-		if list := netpoll(false); !list.empty() { // non-blocking
+		if list := netpoll(0); !list.empty() { // non-blocking
 			gp := list.pop()
 			injectglist(&list)
 			casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2225,12 +2105,7 @@ top:
 
 	// Steal work from other P's.
 	procs := uint32(gomaxprocs)
-	if atomic.Load(&sched.npidle) == procs-1 {
-		// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
-		// New work can appear from returning syscall/cgocall, network or timers.
-		// Neither of that submits to local run queues, so no point in stealing.
-		goto stop
-	}
+	ranTimer := false
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
@@ -2247,11 +2122,48 @@ top:
 				goto top
 			}
 			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
-			if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+			p2 := allp[enum.position()]
+			if _p_ == p2 {
+				continue
+			}
+			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
 				return gp, false
 			}
+
+			// Consider stealing timers from p2.
+			// This call to checkTimers is the only place where
+			// we hold a lock on a different P's timers.
+			// Lock contention can be a problem here, so avoid
+			// grabbing the lock if p2 is running and not marked
+			// for preemption. If p2 is running and not being
+			// preempted we assume it will handle its own timers.
+			if i > 2 && shouldStealTimers(p2) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that is always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(_p_); gp != nil {
+						return gp, inheritTime
+					}
+					ranTimer = true
+				}
+			}
 		}
 	}
+	if ranTimer {
+		// Running a timer may have made some goroutine ready.
+		goto top
+	}
 
 stop:
 
@@ -2268,10 +2180,16 @@ stop:
 		return gp, false
 	}
 
+	delta := int64(-1)
+	if pollUntil != 0 {
+		// checkTimers ensures that polluntil > now.
+		delta = pollUntil - now
+	}
+
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
 	// then pause execution until a callback was triggered.
-	if beforeIdle() {
+	if beforeIdle(delta) {
 		// At least one goroutine got woken.
 		goto top
 	}
@@ -2359,21 +2277,35 @@ stop:
 	}
 
 	// poll network
-	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
-		list := netpoll(true) // block until new work is available
+		if faketime != 0 {
+			// When using fake time, just poll.
+			delta = 0
+		}
+		list := netpoll(delta) // block until new work is available
+		atomic.Store64(&sched.pollUntil, 0)
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
-		if !list.empty() {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
+		if faketime != 0 && list.empty() {
+			// Using fake time and nothing is ready; stop M.
+			// When all M's stop, checkdead will call timejump.
+			stopm()
+			goto top
+		}
+		lock(&sched.lock)
+		_p_ = pidleget()
+		unlock(&sched.lock)
+		if _p_ == nil {
+			injectglist(&list)
+		} else {
+			acquirep(_p_)
+			if !list.empty() {
 				gp := list.pop()
 				injectglist(&list)
 				casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2382,7 +2314,16 @@ stop:
 				}
 				return gp, false
 			}
-			injectglist(&list)
+			if wasSpinning {
+				_g_.m.spinning = true
+				atomic.Xadd(&sched.nmspinning, 1)
+			}
+			goto top
+		}
+	} else if pollUntil != 0 && netpollinited() {
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > pollUntil {
+			netpollBreak()
 		}
 	}
 	stopm()
@@ -2402,7 +2343,7 @@ func pollWork() bool {
 		return true
 	}
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
-		if list := netpoll(false); !list.empty() {
+		if list := netpoll(0); !list.empty() {
 			injectglist(&list)
 			return true
 		}
@@ -2410,6 +2351,22 @@ func pollWork() bool {
 	return false
 }
 
+// wakeNetPoller wakes up the thread sleeping in the network poller,
+// if there is one, and if it isn't going to wake up anyhow before
+// the when argument.
+func wakeNetPoller(when int64) {
+	if atomic.Load64(&sched.lastpoll) == 0 {
+		// In findrunnable we ensure that when polling the pollUntil
+		// field is either zero or the time to which the current
+		// poll is expected to run. This can have a spurious wakeup
+		// but should never miss a wakeup.
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > when {
+			netpollBreak()
+		}
+	}
+}
+
 func resetspinning() {
 	_g_ := getg()
 	if !_g_.m.spinning {
@@ -2474,14 +2431,26 @@ func schedule() {
 	}
 
 top:
+	pp := _g_.m.p.ptr()
+	pp.preempt = false
+
 	if sched.gcwaiting != 0 {
 		gcstopm()
 		goto top
 	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
+	if pp.runSafePointFn != 0 {
 		runSafePointFn()
 	}
 
+	// Sanity check: if we are spinning, the run queue should be empty.
+	// Check this before calling checkTimers, as that might call
+	// goready to put a ready goroutine on the local run queue.
+	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
+		throw("schedule: spinning with local work")
+	}
+
+	checkTimers(pp, 0)
+
 	var gp *g
 	var inheritTime bool
 
@@ -2513,9 +2482,8 @@ top:
 	}
 	if gp == nil {
 		gp, inheritTime = runqget(_g_.m.p.ptr())
-		if gp != nil && _g_.m.spinning {
-			throw("schedule: spinning with local work")
-		}
+		// We can see gp != nil here even if the M is spinning,
+		// if checkTimers added a local goroutine via goready.
 
 		// Because gccgo does not implement preemption as a stack check,
 		// we need to check for preemption here for fairness.
@@ -2591,6 +2559,62 @@ func dropg() {
 	setGNoWB(&_g_.m.curg, nil)
 }
 
+// checkTimers runs any timers for the P that are ready.
+// If now is not 0 it is the current time.
+// It returns the current time or 0 if it is not known,
+// and the time when the next timer should run or 0 if there is no next timer,
+// and reports whether it ran any timers.
+// If the time when the next timer should run is not 0,
+// it is always larger than the returned time.
+// We pass now in and out to avoid extra calls of nanotime.
+//go:yeswritebarrierrec
+func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
+	lock(&pp.timersLock)
+
+	adjusttimers(pp)
+
+	rnow = now
+	if len(pp.timers) > 0 {
+		if rnow == 0 {
+			rnow = nanotime()
+		}
+		for len(pp.timers) > 0 {
+			// Note that runtimer may temporarily unlock
+			// pp.timersLock.
+			if tw := runtimer(pp, rnow); tw != 0 {
+				if tw > 0 {
+					pollUntil = tw
+				}
+				break
+			}
+			ran = true
+		}
+	}
+
+	unlock(&pp.timersLock)
+
+	return rnow, pollUntil, ran
+}
+
+// shouldStealTimers reports whether we should try stealing the timers from p2.
+// We don't steal timers from a running P that is not marked for preemption,
+// on the assumption that it will run its own timers. This reduces
+// contention on the timers lock.
+func shouldStealTimers(p2 *p) bool {
+	if p2.status != _Prunning {
+		return true
+	}
+	mp := p2.m.ptr()
+	if mp == nil || mp.locks > 0 {
+		return false
+	}
+	gp := mp.curg
+	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
+		return false
+	}
+	return true
+}
+
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -2604,8 +2628,8 @@ func park_m(gp *g) {
 		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
 	}
 
-	dropg()
 	casgstatus(gp, _Grunning, _Gwaiting)
+	dropg()
 
 	if fn := _g_.m.waitunlockf; fn != nil {
 		ok := fn(gp, _g_.m.waitlock)
@@ -2628,8 +2652,8 @@ func goschedImpl(gp *g) {
 		dumpgstatus(gp)
 		throw("bad g status")
 	}
-	dropg()
 	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
 	lock(&sched.lock)
 	globrunqput(gp)
 	unlock(&sched.lock)
@@ -2648,7 +2672,7 @@ func gosched_m(gp *g) {
 // goschedguarded is a forbidden-states-avoided version of gosched_m
 func goschedguarded_m(gp *g) {
 
-	if gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.p.ptr().status != _Prunning {
+	if !canPreemptM(gp.m) {
 		gogo(gp) // never return
 	}
 
@@ -2665,6 +2689,47 @@ func gopreempt_m(gp *g) {
 	goschedImpl(gp)
 }
 
+// preemptPark parks gp and puts it in _Gpreempted.
+//
+//go:systemstack
+func preemptPark(gp *g) {
+	if trace.enabled {
+		traceGoPark(traceEvGoBlock, 0)
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	gp.waitreason = waitReasonPreempted
+	// Transition from _Grunning to _Gscan|_Gpreempted. We can't
+	// be in _Grunning when we dropg because then we'd be running
+	// without an M, but the moment we're in _Gpreempted,
+	// something could claim this G before we've fully cleaned it
+	// up. Hence, we set the scan bit to lock down further
+	// transitions until we can dropg.
+	casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
+	dropg()
+	casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+	schedule()
+}
+
+// goyield is like Gosched, but it:
+// - does not emit a GoSched trace event
+// - puts the current G on the runq of the current P instead of the globrunq
+func goyield() {
+	checkTimeouts()
+	mcall(goyield_m)
+}
+
+func goyield_m(gp *g) {
+	pp := gp.m.p.ptr()
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	runqput(pp, gp, false)
+	schedule()
+}
+
 // Finishes execution of the current goroutine.
 func goexit1() {
 	if trace.enabled {
@@ -2687,6 +2752,7 @@ func goexit0(gp *g) {
 	gp.lockedm = 0
 	_g_.m.lockedg = 0
 	gp.entry = nil
+	gp.preemptStop = false
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
@@ -2705,9 +2771,6 @@ func goexit0(gp *g) {
 		gp.gcAssistBytes = 0
 	}
 
-	// Note that gp's stack scan is now "valid" because it has no
-	// stack.
-	gp.gcscanvalid = true
 	dropg()
 
 	if GOARCH == "wasm" { // no threads yet on wasm
@@ -3215,7 +3278,6 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
-	newg.gcscanvalid = false
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -3736,6 +3798,20 @@ func (pp *p) destroy() {
 		globrunqputhead(pp.runnext.ptr())
 		pp.runnext = 0
 	}
+	if len(pp.timers) > 0 {
+		plocal := getg().m.p.ptr()
+		// The world is stopped, but we acquire timersLock to
+		// protect against sysmon calling timeSleepUntil.
+		// This is the only case where we hold the timersLock of
+		// more than one P, so there are no deadlock concerns.
+		lock(&plocal.timersLock)
+		lock(&pp.timersLock)
+		moveTimers(plocal, pp.timers)
+		pp.timers = nil
+		pp.adjustTimers = 0
+		unlock(&pp.timersLock)
+		unlock(&plocal.timersLock)
+	}
 	// If there's a background worker, make it runnable and put
 	// it on the global queue so it can clean itself up.
 	if gp := pp.gcBgMarkWorker.ptr(); gp != nil {
@@ -3761,14 +3837,18 @@ func (pp *p) destroy() {
 		pp.deferpoolbuf[i] = nil
 	}
 	pp.deferpool = pp.deferpoolbuf[:0]
+	systemstack(func() {
+		for i := 0; i < pp.mspancache.len; i++ {
+			// Safe to call since the world is stopped.
+			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
+		}
+		pp.mspancache.len = 0
+		pp.pcache.flush(&mheap_.pages)
+	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-	if raceenabled {
-		raceprocdestroy(pp.raceprocctx)
-		pp.raceprocctx = 0
-	}
 	pp.gcAssistTime = 0
 	pp.status = _Pdead
 }
@@ -4016,7 +4096,8 @@ func checkdead() {
 		}
 		s := readgstatus(gp)
 		switch s &^ _Gscan {
-		case _Gwaiting:
+		case _Gwaiting,
+			_Gpreempted:
 			grunning++
 		case _Grunnable,
 			_Grunning,
@@ -4028,17 +4109,18 @@ func checkdead() {
 	}
 	unlock(&allglock)
 	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
 		throw("no goroutines (main called runtime.Goexit) - deadlock!")
 	}
 
 	// Maybe jump time forward for playground.
-	gp := timejump()
-	if gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
-		_p_ := pidleget()
-		if _p_ == nil {
-			throw("checkdead: no p for timer")
+	_p_ := timejump()
+	if _p_ != nil {
+		for pp := &sched.pidle; *pp != 0; pp = &(*pp).ptr().link {
+			if (*pp).ptr() == _p_ {
+				*pp = _p_.link
+				break
+			}
 		}
 		mp := mget()
 		if mp == nil {
@@ -4051,7 +4133,15 @@ func checkdead() {
 		return
 	}
 
+	// There are no goroutines running, so we can look at the P's.
+	for _, _p_ := range allp {
+		if len(_p_.timers) > 0 {
+			return
+		}
+	}
+
 	getg().m.throwing = -1 // do not dump full stacks
+	unlock(&sched.lock)    // unlock so that GODEBUG=scheddetail=1 doesn't hang
 	throw("all goroutines are asleep - deadlock!")
 }
 
@@ -4084,32 +4174,34 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		now := nanotime()
+		next := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
-				atomic.Store(&sched.sysmonwait, 1)
-				unlock(&sched.lock)
-				// Make wake-up period small enough
-				// for the sampling to be correct.
-				maxsleep := forcegcperiod / 2
-				shouldRelax := true
-				if osRelaxMinNS > 0 {
-					next := timeSleepUntil()
-					now := nanotime()
-					if next-now < osRelaxMinNS {
-						shouldRelax = false
+				if next > now {
+					atomic.Store(&sched.sysmonwait, 1)
+					unlock(&sched.lock)
+					// Make wake-up period small enough
+					// for the sampling to be correct.
+					sleep := forcegcperiod / 2
+					if next-now < sleep {
+						sleep = next - now
 					}
+					shouldRelax := sleep >= osRelaxMinNS
+					if shouldRelax {
+						osRelax(true)
+					}
+					notetsleep(&sched.sysmonnote, sleep)
+					if shouldRelax {
+						osRelax(false)
+					}
+					now = nanotime()
+					next = timeSleepUntil()
+					lock(&sched.lock)
+					atomic.Store(&sched.sysmonwait, 0)
+					noteclear(&sched.sysmonnote)
 				}
-				if shouldRelax {
-					osRelax(true)
-				}
-				notetsleep(&sched.sysmonnote, maxsleep)
-				if shouldRelax {
-					osRelax(false)
-				}
-				lock(&sched.lock)
-				atomic.Store(&sched.sysmonwait, 0)
-				noteclear(&sched.sysmonnote)
 				idle = 0
 				delay = 20
 			}
@@ -4121,10 +4213,9 @@ func sysmon() {
 		}
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
-		now := nanotime()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			list := netpoll(false) // non-blocking - returns list of goroutines
+			list := netpoll(0) // non-blocking - returns list of goroutines
 			if !list.empty() {
 				// Need to decrement number of idle locked M's
 				// (pretending that one more is running) before injectglist.
@@ -4138,6 +4229,12 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
+		if next < now {
+			// There are timers that should have already run,
+			// perhaps because there is an unpreemptible P.
+			// Try to start an M to run them.
+			startm(nil, false)
+		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
 		if retake(now) != 0 {
@@ -4296,6 +4393,12 @@ func preemptone(_p_ *p) bool {
 	// and a few other places, which is at least better than doing
 	// nothing at all.
 
+	// Request an async preemption of this P.
+	if preemptMSupported && debug.asyncpreemptoff == 0 {
+		_p_.preempt = true
+		preemptM(mp)
+	}
+
 	return true
 }
 
@@ -4324,7 +4427,7 @@ func schedtrace(detailed bool) {
 			if mp != nil {
 				id = mp.id
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, "\n")
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, " timerslen=", len(_p_.timers), "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index fee03be..a693937 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -362,6 +362,17 @@ func TestPreemptionGC(t *testing.T) {
 	atomic.StoreUint32(&stop, 1)
 }
 
+func TestAsyncPreempt(t *testing.T) {
+	if !runtime.PreemptMSupported {
+		t.Skip("asynchronous preemption not supported on this platform")
+	}
+	output := runTestProg(t, "testprog", "AsyncPreempt")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
+
 func TestGCFairness(t *testing.T) {
 	output := runTestProg(t, "testprog", "GCFairness")
 	want := "OK\n"
@@ -988,3 +999,42 @@ func TestPreemptionAfterSyscall(t *testing.T) {
 func TestGetgThreadSwitch(t *testing.T) {
 	runtime.RunGetgThreadSwitchTest()
 }
+
+// TestNetpollBreak tests that netpollBreak can break a netpoll.
+// This test is not particularly safe since the call to netpoll
+// will pick up any stray files that are ready, but it should work
+// OK as long it is not run in parallel.
+func TestNetpollBreak(t *testing.T) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		t.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	// Make sure that netpoll is initialized.
+	runtime.NetpollGenericInit()
+
+	start := time.Now()
+	c := make(chan bool, 2)
+	go func() {
+		c <- true
+		runtime.Netpoll(10 * time.Second.Nanoseconds())
+		c <- true
+	}()
+	<-c
+	// Loop because the break might get eaten by the scheduler.
+	// Break twice to break both the netpoll we started and the
+	// scheduler netpoll.
+loop:
+	for {
+		runtime.Usleep(100)
+		runtime.NetpollBreak()
+		runtime.NetpollBreak()
+		select {
+		case <-c:
+			break loop
+		default:
+		}
+	}
+	if dur := time.Since(start); dur > 5*time.Second {
+		t.Errorf("netpollBreak did not interrupt netpoll: slept for: %v", dur)
+	}
+}
diff --git a/libgo/go/runtime/race0.go b/libgo/go/runtime/race0.go
index f1d3706..6f26afa 100644
--- a/libgo/go/runtime/race0.go
+++ b/libgo/go/runtime/race0.go
@@ -29,6 +29,7 @@ func racereadrangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)         { th
 func racewriterangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)        { throw("race") }
 func raceacquire(addr unsafe.Pointer)                                       { throw("race") }
 func raceacquireg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func raceacquirectx(racectx uintptr, addr unsafe.Pointer)                   { throw("race") }
 func racerelease(addr unsafe.Pointer)                                       { throw("race") }
 func racereleaseg(gp *g, addr unsafe.Pointer)                               { throw("race") }
 func racereleasemerge(addr unsafe.Pointer)                                  { throw("race") }
@@ -38,3 +39,4 @@ func racemalloc(p unsafe.Pointer, sz uintptr)                               { th
 func racefree(p unsafe.Pointer, sz uintptr)                                 { throw("race") }
 func racegostart(pc uintptr) uintptr                                        { throw("race"); return 0 }
 func racegoend()                                                            { throw("race") }
+func racectxend(racectx uintptr)                                            { throw("race") }
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index c64549c..60aa90f 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -326,6 +326,7 @@ var debug struct {
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
+	asyncpreemptoff    int32
 }
 
 var dbgvars = []dbgVar{
@@ -345,6 +346,7 @@ var dbgvars = []dbgVar{
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
+	{"asyncpreemptoff", &debug.asyncpreemptoff},
 }
 
 func parsedebugvars() {
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 77648c2..d50f82a 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -40,7 +40,7 @@ const (
 
 	// _Grunning means this goroutine may execute user code. The
 	// stack is owned by this goroutine. It is not on a run queue.
-	// It is assigned an M and a P.
+	// It is assigned an M and a P (g.m and g.m.p are valid).
 	_Grunning // 2
 
 	// _Gsyscall means this goroutine is executing a system call.
@@ -78,11 +78,18 @@ const (
 	// stack is owned by the goroutine that put it in _Gcopystack.
 	_Gcopystack // 8
 
+	// _Gpreempted means this goroutine stopped itself for a
+	// suspendG preemption. It is like _Gwaiting, but nothing is
+	// yet responsible for ready()ing it. Some suspendG must CAS
+	// the status to _Gwaiting to take responsibility for
+	// ready()ing this G.
+	_Gpreempted // 9
+
 	// _Gexitingsyscall means this goroutine is exiting from a
 	// system call. This is like _Gsyscall, but the GC should not
 	// scan its stack. Currently this is only used in exitsyscall0
 	// as a transient state when it drops the G.
-	_Gexitingsyscall // 9
+	_Gexitingsyscall // 10
 
 	// _Gscan combined with one of the above states other than
 	// _Grunning indicates that GC is scanning the stack. The
@@ -95,11 +102,12 @@ const (
 	//
 	// atomicstatus&~Gscan gives the state the goroutine will
 	// return to when the scan completes.
-	_Gscan         = 0x1000
-	_Gscanrunnable = _Gscan + _Grunnable // 0x1001
-	_Gscanrunning  = _Gscan + _Grunning  // 0x1002
-	_Gscansyscall  = _Gscan + _Gsyscall  // 0x1003
-	_Gscanwaiting  = _Gscan + _Gwaiting  // 0x1004
+	_Gscan          = 0x1000
+	_Gscanrunnable  = _Gscan + _Grunnable  // 0x1001
+	_Gscanrunning   = _Gscan + _Grunning   // 0x1002
+	_Gscansyscall   = _Gscan + _Gsyscall   // 0x1003
+	_Gscanwaiting   = _Gscan + _Gwaiting   // 0x1004
+	_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
 )
 
 const (
@@ -407,21 +415,36 @@ type g struct {
 	param        unsafe.Pointer // passed parameter on wakeup
 	atomicstatus uint32
 	// Not for gccgo: stackLock      uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
-	goid           int64
-	schedlink      guintptr
-	waitsince      int64      // approx time when the g become blocked
-	waitreason     waitReason // if status==Gwaiting
-	preempt        bool       // preemption signal, duplicates stackguard0 = stackpreempt
-	paniconfault   bool       // panic (instead of crash) on unexpected fault address
-	preemptscan    bool       // preempted g does scan for gc
-	gcscandone     bool       // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool       // false at start of gc cycle, true if G has not run since last scan; TODO: remove?
-	throwsplit     bool       // must not split stack
-	raceignore     int8       // ignore race detection events
-	sysblocktraced bool       // StartTrace has emitted EvGoInSyscall about this goroutine
-	sysexitticks   int64      // cputicks when syscall has returned (for tracing)
-	traceseq       uint64     // trace event sequencer
-	tracelastp     puintptr   // last P emitted an event for this goroutine
+	goid        int64
+	schedlink   guintptr
+	waitsince   int64      // approx time when the g become blocked
+	waitreason  waitReason // if status==Gwaiting
+	preempt     bool       // preemption signal, duplicates stackguard0 = stackpreempt
+	preemptStop bool       // transition to _Gpreempted on preemption; otherwise, just deschedule
+	// Not for gccgo: preemptShrink bool // shrink stack at synchronous safe point
+	// asyncSafePoint is set if g is stopped at an asynchronous
+	// safe point. This means there are frames on the stack
+	// without precise pointer information.
+	asyncSafePoint bool
+
+	paniconfault bool // panic (instead of crash) on unexpected fault address
+	preemptscan  bool // preempted g does scan for gc
+	gcscandone   bool // g has scanned stack; protected by _Gscan bit in status
+	throwsplit   bool // must not split stack
+
+	gcScannedSyscallStack bool // gccgo specific; see scanSyscallStack
+
+	// activeStackChans indicates that there are unlocked channels
+	// pointing into this goroutine's stack. If true, stack
+	// copying needs to acquire channel locks to protect these
+	// areas of the stack.
+	activeStackChans bool
+
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
 	lockedm        muintptr
 	sig            uint32
 	writebuf       []byte
@@ -555,8 +578,7 @@ type m struct {
 	waittraceskip int
 	startingtrace bool
 	syscalltick   uint32
-	// Not for gccgo: thread        uintptr // thread handle
-	freelink *m // on sched.freem
+	freelink      *m // on sched.freem
 
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
@@ -566,6 +588,11 @@ type m struct {
 	// Not for gccgo: libcallg  guintptr
 	// Not for gccgo: syscall   libcall // stores syscall parameters on windows
 
+	// preemptGen counts the number of completed preemption
+	// signals. This is used to detect when a preemption is
+	// requested, but fails. Accessed atomically.
+	preemptGen uint32
+
 	dlogPerM
 
 	mOS
@@ -590,6 +617,7 @@ type p struct {
 	sysmontick  sysmontick // last tick observed by sysmon
 	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
+	pcache      pageCache
 	raceprocctx uintptr
 
 	// gccgo has only one size of defer.
@@ -624,6 +652,17 @@ type p struct {
 	sudogcache []*sudog
 	sudogbuf   [128]*sudog
 
+	// Cache of mspan objects from the heap.
+	mspancache struct {
+		// We need an explicit length here because this field is used
+		// in allocation codepaths where write barriers are not allowed,
+		// and eliminating the write barrier/keeping it eliminated from
+		// slice updates is tricky, moreso than just managing the length
+		// ourselves.
+		len int
+		buf [128]*mspan
+	}
+
 	tracebuf traceBufPtr
 
 	// traceSweep indicates the sweep events should be traced.
@@ -660,13 +699,36 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
+	// Lock for timers. We normally access the timers while running
+	// on this P, but the scheduler can also do it from a different P.
+	timersLock mutex
+
+	// Actions to take at some time. This is used to implement the
+	// standard library's time package.
+	// Must hold timersLock to access.
+	timers []*timer
+
+	// Number of timerModifiedEarlier timers on P's heap.
+	// This should only be modified while holding timersLock,
+	// or while the timer status is in a transient state
+	// such as timerModifying.
+	adjustTimers uint32
+
+	// Race context used while executing timer functions.
+	// Not for gccgo: timerRaceCtx uintptr
+
+	// preempt is set to indicate that this P should be enter the
+	// scheduler ASAP (regardless of what G is running on it).
+	preempt bool
+
 	pad cpu.CacheLinePad
 }
 
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
-	goidgen  uint64
-	lastpoll uint64
+	goidgen   uint64
+	lastpoll  uint64 // time of last network poll, 0 if currently polling
+	pollUntil uint64 // time to which current poll is sleeping
 
 	lock mutex
 
@@ -841,6 +903,11 @@ type _defer struct {
 
 // panics
 // This is the gccgo version.
+//
+// This is marked go:notinheap because _panic values must only ever
+// live on the stack.
+//
+//go:notinheap
 type _panic struct {
 	// The next entry in the stack.
 	link *_panic
@@ -858,6 +925,9 @@ type _panic struct {
 	// Whether this panic was already seen by a deferred function
 	// which called panic again.
 	aborted bool
+
+	// Whether this panic was created for goexit.
+	goexit bool
 }
 
 // ancestorInfo records details of where a goroutine was started.
@@ -906,6 +976,7 @@ const (
 	waitReasonTraceReaderBlocked                      // "trace reader (blocked)"
 	waitReasonWaitForGCCycle                          // "wait for GC cycle"
 	waitReasonGCWorkerIdle                            // "GC worker (idle)"
+	waitReasonPreempted                               // "preempted"
 )
 
 var waitReasonStrings = [...]string{
@@ -934,6 +1005,7 @@ var waitReasonStrings = [...]string{
 	waitReasonTraceReaderBlocked:    "trace reader (blocked)",
 	waitReasonWaitForGCCycle:        "wait for GC cycle",
 	waitReasonGCWorkerIdle:          "GC worker (idle)",
+	waitReasonPreempted:             "preempted",
 }
 
 func (w waitReason) String() string {
diff --git a/libgo/go/runtime/runtime_mmap_test.go b/libgo/go/runtime/runtime_mmap_test.go
index c7703f4..21918c5 100644
--- a/libgo/go/runtime/runtime_mmap_test.go
+++ b/libgo/go/runtime/runtime_mmap_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd hurd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime_test
 
diff --git a/libgo/go/runtime/runtime_test.go b/libgo/go/runtime/runtime_test.go
index e607868..ca35205 100644
--- a/libgo/go/runtime/runtime_test.go
+++ b/libgo/go/runtime/runtime_test.go
@@ -122,6 +122,21 @@ func BenchmarkDeferMany(b *testing.B) {
 	}
 }
 
+func BenchmarkPanicRecover(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer3()
+	}
+}
+
+func defer3() {
+	defer func(x, y, z int) {
+		if recover() == nil {
+			panic("failed recover")
+		}
+	}(1, 2, 3)
+	panic("hi")
+}
+
 // golang.org/issue/7063
 func TestStopCPUProfilingWithProfilerOff(t *testing.T) {
 	SetCPUProfileRate(0)
@@ -177,13 +192,14 @@ func TestSetPanicOnFault(t *testing.T) {
 	}
 }
 
+// testSetPanicOnFault tests one potentially faulting address.
+// It deliberately constructs and uses an invalid pointer,
+// so mark it as nocheckptr.
+//go:nocheckptr
 func testSetPanicOnFault(t *testing.T, addr uintptr, nfault *int) {
 	if strings.Contains(Version(), "llvm") {
 		t.Skip("LLVM doesn't support non-call exception")
 	}
-	if GOOS == "nacl" {
-		t.Skip("nacl doesn't seem to fault on high addresses")
-	}
 	if GOOS == "js" {
 		t.Skip("js does not support catching faults")
 	}
@@ -283,32 +299,6 @@ func TestTrailingZero(t *testing.T) {
 }
 */
 
-func TestBadOpen(t *testing.T) {
-	if GOOS == "windows" || GOOS == "nacl" || GOOS == "js" {
-		t.Skip("skipping OS that doesn't have open/read/write/close")
-	}
-	// make sure we get the correct error code if open fails. Same for
-	// read/write/close on the resulting -1 fd. See issue 10052.
-	nonfile := []byte("/notreallyafile")
-	fd := Open(&nonfile[0], 0, 0)
-	if fd != -1 {
-		t.Errorf("open(\"%s\")=%d, want -1", string(nonfile), fd)
-	}
-	var buf [32]byte
-	r := Read(-1, unsafe.Pointer(&buf[0]), int32(len(buf)))
-	if r != -1 {
-		t.Errorf("read()=%d, want -1", r)
-	}
-	w := Write(^uintptr(0), unsafe.Pointer(&buf[0]), int32(len(buf)))
-	if w != -1 {
-		t.Errorf("write()=%d, want -1", w)
-	}
-	c := Close(-1)
-	if c != -1 {
-		t.Errorf("close()=%d, want -1", c)
-	}
-}
-
 func TestAppendGrowth(t *testing.T) {
 	var x []int64
 	check := func(want int) {
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index 41e5e88..c9e3dd7 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -19,7 +19,7 @@ const debugSelect = false
 
 // scase.kind values.
 // Known to compiler.
-// Changes here must also be made in src/cmd/compile/internal/gc/select.go's walkselect.
+// Changes here must also be made in src/cmd/compile/internal/gc/select.go's walkselectcases.
 const (
 	caseNil = iota
 	caseRecv
@@ -70,6 +70,9 @@ func selunlock(scases []scase, lockorder []uint16) {
 }
 
 func selparkcommit(gp *g, _ unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	gp.activeStackChans = true
 	// This must not access gp's stack (see gopark). In
 	// particular, it must not access the *hselect. That's okay,
 	// because by the time this is called, gp.waiting has all
@@ -308,6 +311,7 @@ loop:
 	// wait for someone to wake us up
 	gp.param = nil
 	gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
+	gp.activeStackChans = false
 
 	sellock(scases, lockorder)
 
@@ -458,8 +462,6 @@ sclose:
 }
 
 func (c *hchan) sortkey() uintptr {
-	// TODO(khr): if we have a moving garbage collector, we'll need to
-	// change this function.
 	return uintptr(unsafe.Pointer(c))
 }
 
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index c002e29..fb16796 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -180,7 +180,7 @@ func semrelease1(addr *uint32, handoff bool, skipframes int) {
 		atomic.Xadd(&root.nwait, -1)
 	}
 	unlock(&root.lock)
-	if s != nil { // May be slow, so unlock first
+	if s != nil { // May be slow or even yield, so unlock first
 		acquiretime := s.acquiretime
 		if acquiretime != 0 {
 			mutexevent(t0-acquiretime, 3+skipframes)
@@ -192,6 +192,25 @@ func semrelease1(addr *uint32, handoff bool, skipframes int) {
 			s.ticket = 1
 		}
 		readyWithTime(s, 5+skipframes)
+		if s.ticket == 1 && getg().m.locks == 0 {
+			// Direct G handoff
+			// readyWithTime has added the waiter G as runnext in the
+			// current P; we now call the scheduler so that we start running
+			// the waiter G immediately.
+			// Note that waiter inherits our time slice: this is desirable
+			// to avoid having a highly contended semaphore hog the P
+			// indefinitely. goyield is like Gosched, but it does not emit a
+			// GoSched trace event and, more importantly, puts the current G
+			// on the local runq instead of the global one.
+			// We only do this in the starving regime (handoff=true), as in
+			// the non-starving case it is possible for a different waiter
+			// to acquire the semaphore while we are yielding/scheduling,
+			// and this would be wasteful. We wait instead to enter starving
+			// regime, and then we start to do direct handoffs of ticket and
+			// P.
+			// See issue 33747 for discussion.
+			goyield()
+		}
 	}
 }
 
@@ -373,19 +392,11 @@ Found:
 func (root *semaRoot) rotateLeft(x *sudog) {
 	// p -> (x a (y b c))
 	p := x.parent
-	a, y := x.prev, x.next
-	b, c := y.prev, y.next
+	y := x.next
+	b := y.prev
 
 	y.prev = x
 	x.parent = y
-	y.next = c
-	if c != nil {
-		c.parent = y
-	}
-	x.prev = a
-	if a != nil {
-		a.parent = x
-	}
 	x.next = b
 	if b != nil {
 		b.parent = x
@@ -409,23 +420,15 @@ func (root *semaRoot) rotateLeft(x *sudog) {
 func (root *semaRoot) rotateRight(y *sudog) {
 	// p -> (y (x a b) c)
 	p := y.parent
-	x, c := y.prev, y.next
-	a, b := x.prev, x.next
+	x := y.prev
+	b := x.next
 
-	x.prev = a
-	if a != nil {
-		a.parent = x
-	}
 	x.next = y
 	y.parent = x
 	y.prev = b
 	if b != nil {
 		b.parent = y
 	}
-	y.next = c
-	if c != nil {
-		c.parent = y
-	}
 
 	x.parent = p
 	if p == nil {
diff --git a/libgo/go/runtime/sema_test.go b/libgo/go/runtime/sema_test.go
new file mode 100644
index 0000000..8bd5d4c
--- /dev/null
+++ b/libgo/go/runtime/sema_test.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"sync/atomic"
+	"testing"
+)
+
+// TestSemaHandoff checks that when semrelease+handoff is
+// requested, the G that releases the semaphore yields its
+// P directly to the first waiter in line.
+// See issue 33747 for discussion.
+func TestSemaHandoff(t *testing.T) {
+	const iter = 10000
+	ok := 0
+	for i := 0; i < iter; i++ {
+		if testSemaHandoff() {
+			ok++
+		}
+	}
+	// As long as two thirds of handoffs are direct, we
+	// consider the test successful. The scheduler is
+	// nondeterministic, so this test checks that we get the
+	// desired outcome in a significant majority of cases.
+	// The actual ratio of direct handoffs is much higher
+	// (>90%) but we use a lower threshold to minimize the
+	// chances that unrelated changes in the runtime will
+	// cause the test to fail or become flaky.
+	if ok < iter*2/3 {
+		t.Fatal("direct handoff < 2/3:", ok, iter)
+	}
+}
+
+func TestSemaHandoff1(t *testing.T) {
+	if GOMAXPROCS(-1) <= 1 {
+		t.Skip("GOMAXPROCS <= 1")
+	}
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	GOMAXPROCS(1)
+	TestSemaHandoff(t)
+}
+
+func TestSemaHandoff2(t *testing.T) {
+	if GOMAXPROCS(-1) <= 2 {
+		t.Skip("GOMAXPROCS <= 2")
+	}
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	GOMAXPROCS(2)
+	TestSemaHandoff(t)
+}
+
+func testSemaHandoff() bool {
+	var sema, res uint32
+	done := make(chan struct{})
+
+	// We're testing that the current goroutine is able to yield its time slice
+	// to another goroutine. Stop the current goroutine from migrating to
+	// another CPU where it can win the race (and appear to have not yielded) by
+	// keeping the CPUs slightly busy.
+	for i := 0; i < GOMAXPROCS(-1); i++ {
+		go func() {
+			for {
+				select {
+				case <-done:
+					return
+				default:
+				}
+				Gosched()
+			}
+		}()
+	}
+
+	go func() {
+		Semacquire(&sema)
+		atomic.CompareAndSwapUint32(&res, 0, 1)
+
+		Semrelease1(&sema, true, 0)
+		close(done)
+	}()
+	for SemNwait(&sema) == 0 {
+		Gosched() // wait for goroutine to block in Semacquire
+	}
+
+	// The crux of the test: we release the semaphore with handoff
+	// and immediately perform a CAS both here and in the waiter; we
+	// want the CAS in the waiter to execute first.
+	Semrelease1(&sema, true, 0)
+	atomic.CompareAndSwapUint32(&res, 0, 2)
+
+	<-done // wait for goroutines to finish to avoid data races
+
+	return res == 1 // did the waiter run first?
+}
diff --git a/libgo/go/runtime/semasleep_test.go b/libgo/go/runtime/semasleep_test.go
index f5b4a50..9b371b0 100644
--- a/libgo/go/runtime/semasleep_test.go
+++ b/libgo/go/runtime/semasleep_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//+build !nacl,!plan9,!windows,!js
+// +build !plan9,!windows,!js
 
 package runtime_test
 
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
deleted file mode 100644
index 3583c7b..0000000
--- a/libgo/go/runtime/signal_sighandler.go
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build aix darwin dragonfly freebsd hurd linux nacl netbsd openbsd solaris
-
-package runtime
-
-import (
-	"unsafe"
-)
-
-// crashing is the number of m's we have waited for when implementing
-// GOTRACEBACK=crash when a signal is received.
-var crashing int32
-
-// testSigtrap is used by the runtime tests. If non-nil, it is called
-// on SIGTRAP. If it returns true, the normal behavior on SIGTRAP is
-// suppressed.
-var testSigtrap func(info *_siginfo_t, ctxt *sigctxt, gp *g) bool
-
-// sighandler is invoked when a signal occurs. The global g will be
-// set to a gsignal goroutine and we will be running on the alternate
-// signal stack. The parameter g will be the value of the global g
-// when the signal occurred. The sig, info, and ctxt parameters are
-// from the system signal handler: they are the parameters passed when
-// the SA is passed to the sigaction system call.
-//
-// The garbage collector may have stopped the world, so write barriers
-// are not allowed.
-//
-//go:nowritebarrierrec
-func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
-	_g_ := getg()
-	c := &sigctxt{info, ctxt}
-
-	sigfault, sigpc := getSiginfo(info, ctxt)
-
-	if sig == _SIGURG && usestackmaps {
-		// We may be signaled to do a stack scan.
-		// The signal delivery races with enter/exitsyscall.
-		// We may be on g0 stack now. gp.m.curg is the g we
-		// want to scan.
-		// If we're not on g stack, give up. The sender will
-		// try again later.
-		// If we're not stopped at a safepoint (doscanstack will
-		// return false), also give up.
-		if s := readgstatus(gp.m.curg); s == _Gscansyscall {
-			if gp == gp.m.curg {
-				if doscanstack(gp, (*gcWork)(unsafe.Pointer(gp.scangcw))) {
-					gp.gcscanvalid = true
-					gp.gcscandone = true
-				}
-			}
-			gp.m.curg.scangcw = 0
-			notewakeup(&gp.m.scannote)
-			return
-		}
-	}
-
-	if sig == _SIGPROF {
-		sigprof(sigpc, gp, _g_.m)
-		return
-	}
-
-	if sig == _SIGTRAP && testSigtrap != nil && testSigtrap(info, (*sigctxt)(noescape(unsafe.Pointer(c))), gp) {
-		return
-	}
-
-	flags := int32(_SigThrow)
-	if sig < uint32(len(sigtable)) {
-		flags = sigtable[sig].flags
-	}
-	if flags&_SigPanic != 0 && gp.throwsplit {
-		// We can't safely sigpanic because it may grow the
-		// stack. Abort in the signal handler instead.
-		flags = (flags &^ _SigPanic) | _SigThrow
-	}
-	if isAbortPC(sigpc) {
-		// On many architectures, the abort function just
-		// causes a memory fault. Don't turn that into a panic.
-		flags = _SigThrow
-	}
-	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
-		// Emulate gc by passing arguments out of band,
-		// although we don't really have to.
-		gp.sig = sig
-		gp.sigcode0 = uintptr(c.sigcode())
-		gp.sigcode1 = sigfault
-		gp.sigpc = sigpc
-
-		setg(gp)
-
-		// All signals were blocked due to the sigaction mask;
-		// unblock them.
-		var set sigset
-		sigfillset(&set)
-		sigprocmask(_SIG_UNBLOCK, &set, nil)
-
-		sigpanic()
-		throw("sigpanic returned")
-	}
-
-	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
-		if sigsend(sig) {
-			return
-		}
-	}
-
-	if c.sigcode() == _SI_USER && signal_ignored(sig) {
-		return
-	}
-
-	if flags&_SigKill != 0 {
-		dieFromSignal(sig)
-	}
-
-	if flags&_SigThrow == 0 {
-		return
-	}
-
-	_g_.m.throwing = 1
-	_g_.m.caughtsig.set(gp)
-
-	if crashing == 0 {
-		startpanic_m()
-	}
-
-	if sig < uint32(len(sigtable)) {
-		print(sigtable[sig].name, "\n")
-	} else {
-		print("Signal ", sig, "\n")
-	}
-
-	print("PC=", hex(sigpc), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
-		print("signal arrived during cgo execution\n")
-		gp = _g_.m.lockedg.ptr()
-	}
-	print("\n")
-
-	level, _, docrash := gotraceback()
-	if level > 0 {
-		goroutineheader(gp)
-		traceback(0)
-		if crashing == 0 {
-			tracebackothers(gp)
-			print("\n")
-		}
-		dumpregs(info, ctxt)
-	}
-
-	if docrash {
-		crashing++
-		if crashing < mcount()-int32(extraMCount) {
-			// There are other m's that need to dump their stacks.
-			// Relay SIGQUIT to the next m by sending it to the current process.
-			// All m's that have already received SIGQUIT have signal masks blocking
-			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
-			// When the last m receives the SIGQUIT, it will fall through to the call to
-			// crash below. Just in case the relaying gets botched, each m involved in
-			// the relay sleeps for 5 seconds and then does the crash/exit itself.
-			// In expected operation, the last m has received the SIGQUIT and run
-			// crash/exit and the process is gone, all long before any of the
-			// 5-second sleeps have finished.
-			print("\n-----\n\n")
-			raiseproc(_SIGQUIT)
-			usleep(5 * 1000 * 1000)
-		}
-		crash()
-	}
-
-	printDebugLog()
-
-	exit(2)
-}
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 365f5dd..29f9443 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -42,6 +42,38 @@ const (
 	_SIG_IGN uintptr = 1
 )
 
+// sigPreempt is the signal used for non-cooperative preemption.
+//
+// There's no good way to choose this signal, but there are some
+// heuristics:
+//
+// 1. It should be a signal that's passed-through by debuggers by
+// default. On Linux, this is SIGALRM, SIGURG, SIGCHLD, SIGIO,
+// SIGVTALRM, SIGPROF, and SIGWINCH, plus some glibc-internal signals.
+//
+// 2. It shouldn't be used internally by libc in mixed Go/C binaries
+// because libc may assume it's the only thing that can handle these
+// signals. For example SIGCANCEL or SIGSETXID.
+//
+// 3. It should be a signal that can happen spuriously without
+// consequences. For example, SIGALRM is a bad choice because the
+// signal handler can't tell if it was caused by the real process
+// alarm or not (arguably this means the signal is broken, but I
+// digress). SIGUSR1 and SIGUSR2 are also bad because those are often
+// used in meaningful ways by applications.
+//
+// 4. We need to deal with platforms without real-time signals (like
+// macOS), so those are out.
+//
+// We use SIGURG because it meets all of these criteria, is extremely
+// unlikely to be used by an application for its "real" meaning (both
+// because out-of-band data is basically unused and because SIGURG
+// doesn't report which socket has the condition, making it pretty
+// useless), and even if it is, the application has to be ready for
+// spurious SIGURG. SIGIO wouldn't be a bad choice either, but is more
+// likely to be used for real.
+const sigPreempt = _SIGURG
+
 // Stores the signal handlers registered before Go installed its own.
 // These signal handlers will be invoked in cases where Go doesn't want to
 // handle a particular signal (e.g., signal occurred on a non-Go thread).
@@ -251,10 +283,26 @@ func setProcessCPUProfiler(hz int32) {
 		}
 	} else {
 		// If the Go signal handler should be disabled by default,
-		// disable it if it is enabled.
+		// switch back to the signal handler that was installed
+		// when we enabled profiling. We don't try to handle the case
+		// of a program that changes the SIGPROF handler while Go
+		// profiling is enabled.
+		//
+		// If no signal handler was installed before, then start
+		// ignoring SIGPROF signals. We do this, rather than change
+		// to SIG_DFL, because there may be a pending SIGPROF
+		// signal that has not yet been delivered to some other thread.
+		// If we change to SIG_DFL here, the program will crash
+		// when that SIGPROF is delivered. We assume that programs
+		// that use profiling don't want to crash on a stray SIGPROF.
+		// See issue 19320.
 		if !sigInstallGoHandler(_SIGPROF) {
 			if atomic.Cas(&handlingSig[_SIGPROF], 1, 0) {
-				setsig(_SIGPROF, atomic.Loaduintptr(&fwdSig[_SIGPROF]))
+				h := atomic.Loaduintptr(&fwdSig[_SIGPROF])
+				if h == _SIG_DFL {
+					h = _SIG_IGN
+				}
+				setsig(_SIGPROF, h)
 			}
 		}
 	}
@@ -283,6 +331,51 @@ func sigpipe() {
 	dieFromSignal(_SIGPIPE)
 }
 
+// doSigPreempt handles a preemption signal on gp.
+func doSigPreempt(gp *g, ctxt *sigctxt, sigpc uintptr) {
+	// Check if this G wants to be preempted and is safe to
+	// preempt.
+	if wantAsyncPreempt(gp) && isAsyncSafePoint(gp, sigpc) {
+		// Inject a call to asyncPreempt.
+		// ctxt.pushCall(funcPC(asyncPreempt))
+		throw("pushCall not implemented")
+	}
+
+	// Acknowledge the preemption.
+	atomic.Xadd(&gp.m.preemptGen, 1)
+}
+
+// gccgo-specific definition.
+const pushCallSupported = false
+
+const preemptMSupported = pushCallSupported
+
+// preemptM sends a preemption request to mp. This request may be
+// handled asynchronously and may be coalesced with other requests to
+// the M. When the request is received, if the running G or P are
+// marked for preemption and the goroutine is at an asynchronous
+// safe-point, it will preempt the goroutine. It always atomically
+// increments mp.preemptGen after handling a preemption request.
+func preemptM(mp *m) {
+	if !pushCallSupported {
+		// This architecture doesn't support ctxt.pushCall
+		// yet, so doSigPreempt won't work.
+		return
+	}
+	if GOOS == "darwin" && (GOARCH == "arm" || GOARCH == "arm64") && !iscgo {
+		// On darwin, we use libc calls, and cgo is required on ARM and ARM64
+		// so we have TLS set up to save/restore G during C calls. If cgo is
+		// absent, we cannot save/restore G in TLS, and if a signal is
+		// received during C execution we cannot get the G. Therefore don't
+		// send signals.
+		// This can only happen in the go_bootstrap program (otherwise cgo is
+		// required).
+		return
+	}
+	// signalM(mp, sigPreempt)
+	throw("signalM not implemented")
+}
+
 // sigtrampgo is called from the signal handler function, sigtramp,
 // written in assembly code.
 // This is called by the signal handler, and the world may be stopped.
@@ -315,6 +408,183 @@ func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
 	setg(g)
 }
 
+// crashing is the number of m's we have waited for when implementing
+// GOTRACEBACK=crash when a signal is received.
+var crashing int32
+
+// testSigtrap and testSigusr1 are used by the runtime tests. If
+// non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the
+// normal behavior on this signal is suppressed.
+var testSigtrap func(info *_siginfo_t, ctxt *sigctxt, gp *g) bool
+var testSigusr1 func(gp *g) bool
+
+// sighandler is invoked when a signal occurs. The global g will be
+// set to a gsignal goroutine and we will be running on the alternate
+// signal stack. The parameter g will be the value of the global g
+// when the signal occurred. The sig, info, and ctxt parameters are
+// from the system signal handler: they are the parameters passed when
+// the SA is passed to the sigaction system call.
+//
+// The garbage collector may have stopped the world, so write barriers
+// are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	sigfault, sigpc := getSiginfo(info, ctxt)
+
+	if sig == _SIGURG && usestackmaps {
+		// We may be signaled to do a stack scan.
+		// The signal delivery races with enter/exitsyscall.
+		// We may be on g0 stack now. gp.m.curg is the g we
+		// want to scan.
+		// If we're not on g stack, give up. The sender will
+		// try again later.
+		// If we're not stopped at a safepoint (doscanstack will
+		// return false), also give up.
+		if s := readgstatus(gp.m.curg); s == _Gscansyscall {
+			if gp == gp.m.curg {
+				if doscanstack(gp, (*gcWork)(unsafe.Pointer(gp.scangcw))) {
+					gp.gcScannedSyscallStack = true
+				}
+			}
+			gp.m.curg.scangcw = 0
+			notewakeup(&gp.m.scannote)
+			return
+		}
+	}
+
+	if sig == _SIGPROF {
+		sigprof(sigpc, gp, _g_.m)
+		return
+	}
+
+	if sig == _SIGTRAP && testSigtrap != nil && testSigtrap(info, (*sigctxt)(noescape(unsafe.Pointer(c))), gp) {
+		return
+	}
+
+	if sig == _SIGUSR1 && testSigusr1 != nil && testSigusr1(gp) {
+		return
+	}
+
+	if sig == sigPreempt {
+		// Might be a preemption signal.
+		doSigPreempt(gp, c, sigpc)
+		// Even if this was definitely a preemption signal, it
+		// may have been coalesced with another signal, so we
+		// still let it through to the application.
+	}
+
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if flags&_SigPanic != 0 && gp.throwsplit {
+		// We can't safely sigpanic because it may grow the
+		// stack. Abort in the signal handler instead.
+		flags = (flags &^ _SigPanic) | _SigThrow
+	}
+	if isAbortPC(sigpc) {
+		// On many architectures, the abort function just
+		// causes a memory fault. Don't turn that into a panic.
+		flags = _SigThrow
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// Emulate gc by passing arguments out of band,
+		// although we don't really have to.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = sigfault
+		gp.sigpc = sigpc
+
+		setg(gp)
+
+		// All signals were blocked due to the sigaction mask;
+		// unblock them.
+		var set sigset
+		sigfillset(&set)
+		sigprocmask(_SIG_UNBLOCK, &set, nil)
+
+		sigpanic()
+		throw("sigpanic returned")
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(sig)
+	}
+
+	if flags&_SigThrow == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic_m()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(sigpc), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+		print("signal arrived during cgo execution\n")
+		gp = _g_.m.lockedg.ptr()
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		traceback(0)
+		if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(info, ctxt)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < mcount()-int32(extraMCount) {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	printDebugLog()
+
+	exit(2)
+}
+
 // sigpanic turns a synchronous signal into a run-time panic.
 // If the signal handler sees a synchronous panic, it arranges the
 // stack to look like the function where the signal occurred called
@@ -551,11 +821,22 @@ func signalDuringFork(sig uint32) {
 	throw("signal received during fork")
 }
 
+var badginsignalMsg = "fatal: bad g in signal handler\n"
+
 // This runs on a foreign stack, without an m or a g. No stack split.
 //go:nosplit
 //go:norace
 //go:nowritebarrierrec
 func badsignal(sig uintptr, c *sigctxt) {
+	if !iscgo && !cgoHasExtraM {
+		// There is no extra M. needm will not be able to grab
+		// an M. Instead of hanging, just crash.
+		// Cannot call split-stack function as there is no G.
+		s := stringStructOf(&badginsignalMsg)
+		write(2, s.str, int32(s.len))
+		exit(2)
+		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
+	}
 	needm(0)
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
@@ -596,6 +877,13 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return true
 	}
 
+	// This function and its caller sigtrampgo assumes SIGPIPE is delivered on the
+	// originating thread. This property does not hold on macOS (golang.org/issue/33384),
+	// so we have no choice but to ignore SIGPIPE.
+	if GOOS == "darwin" && sig == _SIGPIPE {
+		return true
+	}
+
 	// If there is no handler to forward to, no need to forward.
 	if fwdFn == _SIG_DFL {
 		return false
@@ -610,8 +898,9 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return false
 	}
 	// Determine if the signal occurred inside Go code. We test that:
-	//   (1) we were in a goroutine (i.e., m.curg != nil), and
-	//   (2) we weren't in CGO.
+	//   (1) we weren't in VDSO page,
+	//   (2) we were in a goroutine (i.e., m.curg != nil), and
+	//   (3) we weren't in CGO.
 	g := getg()
 	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
 		return false
@@ -687,13 +976,15 @@ func minitSignals() {
 // stack to the gsignal stack. If the alternate signal stack is set
 // for the thread (the case when a non-Go thread sets the alternate
 // signal stack and then calls a Go function) then set the gsignal
-// stack to the alternate signal stack. Record which choice was made
-// in newSigstack, so that it can be undone in unminit.
+// stack to the alternate signal stack. We also set the alternate
+// signal stack to the gsignal stack if cgo is not used (regardless
+// of whether it is already set). Record which choice was made in
+// newSigstack, so that it can be undone in unminit.
 func minitSignalStack() {
 	_g_ := getg()
 	var st _stack_t
 	sigaltstack(nil, &st)
-	if st.ss_flags&_SS_DISABLE != 0 {
+	if st.ss_flags&_SS_DISABLE != 0 || !iscgo {
 		signalstack(_g_.m.gsignalstack, _g_.m.gsignalstacksize)
 		_g_.m.newSigstack = true
 	} else {
diff --git a/libgo/go/runtime/signal_windows_test.go b/libgo/go/runtime/signal_windows_test.go
new file mode 100644
index 0000000..9748403
--- /dev/null
+++ b/libgo/go/runtime/signal_windows_test.go
@@ -0,0 +1,61 @@
+// +build windows
+
+package runtime_test
+
+import (
+	"internal/testenv"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skip("this test can only run on windows/amd64")
+	}
+	testenv.MustHaveGoBuild(t)
+	testenv.MustHaveExecPath(t, "gcc")
+	testprog.Lock()
+	defer testprog.Unlock()
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// build go dll
+	dll := filepath.Join(dir, "testwinlib.dll")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlib/main.go")
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build go library: %s\n%s", err, out)
+	}
+
+	// build c program
+	exe := filepath.Join(dir, "test.exe")
+	cmd = exec.Command("gcc", "-L"+dir, "-I"+dir, "-ltestwinlib", "-o", exe, "testdata/testwinlib/main.c")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build c exe: %s\n%s", err, out)
+	}
+
+	// run test program
+	cmd = exec.Command(exe)
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failure while running executable: %s\n%s", err, out)
+	}
+	expectedOutput := "exceptionCount: 1\ncontinueCount: 1\n"
+	// cleaning output
+	cleanedOut := strings.ReplaceAll(string(out), "\r\n", "\n")
+	if cleanedOut != expectedOutput {
+		t.Errorf("expected output %q, got %q", expectedOutput, cleanedOut)
+	}
+}
diff --git a/libgo/go/runtime/sizeof_test.go b/libgo/go/runtime/sizeof_test.go
index ecda82a..d829c58 100644
--- a/libgo/go/runtime/sizeof_test.go
+++ b/libgo/go/runtime/sizeof_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !nacl
-
 package runtime_test
 
 import (
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index 49d5a86..b61c2b1 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -26,7 +26,7 @@ type slice struct {
 	cap   int
 }
 
-// An notInHeapSlice is a slice backed by go:notinheap memory.
+// A notInHeapSlice is a slice backed by go:notinheap memory.
 type notInHeapSlice struct {
 	array *notInHeap
 	len   int
diff --git a/libgo/go/runtime/stack_test.go b/libgo/go/runtime/stack_test.go
index 6ed65e8..169dde2 100644
--- a/libgo/go/runtime/stack_test.go
+++ b/libgo/go/runtime/stack_test.go
@@ -116,6 +116,13 @@ func TestStackGrowth(t *testing.T) {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
+
+		if Compiler == "gccgo" && !*Pusestackmaps {
+			// This test is flaky for gccgo's
+			// conservative stack scanning.
+			return
+		}
+
 		done := make(chan bool)
 		var startTime time.Time
 		var started, progress uint32
@@ -599,9 +606,6 @@ func (s structWithMethod) callers() []uintptr {
 	return pc[:Callers(0, pc)]
 }
 
-// The noinline prevents this function from being inlined
-// into a wrapper. TODO: remove this when issue 28640 is fixed.
-//go:noinline
 func (s structWithMethod) stack() string {
 	buf := make([]byte, 4<<10)
 	return string(buf[:Stack(buf, false)])
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 741b6b4..df4cae7 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -508,3 +508,37 @@ func __go_byte_array_to_string(p unsafe.Pointer, l int) string {
 func __go_string_to_byte_array(s string) []byte {
 	return stringtoslicebyte(nil, s)
 }
+
+// parseRelease parses a dot-separated version number. It follows the
+// semver syntax, but allows the minor and patch versions to be
+// elided.
+func parseRelease(rel string) (major, minor, patch int, ok bool) {
+	// Strip anything after a dash or plus.
+	for i := 0; i < len(rel); i++ {
+		if rel[i] == '-' || rel[i] == '+' {
+			rel = rel[:i]
+			break
+		}
+	}
+
+	next := func() (int, bool) {
+		for i := 0; i < len(rel); i++ {
+			if rel[i] == '.' {
+				ver, ok := atoi(rel[:i])
+				rel = rel[i+1:]
+				return ver, ok
+			}
+		}
+		ver, ok := atoi(rel)
+		rel = ""
+		return ver, ok
+	}
+	if major, ok = next(); !ok || rel == "" {
+		return
+	}
+	if minor, ok = next(); !ok || rel == "" {
+		return
+	}
+	patch, ok = next()
+	return
+}
diff --git a/libgo/go/runtime/string_test.go b/libgo/go/runtime/string_test.go
index ec83bb4..e388f70 100644
--- a/libgo/go/runtime/string_test.go
+++ b/libgo/go/runtime/string_test.go
@@ -462,3 +462,34 @@ func TestAtoi32(t *testing.T) {
 		}
 	}
 }
+
+type parseReleaseTest struct {
+	in                  string
+	major, minor, patch int
+}
+
+var parseReleaseTests = []parseReleaseTest{
+	{"", -1, -1, -1},
+	{"x", -1, -1, -1},
+	{"5", 5, 0, 0},
+	{"5.12", 5, 12, 0},
+	{"5.12-x", 5, 12, 0},
+	{"5.12.1", 5, 12, 1},
+	{"5.12.1-x", 5, 12, 1},
+	{"5.12.1.0", 5, 12, 1},
+	{"5.20496382327982653440", -1, -1, -1},
+}
+
+func TestParseRelease(t *testing.T) {
+	for _, test := range parseReleaseTests {
+		major, minor, patch, ok := runtime.ParseRelease(test.in)
+		if !ok {
+			major, minor, patch = -1, -1, -1
+		}
+		if test.major != major || test.minor != minor || test.patch != patch {
+			t.Errorf("parseRelease(%q) = (%v, %v, %v) want (%v, %v, %v)",
+				test.in, major, minor, patch,
+				test.major, test.minor, test.patch)
+		}
+	}
+}
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index a2e1530..ae6134d 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -240,11 +240,16 @@ func asmcgocall(fn, arg unsafe.Pointer) int32 {
 	return 0
 }
 
-// round n up to a multiple of a.  a must be a power of 2.
-func round(n, a uintptr) uintptr {
+// alignUp rounds n up to a multiple of a. a must be a power of 2.
+func alignUp(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
 }
 
+// alignDown rounds n down to a multiple of a. a must be a power of 2.
+func alignDown(n, a uintptr) uintptr {
+	return n &^ (a - 1)
+}
+
 // checkASM returns whether assembly runtime checks have passed.
 func checkASM() bool {
 	return true
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index dcbbffa..454afee 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -4,23 +4,27 @@
 
 // +build !plan9
 // +build !windows
-// +build !nacl
 // +build !js
 
 package runtime
 
 import "unsafe"
 
+// read calls the read system call.
+// It returns a non-negative number of bytes written or a negative errno value.
 //go:noescape
 func read(fd int32, p unsafe.Pointer, n int32) int32
+
 func closefd(fd int32) int32
 
 //extern exit
 func exit(code int32)
 func usleep(usec uint32)
 
+// write calls the write system call.
+// It returns a non-negative number of bytes written or a negative errno value.
 //go:noescape
-func write(fd uintptr, p unsafe.Pointer, n int32) int32
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
diff --git a/libgo/go/runtime/stubs3.go b/libgo/go/runtime/stubs3.go
index d339787..35feb10 100644
--- a/libgo/go/runtime/stubs3.go
+++ b/libgo/go/runtime/stubs3.go
@@ -4,4 +4,4 @@
 
 package runtime
 
-func nanotime() int64
+func nanotime1() int64
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index a2ecf38..5dd3894 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -79,7 +79,7 @@ func (ci *Frames) Next() (frame Frame, more bool) {
 
 	// Subtract 1 from PC to undo the 1 we added in callback in
 	// go-callers.c.
-	function, file, line, _ := funcfileline(pc-1, int32(i))
+	function, file, line, _ := funcfileline(pc-1, int32(i), more)
 	if function == "" && file == "" {
 		return Frame{}, more
 	}
@@ -127,7 +127,7 @@ type Func struct {
 // the a *Func describing the innermost function, but with an entry
 // of the outermost function.
 func FuncForPC(pc uintptr) *Func {
-	name, _, _, _ := funcfileline(pc, -1)
+	name, _, _, _ := funcfileline(pc, -1, false)
 	if name == "" {
 		return nil
 	}
@@ -156,7 +156,7 @@ func (f *Func) Entry() uintptr {
 // The result will not be accurate if pc is not a program
 // counter within f.
 func (f *Func) FileLine(pc uintptr) (file string, line int) {
-	_, file, line, _ = funcfileline(pc, -1)
+	_, file, line, _ = funcfileline(pc, -1, false)
 	return file, line
 }
 
@@ -230,5 +230,5 @@ func demangleSymbol(s string) string {
 }
 
 // implemented in go-caller.c
-func funcfileline(uintptr, int32) (string, string, int, int)
+func funcfileline(uintptr, int32, bool) (string, string, int, int)
 func funcentry(uintptr) uintptr
diff --git a/libgo/go/runtime/testdata/testfaketime/faketime.go b/libgo/go/runtime/testdata/testfaketime/faketime.go
new file mode 100644
index 0000000..1fb15eb
--- /dev/null
+++ b/libgo/go/runtime/testdata/testfaketime/faketime.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test faketime support. This is its own test program because we have
+// to build it with custom build tags and hence want to minimize
+// dependencies.
+
+package main
+
+import (
+	"os"
+	"time"
+)
+
+func main() {
+	println("line 1")
+	// Stream switch, increments time
+	os.Stdout.WriteString("line 2\n")
+	os.Stdout.WriteString("line 3\n")
+	// Stream switch, increments time
+	os.Stderr.WriteString("line 4\n")
+	// Time jump
+	time.Sleep(1 * time.Second)
+	os.Stdout.WriteString("line 5\n")
+	// Print the current time.
+	os.Stdout.WriteString(time.Now().UTC().Format(time.RFC3339))
+}
diff --git a/libgo/go/runtime/testdata/testprog/deadlock.go b/libgo/go/runtime/testdata/testprog/deadlock.go
index 5f0d120..105d6a5 100644
--- a/libgo/go/runtime/testdata/testprog/deadlock.go
+++ b/libgo/go/runtime/testdata/testprog/deadlock.go
@@ -22,6 +22,9 @@ func init() {
 	register("StackOverflow", StackOverflow)
 	register("ThreadExhaustion", ThreadExhaustion)
 	register("RecursivePanic", RecursivePanic)
+	register("RecursivePanic2", RecursivePanic2)
+	register("RecursivePanic3", RecursivePanic3)
+	register("RecursivePanic4", RecursivePanic4)
 	register("GoexitExit", GoexitExit)
 	register("GoNil", GoNil)
 	register("MainGoroutineID", MainGoroutineID)
@@ -29,6 +32,8 @@ func init() {
 	register("GoexitInPanic", GoexitInPanic)
 	register("PanicAfterGoexit", PanicAfterGoexit)
 	register("RecoveredPanicAfterGoexit", RecoveredPanicAfterGoexit)
+	register("RecoverBeforePanicAfterGoexit", RecoverBeforePanicAfterGoexit)
+	register("RecoverBeforePanicAfterGoexit2", RecoverBeforePanicAfterGoexit2)
 	register("PanicTraceback", PanicTraceback)
 	register("GoschedInPanic", GoschedInPanic)
 	register("SyscallInPanic", SyscallInPanic)
@@ -111,6 +116,50 @@ func RecursivePanic() {
 	panic("again")
 }
 
+// Same as RecursivePanic, but do the first recover and the second panic in
+// separate defers, and make sure they are executed in the correct order.
+func RecursivePanic2() {
+	func() {
+		defer func() {
+			fmt.Println(recover())
+		}()
+		var x [8192]byte
+		func(x [8192]byte) {
+			defer func() {
+				panic("second panic")
+			}()
+			defer func() {
+				fmt.Println(recover())
+			}()
+			panic("first panic")
+		}(x)
+	}()
+	panic("third panic")
+}
+
+// Make sure that the first panic finished as a panic, even though the second
+// panic was recovered
+func RecursivePanic3() {
+	defer func() {
+		defer func() {
+			recover()
+		}()
+		panic("second panic")
+	}()
+	panic("first panic")
+}
+
+// Test case where a single defer recovers one panic but starts another panic. If
+// the second panic is never recovered, then the recovered first panic will still
+// appear on the panic stack (labeled '[recovered]') and the runtime stack.
+func RecursivePanic4() {
+	defer func() {
+		recover()
+		panic("second panic")
+	}()
+	panic("first panic")
+}
+
 func GoexitExit() {
 	println("t1")
 	go func() {
@@ -202,6 +251,50 @@ func RecoveredPanicAfterGoexit() {
 	runtime.Goexit()
 }
 
+func RecoverBeforePanicAfterGoexit() {
+	// 1. defer a function that recovers
+	// 2. defer a function that panics
+	// 3. call goexit
+	// Goexit runs the #2 defer. Its panic
+	// is caught by the #1 defer.  For Goexit, we explicitly
+	// resume execution in the Goexit loop, instead of resuming
+	// execution in the caller (which would make the Goexit disappear!)
+	defer func() {
+		r := recover()
+		if r == nil {
+			panic("bad recover")
+		}
+	}()
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
+func RecoverBeforePanicAfterGoexit2() {
+	for i := 0; i < 2; i++ {
+		defer func() {
+		}()
+	}
+	// 1. defer a function that recovers
+	// 2. defer a function that panics
+	// 3. call goexit
+	// Goexit runs the #2 defer. Its panic
+	// is caught by the #1 defer.  For Goexit, we explicitly
+	// resume execution in the Goexit loop, instead of resuming
+	// execution in the caller (which would make the Goexit disappear!)
+	defer func() {
+		r := recover()
+		if r == nil {
+			panic("bad recover")
+		}
+	}()
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
 func PanicTraceback() {
 	pt1()
 }
diff --git a/libgo/go/runtime/testdata/testprog/gc.go b/libgo/go/runtime/testdata/testprog/gc.go
index 3fd1cd8..cc16413 100644
--- a/libgo/go/runtime/testdata/testprog/gc.go
+++ b/libgo/go/runtime/testdata/testprog/gc.go
@@ -27,7 +27,6 @@ func GCSys() {
 	runtime.GC()
 	runtime.ReadMemStats(memstats)
 	sys := memstats.Sys
-	fmt.Printf("original sys: %#x\n", sys)
 
 	runtime.MemProfileRate = 0 // disable profiler
 
@@ -39,8 +38,6 @@ func GCSys() {
 	// Should only be using a few MB.
 	// We allocated 100 MB or (if not short) 1 GB.
 	runtime.ReadMemStats(memstats)
-	fmt.Printf("final sys: %#x\n", memstats.Sys)
-	fmt.Printf("%#v\n", *memstats)
 	if sys > memstats.Sys {
 		sys = 0
 	} else {
@@ -150,9 +147,20 @@ func GCPhys() {
 		size    = 4 << 20
 		split   = 64 << 10
 		objects = 2
+
+		// The page cache could hide 64 8-KiB pages from the scavenger today.
+		maxPageCache = (8 << 10) * 64
 	)
 	// Set GOGC so that this test operates under consistent assumptions.
 	debug.SetGCPercent(100)
+	// Reduce GOMAXPROCS down to 4 if it's greater. We need to bound the amount
+	// of memory held in the page cache because the scavenger can't reach it.
+	// The page cache will hold at most maxPageCache of memory per-P, so this
+	// bounds the amount of memory hidden from the scavenger to 4*maxPageCache.
+	procs := runtime.GOMAXPROCS(-1)
+	if procs > 4 {
+		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	}
 	// Save objects which we want to survive, and condemn objects which we don't.
 	// Note that we condemn objects in this way and release them all at once in
 	// order to avoid having the GC start freeing up these objects while the loop
@@ -200,10 +208,22 @@ func GCPhys() {
 	// Since the runtime should scavenge the entirety of the remaining holes,
 	// theoretically there should be no more free and unscavenged memory. However due
 	// to other allocations that happen during this test we may still see some physical
-	// memory over-use. 10% here is an arbitrary but very conservative threshold which
-	// should easily account for any other allocations this test may have done.
+	// memory over-use.
 	overuse := (float64(heapBacked) - float64(stats.HeapAlloc)) / float64(stats.HeapAlloc)
-	if overuse <= 0.10 {
+	// Compute the threshold.
+	//
+	// In theory, this threshold should just be zero, but that's not possible in practice.
+	// Firstly, the runtime's page cache can hide up to maxPageCache of free memory from the
+	// scavenger per P. To account for this, we increase the threshold by the ratio between the
+	// total amount the runtime could hide from the scavenger to the amount of memory we expect
+	// to be able to scavenge here, which is (size-split)*objects. This computation is the crux
+	// GOMAXPROCS above; if GOMAXPROCS is too high the threshold just becomes 100%+ since the
+	// amount of memory being allocated is fixed. Then we add 5% to account for noise, such as
+	// other allocations this test may have performed that we don't explicitly account for The
+	// baseline threshold here is around 11% for GOMAXPROCS=1, capping out at around 30% for
+	// GOMAXPROCS=4.
+	threshold := 0.05 + float64(procs)*maxPageCache/float64((size-split)*objects)
+	if overuse <= threshold {
 		fmt.Println("OK")
 		return
 	}
@@ -213,8 +233,8 @@ func GCPhys() {
 	// In the context of this test, this indicates a large amount of
 	// fragmentation with physical pages that are otherwise unused but not
 	// returned to the OS.
-	fmt.Printf("exceeded physical memory overuse threshold of 10%%: %3.2f%%\n"+
-		"(alloc: %d, goal: %d, sys: %d, rel: %d, objs: %d)\n", overuse*100,
+	fmt.Printf("exceeded physical memory overuse threshold of %3.2f%%: %3.2f%%\n"+
+		"(alloc: %d, goal: %d, sys: %d, rel: %d, objs: %d)\n", threshold*100, overuse*100,
 		stats.HeapAlloc, stats.NextGC, stats.HeapSys, stats.HeapReleased, len(saved))
 	runtime.KeepAlive(saved)
 }
diff --git a/libgo/go/runtime/testdata/testprog/preempt.go b/libgo/go/runtime/testdata/testprog/preempt.go
new file mode 100644
index 0000000..1c74d0e
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/preempt.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"runtime"
+	"runtime/debug"
+	"sync/atomic"
+)
+
+func init() {
+	register("AsyncPreempt", AsyncPreempt)
+}
+
+func AsyncPreempt() {
+	// Run with just 1 GOMAXPROCS so the runtime is required to
+	// use scheduler preemption.
+	runtime.GOMAXPROCS(1)
+	// Disable GC so we have complete control of what we're testing.
+	debug.SetGCPercent(-1)
+
+	// Start a goroutine with no sync safe-points.
+	var ready, ready2 uint32
+	go func() {
+		for {
+			atomic.StoreUint32(&ready, 1)
+			dummy()
+			dummy()
+		}
+	}()
+	// Also start one with a frameless function.
+	// This is an especially interesting case for
+	// LR machines.
+	go func() {
+		atomic.AddUint32(&ready2, 1)
+		frameless()
+	}()
+	// Also test empty infinite loop.
+	go func() {
+		atomic.AddUint32(&ready2, 1)
+		for {
+		}
+	}()
+
+	// Wait for the goroutine to stop passing through sync
+	// safe-points.
+	for atomic.LoadUint32(&ready) == 0 || atomic.LoadUint32(&ready2) < 2 {
+		runtime.Gosched()
+	}
+
+	// Run a GC, which will have to stop the goroutine for STW and
+	// for stack scanning. If this doesn't work, the test will
+	// deadlock and timeout.
+	runtime.GC()
+
+	println("OK")
+}
+
+//go:noinline
+func frameless() {
+	for i := int64(0); i < 1<<62; i++ {
+		out += i * i * i * i * i * 12345
+	}
+}
+
+var out int64
+
+//go:noinline
+func dummy() {}
diff --git a/libgo/go/runtime/testdata/testprog/signal.go b/libgo/go/runtime/testdata/testprog/signal.go
index 2ccbada..417e105 100644
--- a/libgo/go/runtime/testdata/testprog/signal.go
+++ b/libgo/go/runtime/testdata/testprog/signal.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !windows,!plan9,!nacl
+// +build !windows,!plan9
 
 package main
 
diff --git a/libgo/go/runtime/testdata/testprog/vdso.go b/libgo/go/runtime/testdata/testprog/vdso.go
new file mode 100644
index 0000000..ef92f48
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/vdso.go
@@ -0,0 +1,55 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Invoke signal hander in the VDSO context (see issue 32912).
+
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime/pprof"
+	"time"
+)
+
+func init() {
+	register("SignalInVDSO", signalInVDSO)
+}
+
+func signalInVDSO() {
+	f, err := ioutil.TempFile("", "timeprofnow")
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := pprof.StartCPUProfile(f); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	t0 := time.Now()
+	t1 := t0
+	// We should get a profiling signal 100 times a second,
+	// so running for 1 second should be sufficient.
+	for t1.Sub(t0) < time.Second {
+		t1 = time.Now()
+	}
+
+	pprof.StopCPUProfile()
+
+	name := f.Name()
+	if err := f.Close(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := os.Remove(name); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	fmt.Println("success")
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.c b/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.c
new file mode 100644
index 0000000..cd85ac8
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.c
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This test source is used by both TestBigStackCallbackCgo (linked
+// directly into the Go binary) and TestBigStackCallbackSyscall
+// (compiled into a DLL).
+
+#include <windows.h>
+#include <stdio.h>
+
+#ifndef STACK_SIZE_PARAM_IS_A_RESERVATION
+#define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000
+#endif
+
+typedef void callback(char*);
+
+// Allocate a stack that's much larger than the default.
+static const int STACK_SIZE = 16<<20;
+
+static callback *bigStackCallback;
+
+static void useStack(int bytes) {
+	// Windows doesn't like huge frames, so we grow the stack 64k at a time.
+	char x[64<<10];
+	if (bytes < sizeof x) {
+		bigStackCallback(x);
+	} else {
+		useStack(bytes - sizeof x);
+	}
+}
+
+static DWORD WINAPI threadEntry(LPVOID lpParam) {
+	useStack(STACK_SIZE - (128<<10));
+	return 0;
+}
+
+void bigStack(callback *cb) {
+	bigStackCallback = cb;
+	HANDLE hThread = CreateThread(NULL, STACK_SIZE, threadEntry, NULL, STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
+	if (hThread == NULL) {
+		fprintf(stderr, "CreateThread failed\n");
+		exit(1);
+	}
+	WaitForSingleObject(hThread, INFINITE);
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go b/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go
index 68f1738..b7134a4 100644
--- a/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go
+++ b/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go
@@ -41,13 +41,6 @@ func NumGoroutine() {
 	// Test that there are just the expected number of goroutines
 	// running. Specifically, test that the spare M's goroutine
 	// doesn't show up.
-	//
-	// On non-Windows platforms there's a signal handling thread
-	// started by os/signal.init in addition to the main
-	// goroutine.
-	if runtime.GOOS != "windows" {
-		baseGoroutines = 1
-	}
 	if _, ok := checkNumGoroutine("first", 1+baseGoroutines); !ok {
 		return
 	}
diff --git a/libgo/go/runtime/testdata/testprognet/signal.go b/libgo/go/runtime/testdata/testprognet/signal.go
index a1559fe..4d2de79 100644
--- a/libgo/go/runtime/testdata/testprognet/signal.go
+++ b/libgo/go/runtime/testdata/testprognet/signal.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !windows,!plan9,!nacl
+// +build !windows,!plan9
 
 // This is in testprognet instead of testprog because testprog
 // must not import anything (like net, but also like os/signal)
diff --git a/libgo/go/runtime/testdata/testwinlib/main.c b/libgo/go/runtime/testdata/testwinlib/main.c
new file mode 100644
index 0000000..e84a32f
--- /dev/null
+++ b/libgo/go/runtime/testdata/testwinlib/main.c
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <windows.h>
+#include "testwinlib.h"
+
+int exceptionCount;
+int continueCount;
+LONG WINAPI customExceptionHandlder(struct _EXCEPTION_POINTERS *ExceptionInfo)
+{
+    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_BREAKPOINT)
+    {
+        exceptionCount++;
+        // prepare context to resume execution
+        CONTEXT *c = ExceptionInfo->ContextRecord;
+        c->Rip = *(ULONG_PTR *)c->Rsp;
+        c->Rsp += 8;
+        return EXCEPTION_CONTINUE_EXECUTION;
+    }
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+LONG WINAPI customContinueHandlder(struct _EXCEPTION_POINTERS *ExceptionInfo)
+{
+    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_BREAKPOINT)
+    {
+        continueCount++;
+        return EXCEPTION_CONTINUE_EXECUTION;
+    }
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+
+void throwFromC()
+{
+    DebugBreak();
+}
+int main()
+{
+    // simulate a "lazily" attached debugger, by calling some go code before attaching the exception/continue handler
+    Dummy();
+    exceptionCount = 0;
+    continueCount = 0;
+    void *exceptionHandlerHandle = AddVectoredExceptionHandler(0, customExceptionHandlder);
+    if (NULL == exceptionHandlerHandle)
+    {
+        printf("cannot add vectored exception handler\n");
+        return 2;
+    }
+    void *continueHandlerHandle = AddVectoredContinueHandler(0, customContinueHandlder);
+    if (NULL == continueHandlerHandle)
+    {
+        printf("cannot add vectored continue handler\n");
+        return 2;
+    }
+    CallMeBack(throwFromC);
+    RemoveVectoredContinueHandler(continueHandlerHandle);
+    RemoveVectoredExceptionHandler(exceptionHandlerHandle);
+    printf("exceptionCount: %d\ncontinueCount: %d\n", exceptionCount, continueCount);
+    return 0;
+}
+\ No newline at end of file
diff --git a/libgo/go/runtime/testdata/testwinlib/main.go b/libgo/go/runtime/testdata/testwinlib/main.go
new file mode 100644
index 0000000..400eaa1
--- /dev/null
+++ b/libgo/go/runtime/testdata/testwinlib/main.go
@@ -0,0 +1,28 @@
+// +build windows,cgo
+
+package main
+
+// #include <windows.h>
+// typedef void(*callmeBackFunc)();
+// static void bridgeCallback(callmeBackFunc callback) {
+//	callback();
+//}
+import "C"
+
+// CallMeBack call backs C code.
+//export CallMeBack
+func CallMeBack(callback C.callmeBackFunc) {
+	C.bridgeCallback(callback)
+}
+
+// Dummy is called by the C code before registering the exception/continue handlers simulating a debugger.
+// This makes sure that the Go runtime's lastcontinuehandler is reached before the C continue handler and thus,
+// validate that it does not crash the program before another handler could take an action.
+// The idea here is to reproduce what happens when you attach a debugger to a running program.
+// It also simulate the behavior of the .Net debugger, which register its exception/continue handlers lazily.
+//export Dummy
+func Dummy() int {
+	return 42
+}
+
+func main() {}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index 71e7556..ded68ed 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -7,17 +7,17 @@
 package runtime
 
 import (
-	"internal/cpu"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
 // Package time knows the layout of this structure.
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
-// For GOOS=nacl, package syscall knows the layout of this structure.
-// If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
 type timer struct {
-	tb *timersBucket // the bucket the timer lives in
-	i  int           // heap index
+	// If this timer is on a heap, which P's heap it is on.
+	// puintptr rather than *p to match uintptr in the versions
+	// of this struct defined in other packages.
+	pp puintptr
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
@@ -27,52 +27,146 @@ type timer struct {
 	f      func(interface{}, uintptr)
 	arg    interface{}
 	seq    uintptr
+
+	// What to set the when field to in timerModifiedXX status.
+	nextwhen int64
+
+	// The status field holds one of the values below.
+	status uint32
 }
 
-// timersLen is the length of timers array.
+// Code outside this file has to be careful in using a timer value.
 //
-// Ideally, this would be set to GOMAXPROCS, but that would require
-// dynamic reallocation
+// The pp, status, and nextwhen fields may only be used by code in this file.
 //
-// The current value is a compromise between memory usage and performance
-// that should cover the majority of GOMAXPROCS values used in the wild.
-const timersLen = 64
-
-// timers contains "per-P" timer heaps.
+// Code that creates a new timer value can set the when, period, f,
+// arg, and seq fields.
+// A new timer value may be passed to addtimer (called by time.startTimer).
+// After doing that no fields may be touched.
 //
-// Timers are queued into timersBucket associated with the current P,
-// so each P may work with its own timers independently of other P instances.
+// An active timer (one that has been passed to addtimer) may be
+// passed to deltimer (time.stopTimer), after which it is no longer an
+// active timer. It is an inactive timer.
+// In an inactive timer the period, f, arg, and seq fields may be modified,
+// but not the when field.
+// It's OK to just drop an inactive timer and let the GC collect it.
+// It's not OK to pass an inactive timer to addtimer.
+// Only newly allocated timer values may be passed to addtimer.
 //
-// Each timersBucket may be associated with multiple P
-// if GOMAXPROCS > timersLen.
-var timers [timersLen]struct {
-	timersBucket
-
-	// The padding should eliminate false sharing
-	// between timersBucket values.
-	pad [cpu.CacheLinePadSize - unsafe.Sizeof(timersBucket{})%cpu.CacheLinePadSize]byte
-}
+// An active timer may be passed to modtimer. No fields may be touched.
+// It remains an active timer.
+//
+// An inactive timer may be passed to resettimer to turn into an
+// active timer with an updated when field.
+// It's OK to pass a newly allocated timer value to resettimer.
+//
+// Timer operations are addtimer, deltimer, modtimer, resettimer,
+// cleantimers, adjusttimers, and runtimer.
+//
+// We don't permit calling addtimer/deltimer/modtimer/resettimer simultaneously,
+// but adjusttimers and runtimer can be called at the same time as any of those.
+//
+// Active timers live in heaps attached to P, in the timers field.
+// Inactive timers live there too temporarily, until they are removed.
+//
+// addtimer:
+//   timerNoStatus   -> timerWaiting
+//   anything else   -> panic: invalid value
+// deltimer:
+//   timerWaiting    -> timerDeleted
+//   timerModifiedXX -> timerDeleted
+//   timerNoStatus   -> do nothing
+//   timerDeleted    -> do nothing
+//   timerRemoving   -> do nothing
+//   timerRemoved    -> do nothing
+//   timerRunning    -> wait until status changes
+//   timerMoving     -> wait until status changes
+//   timerModifying  -> panic: concurrent deltimer/modtimer calls
+// modtimer:
+//   timerWaiting    -> timerModifying -> timerModifiedXX
+//   timerModifiedXX -> timerModifying -> timerModifiedYY
+//   timerNoStatus   -> timerWaiting
+//   timerRemoved    -> timerWaiting
+//   timerRunning    -> wait until status changes
+//   timerMoving     -> wait until status changes
+//   timerRemoving   -> wait until status changes
+//   timerDeleted    -> panic: concurrent modtimer/deltimer calls
+//   timerModifying  -> panic: concurrent modtimer calls
+// resettimer:
+//   timerNoStatus   -> timerWaiting
+//   timerRemoved    -> timerWaiting
+//   timerDeleted    -> timerModifying -> timerModifiedXX
+//   timerRemoving   -> wait until status changes
+//   timerRunning    -> wait until status changes
+//   timerWaiting    -> panic: resettimer called on active timer
+//   timerMoving     -> panic: resettimer called on active timer
+//   timerModifiedXX -> panic: resettimer called on active timer
+//   timerModifying  -> panic: resettimer called on active timer
+// cleantimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// adjusttimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// runtimer (looks in P's timer heap):
+//   timerNoStatus   -> panic: uninitialized timer
+//   timerWaiting    -> timerWaiting or
+//   timerWaiting    -> timerRunning -> timerNoStatus or
+//   timerWaiting    -> timerRunning -> timerWaiting
+//   timerModifying  -> wait until status changes
+//   timerModifiedXX -> timerMoving -> timerWaiting
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerRunning    -> panic: concurrent runtimer calls
+//   timerRemoved    -> panic: inconsistent timer heap
+//   timerRemoving   -> panic: inconsistent timer heap
+//   timerMoving     -> panic: inconsistent timer heap
 
-func (t *timer) assignBucket() *timersBucket {
-	id := uint8(getg().m.p.ptr().id) % timersLen
-	t.tb = &timers[id].timersBucket
-	return t.tb
-}
+// Values for the timer status field.
+const (
+	// Timer has no status set yet.
+	timerNoStatus = iota
 
-//go:notinheap
-type timersBucket struct {
-	lock         mutex
-	gp           *g
-	created      bool
-	sleeping     bool
-	rescheduling bool
-	sleepUntil   int64
-	waitnote     note
-	t            []*timer
-}
+	// Waiting for timer to fire.
+	// The timer is in some P's heap.
+	timerWaiting
+
+	// Running the timer function.
+	// A timer will only have this status briefly.
+	timerRunning
+
+	// The timer is deleted and should be removed.
+	// It should not be run, but it is still in some P's heap.
+	timerDeleted
 
-// nacl fake time support - time in nanoseconds since 1970
-var faketime int64
+	// The timer is being removed.
+	// The timer will only have this status briefly.
+	timerRemoving
+
+	// The timer has been stopped.
+	// It is not in any P's heap.
+	timerRemoved
+
+	// The timer is being modified.
+	// The timer will only have this status briefly.
+	timerModifying
+
+	// The timer has been modified to an earlier time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedEarlier
+
+	// The timer has been modified to the same or a later time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedLater
+
+	// The timer has been modified and is being moved.
+	// The timer will only have this status briefly.
+	timerMoving
+)
+
+// maxWhen is the maximum value for timer's when field.
+const maxWhen = 1<<63 - 1
 
 // Package time APIs.
 // Godoc uses the comments in package time, not these.
@@ -92,17 +186,20 @@ func timeSleep(ns int64) {
 		t = new(timer)
 		gp.timer = t
 	}
-	*t = timer{}
-	t.when = nanotime() + ns
 	t.f = goroutineReady
 	t.arg = gp
-	tb := t.assignBucket()
-	lock(&tb.lock)
-	if !tb.addtimerLocked(t) {
-		unlock(&tb.lock)
-		badTimer()
-	}
-	goparkunlock(&tb.lock, waitReasonSleep, traceEvGoSleep, 2)
+	t.nextwhen = nanotime() + ns
+	gopark(resetForSleep, unsafe.Pointer(t), waitReasonSleep, traceEvGoSleep, 1)
+}
+
+// resetForSleep is called after the goroutine is parked for timeSleep.
+// We can't call resettimer in timeSleep itself because if this is a short
+// sleep and there are many goroutines then the P can wind up running the
+// timer function, goroutineReady, before the goroutine has been parked.
+func resetForSleep(gp *g, ut unsafe.Pointer) bool {
+	t := (*timer)(ut)
+	resettimer(t, t.nextwhen)
+	return true
 }
 
 // startTimer adds t to the timer heap.
@@ -114,13 +211,22 @@ func startTimer(t *timer) {
 	addtimer(t)
 }
 
-// stopTimer removes t from the timer heap if it is there.
-// It returns true if t was removed, false if t wasn't even there.
+// stopTimer stops a timer.
+// It reports whether t was stopped before being run.
 //go:linkname stopTimer time.stopTimer
 func stopTimer(t *timer) bool {
 	return deltimer(t)
 }
 
+// resetTimer resets an inactive timer, adding it to the heap.
+//go:linkname resetTimer time.resetTimer
+func resetTimer(t *timer, when int64) {
+	if raceenabled {
+		racerelease(unsafe.Pointer(t))
+	}
+	resettimer(t, when)
+}
+
 // Go runtime.
 
 // Ready the goroutine arg.
@@ -128,252 +234,714 @@ func goroutineReady(arg interface{}, seq uintptr) {
 	goready(arg.(*g), 0)
 }
 
+// addtimer adds a timer to the current P.
+// This should only be called with a newly created timer.
+// That avoids the risk of changing the when field of a timer in some P's heap,
+// which could cause the heap to become unsorted.
 func addtimer(t *timer) {
-	tb := t.assignBucket()
-	lock(&tb.lock)
-	ok := tb.addtimerLocked(t)
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
-	}
-}
-
-// Add a timer to the heap and start or kick timerproc if the new timer is
-// earlier than any of the others.
-// Timers are locked.
-// Returns whether all is well: false if the data structure is corrupt
-// due to user-level races.
-func (tb *timersBucket) addtimerLocked(t *timer) bool {
-	// when must never be negative; otherwise timerproc will overflow
+	// when must never be negative; otherwise runtimer will overflow
 	// during its delta calculation and never expire other runtime timers.
 	if t.when < 0 {
-		t.when = 1<<63 - 1
+		t.when = maxWhen
 	}
-	t.i = len(tb.t)
-	tb.t = append(tb.t, t)
-	if !siftupTimer(tb.t, t.i) {
-		return false
+	if t.status != timerNoStatus {
+		badTimer()
 	}
-	if t.i == 0 {
-		// siftup moved to top: new earliest deadline.
-		if tb.sleeping && tb.sleepUntil > t.when {
-			tb.sleeping = false
-			notewakeup(&tb.waitnote)
-		}
-		if tb.rescheduling {
-			tb.rescheduling = false
-			goready(tb.gp, 0)
-		}
-		if !tb.created {
-			tb.created = true
-			expectSystemGoroutine()
-			go timerproc(tb)
-		}
+	t.status = timerWaiting
+
+	addInitializedTimer(t)
+}
+
+// addInitializedTimer adds an initialized timer to the current P.
+func addInitializedTimer(t *timer) {
+	when := t.when
+
+	pp := getg().m.p.ptr()
+	lock(&pp.timersLock)
+	ok := cleantimers(pp) && doaddtimer(pp, t)
+	unlock(&pp.timersLock)
+	if !ok {
+		badTimer()
 	}
-	return true
+
+	wakeNetPoller(when)
 }
 
-// Delete timer t from the heap.
-// Do not need to update the timerproc: if it wakes up early, no big deal.
-func deltimer(t *timer) bool {
-	if t.tb == nil {
-		// t.tb can be nil if the user created a timer
-		// directly, without invoking startTimer e.g
-		//    time.Ticker{C: c}
-		// In this case, return early without any deletion.
-		// See Issue 21874.
-		return false
+// doaddtimer adds t to the current P's heap.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func doaddtimer(pp *p, t *timer) bool {
+	// Timers rely on the network poller, so make sure the poller
+	// has started.
+	if netpollInited == 0 {
+		netpollGenericInit()
 	}
 
-	tb := t.tb
+	if t.pp != 0 {
+		throw("doaddtimer: P already set in timer")
+	}
+	t.pp.set(pp)
+	i := len(pp.timers)
+	pp.timers = append(pp.timers, t)
+	return siftupTimer(pp.timers, i)
+}
 
-	lock(&tb.lock)
-	removed, ok := tb.deltimerLocked(t)
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
+// deltimer deletes the timer t. It may be on some other P, so we can't
+// actually remove it from the timers heap. We can only mark it as deleted.
+// It will be removed in due course by the P whose heap it is on.
+// Reports whether the timer was removed before it was run.
+func deltimer(t *timer) bool {
+	for {
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting, timerModifiedLater:
+			if atomic.Cas(&t.status, s, timerDeleted) {
+				// Timer was not yet run.
+				return true
+			}
+		case timerModifiedEarlier:
+			tpp := t.pp.ptr()
+			if atomic.Cas(&t.status, s, timerModifying) {
+				atomic.Xadd(&tpp.adjustTimers, -1)
+				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+					badTimer()
+				}
+				// Timer was not yet run.
+				return true
+			}
+		case timerDeleted, timerRemoving, timerRemoved:
+			// Timer was already run.
+			return false
+		case timerRunning, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerNoStatus:
+			// Removing timer that was never added or
+			// has already been run. Also see issue 21874.
+			return false
+		case timerModifying:
+			// Simultaneous calls to deltimer and modtimer.
+			badTimer()
+		default:
+			badTimer()
+		}
 	}
-	return removed
 }
 
-func (tb *timersBucket) deltimerLocked(t *timer) (removed, ok bool) {
-	// t may not be registered anymore and may have
-	// a bogus i (typically 0, if generated by Go).
-	// Verify it before proceeding.
-	i := t.i
-	last := len(tb.t) - 1
-	if i < 0 || i > last || tb.t[i] != t {
-		return false, true
+// dodeltimer removes timer i from the current P's heap.
+// We are locked on the P when this is called.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func dodeltimer(pp *p, i int) bool {
+	if t := pp.timers[i]; t.pp.ptr() != pp {
+		throw("dodeltimer: wrong P")
+	} else {
+		t.pp = 0
 	}
+	last := len(pp.timers) - 1
 	if i != last {
-		tb.t[i] = tb.t[last]
-		tb.t[i].i = i
+		pp.timers[i] = pp.timers[last]
 	}
-	tb.t[last] = nil
-	tb.t = tb.t[:last]
-	ok = true
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	ok := true
 	if i != last {
-		if !siftupTimer(tb.t, i) {
+		// Moving to i may have moved the last timer to a new parent,
+		// so sift up to preserve the heap guarantee.
+		if !siftupTimer(pp.timers, i) {
 			ok = false
 		}
-		if !siftdownTimer(tb.t, i) {
+		if !siftdownTimer(pp.timers, i) {
 			ok = false
 		}
 	}
-	return true, ok
+	return ok
 }
 
+// dodeltimer0 removes timer 0 from the current P's heap.
+// We are locked on the P when this is called.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func dodeltimer0(pp *p) bool {
+	if t := pp.timers[0]; t.pp.ptr() != pp {
+		throw("dodeltimer0: wrong P")
+	} else {
+		t.pp = 0
+	}
+	last := len(pp.timers) - 1
+	if last > 0 {
+		pp.timers[0] = pp.timers[last]
+	}
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	ok := true
+	if last > 0 {
+		ok = siftdownTimer(pp.timers, 0)
+	}
+	return ok
+}
+
+// modtimer modifies an existing timer.
+// This is called by the netpoll code.
 func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
-	tb := t.tb
+	if when < 0 {
+		when = maxWhen
+	}
 
-	lock(&tb.lock)
-	_, ok := tb.deltimerLocked(t)
-	if ok {
-		t.when = when
-		t.period = period
-		t.f = f
-		t.arg = arg
-		t.seq = seq
-		ok = tb.addtimerLocked(t)
+	status := uint32(timerNoStatus)
+	wasRemoved := false
+loop:
+	for {
+		switch status = atomic.Load(&t.status); status {
+		case timerWaiting, timerModifiedEarlier, timerModifiedLater:
+			if atomic.Cas(&t.status, status, timerModifying) {
+				break loop
+			}
+		case timerNoStatus, timerRemoved:
+			// Timer was already run and t is no longer in a heap.
+			// Act like addtimer.
+			if atomic.Cas(&t.status, status, timerWaiting) {
+				wasRemoved = true
+				break loop
+			}
+		case timerRunning, timerRemoving, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerDeleted:
+			// Simultaneous calls to modtimer and deltimer.
+			badTimer()
+		case timerModifying:
+			// Multiple simultaneous calls to modtimer.
+			badTimer()
+		default:
+			badTimer()
+		}
 	}
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
+
+	t.period = period
+	t.f = f
+	t.arg = arg
+	t.seq = seq
+
+	if wasRemoved {
+		t.when = when
+		addInitializedTimer(t)
+	} else {
+		// The timer is in some other P's heap, so we can't change
+		// the when field. If we did, the other P's heap would
+		// be out of order. So we put the new when value in the
+		// nextwhen field, and let the other P set the when field
+		// when it is prepared to resort the heap.
+		t.nextwhen = when
+
+		newStatus := uint32(timerModifiedLater)
+		if when < t.when {
+			newStatus = timerModifiedEarlier
+		}
+
+		// Update the adjustTimers field.  Subtract one if we
+		// are removing a timerModifiedEarlier, add one if we
+		// are adding a timerModifiedEarlier.
+		tpp := t.pp.ptr()
+		adjust := int32(0)
+		if status == timerModifiedEarlier {
+			adjust--
+		}
+		if newStatus == timerModifiedEarlier {
+			adjust++
+		}
+		if adjust != 0 {
+			atomic.Xadd(&tpp.adjustTimers, adjust)
+		}
+
+		// Set the new status of the timer.
+		if !atomic.Cas(&t.status, timerModifying, newStatus) {
+			badTimer()
+		}
+
+		// If the new status is earlier, wake up the poller.
+		if newStatus == timerModifiedEarlier {
+			wakeNetPoller(when)
+		}
 	}
 }
 
-// Timerproc runs the time-driven events.
-// It sleeps until the next event in the tb heap.
-// If addtimer inserts a new earlier event, it wakes timerproc early.
-func timerproc(tb *timersBucket) {
-	setSystemGoroutine()
+// resettimer resets an existing inactive timer to turn it into an active timer,
+// with a new time for when the timer should fire.
+// This should be called instead of addtimer if the timer value has been,
+// or may have been, used previously.
+func resettimer(t *timer, when int64) {
+	if when < 0 {
+		when = maxWhen
+	}
 
-	tb.gp = getg()
 	for {
-		lock(&tb.lock)
-		tb.sleeping = false
-		now := nanotime()
-		delta := int64(-1)
-		for {
-			if len(tb.t) == 0 {
-				delta = -1
-				break
+		switch s := atomic.Load(&t.status); s {
+		case timerNoStatus, timerRemoved:
+			if atomic.Cas(&t.status, s, timerWaiting) {
+				t.when = when
+				addInitializedTimer(t)
+				return
 			}
-			t := tb.t[0]
-			delta = t.when - now
-			if delta > 0 {
-				break
-			}
-			ok := true
-			if t.period > 0 {
-				// leave in heap but adjust next time to fire
-				t.when += t.period * (1 + -delta/t.period)
-				if !siftdownTimer(tb.t, 0) {
-					ok = false
+		case timerDeleted:
+			if atomic.Cas(&t.status, s, timerModifying) {
+				t.nextwhen = when
+				newStatus := uint32(timerModifiedLater)
+				if when < t.when {
+					newStatus = timerModifiedEarlier
+					atomic.Xadd(&t.pp.ptr().adjustTimers, 1)
 				}
-			} else {
-				// remove from heap
-				last := len(tb.t) - 1
-				if last > 0 {
-					tb.t[0] = tb.t[last]
-					tb.t[0].i = 0
+				if !atomic.Cas(&t.status, timerModifying, newStatus) {
+					badTimer()
 				}
-				tb.t[last] = nil
-				tb.t = tb.t[:last]
-				if last > 0 {
-					if !siftdownTimer(tb.t, 0) {
-						ok = false
-					}
+				if newStatus == timerModifiedEarlier {
+					wakeNetPoller(when)
 				}
-				t.i = -1 // mark as removed
+				return
+			}
+		case timerRemoving:
+			// Wait for the removal to complete.
+			osyield()
+		case timerRunning:
+			// Even though the timer should not be active,
+			// we can see timerRunning if the timer function
+			// permits some other goroutine to call resettimer.
+			// Wait until the run is complete.
+			osyield()
+		case timerWaiting, timerModifying, timerModifiedEarlier, timerModifiedLater, timerMoving:
+			// Called resettimer on active timer.
+			badTimer()
+		default:
+			badTimer()
+		}
+	}
+}
+
+// cleantimers cleans up the head of the timer queue. This speeds up
+// programs that create and delete timers; leaving them in the heap
+// slows down addtimer. Reports whether no timer problems were found.
+// The caller must have locked the timers for pp.
+func cleantimers(pp *p) bool {
+	for {
+		if len(pp.timers) == 0 {
+			return true
+		}
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("cleantimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			if !dodeltimer0(pp) {
+				return false
 			}
-			f := t.f
-			arg := t.arg
-			seq := t.seq
-			unlock(&tb.lock)
-			if !ok {
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				return false
+			}
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
+			}
+			// Now we can change the when field.
+			t.when = t.nextwhen
+			// Move t to the right position.
+			if !dodeltimer0(pp) {
+				return false
+			}
+			if !doaddtimer(pp, t) {
+				return false
+			}
+			if s == timerModifiedEarlier {
+				atomic.Xadd(&pp.adjustTimers, -1)
+			}
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				return false
+			}
+		default:
+			// Head of timers does not need adjustment.
+			return true
+		}
+	}
+}
+
+// moveTimers moves a slice of timers to pp. The slice has been taken
+// from a different P.
+// This is currently called when the world is stopped, but the caller
+// is expected to have locked the timers for pp.
+func moveTimers(pp *p, timers []*timer) {
+	for _, t := range timers {
+	loop:
+		for {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				t.pp = 0
+				if !doaddtimer(pp, t) {
+					badTimer()
+				}
+				break loop
+			case timerModifiedEarlier, timerModifiedLater:
+				if !atomic.Cas(&t.status, s, timerMoving) {
+					continue
+				}
+				t.when = t.nextwhen
+				t.pp = 0
+				if !doaddtimer(pp, t) {
+					badTimer()
+				}
+				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+					badTimer()
+				}
+				break loop
+			case timerDeleted:
+				if !atomic.Cas(&t.status, s, timerRemoved) {
+					continue
+				}
+				t.pp = 0
+				// We no longer need this timer in the heap.
+				break loop
+			case timerModifying:
+				// Loop until the modification is complete.
+				osyield()
+			case timerNoStatus, timerRemoved:
+				// We should not see these status values in a timers heap.
 				badTimer()
+			case timerRunning, timerRemoving, timerMoving:
+				// Some other P thinks it owns this timer,
+				// which should not happen.
+				badTimer()
+			default:
+				badTimer()
+			}
+		}
+	}
+}
+
+// adjusttimers looks through the timers in the current P's heap for
+// any timers that have been modified to run earlier, and puts them in
+// the correct place in the heap. While looking for those timers,
+// it also moves timers that have been modified to run later,
+// and removes deleted timers. The caller must have locked the timers for pp.
+func adjusttimers(pp *p) {
+	if len(pp.timers) == 0 {
+		return
+	}
+	if atomic.Load(&pp.adjustTimers) == 0 {
+		return
+	}
+	var moved []*timer
+	for i := 0; i < len(pp.timers); i++ {
+		t := pp.timers[i]
+		if t.pp.ptr() != pp {
+			throw("adjusttimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if atomic.Cas(&t.status, s, timerRemoving) {
+				if !dodeltimer(pp, i) {
+					badTimer()
+				}
+				if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+					badTimer()
+				}
+				// Look at this heap position again.
+				i--
 			}
-			if raceenabled {
-				raceacquire(unsafe.Pointer(t))
+		case timerModifiedEarlier, timerModifiedLater:
+			if atomic.Cas(&t.status, s, timerMoving) {
+				// Now we can change the when field.
+				t.when = t.nextwhen
+				// Take t off the heap, and hold onto it.
+				// We don't add it back yet because the
+				// heap manipulation could cause our
+				// loop to skip some other timer.
+				if !dodeltimer(pp, i) {
+					badTimer()
+				}
+				moved = append(moved, t)
+				if s == timerModifiedEarlier {
+					if n := atomic.Xadd(&pp.adjustTimers, -1); int32(n) <= 0 {
+						addAdjustedTimers(pp, moved)
+						return
+					}
+				}
 			}
-			f(arg, seq)
-			lock(&tb.lock)
+		case timerNoStatus, timerRunning, timerRemoving, timerRemoved, timerMoving:
+			badTimer()
+		case timerWaiting:
+			// OK, nothing to do.
+		case timerModifying:
+			// Check again after modification is complete.
+			osyield()
+			i--
+		default:
+			badTimer()
 		}
-		if delta < 0 || faketime > 0 {
-			// No timers left - put goroutine to sleep.
-			tb.rescheduling = true
-			goparkunlock(&tb.lock, waitReasonTimerGoroutineIdle, traceEvGoBlock, 1)
-			continue
+	}
+
+	if len(moved) > 0 {
+		addAdjustedTimers(pp, moved)
+	}
+}
+
+// addAdjustedTimers adds any timers we adjusted in adjusttimers
+// back to the timer heap.
+func addAdjustedTimers(pp *p, moved []*timer) {
+	for _, t := range moved {
+		if !doaddtimer(pp, t) {
+			badTimer()
+		}
+		if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+			badTimer()
 		}
-		// At least one timer pending. Sleep until then.
-		tb.sleeping = true
-		tb.sleepUntil = now + delta
-		noteclear(&tb.waitnote)
-		unlock(&tb.lock)
-		notetsleepg(&tb.waitnote, delta)
 	}
 }
 
-func timejump() *g {
-	if faketime == 0 {
-		return nil
+// nobarrierWakeTime looks at P's timers and returns the time when we
+// should wake up the netpoller. It returns 0 if there are no timers.
+// This function is invoked when dropping a P, and must run without
+// any write barriers. Therefore, if there are any timers that needs
+// to be moved earlier, it conservatively returns the current time.
+// The netpoller M will wake up and adjust timers before sleeping again.
+//go:nowritebarrierrec
+func nobarrierWakeTime(pp *p) int64 {
+	lock(&pp.timersLock)
+	ret := int64(0)
+	if len(pp.timers) > 0 {
+		if atomic.Load(&pp.adjustTimers) > 0 {
+			ret = nanotime()
+		} else {
+			ret = pp.timers[0].when
+		}
 	}
+	unlock(&pp.timersLock)
+	return ret
+}
+
+// runtimer examines the first timer in timers. If it is ready based on now,
+// it runs the timer and removes or updates it.
+// Returns 0 if it ran a timer, -1 if there are no more timers, or the time
+// when the first timer should run.
+// The caller must have locked the timers for pp.
+// If a timer is run, this will temporarily unlock the timers.
+//go:systemstack
+func runtimer(pp *p, now int64) int64 {
+	for {
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("runtimer: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting:
+			if t.when > now {
+				// Not ready to run.
+				return t.when
+			}
+
+			if !atomic.Cas(&t.status, s, timerRunning) {
+				continue
+			}
+			// Note that runOneTimer may temporarily unlock
+			// pp.timersLock.
+			runOneTimer(pp, t, now)
+			return 0
 
-	for i := range timers {
-		lock(&timers[i].lock)
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			if !dodeltimer0(pp) {
+				badTimer()
+			}
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				badTimer()
+			}
+			if len(pp.timers) == 0 {
+				return -1
+			}
+
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
+			}
+			t.when = t.nextwhen
+			if !dodeltimer0(pp) {
+				badTimer()
+			}
+			if !doaddtimer(pp, t) {
+				badTimer()
+			}
+			if s == timerModifiedEarlier {
+				atomic.Xadd(&pp.adjustTimers, -1)
+			}
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				badTimer()
+			}
+
+		case timerModifying:
+			// Wait for modification to complete.
+			osyield()
+
+		case timerNoStatus, timerRemoved:
+			// Should not see a new or inactive timer on the heap.
+			badTimer()
+		case timerRunning, timerRemoving, timerMoving:
+			// These should only be set when timers are locked,
+			// and we didn't do it.
+			badTimer()
+		default:
+			badTimer()
+		}
 	}
-	gp := timejumpLocked()
-	for i := range timers {
-		unlock(&timers[i].lock)
+}
+
+// runOneTimer runs a single timer.
+// The caller must have locked the timers for pp.
+// This will temporarily unlock the timers while running the timer function.
+//go:systemstack
+func runOneTimer(pp *p, t *timer, now int64) {
+	f := t.f
+	arg := t.arg
+	seq := t.seq
+
+	if t.period > 0 {
+		// Leave in heap but adjust next time to fire.
+		delta := t.when - now
+		t.when += t.period * (1 + -delta/t.period)
+		if !siftdownTimer(pp.timers, 0) {
+			badTimer()
+		}
+		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
+			badTimer()
+		}
+	} else {
+		// Remove from heap.
+		if !dodeltimer0(pp) {
+			badTimer()
+		}
+		if !atomic.Cas(&t.status, timerRunning, timerNoStatus) {
+			badTimer()
+		}
 	}
 
-	return gp
+	unlock(&pp.timersLock)
+
+	f(arg, seq)
+
+	lock(&pp.timersLock)
 }
 
-func timejumpLocked() *g {
+func timejump() *p {
+	if faketime == 0 {
+		return nil
+	}
+
+	// Nothing is running, so we can look at all the P's.
 	// Determine a timer bucket with minimum when.
-	var minT *timer
-	for i := range timers {
-		tb := &timers[i]
-		if !tb.created || len(tb.t) == 0 {
+	var (
+		minT    *timer
+		minWhen int64
+		minP    *p
+	)
+	for _, pp := range allp {
+		if pp.status != _Pidle && pp.status != _Pdead {
+			throw("non-idle P in timejump")
+		}
+		if len(pp.timers) == 0 {
 			continue
 		}
-		t := tb.t[0]
-		if minT == nil || t.when < minT.when {
-			minT = t
+		c := pp.adjustTimers
+		for _, t := range pp.timers {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if minT == nil || t.when < minWhen {
+					minT = t
+					minWhen = t.when
+					minP = pp
+				}
+			case timerModifiedEarlier, timerModifiedLater:
+				if minT == nil || t.nextwhen < minWhen {
+					minT = t
+					minWhen = t.nextwhen
+					minP = pp
+				}
+				if s == timerModifiedEarlier {
+					c--
+				}
+			case timerRunning, timerModifying, timerMoving:
+				badTimer()
+			}
+			// The timers are sorted, so we only have to check
+			// the first timer for each P, unless there are
+			// some timerModifiedEarlier timers. The number
+			// of timerModifiedEarlier timers is in the adjustTimers
+			// field, used to initialize c, above.
+			if c == 0 {
+				break
+			}
 		}
 	}
-	if minT == nil || minT.when <= faketime {
-		return nil
-	}
 
-	faketime = minT.when
-	tb := minT.tb
-	if !tb.rescheduling {
+	if minT == nil || minWhen <= faketime {
 		return nil
 	}
-	tb.rescheduling = false
-	return tb.gp
+
+	faketime = minWhen
+	return minP
 }
 
+// timeSleepUntil returns the time when the next timer should fire.
+// This is only called by sysmon.
 func timeSleepUntil() int64 {
-	next := int64(1<<63 - 1)
+	next := int64(maxWhen)
 
-	// Determine minimum sleepUntil across all the timer buckets.
-	//
-	// The function can not return a precise answer,
-	// as another timer may pop in as soon as timers have been unlocked.
-	// So lock the timers one by one instead of all at once.
-	for i := range timers {
-		tb := &timers[i]
+	// Prevent allp slice changes. This is like retake.
+	lock(&allpLock)
+	for _, pp := range allp {
+		if pp == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
+			continue
+		}
 
-		lock(&tb.lock)
-		if tb.sleeping && tb.sleepUntil < next {
-			next = tb.sleepUntil
+		lock(&pp.timersLock)
+		c := atomic.Load(&pp.adjustTimers)
+		for _, t := range pp.timers {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if t.when < next {
+					next = t.when
+				}
+			case timerModifiedEarlier, timerModifiedLater:
+				if t.nextwhen < next {
+					next = t.nextwhen
+				}
+				if s == timerModifiedEarlier {
+					c--
+				}
+			}
+			// The timers are sorted, so we only have to check
+			// the first timer for each P, unless there are
+			// some timerModifiedEarlier timers. The number
+			// of timerModifiedEarlier timers is in the adjustTimers
+			// field, used to initialize c, above.
+			//
+			// We don't worry about cases like timerModifying.
+			// New timers can show up at any time,
+			// so this function is necessarily imprecise.
+			// Do a signed check here since we aren't
+			// synchronizing the read of pp.adjustTimers
+			// with the check of a timer status.
+			if int32(c) <= 0 {
+				break
+			}
 		}
-		unlock(&tb.lock)
+		unlock(&pp.timersLock)
 	}
+	unlock(&allpLock)
 
 	return next
 }
@@ -385,9 +953,6 @@ func timeSleepUntil() int64 {
 // it will cause the program to crash with a mysterious
 // "panic holding locks" message. Instead, we panic while not
 // holding a lock.
-// The races can occur despite the bucket locks because assignBucket
-// itself is called without locks, so racy calls can cause a timer to
-// change buckets while executing these functions.
 
 func siftupTimer(t []*timer, i int) bool {
 	if i >= len(t) {
@@ -401,12 +966,10 @@ func siftupTimer(t []*timer, i int) bool {
 			break
 		}
 		t[i] = t[p]
-		t[i].i = i
 		i = p
 	}
 	if tmp != t[i] {
 		t[i] = tmp
-		t[i].i = i
 	}
 	return true
 }
@@ -444,12 +1007,10 @@ func siftdownTimer(t []*timer, i int) bool {
 			break
 		}
 		t[i] = t[c]
-		t[i].i = i
 		i = c
 	}
 	if tmp != t[i] {
 		t[i] = tmp
-		t[i].i = i
 	}
 	return true
 }
diff --git a/libgo/go/runtime/time_fake.go b/libgo/go/runtime/time_fake.go
new file mode 100644
index 0000000..c64d299
--- /dev/null
+++ b/libgo/go/runtime/time_fake.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build faketime
+// +build !windows
+
+// Faketime isn't currently supported on Windows. This would require:
+//
+// 1. Shadowing time_now, which is implemented in assembly on Windows.
+//    Since that's exported directly to the time package from runtime
+//    assembly, this would involve moving it from sys_windows_*.s into
+//    its own assembly files build-tagged with !faketime and using the
+//    implementation of time_now from timestub.go in faketime mode.
+//
+// 2. Modifying syscall.Write to call syscall.faketimeWrite,
+//    translating the Stdout and Stderr handles into FDs 1 and 2.
+//    (See CL 192739 PS 3.)
+
+package runtime
+
+import "unsafe"
+
+// faketime is the simulated time in nanoseconds since 1970 for the
+// playground.
+var faketime int64 = 1257894000000000000
+
+var faketimeState struct {
+	lock mutex
+
+	// lastfaketime is the last faketime value written to fd 1 or 2.
+	lastfaketime int64
+
+	// lastfd is the fd to which lastfaketime was written.
+	//
+	// Subsequent writes to the same fd may use the same
+	// timestamp, but the timestamp must increase if the fd
+	// changes.
+	lastfd uintptr
+}
+
+//go:nosplit
+func nanotime() int64 {
+	return faketime
+}
+
+func walltime() (sec int64, nsec int32) {
+	return faketime / 1000000000, int32(faketime % 1000000000)
+}
+
+func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	if !(fd == 1 || fd == 2) {
+		// Do an ordinary write.
+		return write1(fd, p, n)
+	}
+
+	// Write with the playback header.
+
+	// First, lock to avoid interleaving writes.
+	lock(&faketimeState.lock)
+
+	// If the current fd doesn't match the fd of the previous write,
+	// ensure that the timestamp is strictly greater. That way, we can
+	// recover the original order even if we read the fds separately.
+	t := faketimeState.lastfaketime
+	if fd != faketimeState.lastfd {
+		t++
+		faketimeState.lastfd = fd
+	}
+	if faketime > t {
+		t = faketime
+	}
+	faketimeState.lastfaketime = t
+
+	// Playback header: 0 0 P B <8-byte time> <4-byte data length> (big endian)
+	var buf [4 + 8 + 4]byte
+	buf[2] = 'P'
+	buf[3] = 'B'
+	tu := uint64(t)
+	buf[4] = byte(tu >> (7 * 8))
+	buf[5] = byte(tu >> (6 * 8))
+	buf[6] = byte(tu >> (5 * 8))
+	buf[7] = byte(tu >> (4 * 8))
+	buf[8] = byte(tu >> (3 * 8))
+	buf[9] = byte(tu >> (2 * 8))
+	buf[10] = byte(tu >> (1 * 8))
+	buf[11] = byte(tu >> (0 * 8))
+	nu := uint32(n)
+	buf[12] = byte(nu >> (3 * 8))
+	buf[13] = byte(nu >> (2 * 8))
+	buf[14] = byte(nu >> (1 * 8))
+	buf[15] = byte(nu >> (0 * 8))
+	write1(fd, unsafe.Pointer(&buf[0]), int32(len(buf)))
+
+	// Write actual data.
+	res := write1(fd, p, n)
+
+	unlock(&faketimeState.lock)
+	return res
+}
diff --git a/libgo/go/runtime/time_nofake.go b/libgo/go/runtime/time_nofake.go
new file mode 100644
index 0000000..1912a94
--- /dev/null
+++ b/libgo/go/runtime/time_nofake.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !faketime
+
+package runtime
+
+import "unsafe"
+
+// faketime is the simulated time in nanoseconds since 1970 for the
+// playground.
+//
+// Zero means not to use faketime.
+var faketime int64
+
+//go:nosplit
+func nanotime() int64 {
+	return nanotime1()
+}
+
+func walltime() (sec int64, nsec int32) {
+	return walltime1()
+}
+
+// write must be nosplit on Windows (see write1)
+//
+//go:nosplit
+func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	return write1(fd, p, n)
+}
diff --git a/libgo/go/runtime/time_test.go b/libgo/go/runtime/time_test.go
new file mode 100644
index 0000000..9b1922d
--- /dev/null
+++ b/libgo/go/runtime/time_test.go
@@ -0,0 +1,96 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"internal/testenv"
+	"os/exec"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+func TestFakeTime(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("faketime not supported on windows")
+	}
+	if runtime.Compiler == "gccgo" {
+		t.Skip("faketime not supported for gccgo")
+	}
+
+	t.Parallel()
+
+	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Command(exe)
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err = testenv.CleanCmdEnv(cmd).Run()
+	if err != nil {
+		t.Fatalf("exit status: %v\n%s", err, stderr.String())
+	}
+
+	t.Logf("raw stdout: %q", stdout.String())
+	t.Logf("raw stderr: %q", stdout.String())
+
+	f1, err1 := parseFakeTime(stdout.Bytes())
+	if err1 != nil {
+		t.Fatal(err1)
+	}
+	f2, err2 := parseFakeTime(stderr.Bytes())
+	if err2 != nil {
+		t.Fatal(err2)
+	}
+
+	const time0 = 1257894000000000000
+	got := [][]fakeTimeFrame{f1, f2}
+	var want = [][]fakeTimeFrame{{
+		{time0 + 1, "line 2\n"},
+		{time0 + 1, "line 3\n"},
+		{time0 + 1e9, "line 5\n"},
+		{time0 + 1e9, "2009-11-10T23:00:01Z"},
+	}, {
+		{time0, "line 1\n"},
+		{time0 + 2, "line 4\n"},
+	}}
+	if !reflect.DeepEqual(want, got) {
+		t.Fatalf("want %v, got %v", want, got)
+	}
+}
+
+type fakeTimeFrame struct {
+	time uint64
+	data string
+}
+
+func parseFakeTime(x []byte) ([]fakeTimeFrame, error) {
+	var frames []fakeTimeFrame
+	for len(x) != 0 {
+		if len(x) < 4+8+4 {
+			return nil, errors.New("truncated header")
+		}
+		const magic = "\x00\x00PB"
+		if string(x[:len(magic)]) != magic {
+			return nil, errors.New("bad magic")
+		}
+		x = x[len(magic):]
+		time := binary.BigEndian.Uint64(x)
+		x = x[8:]
+		dlen := binary.BigEndian.Uint32(x)
+		x = x[4:]
+		data := string(x[:dlen])
+		x = x[dlen:]
+		frames = append(frames, fakeTimeFrame{time, data})
+	}
+	return frames, nil
+}
diff --git a/libgo/go/runtime/timestub2.go b/libgo/go/runtime/timestub2.go
index 7a28603..38446fb 100644
--- a/libgo/go/runtime/timestub2.go
+++ b/libgo/go/runtime/timestub2.go
@@ -5,6 +5,7 @@
 // +build !darwin
 // +build !windows
 // +build !freebsd
+
 package runtime
 
-func walltime() (sec int64, nsec int32)
+func walltime1() (sec int64, nsec int32)
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index 6db5b62..81ff0ca 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -54,7 +54,7 @@ const (
 	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
 	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
 	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
-	traceEvTimerGoroutine    = 35 // denotes timer goroutine [timer goroutine id]
+	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
 	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	traceEvString            = 37 // string dictionary entry [ID, length, string]
 	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
@@ -84,7 +84,7 @@ const (
 	// and ppc64le.
 	// Tracing won't work reliably for architectures where cputicks is emulated
 	// by nanotime, so the value doesn't matter for those architectures.
-	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64|sys.GoarchAmd64p32)
+	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64)
 	// Maximum number of PCs in a single stack trace.
 	// Since events contain only stack id rather than whole stack trace,
 	// we can allow quite large values here.
@@ -181,9 +181,12 @@ func traceBufPtrOf(b *traceBuf) traceBufPtr {
 // Most clients should use the runtime/trace package or the testing package's
 // -test.trace flag instead of calling StartTrace directly.
 func StartTrace() error {
-	// Stop the world, so that we can take a consistent snapshot
+	// Stop the world so that we can take a consistent snapshot
 	// of all goroutines at the beginning of the trace.
-	stopTheWorld("start tracing")
+	// Do not stop the world during GC so we ensure we always see
+	// a consistent view of GC-related events (e.g. a start is always
+	// paired with an end).
+	stopTheWorldGC("start tracing")
 
 	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
 	// Exitsyscall could check trace.enabled long before and then suddenly wake up
@@ -194,7 +197,7 @@ func StartTrace() error {
 
 	if trace.enabled || trace.shutdown {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		startTheWorldGC()
 		return errorString("tracing is already enabled")
 	}
 
@@ -265,7 +268,7 @@ func StartTrace() error {
 
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	startTheWorldGC()
 	return nil
 }
 
@@ -274,14 +277,14 @@ func StartTrace() error {
 func StopTrace() {
 	// Stop the world so that we can collect the trace buffers from all p's below,
 	// and also to avoid races with traceEvent.
-	stopTheWorld("stop tracing")
+	stopTheWorldGC("stop tracing")
 
 	// See the comment in StartTrace.
 	lock(&trace.bufLock)
 
 	if !trace.enabled {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		startTheWorldGC()
 		return
 	}
 
@@ -318,7 +321,7 @@ func StopTrace() {
 	trace.shutdown = true
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	startTheWorldGC()
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
@@ -414,13 +417,6 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		for i := range timers {
-			tb := &timers[i]
-			if tb.gp != nil {
-				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
-				data = traceAppend(data, uint64(tb.gp.goid))
-			}
-		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
 		trace.stackTab.dump()
@@ -922,7 +918,7 @@ func (p *traceAllocBlockPtr) set(x *traceAllocBlock) { *p = traceAllocBlockPtr(u
 
 // alloc allocates n-byte block.
 func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
-	n = round(n, sys.PtrSize)
+	n = alignUp(n, sys.PtrSize)
 	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
 		if n > uintptr(len(a.head.ptr().data)) {
 			throw("trace: alloc too large")
diff --git a/libgo/go/runtime/trace/trace_stack_test.go b/libgo/go/runtime/trace/trace_stack_test.go
index 62c06e6..e3608c6 100644
--- a/libgo/go/runtime/trace/trace_stack_test.go
+++ b/libgo/go/runtime/trace/trace_stack_test.go
@@ -233,6 +233,7 @@ func TestTraceSymbolize(t *testing.T) {
 		}},
 		{trace.EvGomaxprocs, []frame{
 			{"runtime.startTheWorld", 0}, // this is when the current gomaxprocs is logged.
+			{"runtime.startTheWorldGC", 0},
 			{"runtime.GOMAXPROCS", 0},
 			{"runtime/trace_test.TestTraceSymbolize", 0},
 			{"testing.tRunner", 0},
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 4134d28..1ba91af 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -20,7 +20,7 @@ func printcreatedby(gp *g) {
 	if entry != 0 && tracepc > entry {
 		tracepc -= sys.PCQuantum
 	}
-	function, file, line, _ := funcfileline(tracepc, -1)
+	function, file, line, _ := funcfileline(tracepc, -1, false)
 	if function != "" && showframe(function, gp, false) && gp.goid != 1 {
 		printcreatedby1(function, file, line, entry, pc)
 	}
@@ -93,7 +93,7 @@ func traceback(skip int32) {
 func printAncestorTraceback(ancestor ancestorInfo) {
 	print("[originating from goroutine ", ancestor.goid, "]:\n")
 	for fidx, pc := range ancestor.pcs {
-		function, file, line, _ := funcfileline(pc, -1)
+		function, file, line, _ := funcfileline(pc, -1, false)
 		if showfuncinfo(function, fidx == 0) {
 			printAncestorTracebackFuncInfo(function, file, line, pc)
 		}
@@ -102,7 +102,7 @@ func printAncestorTraceback(ancestor ancestorInfo) {
 		print("...additional frames elided...\n")
 	}
 	// Show what created goroutine, except main goroutine (goid 1).
-	function, file, line, _ := funcfileline(ancestor.gopc, -1)
+	function, file, line, _ := funcfileline(ancestor.gopc, -1, false)
 	if function != "" && showfuncinfo(function, false) && ancestor.goid != 1 {
 		printcreatedby1(function, file, line, funcentry(ancestor.gopc), ancestor.gopc)
 	}
diff --git a/libgo/go/runtime/treap_test.go b/libgo/go/runtime/treap_test.go
deleted file mode 100644
index 110f51c..0000000
--- a/libgo/go/runtime/treap_test.go
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime_test
-
-import (
-	"fmt"
-	"runtime"
-	"testing"
-)
-
-var spanDesc = map[uintptr]struct {
-	pages uintptr
-	scav  bool
-}{
-	0xc0000000: {2, false},
-	0xc0006000: {1, false},
-	0xc0010000: {8, false},
-	0xc0022000: {7, false},
-	0xc0034000: {4, true},
-	0xc0040000: {5, false},
-	0xc0050000: {5, true},
-	0xc0060000: {5000, false},
-}
-
-// Wrap the Treap one more time because go:notinheap doesn't
-// actually follow a structure across package boundaries.
-//
-//go:notinheap
-type treap struct {
-	runtime.Treap
-}
-
-func maskMatchName(mask, match runtime.TreapIterType) string {
-	return fmt.Sprintf("%0*b-%0*b", runtime.TreapIterBits, uint8(mask), runtime.TreapIterBits, uint8(match))
-}
-
-func TestTreapFilter(t *testing.T) {
-	var iterTypes = [...]struct {
-		mask, match runtime.TreapIterType
-		filter      runtime.TreapIterFilter // expected filter
-	}{
-		{0, 0, 0xf},
-		{runtime.TreapIterScav, 0, 0x5},
-		{runtime.TreapIterScav, runtime.TreapIterScav, 0xa},
-		{runtime.TreapIterScav | runtime.TreapIterHuge, runtime.TreapIterHuge, 0x4},
-		{runtime.TreapIterScav | runtime.TreapIterHuge, 0, 0x1},
-		{0, runtime.TreapIterScav, 0x0},
-	}
-	for _, it := range iterTypes {
-		t.Run(maskMatchName(it.mask, it.match), func(t *testing.T) {
-			if f := runtime.TreapFilter(it.mask, it.match); f != it.filter {
-				t.Fatalf("got %#x, want %#x", f, it.filter)
-			}
-		})
-	}
-}
-
-// This test ensures that the treap implementation in the runtime
-// maintains all stated invariants after different sequences of
-// insert, removeSpan, find, and erase. Invariants specific to the
-// treap data structure are checked implicitly: after each mutating
-// operation, treap-related invariants are checked for the entire
-// treap.
-func TestTreap(t *testing.T) {
-	// Set up a bunch of spans allocated into mheap_.
-	// Also, derive a set of typeCounts of each type of span
-	// according to runtime.TreapIterType so we can verify against
-	// them later.
-	spans := make([]runtime.Span, 0, len(spanDesc))
-	typeCounts := [1 << runtime.TreapIterBits][1 << runtime.TreapIterBits]int{}
-	for base, de := range spanDesc {
-		s := runtime.AllocSpan(base, de.pages, de.scav)
-		defer s.Free()
-		spans = append(spans, s)
-
-		for i := runtime.TreapIterType(0); i < 1<<runtime.TreapIterBits; i++ {
-			for j := runtime.TreapIterType(0); j < 1<<runtime.TreapIterBits; j++ {
-				if s.MatchesIter(i, j) {
-					typeCounts[i][j]++
-				}
-			}
-		}
-	}
-	t.Run("TypeCountsSanity", func(t *testing.T) {
-		// Just sanity check type counts for a few values.
-		check := func(mask, match runtime.TreapIterType, count int) {
-			tc := typeCounts[mask][match]
-			if tc != count {
-				name := maskMatchName(mask, match)
-				t.Fatalf("failed a sanity check for mask/match %s counts: got %d, wanted %d", name, tc, count)
-			}
-		}
-		check(0, 0, len(spanDesc))
-		check(runtime.TreapIterScav, 0, 6)
-		check(runtime.TreapIterScav, runtime.TreapIterScav, 2)
-	})
-	t.Run("Insert", func(t *testing.T) {
-		tr := treap{}
-		// Test just a very basic insert/remove for sanity.
-		tr.Insert(spans[0])
-		tr.RemoveSpan(spans[0])
-	})
-	t.Run("FindTrivial", func(t *testing.T) {
-		tr := treap{}
-		// Test just a very basic find operation for sanity.
-		tr.Insert(spans[0])
-		i := tr.Find(1)
-		if i.Span() != spans[0] {
-			t.Fatal("found unknown span in treap")
-		}
-		tr.RemoveSpan(spans[0])
-	})
-	t.Run("FindFirstFit", func(t *testing.T) {
-		// Run this 10 times, recreating the treap each time.
-		// Because of the non-deterministic structure of a treap,
-		// we'll be able to test different structures this way.
-		for i := 0; i < 10; i++ {
-			tr := runtime.Treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Find(5)
-			if i.Span().Base() != 0xc0010000 {
-				t.Fatalf("expected span at lowest address which could fit 5 pages, instead found span at %x", i.Span().Base())
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		}
-	})
-	t.Run("Iterate", func(t *testing.T) {
-		for mask := runtime.TreapIterType(0); mask < 1<<runtime.TreapIterBits; mask++ {
-			for match := runtime.TreapIterType(0); match < 1<<runtime.TreapIterBits; match++ {
-				iterName := maskMatchName(mask, match)
-				t.Run(iterName, func(t *testing.T) {
-					t.Run("StartToEnd", func(t *testing.T) {
-						// Ensure progressing an iterator actually goes over the whole treap
-						// from the start and that it iterates over the elements in order.
-						// Furthermore, ensure that it only iterates over the relevant parts
-						// of the treap.
-						// Finally, ensures that Start returns a valid iterator.
-						tr := treap{}
-						for _, s := range spans {
-							tr.Insert(s)
-						}
-						nspans := 0
-						lastBase := uintptr(0)
-						for i := tr.Start(mask, match); i.Valid(); i = i.Next() {
-							nspans++
-							if lastBase > i.Span().Base() {
-								t.Fatalf("not iterating in correct order: encountered base %x before %x", lastBase, i.Span().Base())
-							}
-							lastBase = i.Span().Base()
-							if !i.Span().MatchesIter(mask, match) {
-								t.Fatalf("found non-matching span while iteration over mask/match %s: base %x", iterName, i.Span().Base())
-							}
-						}
-						if nspans != typeCounts[mask][match] {
-							t.Fatal("failed to iterate forwards over full treap")
-						}
-						for _, s := range spans {
-							tr.RemoveSpan(s)
-						}
-					})
-					t.Run("EndToStart", func(t *testing.T) {
-						// See StartToEnd tests.
-						tr := treap{}
-						for _, s := range spans {
-							tr.Insert(s)
-						}
-						nspans := 0
-						lastBase := ^uintptr(0)
-						for i := tr.End(mask, match); i.Valid(); i = i.Prev() {
-							nspans++
-							if lastBase < i.Span().Base() {
-								t.Fatalf("not iterating in correct order: encountered base %x before %x", lastBase, i.Span().Base())
-							}
-							lastBase = i.Span().Base()
-							if !i.Span().MatchesIter(mask, match) {
-								t.Fatalf("found non-matching span while iteration over mask/match %s: base %x", iterName, i.Span().Base())
-							}
-						}
-						if nspans != typeCounts[mask][match] {
-							t.Fatal("failed to iterate backwards over full treap")
-						}
-						for _, s := range spans {
-							tr.RemoveSpan(s)
-						}
-					})
-				})
-			}
-		}
-		t.Run("Prev", func(t *testing.T) {
-			// Test the iterator invariant that i.prev().next() == i.
-			tr := treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Start(0, 0).Next().Next()
-			p := i.Prev()
-			if !p.Valid() {
-				t.Fatal("i.prev() is invalid")
-			}
-			if p.Next().Span() != i.Span() {
-				t.Fatal("i.prev().next() != i")
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		})
-		t.Run("Next", func(t *testing.T) {
-			// Test the iterator invariant that i.next().prev() == i.
-			tr := treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Start(0, 0).Next().Next()
-			n := i.Next()
-			if !n.Valid() {
-				t.Fatal("i.next() is invalid")
-			}
-			if n.Prev().Span() != i.Span() {
-				t.Fatal("i.next().prev() != i")
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		})
-	})
-	t.Run("EraseOne", func(t *testing.T) {
-		// Test that erasing one iterator correctly retains
-		// all relationships between elements.
-		tr := treap{}
-		for _, s := range spans {
-			tr.Insert(s)
-		}
-		i := tr.Start(0, 0).Next().Next().Next()
-		s := i.Span()
-		n := i.Next()
-		p := i.Prev()
-		tr.Erase(i)
-		if n.Prev().Span() != p.Span() {
-			t.Fatal("p, n := i.Prev(), i.Next(); n.prev() != p after i was erased")
-		}
-		if p.Next().Span() != n.Span() {
-			t.Fatal("p, n := i.Prev(), i.Next(); p.next() != n after i was erased")
-		}
-		tr.Insert(s)
-		for _, s := range spans {
-			tr.RemoveSpan(s)
-		}
-	})
-	t.Run("EraseAll", func(t *testing.T) {
-		// Test that erasing iterators actually removes nodes from the treap.
-		tr := treap{}
-		for _, s := range spans {
-			tr.Insert(s)
-		}
-		for i := tr.Start(0, 0); i.Valid(); {
-			n := i.Next()
-			tr.Erase(i)
-			i = n
-		}
-		if size := tr.Size(); size != 0 {
-			t.Fatalf("should have emptied out treap, %d spans left", size)
-		}
-	})
-}
author	Ian Lance Taylor <iant@golang.org>	2020-01-02 15:05:27 -0800
committer	Ian Lance Taylor <iant@golang.org>	2020-01-21 23:53:22 -0800
commit	5a8ea165926cb0737ab03bc48c18dc5198ab5305 (patch)
tree	962dc3357c57f019f85658f99e2e753e30201c27 /libgo/go/runtime
parent	6ac6529e155c9baa0aaaed7aca06bd38ebda5b43 (diff)
download	gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.zip gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.tar.gz gcc-5a8ea165926cb0737ab03bc48c18dc5198ab5305.tar.bz2