130 files changed, 6133 insertions, 3260 deletions
diff --git a/libgo/go/runtime/auxv_none.go b/libgo/go/runtime/auxv_none.go
index 3a560a17..3ca617b 100644
--- a/libgo/go/runtime/auxv_none.go
+++ b/libgo/go/runtime/auxv_none.go
@@ -7,7 +7,6 @@
 // +build !dragonfly
 // +build !freebsd
 // +build !netbsd
-// +build !openbsd !arm64
 // +build !solaris
 
 package runtime
diff --git a/libgo/go/runtime/cgo_gccgo.go b/libgo/go/runtime/cgo_gccgo.go
index e4d27e8..520838a 100644
--- a/libgo/go/runtime/cgo_gccgo.go
+++ b/libgo/go/runtime/cgo_gccgo.go
@@ -76,7 +76,7 @@ func CgocallDone() {
 func CgocallBack() {
 	gp := getg()
 	if gp == nil || gp.m == nil {
-		needm(0)
+		needm()
 		gp = getg()
 		mp := gp.m
 		mp.dropextram = true
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index efd0e24..79fe65c 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -227,7 +227,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 		hbits := heapBitsForAddr(base)
 		n := span.elemsize
 		for i = uintptr(0); i < n; i += sys.PtrSize {
-			if i != 1*sys.PtrSize && !hbits.morePointers() {
+			if !hbits.morePointers() {
 				// No more possible pointers.
 				break
 			}
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index 1b20aff..7878a8f 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -232,8 +232,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 		// Space is available in the channel buffer. Enqueue the element to send.
 		qp := chanbuf(c, c.sendx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
+			racenotify(c, c.sendx, nil)
 		}
 		typedmemmove(c.elemtype, qp, ep)
 		c.sendx++
@@ -315,11 +314,8 @@ func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 			// Pretend we go through the buffer, even though
 			// we copy directly. Note that we need to increment
 			// the head/tail locations only when raceenabled.
-			qp := chanbuf(c, c.recvx)
-			raceacquire(qp)
-			racerelease(qp)
-			raceacquireg(sg.g, qp)
-			racereleaseg(sg.g, qp)
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
 			c.recvx++
 			if c.recvx == c.dataqsiz {
 				c.recvx = 0
@@ -557,8 +553,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 		// Receive directly from queue
 		qp := chanbuf(c, c.recvx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
+			racenotify(c, c.recvx, nil)
 		}
 		if ep != nil {
 			typedmemmove(c.elemtype, ep, qp)
@@ -647,10 +642,8 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 		// queue is full, those are both the same slot.
 		qp := chanbuf(c, c.recvx)
 		if raceenabled {
-			raceacquire(qp)
-			racerelease(qp)
-			raceacquireg(sg.g, qp)
-			racereleaseg(sg.g, qp)
+			racenotify(c, c.recvx, nil)
+			racenotify(c, c.recvx, sg)
 		}
 		// copy data from queue to receiver
 		if ep != nil {
@@ -861,3 +854,38 @@ func racesync(c *hchan, sg *sudog) {
 	racereleaseg(sg.g, chanbuf(c, 0))
 	raceacquire(chanbuf(c, 0))
 }
+
+// Notify the race detector of a send or receive involving buffer entry idx
+// and a channel c or its communicating partner sg.
+// This function handles the special case of c.elemsize==0.
+func racenotify(c *hchan, idx uint, sg *sudog) {
+	// We could have passed the unsafe.Pointer corresponding to entry idx
+	// instead of idx itself.  However, in a future version of this function,
+	// we can use idx to better handle the case of elemsize==0.
+	// A future improvement to the detector is to call TSan with c and idx:
+	// this way, Go will continue to not allocating buffer entries for channels
+	// of elemsize==0, yet the race detector can be made to handle multiple
+	// sync objects underneath the hood (one sync object per idx)
+	qp := chanbuf(c, idx)
+	// When elemsize==0, we don't allocate a full buffer for the channel.
+	// Instead of individual buffer entries, the race detector uses the
+	// c.buf as the only buffer entry.  This simplification prevents us from
+	// following the memory model's happens-before rules (rules that are
+	// implemented in racereleaseacquire).  Instead, we accumulate happens-before
+	// information in the synchronization object associated with c.buf.
+	if c.elemsize == 0 {
+		if sg == nil {
+			raceacquire(qp)
+			racerelease(qp)
+		} else {
+			raceacquireg(sg.g, qp)
+			racereleaseg(sg.g, qp)
+		}
+	} else {
+		if sg == nil {
+			racereleaseacquire(qp)
+		} else {
+			racereleaseacquireg(sg.g, qp)
+		}
+	}
+}
diff --git a/libgo/go/runtime/closure_test.go b/libgo/go/runtime/closure_test.go
index ea65fbd..741c932 100644
--- a/libgo/go/runtime/closure_test.go
+++ b/libgo/go/runtime/closure_test.go
@@ -1,6 +1,7 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import "testing"
diff --git a/libgo/go/runtime/cpuflags_arm64.go b/libgo/go/runtime/cpuflags_arm64.go
new file mode 100644
index 0000000..7576bef4a
--- /dev/null
+++ b/libgo/go/runtime/cpuflags_arm64.go
@@ -0,0 +1,17 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+)
+
+var arm64UseAlignedLoads bool
+
+func init() {
+	if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
+		arm64UseAlignedLoads = true
+	}
+}
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index 64a7c08..b0198ff 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -154,7 +154,7 @@ func TestCgoExecSignalMask(t *testing.T) {
 	case "windows", "plan9":
 		t.Skipf("skipping signal mask test on %s", runtime.GOOS)
 	}
-	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask")
+	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask", "GOTRACEBACK=system")
 	want := "OK\n"
 	if got != want {
 		t.Errorf("expected %q, got %v", want, got)
@@ -257,6 +257,27 @@ func TestCgoCrashTraceback(t *testing.T) {
 	}
 }
 
+func TestCgoCrashTracebackGo(t *testing.T) {
+	t.Parallel()
+	switch platform := runtime.GOOS + "/" + runtime.GOARCH; platform {
+	case "darwin/amd64":
+	case "linux/amd64":
+	case "linux/ppc64le":
+	default:
+		t.Skipf("not yet supported on %s", platform)
+	}
+	if runtime.Compiler == "gccgo" {
+		t.Skip("gccgo does not have SetCgoTraceback")
+	}
+	got := runTestProg(t, "testprogcgo", "CrashTracebackGo")
+	for i := 1; i <= 3; i++ {
+		want := fmt.Sprintf("main.h%d", i)
+		if !strings.Contains(got, want) {
+			t.Errorf("missing %s", want)
+		}
+	}
+}
+
 func TestCgoTracebackContext(t *testing.T) {
 	t.Parallel()
 	if runtime.Compiler == "gccgo" {
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index aa97cf7..ab6f381 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -9,7 +9,6 @@ import (
 	"flag"
 	"fmt"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -117,7 +116,7 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	testprog.Lock()
 	defer testprog.Unlock()
 	if testprog.dir == "" {
-		dir, err := ioutil.TempDir("", "go-build")
+		dir, err := os.MkdirTemp("", "go-build")
 		if err != nil {
 			t.Fatalf("failed to create temp directory: %v", err)
 		}
@@ -181,6 +180,9 @@ func TestCrashHandler(t *testing.T) {
 }
 
 func testDeadlock(t *testing.T, name string) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", name)
 	want := "fatal error: all goroutines are asleep - deadlock!\n"
 	if !strings.HasPrefix(output, want) {
@@ -205,6 +207,9 @@ func TestLockedDeadlock2(t *testing.T) {
 }
 
 func TestGoexitDeadlock(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitDeadlock")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -293,6 +298,9 @@ func TestRecursivePanic4(t *testing.T) {
 }
 
 func TestGoexitCrash(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -352,6 +360,9 @@ func TestBreakpoint(t *testing.T) {
 }
 
 func TestGoexitInPanic(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	// see issue 8774: this code used to trigger an infinite recursion
 	output := runTestProg(t, "testprog", "GoexitInPanic")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -416,6 +427,9 @@ func TestPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoveredPanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "RecoveredPanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
@@ -424,6 +438,9 @@ func TestRecoveredPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -433,6 +450,9 @@ func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -701,7 +721,9 @@ func TestTimePprof(t *testing.T) {
 	if runtime.GOOS == "aix" {
 		t.Skip("pprof not yet available on AIX (see golang.org/issue/28555)")
 	}
-	fn := runTestProg(t, "testprog", "TimeProf")
+	// Pass GOTRACEBACK for issue #41120 to try to get more
+	// information on timeout.
+	fn := runTestProg(t, "testprog", "TimeProf", "GOTRACEBACK=crash")
 	fn = strings.TrimSpace(fn)
 	defer os.Remove(fn)
 
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 80184d9..69aecd5 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -10,7 +10,6 @@ import (
 	"bytes"
 	"internal/testenv"
 	"io"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -70,6 +69,10 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skipf("skipping; not supported on %v", runtime.GOOS)
 	}
 
+	if runtime.GOOS == "openbsd" && runtime.GOARCH == "mips64" {
+		t.Skipf("skipping; test fails on %s/%s - see issue #42464", runtime.GOOS, runtime.GOARCH)
+	}
+
 	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
 		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
 	}
@@ -81,13 +84,13 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	t.Parallel()
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
 	defer os.RemoveAll(dir)
 
-	if err := ioutil.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
+	if err := os.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
 		t.Fatalf("failed to create Go file: %v", err)
 	}
 
@@ -231,20 +234,27 @@ func TestPanicSystemstack(t *testing.T) {
 	}
 	defer pr.Close()
 
-	// Wait for "x\nx\n" to indicate readiness.
+	// Wait for "x\nx\n" to indicate almost-readiness.
 	buf := make([]byte, 4)
 	_, err = io.ReadFull(pr, buf)
 	if err != nil || string(buf) != "x\nx\n" {
 		t.Fatal("subprocess failed; output:\n", string(buf))
 	}
 
+	// The child blockers print "x\n" and then block on a lock. Receiving
+	// those bytes only indicates that the child is _about to block_. Since
+	// we don't have a way to know when it is fully blocked, sleep a bit to
+	// make us less likely to lose the race and signal before the child
+	// blocks.
+	time.Sleep(100 * time.Millisecond)
+
 	// Send SIGQUIT.
 	if err := cmd.Process.Signal(syscall.SIGQUIT); err != nil {
 		t.Fatal("signaling subprocess: ", err)
 	}
 
 	// Get traceback.
-	tb, err := ioutil.ReadAll(pr)
+	tb, err := io.ReadAll(pr)
 	if err != nil {
 		t.Fatal("reading traceback from pipe: ", err)
 	}
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index ff76580..5fb02cb 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -10,9 +10,8 @@ import (
 )
 
 // GOMAXPROCS sets the maximum number of CPUs that can be executing
-// simultaneously and returns the previous setting. If n < 1, it does not
-// change the current setting.
-// The number of logical CPUs on the local machine can be queried with NumCPU.
+// simultaneously and returns the previous setting. It defaults to
+// the value of runtime.NumCPU. If n < 1, it does not change the current setting.
 // This call will go away when the scheduler improves.
 func GOMAXPROCS(n int) int {
 	if GOARCH == "wasm" && n > 1 {
diff --git a/libgo/go/runtime/debug/garbage.go b/libgo/go/runtime/debug/garbage.go
index 785e9d4..00f92c3 100644
--- a/libgo/go/runtime/debug/garbage.go
+++ b/libgo/go/runtime/debug/garbage.go
@@ -106,6 +106,8 @@ func FreeOSMemory() {
 // the program crashes.
 // SetMaxStack returns the previous setting.
 // The initial setting is 1 GB on 64-bit systems, 250 MB on 32-bit systems.
+// There may be a system-imposed maximum stack limit regardless
+// of the value provided to SetMaxStack.
 //
 // SetMaxStack is useful mainly for limiting the damage done by
 // goroutines that enter an infinite recursion. It only limits future
@@ -139,6 +141,11 @@ func SetMaxThreads(threads int) int {
 // manipulation of memory may cause faults at non-nil addresses in less
 // dramatic situations; SetPanicOnFault allows such programs to request
 // that the runtime trigger only a panic, not a crash.
+// The runtime.Error that the runtime panics with may have an additional method:
+//     Addr() uintptr
+// If that method exists, it returns the memory address which triggered the fault.
+// The results of Addr are best-effort and the veracity of the result
+// may depend on the platform.
 // SetPanicOnFault applies only to the current goroutine.
 // It returns the previous setting.
 func SetPanicOnFault(enabled bool) bool {
diff --git a/libgo/go/runtime/debug/heapdump_test.go b/libgo/go/runtime/debug/heapdump_test.go
index de1ec27..768934d 100644
--- a/libgo/go/runtime/debug/heapdump_test.go
+++ b/libgo/go/runtime/debug/heapdump_test.go
@@ -5,7 +5,6 @@
 package debug_test
 
 import (
-	"io/ioutil"
 	"os"
 	"runtime"
 	. "runtime/debug"
@@ -16,7 +15,7 @@ func TestWriteHeapDumpNonempty(t *testing.T) {
 	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
-	f, err := ioutil.TempFile("", "heapdumptest")
+	f, err := os.CreateTemp("", "heapdumptest")
 	if err != nil {
 		t.Fatalf("TempFile failed: %v", err)
 	}
@@ -45,7 +44,7 @@ func TestWriteHeapDumpFinalizers(t *testing.T) {
 	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
-	f, err := ioutil.TempFile("", "heapdumptest")
+	f, err := os.CreateTemp("", "heapdumptest")
 	if err != nil {
 		t.Fatalf("TempFile failed: %v", err)
 	}
diff --git a/libgo/go/runtime/debug/panic_test.go b/libgo/go/runtime/debug/panic_test.go
new file mode 100644
index 0000000..b67a3de
--- /dev/null
+++ b/libgo/go/runtime/debug/panic_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd
+
+// TODO: test on Windows?
+
+package debug_test
+
+import (
+	"runtime"
+	"runtime/debug"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestPanicOnFault(t *testing.T) {
+	if runtime.GOARCH == "s390x" {
+		t.Skip("s390x fault addresses are missing the low order bits")
+	}
+	if runtime.GOOS == "ios" {
+		t.Skip("iOS doesn't provide fault addresses")
+	}
+	m, err := syscall.Mmap(-1, 0, 0x1000, syscall.PROT_READ /* Note: no PROT_WRITE */, syscall.MAP_SHARED|syscall.MAP_ANON)
+	if err != nil {
+		t.Fatalf("can't map anonymous memory: %s", err)
+	}
+	defer syscall.Munmap(m)
+	old := debug.SetPanicOnFault(true)
+	defer debug.SetPanicOnFault(old)
+	const lowBits = 0x3e7
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatalf("write did not fault")
+		}
+		type addressable interface {
+			Addr() uintptr
+		}
+		a, ok := r.(addressable)
+		if !ok {
+			t.Fatalf("fault does not contain address")
+		}
+		want := uintptr(unsafe.Pointer(&m[lowBits]))
+		got := a.Addr()
+		if got != want {
+			t.Fatalf("fault address %x, want %x", got, want)
+		}
+	}()
+	m[lowBits] = 1 // will fault
+}
diff --git a/libgo/go/runtime/debug_test.go b/libgo/go/runtime/debug_test.go
index 967477dd..46261bc 100644
--- a/libgo/go/runtime/debug_test.go
+++ b/libgo/go/runtime/debug_test.go
@@ -18,7 +18,7 @@ package runtime_test
 
 import (
 	"fmt"
-	"io/ioutil"
+	"os"
 	"regexp"
 	"runtime"
 	"runtime/debug"
@@ -96,7 +96,7 @@ func debugCallTKill(tid int) error {
 // Linux-specific.
 func skipUnderDebugger(t *testing.T) {
 	pid := syscall.Getpid()
-	status, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
+	status, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
 	if err != nil {
 		t.Logf("couldn't get proc tracer: %s", err)
 		return
diff --git a/libgo/go/runtime/error.go b/libgo/go/runtime/error.go
index d91e2db..8065534 100644
--- a/libgo/go/runtime/error.go
+++ b/libgo/go/runtime/error.go
@@ -118,6 +118,26 @@ func NewErrorCString(s uintptr, ret *interface{}) {
 	*ret = errorCString{s}
 }
 
+type errorAddressString struct {
+	msg  string  // error message
+	addr uintptr // memory address where the error occurred
+}
+
+func (e errorAddressString) RuntimeError() {}
+
+func (e errorAddressString) Error() string {
+	return "runtime error: " + e.msg
+}
+
+// Addr returns the memory address where a fault occurred.
+// The address provided is best-effort.
+// The veracity of the result may depend on the platform.
+// Errors providing this method will only be returned as
+// a result of using runtime/debug.SetPanicOnFault.
+func (e errorAddressString) Addr() uintptr {
+	return e.addr
+}
+
 // plainError represents a runtime error described a string without
 // the prefix "runtime error: " after invoking errorString.Error().
 // See Issue #14965.
diff --git a/libgo/go/runtime/export_pipe2_test.go b/libgo/go/runtime/export_pipe2_test.go
new file mode 100644
index 0000000..9d580d331
--- /dev/null
+++ b/libgo/go/runtime/export_pipe2_test.go
@@ -0,0 +1,15 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func Pipe() (r, w int32, errno int32) {
+	r, w, errno = pipe2(0)
+	if errno == _ENOSYS {
+		return pipe()
+	}
+	return r, w, errno
+}
diff --git a/libgo/go/runtime/export_pipe_test.go b/libgo/go/runtime/export_pipe_test.go
new file mode 100644
index 0000000..8f66770
--- /dev/null
+++ b/libgo/go/runtime/export_pipe_test.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly
+
+package runtime
+
+var Pipe = pipe
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 369230a..8dd3050 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -43,8 +43,6 @@ var PhysHugePageSize = physHugePageSize
 
 var NetpollGenericInit = netpollGenericInit
 
-var ParseRelease = parseRelease
-
 var Memmove = memmove
 var MemclrNoHeapPointers = memclrNoHeapPointers
 
@@ -295,6 +293,32 @@ func (p *ProfBuf) Close() {
 	(*profBuf)(p).close()
 }
 
+func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
+	stopTheWorld("ReadMetricsSlow")
+
+	// Initialize the metrics beforehand because this could
+	// allocate and skew the stats.
+	semacquire(&metricsSema)
+	initMetrics()
+	semrelease(&metricsSema)
+
+	systemstack(func() {
+		// Read memstats first. It's going to flush
+		// the mcaches which readMetrics does not do, so
+		// going the other way around may result in
+		// inconsistent statistics.
+		readmemstats_m(memStats)
+	})
+
+	// Read metrics off the system stack.
+	//
+	// The only part of readMetrics that could allocate
+	// and skew the stats is initMetrics.
+	readMetrics(samplesp, len, cap)
+
+	startTheWorld()
+}
+
 // ReadMemStatsSlow returns both the runtime-computed MemStats and
 // MemStats accumulated by scanning the heap.
 func ReadMemStatsSlow() (base, slow MemStats) {
@@ -334,20 +358,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
 		var smallFree uint64
-		slow.Frees = mheap_.nlargefree
-		for i := range mheap_.nsmallfree {
-			slow.Frees += mheap_.nsmallfree[i]
-			bySize[i].Frees = mheap_.nsmallfree[i]
-			bySize[i].Mallocs += mheap_.nsmallfree[i]
-			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
-		}
-		slow.Frees += memstats.tinyallocs
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
+		}
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
@@ -711,7 +737,16 @@ func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
 	return (*pageCache)(c).alloc(npages)
 }
 func (c *PageCache) Flush(s *PageAlloc) {
-	(*pageCache)(c).flush((*pageAlloc)(s))
+	cp := (*pageCache)(c)
+	sp := (*pageAlloc)(s)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(sp.mheapLock)
+		cp.flush(sp)
+		unlock(sp.mheapLock)
+	})
 }
 
 // Expose chunk index type.
@@ -722,13 +757,41 @@ type ChunkIdx chunkIdx
 type PageAlloc pageAlloc
 
 func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
-	return (*pageAlloc)(p).alloc(npages)
+	pp := (*pageAlloc)(p)
+
+	var addr, scav uintptr
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		addr, scav = pp.alloc(npages)
+		unlock(pp.mheapLock)
+	})
+	return addr, scav
 }
 func (p *PageAlloc) AllocToCache() PageCache {
-	return PageCache((*pageAlloc)(p).allocToCache())
+	pp := (*pageAlloc)(p)
+
+	var c PageCache
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		c = PageCache(pp.allocToCache())
+		unlock(pp.mheapLock)
+	})
+	return c
 }
 func (p *PageAlloc) Free(base, npages uintptr) {
-	(*pageAlloc)(p).free(base, npages)
+	pp := (*pageAlloc)(p)
+
+	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
+		lock(pp.mheapLock)
+		pp.free(base, npages)
+		unlock(pp.mheapLock)
+	})
 }
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
@@ -736,6 +799,8 @@ func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
+		// None of the tests need any higher-level locking, so we just
+		// take the lock internally.
 		lock(pp.mheapLock)
 		r = pp.scavenge(nbytes, mayUnlock)
 		unlock(pp.mheapLock)
@@ -745,10 +810,7 @@ func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 func (p *PageAlloc) InUse() []AddrRange {
 	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
 	for _, r := range p.inUse.ranges {
-		ranges = append(ranges, AddrRange{
-			Base:  r.base.addr(),
-			Limit: r.limit.addr(),
-		})
+		ranges = append(ranges, AddrRange{r})
 	}
 	return ranges
 }
@@ -759,10 +821,111 @@ func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
 	return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
 }
 
-// AddrRange represents a range over addresses.
-// Specifically, it represents the range [Base, Limit).
+// AddrRange is a wrapper around addrRange for testing.
 type AddrRange struct {
-	Base, Limit uintptr
+	addrRange
+}
+
+// MakeAddrRange creates a new address range.
+func MakeAddrRange(base, limit uintptr) AddrRange {
+	return AddrRange{makeAddrRange(base, limit)}
+}
+
+// Base returns the virtual base address of the address range.
+func (a AddrRange) Base() uintptr {
+	return a.addrRange.base.addr()
+}
+
+// Base returns the virtual address of the limit of the address range.
+func (a AddrRange) Limit() uintptr {
+	return a.addrRange.limit.addr()
+}
+
+// Equals returns true if the two address ranges are exactly equal.
+func (a AddrRange) Equals(b AddrRange) bool {
+	return a == b
+}
+
+// Size returns the size in bytes of the address range.
+func (a AddrRange) Size() uintptr {
+	return a.addrRange.size()
+}
+
+// AddrRanges is a wrapper around addrRanges for testing.
+type AddrRanges struct {
+	addrRanges
+	mutable bool
+}
+
+// NewAddrRanges creates a new empty addrRanges.
+//
+// Note that this initializes addrRanges just like in the
+// runtime, so its memory is persistentalloc'd. Call this
+// function sparingly since the memory it allocates is
+// leaked.
+//
+// This AddrRanges is mutable, so we can test methods like
+// Add.
+func NewAddrRanges() AddrRanges {
+	r := addrRanges{}
+	r.init(new(sysMemStat))
+	return AddrRanges{r, true}
+}
+
+// MakeAddrRanges creates a new addrRanges populated with
+// the ranges in a.
+//
+// The returned AddrRanges is immutable, so methods like
+// Add will fail.
+func MakeAddrRanges(a ...AddrRange) AddrRanges {
+	// Methods that manipulate the backing store of addrRanges.ranges should
+	// not be used on the result from this function (e.g. add) since they may
+	// trigger reallocation. That would normally be fine, except the new
+	// backing store won't come from the heap, but from persistentalloc, so
+	// we'll leak some memory implicitly.
+	ranges := make([]addrRange, 0, len(a))
+	total := uintptr(0)
+	for _, r := range a {
+		ranges = append(ranges, r.addrRange)
+		total += r.Size()
+	}
+	return AddrRanges{addrRanges{
+		ranges:     ranges,
+		totalBytes: total,
+		sysStat:    new(sysMemStat),
+	}, false}
+}
+
+// Ranges returns a copy of the ranges described by the
+// addrRanges.
+func (a *AddrRanges) Ranges() []AddrRange {
+	result := make([]AddrRange, 0, len(a.addrRanges.ranges))
+	for _, r := range a.addrRanges.ranges {
+		result = append(result, AddrRange{r})
+	}
+	return result
+}
+
+// FindSucc returns the successor to base. See addrRanges.findSucc
+// for more details.
+func (a *AddrRanges) FindSucc(base uintptr) int {
+	return a.findSucc(base)
+}
+
+// Add adds a new AddrRange to the AddrRanges.
+//
+// The AddrRange must be mutable (i.e. created by NewAddrRanges),
+// otherwise this method will throw.
+func (a *AddrRanges) Add(r AddrRange) {
+	if !a.mutable {
+		throw("attempt to mutate immutable AddrRanges")
+	}
+	a.add(r.addrRange)
+}
+
+// TotalBytes returns the totalBytes field of the addrRanges.
+func (a *AddrRanges) TotalBytes() uintptr {
+	return a.addrRanges.totalBytes
 }
 
 // BitRange represents a range over a bitmap.
@@ -796,7 +959,11 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		addr := chunkBase(chunkIdx(i))
 
 		// Mark the chunk's existence in the pageAlloc.
-		p.grow(addr, pallocChunkBytes)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.grow(addr, pallocChunkBytes)
+			unlock(p.mheapLock)
+		})
 
 		// Initialize the bitmap and update pageAlloc metadata.
 		chunk := p.chunkOf(chunkIndex(addr))
@@ -827,13 +994,19 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		}
 
 		// Update heap metadata for the allocRange calls above.
-		p.update(addr, pallocChunkPages, false, false)
+		systemstack(func() {
+			lock(p.mheapLock)
+			p.update(addr, pallocChunkPages, false, false)
+			unlock(p.mheapLock)
+		})
 	}
+
 	systemstack(func() {
 		lock(p.mheapLock)
 		p.scavengeStartGen()
 		unlock(p.mheapLock)
 	})
+
 	return (*PageAlloc)(p)
 }
 
@@ -979,11 +1152,61 @@ func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
 	return x, y
 }
 
-func MSpanCountAlloc(bits []byte) int {
-	s := (*mspan)(mheap_.spanalloc.alloc())
+// mspan wrapper for testing.
+//go:notinheap
+type MSpan mspan
+
+// Allocate an mspan for testing.
+func AllocMSpan() *MSpan {
+	var s *mspan
+	systemstack(func() {
+		lock(&mheap_.lock)
+		s = (*mspan)(mheap_.spanalloc.alloc())
+		unlock(&mheap_.lock)
+	})
+	return (*MSpan)(s)
+}
+
+// Free an allocated mspan.
+func FreeMSpan(s *MSpan) {
+	systemstack(func() {
+		lock(&mheap_.lock)
+		mheap_.spanalloc.free(unsafe.Pointer(s))
+		unlock(&mheap_.lock)
+	})
+}
+
+func MSpanCountAlloc(ms *MSpan, bits []byte) int {
+	s := (*mspan)(ms)
 	s.nelems = uintptr(len(bits) * 8)
 	s.gcmarkBits = (*gcBits)(unsafe.Pointer(&bits[0]))
-	return s.countAlloc()
+	result := s.countAlloc()
+	s.gcmarkBits = nil
+	return result
+}
+
+const (
+	TimeHistSubBucketBits   = timeHistSubBucketBits
+	TimeHistNumSubBuckets   = timeHistNumSubBuckets
+	TimeHistNumSuperBuckets = timeHistNumSuperBuckets
+)
+
+type TimeHistogram timeHistogram
+
+// Counts returns the counts for the given bucket, subBucket indices.
+// Returns true if the bucket was valid, otherwise returns the counts
+// for the overflow bucket and false.
+func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
+	t := (*timeHistogram)(th)
+	i := bucket*TimeHistNumSubBuckets + subBucket
+	if i >= uint(len(t.counts)) {
+		return t.overflow, false
+	}
+	return t.counts[i], true
+}
+
+func (th *TimeHistogram) Record(duration int64) {
+	(*timeHistogram)(th).record(duration)
 }
 
 var Pusestackmaps = &usestackmaps
diff --git a/libgo/go/runtime/export_unix_test.go b/libgo/go/runtime/export_unix_test.go
index 9d0f0d8..801786d 100644
--- a/libgo/go/runtime/export_unix_test.go
+++ b/libgo/go/runtime/export_unix_test.go
@@ -9,7 +9,6 @@ package runtime
 import "unsafe"
 
 var NonblockingPipe = nonblockingPipe
-var Pipe = pipe
 var SetNonblock = setNonblock
 var Closeonexec = closeonexec
 
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index a85fbb0..8d10c85 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -78,10 +78,23 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
-	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
-	instead of MADV_FREE on Linux when returning memory to the
-	kernel. This is less efficient, but causes RSS numbers to drop
-	more quickly.
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
+	madvdontneed: setting madvdontneed=0 will use MADV_FREE
+	instead of MADV_DONTNEED on Linux when returning memory to the
+	kernel. This is more efficient, but means RSS numbers will
+	drop only when the OS is under memory pressure.
 
 	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
 	When set to 0 memory profiling is disabled.  Refer to the description of
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index f545e4b..cde2e14 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -524,7 +524,7 @@ func BenchmarkReadMemStats(b *testing.B) {
 	hugeSink = nil
 }
 
-func BenchmarkReadMemStatsLatency(b *testing.B) {
+func applyGCLoad(b *testing.B) func() {
 	// We’ll apply load to the runtime with maxProcs-1 goroutines
 	// and use one more to actually benchmark. It doesn't make sense
 	// to try to run this test with only 1 P (that's what
@@ -569,6 +569,14 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 			runtime.KeepAlive(hold)
 		}()
 	}
+	return func() {
+		close(done)
+		wg.Wait()
+	}
+}
+
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
 
 	// Spend this much time measuring latencies.
 	latencies := make([]time.Duration, 0, 1024)
@@ -585,12 +593,11 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 		runtime.ReadMemStats(&ms)
 		latencies = append(latencies, time.Now().Sub(start))
 	}
-	close(done)
-	// Make sure to stop the timer before we wait! The goroutines above
-	// are very heavy-weight and not easy to stop, so we could end up
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
 	// confusing the benchmarking framework for small b.N.
 	b.StopTimer()
-	wg.Wait()
+	stop()
 
 	// Disable the default */op metrics.
 	// ns/op doesn't mean anything because it's an average, but we
@@ -769,6 +776,10 @@ func BenchmarkScanStackNoLocals(b *testing.B) {
 }
 
 func BenchmarkMSpanCountAlloc(b *testing.B) {
+	// Allocate one dummy mspan for the whole benchmark.
+	s := runtime.AllocMSpan()
+	defer runtime.FreeMSpan(s)
+
 	// n is the number of bytes to benchmark against.
 	// n must always be a multiple of 8, since gcBits is
 	// always rounded up 8 bytes.
@@ -780,7 +791,7 @@ func BenchmarkMSpanCountAlloc(b *testing.B) {
 
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
-				runtime.MSpanCountAlloc(bits)
+				runtime.MSpanCountAlloc(s, bits)
 			}
 		})
 	}
diff --git a/libgo/go/runtime/gcinfo_test.go b/libgo/go/runtime/gcinfo_test.go
index c26f411..d329e32 100644
--- a/libgo/go/runtime/gcinfo_test.go
+++ b/libgo/go/runtime/gcinfo_test.go
@@ -63,7 +63,7 @@ func TestGCInfo(t *testing.T) {
 	}
 
 	for i := 0; i < 10; i++ {
-		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(padDead(infoPtr)))
+		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(infoPtr))
 		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), trimDead(infoPtr10))
 		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), trimDead(infoScalarPtr))
 		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), trimDead(infoScalarPtr4))
@@ -83,25 +83,10 @@ func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	}
 }
 
-func padDead(mask []byte) []byte {
-	// Because the dead bit isn't encoded in the second word,
-	// and because on 32-bit systems a one-word allocation
-	// uses a two-word block, the pointer info for a one-word
-	// object needs to be expanded to include an extra scalar
-	// on 32-bit systems to match the heap bitmap.
-	if runtime.PtrSize == 4 && len(mask) == 1 {
-		return []byte{mask[0], 0}
-	}
-	return mask
-}
-
 func trimDead(mask []byte) []byte {
-	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
+	for len(mask) > 0 && mask[len(mask)-1] == typeScalar {
 		mask = mask[:len(mask)-1]
 	}
-	if len(mask) == 2 && mask[0] == typeScalar && mask[1] == typeScalar {
-		mask = mask[:0]
-	}
 	return mask
 }
 
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index 816d93c..f3f7382 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -20,8 +20,19 @@ import (
 func runtime_debug_WriteHeapDump(fd uintptr) {
 	stopTheWorld("write heap dump")
 
+	// Keep m on this G's stack instead of the system stack.
+	// Both readmemstats_m and writeheapdump_m have pretty large
+	// peak stack depths and we risk blowing the system stack.
+	// This is safe because the world is stopped, so we don't
+	// need to worry about anyone shrinking and therefore moving
+	// our stack.
+	var m MemStats
 	systemstack(func() {
-		writeheapdump_m(fd)
+		// Call readmemstats_m here instead of deeper in
+		// writeheapdump_m because we might blow the system stack
+		// otherwise.
+		readmemstats_m(&m)
+		writeheapdump_m(fd, &m)
 	})
 
 	startTheWorld()
@@ -311,7 +322,10 @@ func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
 }
 
 func dumproots() {
-	// MSpan.types
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
+	// mspan.types
 	for _, s := range mheap_.allspans {
 		if s.state.get() == mSpanInUse {
 			// Finalizers
@@ -335,6 +349,9 @@ func dumproots() {
 var freemark [_PageSize / 8]bool
 
 func dumpobjs() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
 	for _, s := range mheap_.allspans {
 		if s.state.get() != mSpanInUse {
 			continue
@@ -405,36 +422,42 @@ func dumpms() {
 	}
 }
 
-func dumpmemstats() {
+//go:systemstack
+func dumpmemstats(m *MemStats) {
+	assertWorldStopped()
+
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
 	dumpint(tagMemStats)
-	dumpint(memstats.alloc)
-	dumpint(memstats.total_alloc)
-	dumpint(memstats.sys)
-	dumpint(memstats.nlookup)
-	dumpint(memstats.nmalloc)
-	dumpint(memstats.nfree)
-	dumpint(memstats.heap_alloc)
-	dumpint(memstats.heap_sys)
-	dumpint(memstats.heap_idle)
-	dumpint(memstats.heap_inuse)
-	dumpint(memstats.heap_released)
-	dumpint(memstats.heap_objects)
-	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys)
-	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys)
-	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys)
-	dumpint(memstats.buckhash_sys)
-	dumpint(memstats.gc_sys)
-	dumpint(memstats.other_sys)
-	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc_unix)
-	dumpint(memstats.pause_total_ns)
+	dumpint(m.Alloc)
+	dumpint(m.TotalAlloc)
+	dumpint(m.Sys)
+	dumpint(m.Lookups)
+	dumpint(m.Mallocs)
+	dumpint(m.Frees)
+	dumpint(m.HeapAlloc)
+	dumpint(m.HeapSys)
+	dumpint(m.HeapIdle)
+	dumpint(m.HeapInuse)
+	dumpint(m.HeapReleased)
+	dumpint(m.HeapObjects)
+	dumpint(m.StackInuse)
+	dumpint(m.StackSys)
+	dumpint(m.MSpanInuse)
+	dumpint(m.MSpanSys)
+	dumpint(m.MCacheInuse)
+	dumpint(m.MCacheSys)
+	dumpint(m.BuckHashSys)
+	dumpint(m.GCSys)
+	dumpint(m.OtherSys)
+	dumpint(m.NextGC)
+	dumpint(m.LastGC)
+	dumpint(m.PauseTotalNs)
 	for i := 0; i < 256; i++ {
-		dumpint(memstats.pause_ns[i])
+		dumpint(m.PauseNs[i])
 	}
-	dumpint(uint64(memstats.numgc))
+	dumpint(uint64(m.NumGC))
 }
 
 func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
@@ -481,6 +504,9 @@ func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs,
 }
 
 func dumpmemprof() {
+	// To protect mheap_.allspans.
+	assertWorldStopped()
+
 	iterate_memprof(dumpmemprof_callback)
 	for _, s := range mheap_.allspans {
 		if s.state.get() != mSpanInUse {
@@ -501,7 +527,9 @@ func dumpmemprof() {
 
 var dumphdr = []byte("go1.7 heap dump\n")
 
-func mdump() {
+func mdump(m *MemStats) {
+	assertWorldStopped()
+
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
 		if s.state.get() == mSpanInUse {
@@ -515,13 +543,15 @@ func mdump() {
 	dumpgs()
 	dumpms()
 	dumproots()
-	dumpmemstats()
+	dumpmemstats(m)
 	dumpmemprof()
 	dumpint(tagEOF)
 	flush()
 }
 
-func writeheapdump_m(fd uintptr) {
+func writeheapdump_m(fd uintptr, m *MemStats) {
+	assertWorldStopped()
+
 	_g_ := getg()
 	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
 	_g_.waitreason = waitReasonDumpingHeap
@@ -535,7 +565,7 @@ func writeheapdump_m(fd uintptr) {
 	dumpfd = fd
 
 	// Call dump routine.
-	mdump()
+	mdump(m)
 
 	// Reset dump file.
 	dumpfd = 0
@@ -574,7 +604,7 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		if i != 1 && !hbits.morePointers() {
+		if !hbits.morePointers() {
 			break // end of object
 		}
 		if hbits.isPointer() {
diff --git a/libgo/go/runtime/histogram.go b/libgo/go/runtime/histogram.go
new file mode 100644
index 0000000..4020969
--- /dev/null
+++ b/libgo/go/runtime/histogram.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+)
+
+const (
+	// For the time histogram type, we use an HDR histogram.
+	// Values are placed in super-buckets based solely on the most
+	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are then placed into sub-buckets based on the value of
+	// the next timeHistSubBucketBits most significant bits. Thus,
+	// sub-buckets are linear within a super-bucket.
+	//
+	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
+	// defines the error. This error may be computed as
+	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
+	// per super-bucket the error is approximately 6%.
+	//
+	// The number of super-buckets (timeHistNumSuperBuckets), on the
+	// other hand, defines the range. To reserve room for sub-buckets,
+	// bit timeHistSubBucketBits is the first bit considered for
+	// super-buckets, so super-bucket indicies are adjusted accordingly.
+	//
+	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	//
+	//    00110
+	//    ^----
+	//    │  ^
+	//    │  └---- Lowest 4 bits -> sub-bucket 6
+	//    └------- Bit 4 unset -> super-bucket 0
+	//
+	//    10110
+	//    ^----
+	//    │  ^
+	//    │  └---- Next 4 bits -> sub-bucket 6
+	//    └------- Bit 4 set -> super-bucket 1
+	//    100010
+	//    ^----^
+	//    │  ^ └-- Lower bits ignored
+	//    │  └---- Next 4 bits -> sub-bucket 1
+	//    └------- Bit 5 set -> super-bucket 2
+	//
+	// Following this pattern, bucket 45 will have the bit 48 set. We don't
+	// have any buckets for higher values, so the highest sub-bucket will
+	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
+	// more than enough to handle durations produced by the runtime.
+	timeHistSubBucketBits   = 4
+	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
+	timeHistNumSuperBuckets = 45
+	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+)
+
+// timeHistogram represents a distribution of durations in
+// nanoseconds.
+//
+// The accuracy and range of the histogram is defined by the
+// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+//
+// It is an HDR histogram with exponentially-distributed
+// buckets and linearly distributed sub-buckets.
+//
+// Counts in the histogram are updated atomically, so it is safe
+// for concurrent use. It is also safe to read all the values
+// atomically.
+type timeHistogram struct {
+	counts   [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+	overflow uint64
+}
+
+// record adds the given duration to the distribution.
+//
+// Although the duration is an int64 to facilitate ease-of-use
+// with e.g. nanotime, the duration must be non-negative.
+func (h *timeHistogram) record(duration int64) {
+	if duration < 0 {
+		throw("timeHistogram encountered negative duration")
+	}
+	// The index of the exponential bucket is just the index
+	// of the highest set bit adjusted for how many bits we
+	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
+	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
+	var superBucket, subBucket uint
+	if duration >= timeHistNumSubBuckets {
+		// At this point, we know the duration value will always be
+		// at least timeHistSubBucketsBits long.
+		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
+		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
+			// The bucket index we got is larger than what we support, so
+			// add into the special overflow bucket.
+			atomic.Xadd64(&h.overflow, 1)
+			return
+		}
+		// The linear subbucket index is just the timeHistSubBucketsBits
+		// bits after the top bit. To extract that value, shift down
+		// the duration such that we leave the top bit and the next bits
+		// intact, then extract the index.
+		subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
+	} else {
+		subBucket = uint(duration)
+	}
+	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+}
+
+// timeHistogramMetricsBuckets generates a slice of boundaries for
+// the timeHistogram. These boundaries are represented in seconds,
+// not nanoseconds like the timeHistogram represents durations.
+func timeHistogramMetricsBuckets() []float64 {
+	b := make([]float64, timeHistTotalBuckets-1)
+	for i := 0; i < timeHistNumSuperBuckets; i++ {
+		superBucketMin := uint64(0)
+		// The (inclusive) minimum for the first bucket is 0.
+		if i > 0 {
+			// The minimum for the second bucket will be
+			// 1 << timeHistSubBucketBits, indicating that all
+			// sub-buckets are represented by the next timeHistSubBucketBits
+			// bits.
+			// Thereafter, we shift up by 1 each time, so we can represent
+			// this pattern as (i-1)+timeHistSubBucketBits.
+			superBucketMin = uint64(1) << uint(i-1+timeHistSubBucketBits)
+		}
+		// subBucketShift is the amount that we need to shift the sub-bucket
+		// index to combine it with the bucketMin.
+		subBucketShift := uint(0)
+		if i > 1 {
+			// The first two buckets are exact with respect to integers,
+			// so we'll never have to shift the sub-bucket index. Thereafter,
+			// we shift up by 1 with each subsequent bucket.
+			subBucketShift = uint(i - 2)
+		}
+		for j := 0; j < timeHistNumSubBuckets; j++ {
+			// j is the sub-bucket index. By shifting the index into position to
+			// combine with the bucket minimum, we obtain the minimum value for that
+			// sub-bucket.
+			subBucketMin := superBucketMin + (uint64(j) << subBucketShift)
+
+			// Convert the subBucketMin which is in nanoseconds to a float64 seconds value.
+			// These values will all be exactly representable by a float64.
+			b[i*timeHistNumSubBuckets+j] = float64(subBucketMin) / 1e9
+		}
+	}
+	return b
+}
diff --git a/libgo/go/runtime/histogram_test.go b/libgo/go/runtime/histogram_test.go
new file mode 100644
index 0000000..5f5b28f
--- /dev/null
+++ b/libgo/go/runtime/histogram_test.go
@@ -0,0 +1,58 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+var dummyTimeHistogram TimeHistogram
+
+func TestTimeHistogram(t *testing.T) {
+	// We need to use a global dummy because this
+	// could get stack-allocated with a non-8-byte alignment.
+	// The result of this bad alignment is a segfault on
+	// 32-bit platforms when calling Record.
+	h := &dummyTimeHistogram
+
+	// Record exactly one sample in each bucket.
+	for i := 0; i < TimeHistNumSuperBuckets; i++ {
+		var base int64
+		if i > 0 {
+			base = int64(1) << (i + TimeHistSubBucketBits - 1)
+		}
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
+			v := int64(j)
+			if i > 0 {
+				v <<= i - 1
+			}
+			h.Record(base + v)
+		}
+	}
+	// Hit the overflow bucket.
+	h.Record(int64(^uint64(0) >> 1))
+
+	// Check to make sure there's exactly one count in each
+	// bucket.
+	for i := uint(0); i < TimeHistNumSuperBuckets; i++ {
+		for j := uint(0); j < TimeHistNumSubBuckets; j++ {
+			c, ok := h.Count(i, j)
+			if !ok {
+				t.Errorf("hit overflow bucket unexpectedly: (%d, %d)", i, j)
+			} else if c != 1 {
+				t.Errorf("bucket (%d, %d) has count that is not 1: %d", i, j, c)
+			}
+		}
+	}
+	c, ok := h.Count(TimeHistNumSuperBuckets, 0)
+	if ok {
+		t.Errorf("expected to hit overflow bucket: (%d, %d)", TimeHistNumSuperBuckets, 0)
+	}
+	if c != 1 {
+		t.Errorf("overflow bucket has count that is not 1: %d", c)
+	}
+	dummyTimeHistogram = TimeHistogram{}
+}
diff --git a/libgo/go/runtime/internal/atomic/atomic.c b/libgo/go/runtime/internal/atomic/atomic.c
index 9fed1a8..569e56e 100644
--- a/libgo/go/runtime/internal/atomic/atomic.c
+++ b/libgo/go/runtime/internal/atomic/atomic.c
@@ -6,6 +6,10 @@
 
 #include "runtime.h"
 
+extern void panicUnaligned(void)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.panicUnaligned")
+  __attribute__ ((noreturn));
+
 uint32_t Load (uint32_t *ptr)
   __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.Load")
   __attribute__ ((no_split_stack));
@@ -44,7 +48,7 @@ uint64_t
 Load64 (uint64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
@@ -58,6 +62,28 @@ LoadAcq (uint32_t *ptr)
   return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
 }
 
+uint64_t LoadAcq64 (uint64_t *ptr)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.LoadAcq64")
+  __attribute__ ((no_split_stack));
+
+uint64_t
+LoadAcq64 (uint64_t *ptr)
+{
+  if (((uintptr_t) ptr & 7) != 0)
+    panicUnaligned ();
+  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+}
+
+uintptr_t LoadAcquintptr (uintptr_t *ptr)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.LoadAcquintptr")
+  __attribute__ ((no_split_stack));
+
+uintptr_t
+LoadAcquintptr (uintptr_t *ptr)
+{
+  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+}
+
 uintptr_t Loaduintptr (uintptr_t *ptr)
   __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.Loaduintptr")
   __attribute__ ((no_split_stack));
@@ -86,7 +112,7 @@ int64_t
 Loadint64 (int64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
@@ -108,7 +134,7 @@ uint64_t
 Xadd64 (uint64_t *ptr, int64_t delta)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_add_fetch (ptr, (uint64_t) delta, __ATOMIC_SEQ_CST);
 }
 
@@ -130,7 +156,7 @@ int64_t
 Xaddint64 (int64_t *ptr, int64_t delta)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_add_fetch (ptr, delta, __ATOMIC_SEQ_CST);
 }
 
@@ -152,7 +178,7 @@ uint64_t
 Xchg64 (uint64_t *ptr, uint64_t new)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_exchange_n (ptr, new, __ATOMIC_SEQ_CST);
 }
 
@@ -186,6 +212,26 @@ Or8 (uint8_t *ptr, uint8_t val)
   __atomic_or_fetch (ptr, val, __ATOMIC_SEQ_CST);
 }
 
+void And (uint32_t *ptr, uint32_t val)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.And")
+  __attribute__ ((no_split_stack));
+
+void
+And (uint32_t *ptr, uint32_t val)
+{
+  __atomic_and_fetch (ptr, val, __ATOMIC_SEQ_CST);
+}
+
+void Or (uint32_t *ptr, uint32_t val)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.Or")
+  __attribute__ ((no_split_stack));
+
+void
+Or (uint32_t *ptr, uint32_t val)
+{
+  __atomic_or_fetch (ptr, val, __ATOMIC_SEQ_CST);
+}
+
 _Bool Cas (uint32_t *ptr, uint32_t old, uint32_t new)
   __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.Cas")
   __attribute__ ((no_split_stack));
@@ -204,7 +250,7 @@ _Bool
 Cas64 (uint64_t *ptr, uint64_t old, uint64_t new)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   return __atomic_compare_exchange_n (ptr, &old, new, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
@@ -266,7 +312,7 @@ void
 Store64 (uint64_t *ptr, uint64_t val)
 {
   if (((uintptr_t) ptr & 7) != 0)
-    panicmem ();
+    panicUnaligned ();
   __atomic_store_n (ptr, val, __ATOMIC_SEQ_CST);
 }
 
@@ -280,6 +326,28 @@ StoreRel (uint32_t *ptr, uint32_t val)
   __atomic_store_n (ptr, val, __ATOMIC_RELEASE);
 }
 
+void StoreRel64 (uint64_t *ptr, uint64_t val)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.StoreRel64")
+  __attribute__ ((no_split_stack));
+
+void
+StoreRel64 (uint64_t *ptr, uint64_t val)
+{
+  if (((uintptr_t) ptr & 7) != 0)
+    panicUnaligned ();
+  __atomic_store_n (ptr, val, __ATOMIC_RELEASE);
+}
+
+void StoreReluintptr (uintptr_t *ptr, uintptr_t val)
+  __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.StoreReluintptr")
+  __attribute__ ((no_split_stack));
+
+void
+StoreReluintptr (uintptr_t *ptr, uintptr_t val)
+{
+  __atomic_store_n (ptr, val, __ATOMIC_RELEASE);
+}
+
 void Storeuintptr (uintptr_t *ptr, uintptr_t val)
   __asm__ (GOSYM_PREFIX "runtime_1internal_1atomic.Storeuintptr")
   __attribute__ ((no_split_stack));
diff --git a/libgo/go/runtime/internal/atomic/atomic_test.go b/libgo/go/runtime/internal/atomic/atomic_test.go
index b0a8fa0..c9c2eba 100644
--- a/libgo/go/runtime/internal/atomic/atomic_test.go
+++ b/libgo/go/runtime/internal/atomic/atomic_test.go
@@ -73,8 +73,15 @@ func TestXadduintptrOnUint64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
@@ -143,6 +150,45 @@ func TestAnd8(t *testing.T) {
 	}
 }
 
+func TestAnd(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0xffffffff)
+	for i := uint32(0); i < 32; i++ {
+		atomic.And(&x, ^(1 << i))
+		if r := uint32(0xffffffff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint32, 1<<12)
+	for i := range a {
+		a[i] = 0xffffffff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := ^uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestOr8(t *testing.T) {
 	// Basic sanity check.
 	x := uint8(0)
@@ -179,7 +225,43 @@ func TestOr8(t *testing.T) {
 	}
 }
 
-func TestBitwiseContended(t *testing.T) {
+func TestOr(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0)
+	for i := uint32(0); i < 32; i++ {
+		atomic.Or(&x, 1<<i)
+		if r := (uint32(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint32(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xffffffff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint32(0xffffffff), v)
+		}
+	}
+}
+
+func TestBitwiseContended8(t *testing.T) {
 	// Start with every bit in array set to 0.
 	a := make([]uint8, 16)
 
@@ -221,6 +303,48 @@ func TestBitwiseContended(t *testing.T) {
 	}
 }
 
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or(&a[i], m)
+					if atomic.Load(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And(&a[i], ^m)
+					if atomic.Load(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/libgo/go/runtime/internal/atomic/bench_test.go b/libgo/go/runtime/internal/atomic/bench_test.go
index de71b0f..2476c06 100644
--- a/libgo/go/runtime/internal/atomic/bench_test.go
+++ b/libgo/go/runtime/internal/atomic/bench_test.go
@@ -51,6 +51,14 @@ func BenchmarkAnd8(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkAnd8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -63,6 +71,18 @@ func BenchmarkAnd8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkAndParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.And(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkOr8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -71,6 +91,14 @@ func BenchmarkOr8(b *testing.B) {
 	}
 }
 
+func BenchmarkOr(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkOr8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -83,6 +111,18 @@ func BenchmarkOr8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkOrParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.Or(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
@@ -102,3 +142,54 @@ func BenchmarkXadd64(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkCas(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas(ptr, 1, 0)
+			atomic.Cas(ptr, 0, 1)
+		}
+	})
+}
+
+func BenchmarkCas64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			atomic.Cas64(ptr, 1, 0)
+			atomic.Cas64(ptr, 0, 1)
+		}
+	})
+}
+func BenchmarkXchg(b *testing.B) {
+	var x uint32
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint32
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg(ptr, y)
+			y += 1
+		}
+	})
+}
+
+func BenchmarkXchg64(b *testing.B) {
+	var x uint64
+	x = 1
+	ptr := &x
+	b.RunParallel(func(pb *testing.PB) {
+		var y uint64
+		y = 1
+		for pb.Next() {
+			y = atomic.Xchg64(ptr, y)
+			y += 1
+		}
+	})
+}
diff --git a/libgo/go/runtime/internal/atomic/gccgo.go b/libgo/go/runtime/internal/atomic/gccgo.go
index 4df8346..c423803 100644
--- a/libgo/go/runtime/internal/atomic/gccgo.go
+++ b/libgo/go/runtime/internal/atomic/gccgo.go
@@ -24,6 +24,12 @@ func Load64(ptr *uint64) uint64
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
 //go:noescape
@@ -47,6 +53,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
@@ -67,6 +79,12 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/libgo/go/runtime/internal/atomic/unaligned.go b/libgo/go/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000..fbfe125
--- /dev/null
+++ b/libgo/go/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import _ "unsafe" // for go:linkname
+
+// Let the C code call this function.
+//go:linkname panicUnaligned
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
diff --git a/libgo/go/runtime/lockrank.go b/libgo/go/runtime/lockrank.go
index 0001935..b3c01ba 100644
--- a/libgo/go/runtime/lockrank.go
+++ b/libgo/go/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -67,8 +67,6 @@ const (
 	lockRankRwmutexW
 	lockRankRwmutexR
 
-	lockRankMcentral // For !go115NewMCentralImpl
-	lockRankSpine    // For !go115NewMCentralImpl
 	lockRankSpanSetSpine
 	lockRankGscan
 	lockRankStackpool
@@ -122,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -149,8 +147,6 @@ var lockNames = []string{
 	lockRankRwmutexW: "rwmutexW",
 	lockRankRwmutexR: "rwmutexR",
 
-	lockRankMcentral:     "mcentral",
-	lockRankSpine:        "spine",
 	lockRankSpanSetSpine: "spanSetSpine",
 	lockRankGscan:        "gscan",
 	lockRankStackpool:    "stackpool",
@@ -186,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -203,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
@@ -217,7 +213,7 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankNotifyList:    {},
 	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
 	lockRankTraceStrings:  {lockRankTraceBuf},
-	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankRoot:          {},
@@ -226,20 +222,18 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankNetpollInit:   {lockRankTimers},
 
 	lockRankRwmutexW: {},
-	lockRankRwmutexR: {lockRankRwmutexW},
-
-	lockRankMcentral:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankSpine:        {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine, lockRankGscan},
-	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankSpanSetSpine, lockRankGscan},
+	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
+
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
 	lockRankDefer:        {},
 	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
-	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
 	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGlobalAlloc:  {lockRankProf, lockRankSpine, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
diff --git a/libgo/go/runtime/lockrank_off.go b/libgo/go/runtime/lockrank_off.go
index 32378a9..7dcd8f5 100644
--- a/libgo/go/runtime/lockrank_off.go
+++ b/libgo/go/runtime/lockrank_off.go
@@ -18,29 +18,47 @@ func getLockRank(l *mutex) lockRank {
 	return 0
 }
 
-// The following functions may be called in nosplit context.
-// Nosplit is not strictly required for lockWithRank, unlockWithRank
-// and lockWithRankMayAcquire, but these nosplit annotations must
-// be kept consistent with the equivalent functions in lockrank_on.go.
-
-//go:nosplit
 func lockWithRank(l *mutex, rank lockRank) {
 	lock2(l)
 }
 
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func acquireLockRank(rank lockRank) {
 }
 
-//go:nosplit
 func unlockWithRank(l *mutex) {
 	unlock2(l)
 }
 
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func releaseLockRank(rank lockRank) {
 }
 
-//go:nosplit
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 }
+
+//go:nosplit
+func assertLockHeld(l *mutex) {
+}
+
+//go:nosplit
+func assertRankHeld(r lockRank) {
+}
+
+//go:nosplit
+func worldStopped() {
+}
+
+//go:nosplit
+func worldStarted() {
+}
+
+//go:nosplit
+func assertWorldStopped() {
+}
+
+//go:nosplit
+func assertWorldStoppedOrLockHeld(l *mutex) {
+}
diff --git a/libgo/go/runtime/lockrank_on.go b/libgo/go/runtime/lockrank_on.go
index fbc5ff5..88ac95a 100644
--- a/libgo/go/runtime/lockrank_on.go
+++ b/libgo/go/runtime/lockrank_on.go
@@ -7,9 +7,14 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
+// worldIsStopped is accessed atomically to track world-stops. 1 == world
+// stopped.
+var worldIsStopped uint32
+
 // lockRankStruct is embedded in mutex
 type lockRankStruct struct {
 	// static lock ranking of the lock
@@ -40,15 +45,19 @@ func getLockRank(l *mutex) lockRank {
 	return l.rank
 }
 
-// The following functions are the entry-points to record lock
-// operations.
-// All of these are nosplit and switch to the system stack immediately
-// to avoid stack growths. Since a stack growth could itself have lock
-// operations, this prevents re-entrant calls.
-
 // lockWithRank is like lock(l), but allows the caller to specify a lock rank
 // when acquiring a non-static lock.
-//go:nosplit
+//
+// Note that we need to be careful about stack splits:
+//
+// This function is not nosplit, thus it may split at function entry. This may
+// introduce a new edge in the lock order, but it is no different from any
+// other (nosplit) call before this call (including the call to lock() itself).
+//
+// However, we switch to the systemstack to record the lock held to ensure that
+// we record an accurate lock ordering. e.g., without systemstack, a stack
+// split on entry to lock2() would record stack split locks as taken after l,
+// even though l is not actually locked yet.
 func lockWithRank(l *mutex, rank lockRank) {
 	if l == &debuglock || l == &paniclk {
 		// debuglock is only used for println/printlock(). Don't do lock
@@ -86,11 +95,26 @@ func lockWithRank(l *mutex, rank lockRank) {
 	})
 }
 
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func printHeldLocks(gp *g) {
+	if gp.m.locksHeldLen == 0 {
+		println("<none>")
+		return
+	}
+
+	for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
+		println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
+	}
+}
+
 // acquireLockRank acquires a rank which is not associated with a mutex lock
+//
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func acquireLockRank(rank lockRank) {
 	gp := getg()
-	// Log the new class.
+	// Log the new class. See comment on lockWithRank.
 	systemstack(func() {
 		i := gp.m.locksHeldLen
 		if i >= len(gp.m.locksHeld) {
@@ -109,6 +133,8 @@ func acquireLockRank(rank lockRank) {
 
 // checkRanks checks if goroutine g, which has mostly recently acquired a lock
 // with rank 'prevRank', can now acquire a lock with rank 'rank'.
+//
+//go:systemstack
 func checkRanks(gp *g, prevRank, rank lockRank) {
 	rankOK := false
 	if rank < prevRank {
@@ -135,14 +161,12 @@ func checkRanks(gp *g, prevRank, rank lockRank) {
 	if !rankOK {
 		printlock()
 		println(gp.m.procid, " ======")
-		for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
-			println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
-		}
+		printHeldLocks(gp)
 		throw("lock ordering problem")
 	}
 }
 
-//go:nosplit
+// See comment on lockWithRank regarding stack splitting.
 func unlockWithRank(l *mutex) {
 	if l == &debuglock || l == &paniclk {
 		// See comment at beginning of lockWithRank.
@@ -169,6 +193,8 @@ func unlockWithRank(l *mutex) {
 }
 
 // releaseLockRank releases a rank which is not associated with a mutex lock
+//
+// This function may be called in nosplit context and thus must be nosplit.
 //go:nosplit
 func releaseLockRank(rank lockRank) {
 	gp := getg()
@@ -189,7 +215,7 @@ func releaseLockRank(rank lockRank) {
 	})
 }
 
-//go:nosplit
+// See comment on lockWithRank regarding stack splitting.
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 	gp := getg()
 	if gp.m.locksHeldLen == 0 {
@@ -212,3 +238,146 @@ func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 		gp.m.locksHeldLen--
 	})
 }
+
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func checkLockHeld(gp *g, l *mutex) bool {
+	for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+		if gp.m.locksHeld[i].lockAddr == uintptr(unsafe.Pointer(l)) {
+			return true
+		}
+	}
+	return false
+}
+
+// assertLockHeld throws if l is not held by the caller.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertLockHeld(l *mutex) {
+	gp := getg()
+
+	held := checkLockHeld(gp, l)
+	if held {
+		return
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires lock ", l, " (rank ", l.rank.String(), "), holding:\n")
+		printHeldLocks(gp)
+		throw("not holding required lock!")
+	})
+}
+
+// assertRankHeld throws if a mutex with rank r is not held by the caller.
+//
+// This is less precise than assertLockHeld, but can be used in places where a
+// pointer to the exact mutex is not available.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertRankHeld(r lockRank) {
+	gp := getg()
+
+	for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+		if gp.m.locksHeld[i].rank == r {
+			return
+		}
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires lock with rank ", r.String(), "), holding:\n")
+		printHeldLocks(gp)
+		throw("not holding required lock!")
+	})
+}
+
+// worldStopped notes that the world is stopped.
+//
+// Caller must hold worldsema.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func worldStopped() {
+	if stopped := atomic.Xadd(&worldIsStopped, 1); stopped != 1 {
+		systemstack(func() {
+			print("world stop count=", stopped, "\n")
+			throw("recursive world stop")
+		})
+	}
+}
+
+// worldStarted that the world is starting.
+//
+// Caller must hold worldsema.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func worldStarted() {
+	if stopped := atomic.Xadd(&worldIsStopped, -1); stopped != 0 {
+		systemstack(func() {
+			print("world stop count=", stopped, "\n")
+			throw("released non-stopped world stop")
+		})
+	}
+}
+
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func checkWorldStopped() bool {
+	stopped := atomic.Load(&worldIsStopped)
+	if stopped > 1 {
+		systemstack(func() {
+			print("inconsistent world stop count=", stopped, "\n")
+			throw("inconsistent world stop count")
+		})
+	}
+
+	return stopped == 1
+}
+
+// assertWorldStopped throws if the world is not stopped. It does not check
+// which M stopped the world.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertWorldStopped() {
+	if checkWorldStopped() {
+		return
+	}
+
+	throw("world not stopped")
+}
+
+// assertWorldStoppedOrLockHeld throws if the world is not stopped and the
+// passed lock is not held.
+//
+// nosplit to ensure it can be called in as many contexts as possible.
+//go:nosplit
+func assertWorldStoppedOrLockHeld(l *mutex) {
+	if checkWorldStopped() {
+		return
+	}
+
+	gp := getg()
+	held := checkLockHeld(gp, l)
+	if held {
+		return
+	}
+
+	// Crash from system stack to avoid splits that may cause
+	// additional issues.
+	systemstack(func() {
+		printlock()
+		print("caller requires world stop or lock ", l, " (rank ", l.rank.String(), "), holding:\n")
+		println("<no world stop>")
+		printHeldLocks(gp)
+		throw("no world stop or required lock!")
+	})
+}
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index feb043a..d3d88a2 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -208,7 +208,7 @@ const (
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
-	// On darwin/arm64, although 64-bit pointers are presumably
+	// On ios/arm64, although 64-bit pointers are presumably
 	// available, pointers are truncated to 33 bits. Furthermore,
 	// only the top 4 GiB of the address space are actually available
 	// to the application, but we allow the whole 33 bits anyway for
@@ -217,7 +217,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosDarwin*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosDarwin*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosIos*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -525,14 +525,22 @@ func mallocinit() {
 		// However, on arm64, we ignore all this advice above and slam the
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
-		// On darwin/arm64, the address space is even smaller.
+		// On ios/arm64, the address space is even smaller.
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && GOOS == "darwin":
+			case raceenabled:
+				// The TSAN runtime requires the heap
+				// to be in the range [0x00c000000000,
+				// 0x00e000000000).
+				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
+				if p >= uintptrMask&0x00e000000000 {
+					continue
+				}
+			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
@@ -543,14 +551,6 @@ func mallocinit() {
 					continue
 				}
 				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
-			case raceenabled:
-				// The TSAN runtime requires the heap
-				// to be in the range [0x00c000000000,
-				// 0x00e000000000).
-				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
-				if p >= uintptrMask&0x00e000000000 {
-					continue
-				}
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
@@ -638,6 +638,8 @@ func mallocinit() {
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
+	assertLockHeld(&h.lock)
+
 	n = alignUp(n, heapArenaBytes)
 
 	// First, try the arena pre-reservation.
@@ -754,9 +756,9 @@ mapped:
 			throw("arena already initialized")
 		}
 		var r *heapArena
-		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 		if r == nil {
-			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 			if r == nil {
 				throw("out of memory allocating heap arena metadata")
 			}
@@ -768,7 +770,7 @@ mapped:
 			if size == 0 {
 				size = physPageSize
 			}
-			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gcMiscSys))
 			if newArray == nil {
 				throw("out of memory allocating allArenas")
 			}
@@ -920,27 +922,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if debug.sbrk != 0 {
-		align := uintptr(16)
-		if typ != nil {
-			// TODO(austin): This should be just
-			//   align = uintptr(typ.align)
-			// but that's only 4 on 32-bit platforms,
-			// even if there's a uint64 field in typ (see #599).
-			// This causes 64-bit atomic accesses to panic.
-			// Hence, we use stricter alignment that matches
-			// the normal allocator better.
-			if size&7 == 0 {
-				align = 8
-			} else if size&3 == 0 {
-				align = 4
-			} else if size&1 == 0 {
-				align = 2
-			} else {
-				align = 1
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
 			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.allocs += 1
 		}
-		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
 	// When using gccgo, when a cgo or SWIG function has an
@@ -987,18 +996,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
+	c := getMCache()
+	if c == nil {
+		throw("mallocgc called without a P or outside bootstrapping")
 	}
 	var span *mspan
 	var x unsafe.Pointer
@@ -1038,6 +1038,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
 				off = alignUp(off, 8)
+			} else if sys.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
 				off = alignUp(off, 4)
 			} else if size&1 == 0 {
@@ -1047,7 +1055,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				if incallback {
@@ -1092,9 +1100,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1114,7 +1120,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1144,13 +1150,20 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
-	if debug.allocfreetrace != 0 {
-		tracealloc(x, size, typ)
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.bytes += uint64(size)
+		}
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1182,37 +1195,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	if go115NewMCentralImpl {
-		// Put the large span in the mcentral swept list so that it's
-		// visible to the background sweeper.
-		mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	}
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
@@ -1248,16 +1230,11 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
+	c := getMCache()
+	if c == nil {
+		throw("profilealloc called without a P or outside bootstrapping")
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
@@ -1269,6 +1246,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 // distribution (exp(MemProfileRate)), so the best return value is a random
 // number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() uintptr {
+	if MemProfileRate == 1 {
+		// Callers assign our return value to
+		// mcache.next_sample, but next_sample is not used
+		// when the rate is 1. So avoid the math below and
+		// just return something.
+		return 0
+	}
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
 		if g := getg(); g == g.m.gsignal {
@@ -1349,7 +1333,7 @@ var persistentChunks *notInHeap
 // The returned memory will be zeroed.
 //
 // Consider marking persistentalloc'd types go:notinheap.
-func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
@@ -1360,7 +1344,7 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 	const (
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
@@ -1419,8 +1403,8 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	}
 
 	if sysStat != &memstats.other_sys {
-		mSysStatInc(sysStat, size)
-		mSysStatDec(&memstats.other_sys, size)
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
 	}
 	return p
 }
@@ -1461,7 +1445,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	l.end = base + size
 }
 
-func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index 45555ee..24b8740 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -12,8 +12,10 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"runtime"
 	. "runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -170,6 +172,61 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+var (
+	tinyByteSink   *byte
+	tinyUint32Sink *uint32
+	tinyObj12Sink  *obj12
+)
+
+type obj12 struct {
+	a uint64
+	b uint32
+}
+
+func TestTinyAllocIssue37262(t *testing.T) {
+	// Try to cause an alignment access fault
+	// by atomically accessing the first 64-bit
+	// value of a tiny-allocated object.
+	// See issue 37262 for details.
+
+	// GC twice, once to reach a stable heap state
+	// and again to make sure we finish the sweep phase.
+	runtime.GC()
+	runtime.GC()
+
+	// Make 1-byte allocations until we get a fresh tiny slot.
+	aligned := false
+	for i := 0; i < 16; i++ {
+		tinyByteSink = new(byte)
+		if uintptr(unsafe.Pointer(tinyByteSink))&0xf == 0xf {
+			aligned = true
+			break
+		}
+	}
+	if !aligned {
+		t.Fatal("unable to get a fresh tiny slot")
+	}
+
+	// Create a 4-byte object so that the current
+	// tiny slot is partially filled.
+	tinyUint32Sink = new(uint32)
+
+	// Create a 12-byte object, which fits into the
+	// tiny slot. If it actually gets place there,
+	// then the field "a" will be improperly aligned
+	// for atomic access on 32-bit architectures.
+	// This won't be true if issue 36606 gets resolved.
+	tinyObj12Sink = new(obj12)
+
+	// Try to atomically access "x.a".
+	atomic.StoreUint64(&tinyObj12Sink.a, 10)
+
+	// Clear the sinks.
+	tinyByteSink = nil
+	tinyUint32Sink = nil
+	tinyObj12Sink = nil
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
diff --git a/libgo/go/runtime/map.go b/libgo/go/runtime/map.go
index 1155fee..5b9d7102 100644
--- a/libgo/go/runtime/map.go
+++ b/libgo/go/runtime/map.go
@@ -177,8 +177,8 @@ type bmap struct {
 // If you modify hiter, also change cmd/compile/internal/gc/reflect.go to indicate
 // the layout of this structure.
 type hiter struct {
-	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/internal/gc/range.go).
-	elem        unsafe.Pointer // Must be in second position (see cmd/internal/gc/range.go).
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/gc/range.go).
+	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/gc/range.go).
 	t           *maptype
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
@@ -634,7 +634,7 @@ again:
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 	top := tophash(hash)
 
 	var inserti *uint8
@@ -685,7 +685,7 @@ bucketloop:
 	}
 
 	if inserti == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		newb := h.newoverflow(t, b)
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
@@ -815,6 +815,11 @@ search:
 			}
 		notLast:
 			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
 			break search
 		}
 	}
@@ -1051,6 +1056,10 @@ func mapclear(t *maptype, h *hmap) {
 	h.noverflow = 0
 	h.count = 0
 
+	// Reset the hash seed to make it more difficult for attackers to
+	// repeatedly trigger hash collisions. See issue 25237.
+	h.hash0 = fastrand()
+
 	// Keep the mapextra allocation but clear any extra information.
 	if h.extra != nil {
 		*h.extra = mapextra{}
@@ -1429,5 +1438,5 @@ func reflectlite_maplen(h *hmap) int {
 	return h.count
 }
 
-const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go:zeroValSize
+const maxZero = 1024 // must match value in reflect/value.go:maxZero cmd/compile/internal/gc/walk.go:zeroValSize
 var zeroVal [maxZero]byte
diff --git a/libgo/go/runtime/map_benchmark_test.go b/libgo/go/runtime/map_benchmark_test.go
index 893cb6c..d0becc9 100644
--- a/libgo/go/runtime/map_benchmark_test.go
+++ b/libgo/go/runtime/map_benchmark_test.go
@@ -1,6 +1,7 @@
 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import (
diff --git a/libgo/go/runtime/map_fast32.go b/libgo/go/runtime/map_fast32.go
index fdc7f0e..74f1d61a 100644
--- a/libgo/go/runtime/map_fast32.go
+++ b/libgo/go/runtime/map_fast32.go
@@ -122,7 +122,7 @@ again:
 	if h.growing() {
 		growWork_fast32(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -166,7 +166,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -212,7 +212,7 @@ again:
 	if h.growing() {
 		growWork_fast32(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -256,7 +256,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -307,8 +307,12 @@ search:
 				continue
 			}
 			// Only clear key if there are pointers in it.
-			if t.key.ptrdata != 0 {
-				memclrHasPointers(k, t.key.size)
+			// This can only happen if pointers are 32 bit
+			// wide as 64 bit pointers do not fit into a 32 bit key.
+			if sys.PtrSize == 4 && t.key.ptrdata != 0 {
+				// The key must be a pointer as we checked pointers are
+				// 32 bits wide and the key is 32 bits wide also.
+				*(*unsafe.Pointer)(k) = nil
 			}
 			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.elemsize))
 			if t.elem.ptrdata != 0 {
@@ -348,6 +352,11 @@ search:
 			}
 		notLast:
 			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
 			break search
 		}
 	}
diff --git a/libgo/go/runtime/map_fast64.go b/libgo/go/runtime/map_fast64.go
index 26c60ae..14bdcee 100644
--- a/libgo/go/runtime/map_fast64.go
+++ b/libgo/go/runtime/map_fast64.go
@@ -122,7 +122,7 @@ again:
 	if h.growing() {
 		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -166,7 +166,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -212,7 +212,7 @@ again:
 	if h.growing() {
 		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 
 	var insertb *bmap
 	var inserti uintptr
@@ -256,7 +256,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -308,7 +308,13 @@ search:
 			}
 			// Only clear key if there are pointers in it.
 			if t.key.ptrdata != 0 {
-				memclrHasPointers(k, t.key.size)
+				if sys.PtrSize == 8 {
+					*(*unsafe.Pointer)(k) = nil
+				} else {
+					// There are three ways to squeeze at one ore more 32 bit pointers into 64 bits.
+					// Just call memclrHasPointers instead of trying to handle all cases here.
+					memclrHasPointers(k, 8)
+				}
 			}
 			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.elemsize))
 			if t.elem.ptrdata != 0 {
@@ -348,6 +354,11 @@ search:
 			}
 		notLast:
 			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
 			break search
 		}
 	}
diff --git a/libgo/go/runtime/map_faststr.go b/libgo/go/runtime/map_faststr.go
index 1775214..2205947 100644
--- a/libgo/go/runtime/map_faststr.go
+++ b/libgo/go/runtime/map_faststr.go
@@ -232,7 +232,7 @@ again:
 	if h.growing() {
 		growWork_faststr(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
 	top := tophash(hash)
 
 	var insertb *bmap
@@ -281,7 +281,7 @@ bucketloop:
 	}
 
 	if insertb == nil {
-		// all current buckets are full, allocate a new one.
+		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
 		insertb = h.newoverflow(t, b)
 		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
@@ -376,6 +376,11 @@ search:
 			}
 		notLast:
 			h.count--
+			// Reset the hash seed to make it more difficult for attackers to
+			// repeatedly trigger hash collisions. See issue 25237.
+			if h.count == 0 {
+				h.hash0 = fastrand()
+			}
 			break search
 		}
 	}
diff --git a/libgo/go/runtime/map_test.go b/libgo/go/runtime/map_test.go
index b13d269..b9a3457 100644
--- a/libgo/go/runtime/map_test.go
+++ b/libgo/go/runtime/map_test.go
@@ -1009,6 +1009,27 @@ func benchmarkMapDeleteStr(b *testing.B, n int) {
 	}
 }
 
+func benchmarkMapDeletePointer(b *testing.B, n int) {
+	i2p := make([]*int, n)
+	for i := 0; i < n; i++ {
+		i2p[i] = new(int)
+	}
+	a := make(map[*int]int, n)
+	b.ResetTimer()
+	k := 0
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := 0; j < n; j++ {
+				a[i2p[j]] = j
+			}
+			k = i
+			b.StartTimer()
+		}
+		delete(a, i2p[i-k])
+	}
+}
+
 func runWith(f func(*testing.B, int), v ...int) func(*testing.B) {
 	return func(b *testing.B) {
 		for _, n := range v {
@@ -1039,6 +1060,7 @@ func BenchmarkMapDelete(b *testing.B) {
 	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 100, 1000, 10000))
 	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 100, 1000, 10000))
 	b.Run("Str", runWith(benchmarkMapDeleteStr, 100, 1000, 10000))
+	b.Run("Pointer", runWith(benchmarkMapDeletePointer, 100, 1000, 10000))
 }
 
 func TestDeferDeleteSlow(t *testing.T) {
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index 3bd8b34..3399033 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -270,28 +270,7 @@ func typedslicecopy(typ *_type, dstPtr unsafe.Pointer, dstLen int, srcPtr unsafe
 //go:linkname reflect_typedslicecopy reflect.typedslicecopy
 func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 	if elemType.ptrdata == 0 {
-		n := dst.len
-		if n > src.len {
-			n = src.len
-		}
-		if n == 0 {
-			return 0
-		}
-
-		size := uintptr(n) * elemType.size
-		if raceenabled {
-			callerpc := getcallerpc()
-			pc := funcPC(reflect_typedslicecopy)
-			racewriterangepc(dst.array, size, callerpc, pc)
-			racereadrangepc(src.array, size, callerpc, pc)
-		}
-		if msanenabled {
-			msanwrite(dst.array, size)
-			msanread(src.array, size)
-		}
-
-		memmove(dst.array, src.array, size)
-		return n
+		return slicecopy(dst.array, dst.len, src.array, src.len, elemType.size)
 	}
 	return typedslicecopy(elemType, dst.array, dst.len, src.array, src.len)
 }
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index 7acd5d1..0eb19d6 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -6,10 +6,11 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Stack frames and global variables in the data and bss sections are described
-// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
-// to be visited during GC. The bits in each byte are consumed starting with
-// the low bit: 1<<0, 1<<1, and so on.
+// Stack frames and global variables in the data and bss sections are
+// described by bitmaps with 1 bit per pointer-sized word. A "1" bit
+// means the word is a live pointer to be visited by the GC (referred to
+// as "pointer"). A "0" bit means the word should be ignored by GC
+// (referred to as "scalar", though it could be a dead pointer value).
 //
 // Heap bitmap
 //
@@ -20,57 +21,27 @@
 // through start+3*ptrSize, ha.bitmap[1] holds the entries for
 // start+4*ptrSize through start+7*ptrSize, and so on.
 //
-// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
-// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
-// The meaning of the high bit depends on the position of the word being described
-// in its allocated object. In all words *except* the second word, the
-// high bit indicates that the object is still being described. In
-// these words, if a bit pair with a high bit 0 is encountered, the
-// low bit can also be assumed to be 0, and the object description is
-// over. This 00 is called the ``dead'' encoding: it signals that the
-// rest of the words in the object are uninteresting to the garbage
-// collector.
-//
-// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In each 2-bit entry, the lower bit is a pointer/scalar bit, just
+// like in the stack/data bitmaps described above. The upper bit
+// indicates scan/dead: a "1" value ("scan") indicates that there may
+// be pointers in later words of the allocation, and a "0" value
+// ("dead") indicates there are no more pointers in the allocation. If
+// the upper bit is 0, the lower bit must also be 0, and this
+// indicates scanning can ignore the rest of the allocation.
 //
 // The 2-bit entries are split when written into the byte, so that the top half
-// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
-// bits.
-// This form allows a copy from the 1-bit to the 4-bit form to keep the
-// pointer bits contiguous, instead of having to space them out.
+// of the byte contains 4 high (scan) bits and the bottom half contains 4 low
+// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to
+// keep the pointer bits contiguous, instead of having to space them out.
 //
-// The code makes use of the fact that the zero value for a heap bitmap
-// has no live pointer bit set and is (depending on position), not used,
-// not checkmarked, and is the dead encoding.
-// These properties must be preserved when modifying the encoding.
+// The code makes use of the fact that the zero value for a heap
+// bitmap means scalar/dead. This property must be preserved when
+// modifying the encoding.
 //
 // The bitmap for noscan spans is not maintained. Code must ensure
 // that an object is scannable before consulting its bitmap by
 // checking either the noscan bit in the span or by consulting its
 // type's information.
-//
-// Checkmarks
-//
-// In a concurrent garbage collector, one worries about failing to mark
-// a live object due to mutations without write barriers or bugs in the
-// collector implementation. As a sanity check, the GC has a 'checkmark'
-// mode that retraverses the object graph with the world stopped, to make
-// sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
-// for the second word of the object holds the checkmark bit.
-// When not in checkmark mode, this bit is set to 1.
-//
-// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
-// means every allocated object has two words, so there is room for the
-// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
-// just one word, so the second bit pair is not available for encoding the
-// checkmark. However, because non-pointer allocations are combined
-// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
-// must be a pointer, so the type bit in the first word is not actually needed.
-// It is still used in general, except in checkmark the type bit is repurposed
-// as the checkmark bit and then reinitialized (to 1) as the type bit when
-// finished.
-//
 
 package runtime
 
@@ -565,33 +536,6 @@ func (h heapBits) isPointer() bool {
 	return h.bits()&bitPointer != 0
 }
 
-// isCheckmarked reports whether the heap bits have the checkmarked bit set.
-// It must be told how large the object at h is, because the encoding of the
-// checkmark bit varies by size.
-// h must describe the initial word of the object.
-func (h heapBits) isCheckmarked(size uintptr) bool {
-	if size == sys.PtrSize {
-		return (*h.bitp>>h.shift)&bitPointer != 0
-	}
-	// All multiword objects are 2-word aligned,
-	// so we know that the initial word's 2-bit pair
-	// and the second word's 2-bit pair are in the
-	// same heap bitmap byte, *h.bitp.
-	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
-}
-
-// setCheckmarked sets the checkmarked bit.
-// It must be told how large the object at h is, because the encoding of the
-// checkmark bit varies by size.
-// h must describe the initial word of the object.
-func (h heapBits) setCheckmarked(size uintptr) {
-	if size == sys.PtrSize {
-		atomic.Or8(h.bitp, bitPointer<<h.shift)
-		return
-	}
-	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
-}
-
 // bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
@@ -815,7 +759,6 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 
 // initSpan initializes the heap bitmap for a span.
-// It clears all checkmark bits.
 // If this is a span of pointer-sized objects, it initializes all
 // words to pointer/scan.
 // Otherwise, it initializes all words to scalar/dead.
@@ -846,45 +789,6 @@ func (h heapBits) initSpan(s *mspan) {
 	}
 }
 
-// initCheckmarkSpan initializes a span for being checkmarked.
-// It clears the checkmark bits, which are set to 1 in normal operation.
-func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if sys.PtrSize == 8 && size == sys.PtrSize {
-		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must clear type bit (checkmark bit) of every word.
-		// The type bit is the lower of every two-bit pair.
-		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
-			*h.bitp &^= bitPointerAll
-			h = h.forward(wordsPerBitmapByte)
-		}
-		return
-	}
-	for i := uintptr(0); i < n; i++ {
-		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
-		h = h.forward(size / sys.PtrSize)
-	}
-}
-
-// clearCheckmarkSpan undoes all the checkmarking in a span.
-// The actual checkmark bits are ignored, so the only work to do
-// is to fix the pointer bits. (Pointer bits are ignored by scanobject
-// but consulted by typedmemmove.)
-func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if sys.PtrSize == 8 && size == sys.PtrSize {
-		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must clear type bit (checkmark bit) of every word.
-		// The type bit is the lower of every two-bit pair.
-		for i := uintptr(0); i < n; i += wordsPerBitmapByte {
-			*h.bitp |= bitPointerAll
-			h = h.forward(wordsPerBitmapByte)
-		}
-	}
-}
-
 // countAlloc returns the number of objects allocated in span s by
 // scanning the allocation bitmap.
 func (s *mspan) countAlloc() int {
@@ -931,6 +835,12 @@ func (s *mspan) countAlloc() int {
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	const doubleCheck = false // slow but helpful; enable to test modifications to this code
 
+	const (
+		mask1 = bitPointer | bitScan                        // 00010001
+		mask2 = bitPointer | bitScan | mask1<<heapBitsShift // 00110011
+		mask3 = bitPointer | bitScan | mask2<<heapBitsShift // 01110111
+	)
+
 	// dataSize is always size rounded up to the next malloc size class,
 	// except in the case of allocating a defer block, in which case
 	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
@@ -959,11 +869,12 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	h := heapBitsForAddr(x)
 	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
 
-	// Heap bitmap bits for 2-word object are only 4 bits,
-	// so also shared with objects next to it.
-	// This is called out as a special case primarily for 32-bit systems,
-	// so that on 32-bit systems the code below can assume all objects
-	// are 4-word aligned (because they're all 16-byte aligned).
+	// 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits.
+	// Therefore, these objects share a heap bitmap byte with the objects next to them.
+	// These are called out as a special case primarily so the code below can assume all
+	// objects are at least 4 words long and that their bitmaps start either at the beginning
+	// of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively).
+
 	if size == 2*sys.PtrSize {
 		if typ.size == sys.PtrSize {
 			// We're allocating a block big enough to hold two pointers.
@@ -977,11 +888,11 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
 				// 1 pointer object. On 32-bit machines clear the bit for the
 				// unused second word.
-				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 				*h.bitp |= (bitPointer | bitScan) << h.shift
 			} else {
-				// 2-element slice of pointer.
-				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
+				// 2-element array of pointer.
+				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
 			}
 			return
 		}
@@ -994,14 +905,77 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			}
 		}
 		b := uint32(*ptrmask)
-		hb := (b & 3) | bitScan
-		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
-		// 110011 is shifted h.shift and complemented.
-		// This clears out the bits that are about to be
-		// ored into *h.hbitp in the next instructions.
+		hb := b & 3
+		hb |= bitScanAll & ((bitScan << (typ.ptrdata / sys.PtrSize)) - 1)
+		// Clear the bits for this object so we can set the
+		// appropriate ones.
 		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 		*h.bitp |= uint8(hb << h.shift)
 		return
+	} else if size == 3*sys.PtrSize {
+		b := uint8(*ptrmask)
+		if doubleCheck {
+			if b == 0 {
+				println("runtime: invalid type ", typ.string())
+				throw("heapBitsSetType: called with non-pointer type")
+			}
+			if sys.PtrSize != 8 {
+				throw("heapBitsSetType: unexpected 3 pointer wide size class on 32 bit")
+			}
+			if typ.kind&kindGCProg != 0 {
+				throw("heapBitsSetType: unexpected GC prog for 3 pointer wide size class")
+			}
+			if typ.size == 2*sys.PtrSize {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, "\n")
+				throw("heapBitsSetType: inconsistent object sizes")
+			}
+		}
+		if typ.size == sys.PtrSize {
+			// The type contains a pointer otherwise heapBitsSetType wouldn't have been called.
+			// Since the type is only 1 pointer wide and contains a pointer, its gcdata must be exactly 1.
+			if doubleCheck && *typ.gcdata != 1 {
+				print("runtime: heapBitsSetType size=", size, " typ.size=", typ.size, "but *typ.gcdata", *typ.gcdata, "\n")
+				throw("heapBitsSetType: unexpected gcdata for 1 pointer wide type size in 3 pointer wide size class")
+			}
+			// 3 element array of pointers. Unrolling ptrmask 3 times into p yields 00000111.
+			b = 7
+		}
+
+		hb := b & 7
+		// Set bitScan bits for all pointers.
+		hb |= hb << wordsPerBitmapByte
+		// First bitScan bit is always set since the type contains pointers.
+		hb |= bitScan
+		// Second bitScan bit needs to also be set if the third bitScan bit is set.
+		hb |= hb & (bitScan << (2 * heapBitsShift)) >> 1
+
+		// For h.shift > 1 heap bits cross a byte boundary and need to be written part
+		// to h.bitp and part to the next h.bitp.
+		switch h.shift {
+		case 0:
+			*h.bitp &^= mask3 << 0
+			*h.bitp |= hb << 0
+		case 1:
+			*h.bitp &^= mask3 << 1
+			*h.bitp |= hb << 1
+		case 2:
+			*h.bitp &^= mask2 << 2
+			*h.bitp |= (hb & mask2) << 2
+			// Two words written to the first byte.
+			// Advance two words to get to the next byte.
+			h = h.next().next()
+			*h.bitp &^= mask1
+			*h.bitp |= (hb >> 2) & mask1
+		case 3:
+			*h.bitp &^= mask1 << 3
+			*h.bitp |= (hb & mask1) << 3
+			// One word written to the first byte.
+			// Advance one word to get to the next byte.
+			h = h.next()
+			*h.bitp &^= mask2
+			*h.bitp |= (hb >> 1) & mask2
+		}
+		return
 	}
 
 	// Copy from 1-bit ptrmask into 2-bit bitmap.
@@ -1175,11 +1149,6 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		throw("heapBitsSetType: called with non-pointer type")
 		return
 	}
-	if nw < 2 {
-		// Must write at least 2 words, because the "no scan"
-		// encoding doesn't take effect until the third word.
-		nw = 2
-	}
 
 	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
 	// The leading byte is special because it contains the bits for word 1,
@@ -1192,21 +1161,22 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 
 	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
-		// Handle first byte of bitmap specially.
+		//
+		// This is a fast path for small objects.
 		//
 		// The first byte we write out covers the first four
 		// words of the object. The scan/dead bit on the first
 		// word must be set to scan since there are pointers
-		// somewhere in the object. The scan/dead bit on the
-		// second word is the checkmark, so we don't set it.
+		// somewhere in the object.
 		// In all following words, we set the scan/dead
-		// appropriately to indicate that the object contains
+		// appropriately to indicate that the object continues
 		// to the next 2-bit entry in the bitmap.
 		//
-		// TODO: It doesn't matter if we set the checkmark, so
-		// maybe this case isn't needed any more.
+		// We set four bits at a time here, but if the object
+		// is fewer than four words, phase 3 will clear
+		// unnecessary bits.
 		hb = b & bitPointerAll
-		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+		hb |= bitScanAll
 		if w += 4; w >= nw {
 			goto Phase3
 		}
@@ -1215,26 +1185,35 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		b >>= 4
 		nb -= 4
 
-	case sys.PtrSize == 8 && h.shift == 2:
+	case h.shift == 2:
 		// Ptrmask and heap bitmap are misaligned.
+		//
+		// On 32 bit architectures only the 6-word object that corresponds
+		// to a 24 bytes size class can start with h.shift of 2 here since
+		// all other non 16 byte aligned size classes have been handled by
+		// special code paths at the beginning of heapBitsSetType on 32 bit.
+		//
+		// Many size classes are only 16 byte aligned. On 64 bit architectures
+		// this results in a heap bitmap position starting with a h.shift of 2.
+		//
 		// The bits for the first two words are in a byte shared
 		// with another object, so we must be careful with the bits
 		// already there.
-		// We took care of 1-word and 2-word objects above,
+		//
+		// We took care of 1-word, 2-word, and 3-word objects above,
 		// so this is at least a 6-word object.
 		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
-		// This is not noscan, so set the scan bit in the
-		// first word.
 		hb |= bitScan << (2 * heapBitsShift)
+		if nw > 1 {
+			hb |= bitScan << (3 * heapBitsShift)
+		}
 		b >>= 2
 		nb -= 2
-		// Note: no bitScan for second word because that's
-		// the checkmark.
-		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
+		*hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift))
 		*hbitp |= uint8(hb)
 		hbitp = add1(hbitp)
 		if w += 2; w >= nw {
-			// We know that there is more data, because we handled 2-word objects above.
+			// We know that there is more data, because we handled 2-word and 3-word objects above.
 			// This must be at least a 6-word object. If we're out of pointer words,
 			// mark no scan in next bitmap byte and finish.
 			hb = 0
@@ -1369,12 +1348,12 @@ Phase4:
 		// Handle the first byte specially if it's shared. See
 		// Phase 1 for why this is the only special case we need.
 		if doubleCheck {
-			if !(h.shift == 0 || (sys.PtrSize == 8 && h.shift == 2)) {
+			if !(h.shift == 0 || h.shift == 2) {
 				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
 				throw("bad start shift")
 			}
 		}
-		if sys.PtrSize == 8 && h.shift == 2 {
+		if h.shift == 2 {
 			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
 			h = h.next().next()
 			cnw -= 2
@@ -1423,17 +1402,20 @@ Phase4:
 	// Double check the whole bitmap.
 	if doubleCheck {
 		// x+size may not point to the heap, so back up one
-		// word and then call next().
-		end := heapBitsForAddr(x + size - sys.PtrSize).next()
-		endAI := arenaIdx(end.arena)
-		if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0])) {
-			// The unrolling code above walks hbitp just
-			// past the bitmap without moving to the next
-			// arena. Synthesize this for end.bitp.
-			end.arena--
-			endAI = arenaIdx(end.arena)
-			end.bitp = addb(&mheap_.arenas[endAI.l1()][endAI.l2()].bitmap[0], heapArenaBitmapBytes)
-			end.last = nil
+		// word and then advance it the way we do above.
+		end := heapBitsForAddr(x + size - sys.PtrSize)
+		if outOfPlace {
+			// In out-of-place copying, we just advance
+			// using next.
+			end = end.next()
+		} else {
+			// Don't use next because that may advance to
+			// the next arena and the in-place logic
+			// doesn't do that.
+			end.shift += heapBitsShift
+			if end.shift == 4*heapBitsShift {
+				end.bitp, end.shift = add1(end.bitp), 0
+			}
 		}
 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
 			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
@@ -1457,19 +1439,16 @@ Phase4:
 			var have, want uint8
 			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
 			if i >= totalptr {
-				want = 0 // deadmarker
 				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					// heapBitsSetTypeGCProg always fills
+					// in full nibbles of bitScan.
 					want = bitScan
 				}
 			} else {
 				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
-				if i != 1 {
-					want |= bitScan
-				} else {
-					have &^= bitScan
-				}
+				want |= bitScan
 			}
 			if have != want {
 				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
@@ -1909,12 +1888,12 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
 	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
 	// Compute the number of pages needed for bitmapBytes.
 	pages := divRoundUp(bitmapBytes, pageSize)
-	s := mheap_.allocManual(pages, &memstats.gc_sys)
+	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
 	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
 	return s
 }
 func dematerializeGCProg(s *mspan) {
-	mheap_.freeManual(s, &memstats.gc_sys)
+	mheap_.freeManual(s, spanAllocPtrScalarBits)
 }
 
 func dumpGCProg(p *byte) {
@@ -2009,7 +1988,7 @@ func getgcmask(ep interface{}) (mask []byte) {
 			if hbits.isPointer() {
 				mask[i/sys.PtrSize] = 1
 			}
-			if i != 1*sys.PtrSize && !hbits.morePointers() {
+			if !hbits.morePointers() {
 				mask = mask[:i/sys.PtrSize]
 				break
 			}
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index ba52624..32622e6 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -19,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -31,19 +32,17 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny             uintptr
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
+	tiny       uintptr
+	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
 	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
 
-	// Local allocator stats, flushed during GC.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
@@ -86,10 +85,16 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
 func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
@@ -100,12 +105,31 @@ func freemcache(c *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		purgecachedstats(c)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Returns nil if we're not bootstrapping or we don't have a P. The caller's
+// P must not change, so we must be in a non-preemptible state.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -123,11 +147,7 @@ func (c *mcache) refill(spc spanClass) {
 		if s.sweepgen != mheap_.sweepgen+3 {
 			throw("bad sweepgen in refill")
 		}
-		if go115NewMCentralImpl {
-			mheap_.central[spc].mcentral.uncacheSpan(s)
-		} else {
-			atomic.Store(&s.sweepgen, mheap_.sweepgen)
-		}
+		mheap_.central[spc].mcentral.uncacheSpan(s)
 	}
 
 	// Get a new cached span from the central lists.
@@ -144,13 +164,107 @@ func (c *mcache) refill(spc spanClass) {
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release()
+
+	// Update heap_live with the same assumption.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
+	// While we're here, flush scanAlloc, since we have to call
+	// revise anyway.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live and heap_scan changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release()
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release()
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
@@ -158,6 +272,13 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
+
+	// Updated heap_scan and possible heap_live.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index ed49d86..cd20dec 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -18,13 +18,8 @@ import "runtime/internal/atomic"
 //
 //go:notinheap
 type mcentral struct {
-	lock      mutex
 	spanclass spanClass
 
-	// For !go115NewMCentralImpl.
-	nonempty mSpanList // list of spans with a free object, ie a nonempty free list
-	empty    mSpanList // list of spans with no free objects (or cached in an mcache)
-
 	// partial and full contain two mspan sets: one of swept in-use
 	// spans, and one of unswept in-use spans. These two trade
 	// roles on each GC cycle. The unswept set is drained either by
@@ -45,26 +40,15 @@ type mcentral struct {
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
 func (c *mcentral) init(spc spanClass) {
 	c.spanclass = spc
-	if go115NewMCentralImpl {
-		lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
-		lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
-	} else {
-		c.nonempty.init()
-		c.empty.init()
-		lockInit(&c.lock, lockRankMcentral)
-	}
+	lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
+	lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
 }
 
 // partialUnswept returns the spanSet which holds partially-filled
@@ -93,9 +77,6 @@ func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
 
 // Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
-	if !go115NewMCentralImpl {
-		return c.oldCacheSpan()
-	}
 	// Deduct credit for this span allocation and sweep if necessary.
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
@@ -188,131 +169,6 @@ havespan:
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
-	freeByteBase := s.freeindex &^ (64 - 1)
-	whichByte := freeByteBase / 8
-	// Init alloc bits cache.
-	s.refillAllocCache(whichByte)
-
-	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
-	// s.allocCache.
-	s.allocCache >>= s.freeindex % 64
-
-	return s
-}
-
-// Allocate a span to use in an mcache.
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) oldCacheSpan() *mspan {
-	// Deduct credit for this span allocation and sweep if necessary.
-	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
-	deductSweepCredit(spanBytes, 0)
-
-	lock(&c.lock)
-	traceDone := false
-	if trace.enabled {
-		traceGCSweepStart()
-	}
-	sg := mheap_.sweepgen
-retry:
-	var s *mspan
-	for s = c.nonempty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			c.nonempty.remove(s)
-			c.empty.insertBack(s)
-			unlock(&c.lock)
-			s.sweep(true)
-			goto havespan
-		}
-		if s.sweepgen == sg-1 {
-			// the span is being swept by background sweeper, skip
-			continue
-		}
-		// we have a nonempty span that does not require sweeping, allocate from it
-		c.nonempty.remove(s)
-		c.empty.insertBack(s)
-		unlock(&c.lock)
-		goto havespan
-	}
-
-	for s = c.empty.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			// we have an empty span that requires sweeping,
-			// sweep it and see if we can free some space in it
-			c.empty.remove(s)
-			// swept spans are at the end of the list
-			c.empty.insertBack(s)
-			unlock(&c.lock)
-			s.sweep(true)
-			freeIndex := s.nextFreeIndex()
-			if freeIndex != s.nelems {
-				s.freeindex = freeIndex
-				goto havespan
-			}
-			lock(&c.lock)
-			// the span is still empty after sweep
-			// it is already in the empty list, so just retry
-			goto retry
-		}
-		if s.sweepgen == sg-1 {
-			// the span is being swept by background sweeper, skip
-			continue
-		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break
-	}
-	if trace.enabled {
-		traceGCSweepDone()
-		traceDone = true
-	}
-	unlock(&c.lock)
-
-	// Replenish central list if empty.
-	s = c.grow()
-	if s == nil {
-		return nil
-	}
-	lock(&c.lock)
-	c.empty.insertBack(s)
-	unlock(&c.lock)
-
-	// At this point s is a non-empty span, queued at the end of the empty list,
-	// c is unlocked.
-havespan:
-	if trace.enabled && !traceDone {
-		traceGCSweepDone()
-	}
-	n := int(s.nelems) - int(s.allocCount)
-	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
-		throw("span has no free objects")
-	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -330,10 +186,6 @@ havespan:
 // s must have a span class corresponding to this
 // mcentral and it must not be empty.
 func (c *mcentral) uncacheSpan(s *mspan) {
-	if !go115NewMCentralImpl {
-		c.oldUncacheSpan(s)
-		return
-	}
 	if s.allocCount == 0 {
 		throw("uncaching span but s.allocCount == 0")
 	}
@@ -354,27 +206,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -382,7 +213,7 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
@@ -393,111 +224,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 	}
 }
 
-// Return span from an mcache.
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) oldUncacheSpan(s *mspan) {
-	if s.allocCount == 0 {
-		throw("uncaching span but s.allocCount == 0")
-	}
-
-	sg := mheap_.sweepgen
-	stale := s.sweepgen == sg+1
-	if stale {
-		// Span was cached before sweep began. It's our
-		// responsibility to sweep it.
-		//
-		// Set sweepgen to indicate it's not cached but needs
-		// sweeping and can't be allocated from. sweep will
-		// set s.sweepgen to indicate s is swept.
-		atomic.Store(&s.sweepgen, sg-1)
-	} else {
-		// Indicate that s is no longer cached.
-		atomic.Store(&s.sweepgen, sg)
-	}
-
-	n := int(s.nelems) - int(s.allocCount)
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		lock(&c.lock)
-		c.empty.remove(s)
-		c.nonempty.insert(s)
-		if !stale {
-			// mCentral_CacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-		unlock(&c.lock)
-	}
-
-	if stale {
-		// Now that s is in the right mcentral list, we can
-		// sweep it.
-		s.sweep(false)
-	}
-}
-
-// freeSpan updates c and s after sweeping s.
-// It sets s's sweepgen to the latest generation,
-// and, based on the number of free objects in s,
-// moves s to the appropriate list of c or returns it
-// to the heap.
-// freeSpan reports whether s was returned to the heap.
-// If preserve=true, it does not move s (the caller
-// must take care of it).
-//
-// For !go115NewMCentralImpl.
-func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
-	if sg := mheap_.sweepgen; s.sweepgen == sg+1 || s.sweepgen == sg+3 {
-		throw("freeSpan given cached span")
-	}
-	s.needzero = 1
-
-	if preserve {
-		// preserve is set only when called from (un)cacheSpan above,
-		// the span must be in the empty list.
-		if !s.inList() {
-			throw("can't preserve unlinked span")
-		}
-		atomic.Store(&s.sweepgen, mheap_.sweepgen)
-		return false
-	}
-
-	lock(&c.lock)
-
-	// Move to nonempty if necessary.
-	if wasempty {
-		c.empty.remove(s)
-		c.nonempty.insert(s)
-	}
-
-	// delay updating sweepgen until here. This is the signal that
-	// the span may be used in an mcache, so it must come after the
-	// linked list operations above (actually, just after the
-	// lock of c above.)
-	atomic.Store(&s.sweepgen, mheap_.sweepgen)
-
-	if s.allocCount != 0 {
-		unlock(&c.lock)
-		return false
-	}
-
-	c.nonempty.remove(s)
-	unlock(&c.lock)
-	mheap_.freeSpan(s)
-	return true
-}
-
 // grow allocates a new empty span from the heap and initializes it for c's size class.
 func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
diff --git a/libgo/go/runtime/mcheckmark.go b/libgo/go/runtime/mcheckmark.go
new file mode 100644
index 0000000..e4acb79
--- /dev/null
+++ b/libgo/go/runtime/mcheckmark.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GC checkmarks
+//
+// In a concurrent garbage collector, one worries about failing to mark
+// a live object due to mutations without write barriers or bugs in the
+// collector implementation. As a sanity check, the GC has a 'checkmark'
+// mode that retraverses the object graph with the world stopped, to make
+// sure that everything that should be marked is marked.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A checkmarksMap stores the GC marks in "checkmarks" mode. It is a
+// per-arena bitmap with a bit for every word in the arena. The mark
+// is stored on the bit corresponding to the first word of the marked
+// allocation.
+//
+//go:notinheap
+type checkmarksMap [heapArenaBytes / sys.PtrSize / 8]uint8
+
+// If useCheckmark is true, marking of an object uses the checkmark
+// bits instead of the standard mark bits.
+var useCheckmark = false
+
+// startCheckmarks prepares for the checkmarks phase.
+//
+// The world must be stopped.
+func startCheckmarks() {
+	assertWorldStopped()
+
+	// Clear all checkmarks.
+	for _, ai := range mheap_.allArenas {
+		arena := mheap_.arenas[ai.l1()][ai.l2()]
+		bitmap := arena.checkmarks
+
+		if bitmap == nil {
+			// Allocate bitmap on first use.
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
+			if bitmap == nil {
+				throw("out of memory allocating checkmarks bitmap")
+			}
+			arena.checkmarks = bitmap
+		} else {
+			// Otherwise clear the existing bitmap.
+			for i := range bitmap {
+				bitmap[i] = 0
+			}
+		}
+	}
+	// Enable checkmarking.
+	useCheckmark = true
+}
+
+// endCheckmarks ends the checkmarks phase.
+func endCheckmarks() {
+	if gcMarkWorkAvailable(nil) {
+		throw("GC work not flushed")
+	}
+	useCheckmark = false
+}
+
+// setCheckmark throws if marking object is a checkmarks violation,
+// and otherwise sets obj's checkmark. It returns true if obj was
+// already checkmarked.
+func setCheckmark(obj, base, off uintptr, mbits markBits, forStack bool) bool {
+	if !mbits.isMarked() {
+		// Stack scanning is conservative, so we can see a
+		// reference to an object not previously found.
+		// Assume the object was correctly not marked and
+		// ignore the pointer.
+		if forStack {
+			return false
+		}
+		printlock()
+		print("runtime: checkmarks found unexpected unmarked object obj=", hex(obj), "\n")
+		print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+		// Dump the source (base) object
+		gcDumpObject("base", base, off)
+
+		// Dump the object
+		gcDumpObject("obj", obj, ^uintptr(0))
+
+		getg().m.traceback = 2
+		throw("checkmark found unmarked object")
+	}
+
+	ai := arenaIndex(obj)
+	arena := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaWord := (obj / heapArenaBytes / 8) % uintptr(len(arena.checkmarks))
+	mask := byte(1 << ((obj / heapArenaBytes) % 8))
+	bytep := &arena.checkmarks[arenaWord]
+
+	if atomic.Load8(bytep)&mask != 0 {
+		// Already checkmarked.
+		return true
+	}
+
+	atomic.Or8(bytep, mask)
+	return false
+}
diff --git a/libgo/go/runtime/mem_gccgo.go b/libgo/go/runtime/mem_gccgo.go
index ba38eba..fa3389d 100644
--- a/libgo/go/runtime/mem_gccgo.go
+++ b/libgo/go/runtime/mem_gccgo.go
@@ -48,7 +48,7 @@ func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) (u
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -61,7 +61,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -166,8 +166,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -183,8 +183,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	if GOOS == "aix" {
 		// AIX does not allow mapping a range that is already mapped.
diff --git a/libgo/go/runtime/memmove_linux_amd64_test.go b/libgo/go/runtime/memmove_linux_amd64_test.go
index d0e8b42a..b3ccd90 100644
--- a/libgo/go/runtime/memmove_linux_amd64_test.go
+++ b/libgo/go/runtime/memmove_linux_amd64_test.go
@@ -5,7 +5,6 @@
 package runtime_test
 
 import (
-	"io/ioutil"
 	"os"
 	"reflect"
 	"syscall"
@@ -18,7 +17,7 @@ import (
 func TestMemmoveOverflow(t *testing.T) {
 	t.Parallel()
 	// Create a temporary file.
-	tmp, err := ioutil.TempFile("", "go-memmovetest")
+	tmp, err := os.CreateTemp("", "go-memmovetest")
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/libgo/go/runtime/memmove_test.go b/libgo/go/runtime/memmove_test.go
index 396c130..7c9d2ad 100644
--- a/libgo/go/runtime/memmove_test.go
+++ b/libgo/go/runtime/memmove_test.go
@@ -286,6 +286,9 @@ var bufSizes = []int{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 	32, 64, 128, 256, 512, 1024, 2048, 4096,
 }
+var bufSizesOverlap = []int{
+	32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
 
 func BenchmarkMemmove(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
@@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[:n])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n+1)
@@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+16)
+		for i := 0; i < b.N; i++ {
+			copy(x[16:n+16], x[1:n+1])
+		}
+	})
+}
+
 func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
 		x := make([]byte, n)
@@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
 	})
 }
 
+func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) {
+	benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) {
+		x := make([]byte, n+1)
+		for i := 0; i < b.N; i++ {
+			copy(x[1:n+1], x[:n])
+		}
+	})
+}
+
 func TestMemclr(t *testing.T) {
 	size := 512
 	if testing.Short() {
@@ -538,21 +568,30 @@ func BenchmarkCopyFat1024(b *testing.B) {
 	}
 }
 
+// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
+// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+// when we try to read the result with one 4-byte load.
 func BenchmarkIssue18740(b *testing.B) {
-	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
-	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
-	// when we try to read the result with one 4-byte load.
-	var buf [4]byte
-	for j := 0; j < b.N; j++ {
-		s := uint32(0)
-		for i := 0; i < 4096; i += 4 {
-			copy(buf[:], g[i:])
-			s += binary.LittleEndian.Uint32(buf[:])
-		}
-		sink = uint64(s)
+	benchmarks := []struct {
+		name  string
+		nbyte int
+		f     func([]byte) uint64
+	}{
+		{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
+		{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
+		{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
+	}
+
+	var g [4096]byte
+	for _, bm := range benchmarks {
+		buf := make([]byte, bm.nbyte)
+		b.Run(bm.name, func(b *testing.B) {
+			for j := 0; j < b.N; j++ {
+				for i := 0; i < 4096; i += bm.nbyte {
+					copy(buf[:], g[i:])
+					sink += bm.f(buf[:])
+				}
+			}
+		})
 	}
 }
-
-// TODO: 2 byte and 8 byte benchmarks also.
-
-var g [4096]byte
diff --git a/libgo/go/runtime/metrics.go b/libgo/go/runtime/metrics.go
new file mode 100644
index 0000000..5f09a88
--- /dev/null
+++ b/libgo/go/runtime/metrics.go
@@ -0,0 +1,485 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Metrics implementation exported to runtime/metrics.
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var (
+	// metrics is a map of runtime/metrics keys to
+	// data used by the runtime to sample each metric's
+	// value.
+	metricsSema uint32 = 1
+	metricsInit bool
+	metrics     map[string]metricData
+
+	sizeClassBuckets []float64
+	timeHistBuckets  []float64
+)
+
+type metricData struct {
+	// deps is the set of runtime statistics that this metric
+	// depends on. Before compute is called, the statAggregate
+	// which will be passed must ensure() these dependencies.
+	deps statDepSet
+
+	// compute is a function that populates a metricValue
+	// given a populated statAggregate structure.
+	compute func(in *statAggregate, out *metricValue)
+}
+
+// initMetrics initializes the metrics map if it hasn't been yet.
+//
+// metricsSema must be held.
+func initMetrics() {
+	if metricsInit {
+		return
+	}
+	sizeClassBuckets = make([]float64, _NumSizeClasses)
+	for i := range sizeClassBuckets {
+		sizeClassBuckets[i] = float64(class_to_size[i])
+	}
+	timeHistBuckets = timeHistogramMetricsBuckets()
+	metrics = map[string]metricData{
+		"/gc/cycles/automatic:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone - in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/forced:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/total:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone
+			},
+		},
+		"/gc/heap/allocs-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeAllocCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallAllocCount[i])
+				}
+			},
+		},
+		"/gc/heap/frees-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeFreeCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallFreeCount[i])
+				}
+			},
+		},
+		"/gc/heap/goal:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.heapGoal
+			},
+		},
+		"/gc/heap/objects:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.numObjects
+			},
+		},
+		"/gc/pauses:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[len(hist.counts)-1] = atomic.Load64(&memstats.gcPauseDist.overflow)
+				for i := range hist.buckets {
+					hist.counts[i] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+				}
+			},
+		},
+		"/memory/classes/heap/free:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed - in.heapStats.inHeap -
+					in.heapStats.inStacks - in.heapStats.inWorkBufs -
+					in.heapStats.inPtrScalarBits)
+			},
+		},
+		"/memory/classes/heap/objects:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/heap/released:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.released)
+			},
+		},
+		"/memory/classes/heap/stacks:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inStacks)
+			},
+		},
+		"/memory/classes/heap/unused:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inHeap) - in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/metadata/mcache/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheSys - in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mcache/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanSys - in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/other:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inWorkBufs+in.heapStats.inPtrScalarBits) + in.sysStats.gcMiscSys
+			},
+		},
+		"/memory/classes/os-stacks:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.stacksSys
+			},
+		},
+		"/memory/classes/other:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.otherSys
+			},
+		},
+		"/memory/classes/profiling/buckets:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.buckHashSys
+			},
+		},
+		"/memory/classes/total:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed+in.heapStats.released) +
+					in.sysStats.stacksSys + in.sysStats.mSpanSys +
+					in.sysStats.mCacheSys + in.sysStats.buckHashSys +
+					in.sysStats.gcMiscSys + in.sysStats.otherSys
+			},
+		},
+		"/sched/goroutines:goroutines": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(gcount())
+			},
+		},
+	}
+	metricsInit = true
+}
+
+// statDep is a dependency on a group of statistics
+// that a metric might have.
+type statDep uint
+
+const (
+	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
+	sysStatsDep                 // corresponds to sysStatsAggregate
+	numStatsDeps
+)
+
+// statDepSet represents a set of statDeps.
+//
+// Under the hood, it's a bitmap.
+type statDepSet [1]uint64
+
+// makeStatDepSet creates a new statDepSet from a list of statDeps.
+func makeStatDepSet(deps ...statDep) statDepSet {
+	var s statDepSet
+	for _, d := range deps {
+		s[d/64] |= 1 << (d % 64)
+	}
+	return s
+}
+
+// differennce returns set difference of s from b as a new set.
+func (s statDepSet) difference(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] &^ b[i]
+	}
+	return c
+}
+
+// union returns the union of the two sets as a new set.
+func (s statDepSet) union(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] | b[i]
+	}
+	return c
+}
+
+// empty returns true if there are no dependencies in the set.
+func (s *statDepSet) empty() bool {
+	for _, c := range s {
+		if c != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// has returns true if the set contains a given statDep.
+func (s *statDepSet) has(d statDep) bool {
+	return s[d/64]&(1<<(d%64)) != 0
+}
+
+// heapStatsAggregate represents memory stats obtained from the
+// runtime. This set of stats is grouped together because they
+// depend on each other in some way to make sense of the runtime's
+// current heap memory use. They're also sharded across Ps, so it
+// makes sense to grab them all at once.
+type heapStatsAggregate struct {
+	heapStatsDelta
+
+	// Derived from values in heapStatsDelta.
+
+	// inObjects is the bytes of memory occupied by objects,
+	inObjects uint64
+
+	// numObjects is the number of live objects in the heap.
+	numObjects uint64
+}
+
+// compute populates the heapStatsAggregate with values from the runtime.
+func (a *heapStatsAggregate) compute() {
+	memstats.heapStats.read(&a.heapStatsDelta)
+
+	// Calculate derived stats.
+	a.inObjects = uint64(a.largeAlloc - a.largeFree)
+	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
+	for i := range a.smallAllocCount {
+		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
+		a.inObjects += n * uint64(class_to_size[i])
+		a.numObjects += n
+	}
+}
+
+// sysStatsAggregate represents system memory stats obtained
+// from the runtime. This set of stats is grouped together because
+// they're all relatively cheap to acquire and generally independent
+// of one another and other runtime memory stats. The fact that they
+// may be acquired at different times, especially with respect to
+// heapStatsAggregate, means there could be some skew, but because of
+// these stats are independent, there's no real consistency issue here.
+type sysStatsAggregate struct {
+	stacksSys      uint64
+	mSpanSys       uint64
+	mSpanInUse     uint64
+	mCacheSys      uint64
+	mCacheInUse    uint64
+	buckHashSys    uint64
+	gcMiscSys      uint64
+	otherSys       uint64
+	heapGoal       uint64
+	gcCyclesDone   uint64
+	gcCyclesForced uint64
+}
+
+// compute populates the sysStatsAggregate with values from the runtime.
+func (a *sysStatsAggregate) compute() {
+	a.stacksSys = memstats.stacks_sys.load()
+	a.buckHashSys = memstats.buckhash_sys.load()
+	a.gcMiscSys = memstats.gcMiscSys.load()
+	a.otherSys = memstats.other_sys.load()
+	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.gcCyclesDone = uint64(memstats.numgc)
+	a.gcCyclesForced = uint64(memstats.numforcedgc)
+
+	systemstack(func() {
+		lock(&mheap_.lock)
+		a.mSpanSys = memstats.mspan_sys.load()
+		a.mSpanInUse = uint64(mheap_.spanalloc.inuse)
+		a.mCacheSys = memstats.mcache_sys.load()
+		a.mCacheInUse = uint64(mheap_.cachealloc.inuse)
+		unlock(&mheap_.lock)
+	})
+}
+
+// statAggregate is the main driver of the metrics implementation.
+//
+// It contains multiple aggregates of runtime statistics, as well
+// as a set of these aggregates that it has populated. The aggergates
+// are populated lazily by its ensure method.
+type statAggregate struct {
+	ensured   statDepSet
+	heapStats heapStatsAggregate
+	sysStats  sysStatsAggregate
+}
+
+// ensure populates statistics aggregates determined by deps if they
+// haven't yet been populated.
+func (a *statAggregate) ensure(deps *statDepSet) {
+	missing := deps.difference(a.ensured)
+	if missing.empty() {
+		return
+	}
+	for i := statDep(0); i < numStatsDeps; i++ {
+		if !missing.has(i) {
+			continue
+		}
+		switch i {
+		case heapStatsDep:
+			a.heapStats.compute()
+		case sysStatsDep:
+			a.sysStats.compute()
+		}
+	}
+	a.ensured = a.ensured.union(missing)
+}
+
+// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// must be kept structurally identical to that type.
+type metricKind int
+
+const (
+	// These values must be kept identical to their corresponding Kind* values
+	// in the runtime/metrics package.
+	metricKindBad metricKind = iota
+	metricKindUint64
+	metricKindFloat64
+	metricKindFloat64Histogram
+)
+
+// metricSample is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricSample struct {
+	name  string
+	value metricValue
+}
+
+// metricValue is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricValue struct {
+	kind    metricKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// float64HistOrInit tries to pull out an existing float64Histogram
+// from the value, but if none exists, then it allocates one with
+// the given buckets.
+func (v *metricValue) float64HistOrInit(buckets []float64) *metricFloat64Histogram {
+	var hist *metricFloat64Histogram
+	if v.kind == metricKindFloat64Histogram && v.pointer != nil {
+		hist = (*metricFloat64Histogram)(v.pointer)
+	} else {
+		v.kind = metricKindFloat64Histogram
+		hist = new(metricFloat64Histogram)
+		v.pointer = unsafe.Pointer(hist)
+	}
+	hist.buckets = buckets
+	if len(hist.counts) != len(hist.buckets)+1 {
+		hist.counts = make([]uint64, len(buckets)+1)
+	}
+	return hist
+}
+
+// metricFloat64Histogram is a runtime copy of runtime/metrics.Float64Histogram
+// and must be kept structurally identical to that type.
+type metricFloat64Histogram struct {
+	counts  []uint64
+	buckets []float64
+}
+
+// agg is used by readMetrics, and is protected by metricsSema.
+//
+// Managed as a global variable because its pointer will be
+// an argument to a dynamically-defined function, and we'd
+// like to avoid it escaping to the heap.
+var agg statAggregate
+
+// readMetrics is the implementation of runtime/metrics.Read.
+//
+//go:linkname readMetrics runtime_1metrics.runtime__readMetrics
+func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
+	// Construct a slice from the args.
+	sl := slice{samplesp, len, cap}
+	samples := *(*[]metricSample)(unsafe.Pointer(&sl))
+
+	// Acquire the metricsSema but with handoff. This operation
+	// is expensive enough that queueing up goroutines and handing
+	// off between them will be noticably better-behaved.
+	semacquire1(&metricsSema, true, 0, 0)
+
+	// Ensure the map is initialized.
+	initMetrics()
+
+	// Clear agg defensively.
+	agg = statAggregate{}
+
+	// Sample.
+	for i := range samples {
+		sample := &samples[i]
+		data, ok := metrics[sample.name]
+		if !ok {
+			sample.value.kind = metricKindBad
+			continue
+		}
+		// Ensure we have all the stats we need.
+		// agg is populated lazily.
+		agg.ensure(&data.deps)
+
+		// Compute the value based on the stats we have.
+		data.compute(&agg, &sample.value)
+	}
+
+	semrelease(&metricsSema)
+}
diff --git a/libgo/go/runtime/metrics/description.go b/libgo/go/runtime/metrics/description.go
new file mode 100644
index 0000000..32af5d1
--- /dev/null
+++ b/libgo/go/runtime/metrics/description.go
@@ -0,0 +1,180 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Description describes a runtime metric.
+type Description struct {
+	// Name is the full name of the metric which includes the unit.
+	//
+	// The format of the metric may be described by the following regular expression.
+	//
+	// 	^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$
+	//
+	// The format splits the name into two components, separated by a colon: a path which always
+	// starts with a /, and a machine-parseable unit. The name may contain any valid Unicode
+	// codepoint in between / characters, but by convention will try to stick to lowercase
+	// characters and hyphens. An example of such a path might be "/memory/heap/free".
+	//
+	// The unit is by convention a series of lowercase English unit names (singular or plural)
+	// without prefixes delimited by '*' or '/'. The unit names may contain any valid Unicode
+	// codepoint that is not a delimiter.
+	// Examples of units might be "seconds", "bytes", "bytes/second", "cpu-seconds",
+	// "byte*cpu-seconds", and "bytes/second/second".
+	//
+	// A complete name might look like "/memory/heap/free:bytes".
+	Name string
+
+	// Description is an English language sentence describing the metric.
+	Description string
+
+	// Kind is the kind of value for this metric.
+	//
+	// The purpose of this field is to allow users to filter out metrics whose values are
+	// types which their application may not understand.
+	Kind ValueKind
+
+	// Cumulative is whether or not the metric is cumulative. If a cumulative metric is just
+	// a single number, then it increases monotonically. If the metric is a distribution,
+	// then each bucket count increases monotonically.
+	//
+	// This flag thus indicates whether or not it's useful to compute a rate from this value.
+	Cumulative bool
+
+	// StopTheWorld is whether or not the metric requires a stop-the-world
+	// event in order to collect it.
+	StopTheWorld bool
+}
+
+// The English language descriptions below must be kept in sync with the
+// descriptions of each metric in doc.go.
+var allDesc = []Description{
+	{
+		Name:        "/gc/cycles/automatic:gc-cycles",
+		Description: "Count of completed GC cycles generated by the Go runtime.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/forced:gc-cycles",
+		Description: "Count of completed GC cycles forced by the application.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/total:gc-cycles",
+		Description: "Count of all completed GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/heap/allocs-by-size:objects",
+		Description: "Distribution of all objects allocated by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/frees-by-size:objects",
+		Description: "Distribution of all objects freed by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/goal:bytes",
+		Description: "Heap size target for the end of the GC cycle.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/heap/objects:objects",
+		Description: "Number of objects, live or unswept, occupying heap memory.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/pauses:seconds",
+		Description: "Distribution individual GC-related stop-the-world pause latencies.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name: "/memory/classes/heap/free:bytes",
+		Description: "Memory that is completely free and eligible to be returned to the underlying system, " +
+			"but has not been. This metric is the runtime's estimate of free address space that is backed by " +
+			"physical memory.",
+		Kind: KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/objects:bytes",
+		Description: "Memory occupied by live objects and dead objects that have not yet been marked free by the garbage collector.",
+		Kind:        KindUint64,
+	},
+	{
+		Name: "/memory/classes/heap/released:bytes",
+		Description: "Memory that is completely free and has been returned to the underlying system. This " +
+			"metric is the runtime's estimate of free address space that is still mapped into the process, " +
+			"but is not backed by physical memory.",
+		Kind: KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/stacks:bytes",
+		Description: "Memory allocated from the heap that is reserved for stack space, whether or not it is currently in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/unused:bytes",
+		Description: "Memory that is reserved for heap objects but is not currently used to hold heap objects.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/free:bytes",
+		Description: "Memory that is reserved for runtime mcache structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/inuse:bytes",
+		Description: "Memory that is occupied by runtime mcache structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/free:bytes",
+		Description: "Memory that is reserved for runtime mspan structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/inuse:bytes",
+		Description: "Memory that is occupied by runtime mspan structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/other:bytes",
+		Description: "Memory that is reserved for or used to hold runtime metadata.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/os-stacks:bytes",
+		Description: "Stack memory allocated by the underlying operating system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/other:bytes",
+		Description: "Memory used by execution trace buffers, structures for debugging the runtime, finalizer and profiler specials, and more.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/profiling/buckets:bytes",
+		Description: "Memory that is used by the stack trace hash map used for profiling.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/total:bytes",
+		Description: "All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines:goroutines",
+		Description: "Count of live goroutines.",
+		Kind:        KindUint64,
+	},
+}
+
+// All returns a slice of containing metric descriptions for all supported metrics.
+func All() []Description {
+	return allDesc
+}
diff --git a/libgo/go/runtime/metrics/description_test.go b/libgo/go/runtime/metrics/description_test.go
new file mode 100644
index 0000000..fd1fd46
--- /dev/null
+++ b/libgo/go/runtime/metrics/description_test.go
@@ -0,0 +1,115 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics_test
+
+import (
+	"bufio"
+	"os"
+	"regexp"
+	"runtime/metrics"
+	"strings"
+	"testing"
+)
+
+func TestDescriptionNameFormat(t *testing.T) {
+	r := regexp.MustCompile("^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$")
+	descriptions := metrics.All()
+	for _, desc := range descriptions {
+		if !r.MatchString(desc.Name) {
+			t.Errorf("metrics %q does not match regexp %s", desc.Name, r)
+		}
+	}
+}
+
+func extractMetricDocs(t *testing.T) map[string]string {
+	f, err := os.Open("doc.go")
+	if err != nil {
+		t.Fatalf("failed to open doc.go in runtime/metrics package: %v", err)
+	}
+	const (
+		stateSearch          = iota // look for list of metrics
+		stateNextMetric             // look for next metric
+		stateNextDescription        // build description
+	)
+	state := stateSearch
+	s := bufio.NewScanner(f)
+	result := make(map[string]string)
+	var metric string
+	var prevMetric string
+	var desc strings.Builder
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		switch state {
+		case stateSearch:
+			if line == "Below is the full list of supported metrics, ordered lexicographically." {
+				state = stateNextMetric
+			}
+		case stateNextMetric:
+			// Ignore empty lines until we find a non-empty
+			// one. This will be our metric name.
+			if len(line) != 0 {
+				prevMetric = metric
+				metric = line
+				if prevMetric > metric {
+					t.Errorf("metrics %s and %s are out of lexicographical order", prevMetric, metric)
+				}
+				state = stateNextDescription
+			}
+		case stateNextDescription:
+			if len(line) == 0 || line == `*/` {
+				// An empty line means we're done.
+				// Write down the description and look
+				// for a new metric.
+				result[metric] = desc.String()
+				desc.Reset()
+				state = stateNextMetric
+			} else {
+				// As long as we're seeing data, assume that's
+				// part of the description and append it.
+				if desc.Len() != 0 {
+					// Turn previous newlines into spaces.
+					desc.WriteString(" ")
+				}
+				desc.WriteString(line)
+			}
+		}
+		if line == `*/` {
+			break
+		}
+	}
+	if state == stateSearch {
+		t.Fatalf("failed to find supported metrics docs in %s", f.Name())
+	}
+	return result
+}
+
+func TestDescriptionDocs(t *testing.T) {
+	docs := extractMetricDocs(t)
+	descriptions := metrics.All()
+	for _, d := range descriptions {
+		want := d.Description
+		got, ok := docs[d.Name]
+		if !ok {
+			t.Errorf("no docs found for metric %s", d.Name)
+			continue
+		}
+		if got != want {
+			t.Errorf("mismatched description and docs for metric %s", d.Name)
+			t.Errorf("want: %q, got %q", want, got)
+			continue
+		}
+	}
+	if len(docs) > len(descriptions) {
+	docsLoop:
+		for name, _ := range docs {
+			for _, d := range descriptions {
+				if name == d.Name {
+					continue docsLoop
+				}
+			}
+			t.Errorf("stale documentation for non-existent metric: %s", name)
+		}
+	}
+}
diff --git a/libgo/go/runtime/metrics/doc.go b/libgo/go/runtime/metrics/doc.go
new file mode 100644
index 0000000..a68184e
--- /dev/null
+++ b/libgo/go/runtime/metrics/doc.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package metrics provides a stable interface to access implementation-defined
+metrics exported by the Go runtime. This package is similar to existing functions
+like runtime.ReadMemStats and debug.ReadGCStats, but significantly more general.
+
+The set of metrics defined by this package may evolve as the runtime itself
+evolves, and also enables variation across Go implementations, whose relevant
+metric sets may not intersect.
+
+Interface
+
+Metrics are designated by a string key, rather than, for example, a field name in
+a struct. The full list of supported metrics is always available in the slice of
+Descriptions returned by All. Each Description also includes useful information
+about the metric, such as how to display it (e.g. gauge vs. counter) and how difficult
+or disruptive it is to obtain it (e.g. do you need to stop the world?).
+
+Thus, users of this API are encouraged to sample supported metrics defined by the
+slice returned by All to remain compatible across Go versions. Of course, situations
+arise where reading specific metrics is critical. For these cases, users are
+encouranged to use build tags, and although metrics may be deprecated and removed,
+users should consider this to be an exceptional and rare event, coinciding with a
+very large change in a particular Go implementation.
+
+Each metric key also has a "kind" that describes the format of the metric's value.
+In the interest of not breaking users of this package, the "kind" for a given metric
+is guaranteed not to change. If it must change, then a new metric will be introduced
+with a new key and a new "kind."
+
+Metric key format
+
+As mentioned earlier, metric keys are strings. Their format is simple and well-defined,
+designed to be both human and machine readable. It is split into two components,
+separated by a colon: a rooted path and a unit. The choice to include the unit in
+the key is motivated by compatibility: if a metric's unit changes, its semantics likely
+did also, and a new key should be introduced.
+
+For more details on the precise definition of the metric key's path and unit formats, see
+the documentation of the Name field of the Description struct.
+
+A note about floats
+
+This package supports metrics whose values have a floating-point representation. In
+order to improve ease-of-use, this package promises to never produce the following
+classes of floating-point values: NaN, infinity.
+
+Supported metrics
+
+Below is the full list of supported metrics, ordered lexicographically.
+
+	/gc/cycles/automatic:gc-cycles
+		Count of completed GC cycles generated by the Go runtime.
+
+	/gc/cycles/forced:gc-cycles
+		Count of completed GC cycles forced by the application.
+
+	/gc/cycles/total:gc-cycles
+		Count of all completed GC cycles.
+
+	/gc/heap/allocs-by-size:objects
+		Distribution of all objects allocated by approximate size.
+
+	/gc/heap/frees-by-size:objects
+		Distribution of all objects freed by approximate size.
+
+	/gc/heap/goal:bytes
+		Heap size target for the end of the GC cycle.
+
+	/gc/heap/objects:objects
+		Number of objects, live or unswept, occupying heap memory.
+
+	/gc/pauses:seconds
+		Distribution individual GC-related stop-the-world pause latencies.
+
+	/memory/classes/heap/free:bytes
+		Memory that is completely free and eligible to be returned to
+		the underlying system, but has not been. This metric is the
+		runtime's estimate of free address space that is backed by
+		physical memory.
+
+	/memory/classes/heap/objects:bytes
+		Memory occupied by live objects and dead objects that have
+		not yet been marked free by the garbage collector.
+
+	/memory/classes/heap/released:bytes
+		Memory that is completely free and has been returned to
+		the underlying system. This metric is the runtime's estimate of
+		free address space that is still mapped into the process, but
+		is not backed by physical memory.
+
+	/memory/classes/heap/stacks:bytes
+		Memory allocated from the heap that is reserved for stack
+		space, whether or not it is currently in-use.
+
+	/memory/classes/heap/unused:bytes
+		Memory that is reserved for heap objects but is not currently
+		used to hold heap objects.
+
+	/memory/classes/metadata/mcache/free:bytes
+		Memory that is reserved for runtime mcache structures, but
+		not in-use.
+
+	/memory/classes/metadata/mcache/inuse:bytes
+		Memory that is occupied by runtime mcache structures that
+		are currently being used.
+
+	/memory/classes/metadata/mspan/free:bytes
+		Memory that is reserved for runtime mspan structures, but
+		not in-use.
+
+	/memory/classes/metadata/mspan/inuse:bytes
+		Memory that is occupied by runtime mspan structures that are
+		currently being used.
+
+	/memory/classes/metadata/other:bytes
+		Memory that is reserved for or used to hold runtime
+		metadata.
+
+	/memory/classes/os-stacks:bytes
+		Stack memory allocated by the underlying operating system.
+
+	/memory/classes/other:bytes
+		Memory used by execution trace buffers, structures for
+		debugging the runtime, finalizer and profiler specials, and
+		more.
+
+	/memory/classes/profiling/buckets:bytes
+		Memory that is used by the stack trace hash map used for
+		profiling.
+
+	/memory/classes/total:bytes
+		All memory mapped by the Go runtime into the current process
+		as read-write. Note that this does not include memory mapped
+		by code called via cgo or via the syscall package.
+		Sum of all metrics in /memory/classes.
+
+	/sched/goroutines:goroutines
+		Count of live goroutines.
+*/
+package metrics
diff --git a/libgo/go/runtime/metrics/histogram.go b/libgo/go/runtime/metrics/histogram.go
new file mode 100644
index 0000000..e1364e1
--- /dev/null
+++ b/libgo/go/runtime/metrics/histogram.go
@@ -0,0 +1,30 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Float64Histogram represents a distribution of float64 values.
+type Float64Histogram struct {
+	// Counts contains the weights for each histogram bucket. The length of
+	// Counts is equal to the length of Buckets (in the metric description)
+	// plus one to account for the implicit minimum bucket.
+	//
+	// Given N buckets, the following is the mathematical relationship between
+	// Counts and Buckets.
+	// count[0] is the weight of the range (-inf, bucket[0])
+	// count[n] is the weight of the range [bucket[n], bucket[n+1]), for 0 < n < N-1
+	// count[N-1] is the weight of the range [bucket[N-1], inf)
+	Counts []uint64
+
+	// Buckets contains the boundaries between histogram buckets, in increasing order.
+	//
+	// Because this slice contains boundaries, there are len(Buckets)+1 counts:
+	// a count for all values less than the first boundary, a count covering each
+	// [slice[i], slice[i+1]) interval, and a count for all values greater than or
+	// equal to the last boundary.
+	//
+	// For a given metric name, the value of Buckets is guaranteed not to change
+	// between calls until program exit.
+	Buckets []float64
+}
diff --git a/libgo/go/runtime/metrics/sample.go b/libgo/go/runtime/metrics/sample.go
new file mode 100644
index 0000000..35534dd
--- /dev/null
+++ b/libgo/go/runtime/metrics/sample.go
@@ -0,0 +1,47 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	_ "runtime" // depends on the runtime via a linkname'd function
+	"unsafe"
+)
+
+// Sample captures a single metric sample.
+type Sample struct {
+	// Name is the name of the metric sampled.
+	//
+	// It must correspond to a name in one of the metric descriptions
+	// returned by Descriptions.
+	Name string
+
+	// Value is the value of the metric sample.
+	Value Value
+}
+
+// Implemented in the runtime.
+func runtime_readMetrics(unsafe.Pointer, int, int)
+
+// Read populates each Value field in the given slice of metric samples.
+//
+// Desired metrics should be present in the slice with the appropriate name.
+// The user of this API is encouraged to re-use the same slice between calls for
+// efficiency, but is not required to do so.
+//
+// Note that re-use has some caveats. Notably, Values should not be read or
+// manipulated while a Read with that value is outstanding; that is a data race.
+// This property includes pointer-typed Values (e.g. Float64Histogram) whose
+// underlying storage will be reused by Read when possible. To safely use such
+// values in a concurrent setting, all data must be deep-copied.
+//
+// It is safe to execute multiple Read calls concurrently, but their arguments
+// must share no underlying memory. When in doubt, create a new []Sample from
+// scratch, which is always safe, though may be inefficient.
+//
+// Sample values with names not appearing in All will have their Value populated
+// as KindBad to indicate that the name is unknown.
+func Read(m []Sample) {
+	runtime_readMetrics(unsafe.Pointer(&m[0]), len(m), cap(m))
+}
diff --git a/libgo/go/runtime/metrics/value.go b/libgo/go/runtime/metrics/value.go
new file mode 100644
index 0000000..0b056b4
--- /dev/null
+++ b/libgo/go/runtime/metrics/value.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	"math"
+	"unsafe"
+)
+
+// ValueKind is a tag for a metric Value which indicates its type.
+type ValueKind int
+
+const (
+	// KindBad indicates that the Value has no type and should not be used.
+	KindBad ValueKind = iota
+
+	// KindUint64 indicates that the type of the Value is a uint64.
+	KindUint64
+
+	// KindFloat64 indicates that the type of the Value is a float64.
+	KindFloat64
+
+	// KindFloat64Histogram indicates that the type of the Value is a *Float64Histogram.
+	KindFloat64Histogram
+)
+
+// Value represents a metric value returned by the runtime.
+type Value struct {
+	kind    ValueKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// Kind returns the a tag representing the kind of value this is.
+func (v Value) Kind() ValueKind {
+	return v.kind
+}
+
+// Uint64 returns the internal uint64 value for the metric.
+//
+// If v.Kind() != KindUint64, this method panics.
+func (v Value) Uint64() uint64 {
+	if v.kind != KindUint64 {
+		panic("called Uint64 on non-uint64 metric value")
+	}
+	return v.scalar
+}
+
+// Float64 returns the internal float64 value for the metric.
+//
+// If v.Kind() != KindFloat64, this method panics.
+func (v Value) Float64() float64 {
+	if v.kind != KindFloat64 {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return math.Float64frombits(v.scalar)
+}
+
+// Float64Histogram returns the internal *Float64Histogram value for the metric.
+//
+// If v.Kind() != KindFloat64Histogram, this method panics.
+func (v Value) Float64Histogram() *Float64Histogram {
+	if v.kind != KindFloat64Histogram {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return (*Float64Histogram)(v.pointer)
+}
diff --git a/libgo/go/runtime/metrics_test.go b/libgo/go/runtime/metrics_test.go
new file mode 100644
index 0000000..167edd5
--- /dev/null
+++ b/libgo/go/runtime/metrics_test.go
@@ -0,0 +1,224 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"runtime/metrics"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
+	all := metrics.All()
+	samples := make([]metrics.Sample, len(all))
+	descs := make(map[string]metrics.Description)
+	for i := range all {
+		samples[i].Name = all[i].Name
+		descs[all[i].Name] = all[i]
+	}
+	return descs, samples
+}
+
+func TestReadMetrics(t *testing.T) {
+	// Tests whether readMetrics produces values aligning
+	// with ReadMemStats while the world is stopped.
+	var mstats runtime.MemStats
+	_, samples := prepareAllMetricsSamples()
+	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
+
+	checkUint64 := func(t *testing.T, m string, got, want uint64) {
+		t.Helper()
+		if got != want {
+			t.Errorf("metric %q: got %d, want %d", m, got, want)
+		}
+	}
+
+	// Check to make sure the values we read line up with other values we read.
+	for i := range samples {
+		switch name := samples[i].Name; name {
+		case "/memory/classes/heap/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
+		case "/memory/classes/heap/released:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
+		case "/memory/classes/heap/objects:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
+		case "/memory/classes/heap/unused:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
+		case "/memory/classes/heap/stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
+		case "/memory/classes/metadata/mcache/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
+		case "/memory/classes/metadata/mcache/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
+		case "/memory/classes/metadata/mspan/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
+		case "/memory/classes/metadata/mspan/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
+		case "/memory/classes/metadata/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
+		case "/memory/classes/os-stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
+		case "/memory/classes/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
+		case "/memory/classes/profiling/buckets:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
+		case "/memory/classes/total:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
+		case "/gc/heap/objects:objects":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
+		case "/gc/heap/goal:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
+		case "/gc/cycles/automatic:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
+		case "/gc/cycles/forced:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
+		case "/gc/cycles/total:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
+		}
+	}
+}
+
+func TestReadMetricsConsistency(t *testing.T) {
+	// Tests whether readMetrics produces consistent, sensible values.
+	// The values are read concurrently with the runtime doing other
+	// things (e.g. allocating) so what we read can't reasonably compared
+	// to runtime values.
+
+	// Run a few GC cycles to get some of the stats to be non-zero.
+	runtime.GC()
+	runtime.GC()
+	runtime.GC()
+
+	// Read all the supported metrics through the metrics package.
+	descs, samples := prepareAllMetricsSamples()
+	metrics.Read(samples)
+
+	// Check to make sure the values we read make sense.
+	var totalVirtual struct {
+		got, want uint64
+	}
+	var objects struct {
+		alloc, free *metrics.Float64Histogram
+		total       uint64
+	}
+	var gc struct {
+		numGC  uint64
+		pauses uint64
+	}
+	for i := range samples {
+		kind := samples[i].Value.Kind()
+		if want := descs[samples[i].Name].Kind; kind != want {
+			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
+			continue
+		}
+		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
+			v := samples[i].Value.Uint64()
+			totalVirtual.want += v
+
+			// None of these stats should ever get this big.
+			// If they do, there's probably overflow involved,
+			// usually due to bad accounting.
+			if int64(v) < 0 {
+				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
+			}
+		}
+		switch samples[i].Name {
+		case "/memory/classes/total:bytes":
+			totalVirtual.got = samples[i].Value.Uint64()
+		case "/gc/heap/objects:objects":
+			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs-by-size:objects":
+			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees-by-size:objects":
+			objects.free = samples[i].Value.Float64Histogram()
+		case "/gc/cycles:gc-cycles":
+			gc.numGC = samples[i].Value.Uint64()
+		case "/gc/pauses:seconds":
+			h := samples[i].Value.Float64Histogram()
+			gc.pauses = 0
+			for i := range h.Counts {
+				gc.pauses += h.Counts[i]
+			}
+		case "/sched/goroutines:goroutines":
+			if samples[i].Value.Uint64() < 1 {
+				t.Error("number of goroutines is less than one")
+			}
+		}
+	}
+	if totalVirtual.got != totalVirtual.want {
+		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
+	}
+	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
+		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
+	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
+		t.Error("allocs-by-size and frees-by-size counts don't match in length")
+	} else {
+		for i := range objects.alloc.Buckets {
+			ba := objects.alloc.Buckets[i]
+			bf := objects.free.Buckets[i]
+			if ba != bf {
+				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
+			}
+		}
+		if !t.Failed() {
+			got, want := uint64(0), objects.total
+			for i := range objects.alloc.Counts {
+				if objects.alloc.Counts[i] < objects.free.Counts[i] {
+					t.Errorf("found more allocs than frees in object dist bucket %d", i)
+					continue
+				}
+				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+			}
+			if got != want {
+				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
+			}
+		}
+	}
+	// The current GC has at least 2 pauses per GC.
+	// Check to see if that value makes sense.
+	if gc.pauses < gc.numGC*2 {
+		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
+	}
+}
+
+func BenchmarkReadMetricsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+	_, samples := prepareAllMetricsSamples()
+
+	// Hit metrics.Read continuously and measure.
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := time.Now()
+		metrics.Read(samples)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	stop()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 2ca6280..9059889 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -59,7 +59,7 @@ func queuefinalizer(p unsafe.Pointer, fn *funcval, ft *functype, ot *ptrtype) {
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
-			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
 			finc.alllink = allfin
 			allfin = finc
 			if finptrmask[0] == 0 {
@@ -260,15 +260,15 @@ func runfinq() {
 // pass the object to a call of the KeepAlive function to mark the
 // last point in the function where the object must be reachable.
 //
-// For example, if p points to a struct that contains a file descriptor d,
-// and p has a finalizer that closes that file descriptor, and if the last
-// use of p in a function is a call to syscall.Write(p.d, buf, size), then
-// p may be unreachable as soon as the program enters syscall.Write. The
-// finalizer may run at that moment, closing p.d, causing syscall.Write
-// to fail because it is writing to a closed file descriptor (or, worse,
-// to an entirely different file descriptor opened by a different goroutine).
-// To avoid this problem, call runtime.KeepAlive(p) after the call to
-// syscall.Write.
+// For example, if p points to a struct, such as os.File, that contains
+// a file descriptor d, and p has a finalizer that closes that file
+// descriptor, and if the last use of p in a function is a call to
+// syscall.Write(p.d, buf, size), then p may be unreachable as soon as
+// the program enters syscall.Write. The finalizer may run at that moment,
+// closing p.d, causing syscall.Write to fail because it is writing to
+// a closed file descriptor (or, worse, to an entirely different
+// file descriptor opened by a different goroutine). To avoid this problem,
+// call runtime.KeepAlive(p) after the call to syscall.Write.
 //
 // A single goroutine runs all finalizers for a program, sequentially.
 // If a finalizer must run for a long time, it should do so by starting
diff --git a/libgo/go/runtime/mfixalloc.go b/libgo/go/runtime/mfixalloc.go
index f9dd6ca..293c16b 100644
--- a/libgo/go/runtime/mfixalloc.go
+++ b/libgo/go/runtime/mfixalloc.go
@@ -32,7 +32,7 @@ type fixalloc struct {
 	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
-	stat   *uint64
+	stat   *sysMemStat
 	zero   bool // zero allocations
 }
 
@@ -49,7 +49,7 @@ type mlink struct {
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
 	f.size = size
 	f.first = first
 	f.arg = arg
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index 72479c2..a9f2c1a 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -292,10 +292,14 @@ func setGCPhase(x uint32) {
 type gcMarkWorkerMode int
 
 const (
+	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
+	// starting work and the mode should be ignored.
+	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
+
 	// gcMarkWorkerDedicatedMode indicates that the P of a mark
 	// worker is dedicated to running that mark worker. The mark
 	// worker should run without preemption.
-	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
+	gcMarkWorkerDedicatedMode
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
@@ -315,6 +319,7 @@ const (
 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 // to use in execution traces.
 var gcMarkWorkerModeStrings = [...]string{
+	"Not worker",
 	"GC (dedicated)",
 	"GC (fractional)",
 	"GC (idle)",
@@ -390,10 +395,24 @@ type gcControllerState struct {
 	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
 	// time heap_scan is updated.
-	assistWorkPerByte float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
 
 	// assistBytesPerWork is 1/assistWorkPerByte.
-	assistBytesPerWork float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
 	// time that should be spent in the fractional mark worker on
@@ -411,7 +430,8 @@ type gcControllerState struct {
 }
 
 // startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema.
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
 func (c *gcControllerState) startCycle() {
 	c.scanWork = 0
 	c.bgScanCredit = 0
@@ -471,7 +491,8 @@ func (c *gcControllerState) startCycle() {
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
-		print("pacer: assist ratio=", c.assistWorkPerByte,
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
 			" (scan ", memstats.heap_scan>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
@@ -481,9 +502,22 @@ func (c *gcControllerState) startCycle() {
 }
 
 // revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan, memstats.heap_live, or
-// memstats.next_gc is updated (with mheap_.lock held).
+// improved estimates. This should be called whenever memstats.heap_scan,
+// memstats.heap_live, or memstats.next_gc is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
@@ -496,10 +530,12 @@ func (c *gcControllerState) revise() {
 		gcpercent = 100000
 	}
 	live := atomic.Load64(&memstats.heap_live)
+	scan := atomic.Load64(&memstats.heap_scan)
+	work := atomic.Loadint64(&c.scanWork)
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(memstats.next_gc)
+	heapGoal := int64(atomic.Load64(&memstats.next_gc))
 
 	// Compute the expected scan work remaining.
 	//
@@ -510,17 +546,17 @@ func (c *gcControllerState) revise() {
 	//
 	// (This is a float calculation to avoid overflowing on
 	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
-	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+	if int64(live) > heapGoal || work > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
 		// work than we expected. Pace GC so that in the worst case it
 		// will complete by the hard goal.
 		const maxOvershoot = 1.1
-		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
 
 		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(memstats.heap_scan)
+		scanWorkExpected = int64(scan)
 	}
 
 	// Compute the remaining scan work estimate.
@@ -530,7 +566,7 @@ func (c *gcControllerState) revise() {
 	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
-	scanWorkRemaining := scanWorkExpected - c.scanWork
+	scanWorkRemaining := scanWorkExpected - work
 	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
@@ -554,8 +590,15 @@ func (c *gcControllerState) revise() {
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
-	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -672,18 +715,12 @@ func (c *gcControllerState) enlistWorker() {
 	}
 }
 
-// findRunnableGCWorker returns the background mark worker for _p_ if it
+// findRunnableGCWorker returns a background mark worker for _p_ if it
 // should be run. This must only be called when gcBlackenEnabled != 0.
 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
-	if _p_.gcBgMarkWorker == 0 {
-		// The mark worker associated with this P is blocked
-		// performing a mark transition. We can't run it
-		// because it may be on some other run or wait queue.
-		return nil
-	}
 
 	if !gcMarkWorkAvailable(_p_) {
 		// No work to be done right now. This can happen at
@@ -693,15 +730,35 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		return nil
 	}
 
+	// Grab a worker before we commit to running below.
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		// There is at least one worker per P, so normally there are
+		// enough workers to run on all Ps, if necessary. However, once
+		// a worker enters gcMarkDone it may park without rejoining the
+		// pool, thus freeing a P with no corresponding worker.
+		// gcMarkDone never depends on another worker doing work, so it
+		// is safe to simply do nothing here.
+		//
+		// If gcMarkDone bails out without completing the mark phase,
+		// it will always do so with queued global work. Thus, that P
+		// will be immediately eligible to re-run the worker G it was
+		// just using, ensuring work can complete.
+		return nil
+	}
+
 	decIfPositive := func(ptr *int64) bool {
-		if *ptr > 0 {
-			if atomic.Xaddint64(ptr, -1) >= 0 {
+		for {
+			v := atomic.Loadint64(ptr)
+			if v <= 0 {
+				return false
+			}
+
+			// TODO: having atomic.Casint64 would be more pleasant.
+			if atomic.Cas64((*uint64)(unsafe.Pointer(ptr)), uint64(v), uint64(v-1)) {
 				return true
 			}
-			// We lost a race
-			atomic.Xaddint64(ptr, +1)
 		}
-		return false
 	}
 
 	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
@@ -710,6 +767,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 	} else if c.fractionalUtilizationGoal == 0 {
 		// No need for fractional workers.
+		gcBgMarkWorkerPool.push(&node.node)
 		return nil
 	} else {
 		// Is this P behind on the fractional utilization
@@ -719,14 +777,15 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		delta := nanotime() - gcController.markStartTime
 		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
 			// Nope. No need to run a fractional worker.
+			gcBgMarkWorkerPool.push(&node.node)
 			return nil
 		}
 		// Run a fractional worker.
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
-	// Run the background mark worker
-	gp := _p_.gcBgMarkWorker.ptr()
+	// Run the background mark worker.
+	gp := node.gp.ptr()
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	if trace.enabled {
 		traceGoUnpark(gp, 0)
@@ -764,6 +823,8 @@ func pollFractionalWorkerExit() bool {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcSetTriggerRatio(triggerRatio float64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
 	// Compute the next GC goal, which is when the allocated heap
 	// has grown by GOGC/100 over the heap marked by the last
 	// cycle.
@@ -846,7 +907,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 
 	// Commit to the trigger and goal.
 	memstats.gc_trigger = trigger
-	memstats.next_gc = goal
+	atomic.Store64(&memstats.next_gc, goal)
 	if trace.enabled {
 		traceNextGC()
 	}
@@ -903,7 +964,9 @@ func gcSetTriggerRatio(triggerRatio float64) {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcEffectiveGrowthRatio() float64 {
-	egogc := float64(memstats.next_gc-memstats.heap_marked) / float64(memstats.heap_marked)
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
 	if egogc < 0 {
 		// Shouldn't happen, but just in case.
 		egogc = 0
@@ -985,7 +1048,6 @@ var work struct {
 	nproc  uint32
 	tstart int64
 	nwait  uint32
-	ndone  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
 	nFlushCacheRoots                    int
@@ -1383,6 +1445,7 @@ func gcStart(trigger gcTrigger) {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
+		memstats.gcPauseDist.record(now - work.pauseStart)
 	})
 
 	// Release the world sema before Gosched() in STW mode
@@ -1409,19 +1472,6 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32
 
-// debugCachedWork enables extra checks for debugging premature mark
-// termination.
-//
-// For debugging issue #27993.
-const debugCachedWork = false
-
-// gcWorkPauseGen is for debugging the mark completion algorithm.
-// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
-// Only used if debugCachedWork is true.
-//
-// For debugging issue #27993.
-var gcWorkPauseGen uint32 = 1
-
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1477,15 +1527,7 @@ top:
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
 			wbBufFlush1(_p_)
-			// For debugging, shrink the write barrier
-			// buffer so it flushes immediately.
-			// wbBuf.reset will keep it at this size as
-			// long as throwOnGCWork is set.
-			if debugCachedWork {
-				b := &_p_.wbBuf
-				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
-				b.debugGen = gcWorkPauseGen
-			}
+
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
@@ -1496,29 +1538,12 @@ top:
 			if _p_.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
 				_p_.gcw.flushedWork = false
-			} else if debugCachedWork {
-				// For debugging, freeze the gcWork
-				// until we know whether we've reached
-				// completion or not. If we think
-				// we've reached completion, but
-				// there's a paused gcWork, then
-				// that's a bug.
-				_p_.gcw.pauseGen = gcWorkPauseGen
-				// Capture the G's stack.
-				for i := range _p_.gcw.pauseStack {
-					_p_.gcw.pauseStack[i].pc = 0
-				}
-				callers(1, _p_.gcw.pauseStack[:])
 			}
 		})
 		casgstatus(gp, _Gwaiting, _Grunning)
 	})
 
 	if gcMarkDoneFlushed != 0 {
-		if debugCachedWork {
-			// Release paused gcWorks.
-			atomic.Xadd(&gcWorkPauseGen, 1)
-		}
 		// More grey objects were discovered since the
 		// previous termination check, so there may be more
 		// work to do. Keep going. It's possible the
@@ -1528,13 +1553,6 @@ top:
 		goto top
 	}
 
-	if debugCachedWork {
-		throwOnGCWork = true
-		// Release paused gcWorks. If there are any, they
-		// should now observe throwOnGCWork and panic.
-		atomic.Xadd(&gcWorkPauseGen, 1)
-	}
-
 	// There was no global work, no local work, and no Ps
 	// communicated work since we took markDoneSema. Therefore
 	// there are no grey objects and no more objects can be
@@ -1551,59 +1569,34 @@ top:
 	// below. The important thing is that the wb remains active until
 	// all marking is complete. This includes writes made by the GC.
 
-	if debugCachedWork {
-		// For debugging, double check that no work was added after we
-		// went around above and disable write barrier buffering.
+	// There is sometimes work left over when we enter mark termination due
+	// to write barriers performed after the completion barrier above.
+	// Detect this and resume concurrent mark. This is obviously
+	// unfortunate.
+	//
+	// See issue #27993 for details.
+	//
+	// Switch to the system stack to call wbBufFlush1, though in this case
+	// it doesn't matter because we're non-preemptible anyway.
+	restart := false
+	systemstack(func() {
 		for _, p := range allp {
-			gcw := &p.gcw
-			if !gcw.empty() {
-				printlock()
-				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
-				if gcw.wbuf1 == nil {
-					print(" wbuf1=<nil>")
-				} else {
-					print(" wbuf1.n=", gcw.wbuf1.nobj)
-				}
-				if gcw.wbuf2 == nil {
-					print(" wbuf2=<nil>")
-				} else {
-					print(" wbuf2.n=", gcw.wbuf2.nobj)
-				}
-				print("\n")
-				if gcw.pauseGen == gcw.putGen {
-					println("runtime: checkPut already failed at this generation")
-				}
-				throw("throwOnGCWork")
+			wbBufFlush1(p)
+			if !p.gcw.empty() {
+				restart = true
+				break
 			}
 		}
-	} else {
-		// For unknown reasons (see issue #27993), there is
-		// sometimes work left over when we enter mark
-		// termination. Detect this and resume concurrent
-		// mark. This is obviously unfortunate.
-		//
-		// Switch to the system stack to call wbBufFlush1,
-		// though in this case it doesn't matter because we're
-		// non-preemptible anyway.
-		restart := false
+	})
+	if restart {
+		getg().m.preemptoff = ""
 		systemstack(func() {
-			for _, p := range allp {
-				wbBufFlush1(p)
-				if !p.gcw.empty() {
-					restart = true
-					break
-				}
-			}
+			now := startTheWorldWithSema(true)
+			work.pauseNS += now - work.pauseStart
+			memstats.gcPauseDist.record(now - work.pauseStart)
 		})
-		if restart {
-			getg().m.preemptoff = ""
-			systemstack(func() {
-				now := startTheWorldWithSema(true)
-				work.pauseNS += now - work.pauseStart
-			})
-			semrelease(&worldsema)
-			goto top
-		}
+		semrelease(&worldsema)
+		goto top
 	}
 
 	// Disable assists and background workers. We must do
@@ -1632,10 +1625,10 @@ top:
 	gcMarkTermination(nextTriggerRatio)
 }
 
+// World must be stopped and mark assists and background workers must be
+// disabled.
 func gcMarkTermination(nextTriggerRatio float64) {
-	// World is stopped.
-	// Start marktermination which includes enabling the write barrier.
-	atomic.Store(&gcBlackenEnabled, 0)
+	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
 	work.heap1 = memstats.heap_live
@@ -1672,13 +1665,13 @@ func gcMarkTermination(nextTriggerRatio float64) {
 			// mark using checkmark bits, to check that we
 			// didn't forget to mark anything during the
 			// concurrent mark process.
+			startCheckmarks()
 			gcResetMarkState()
-			initCheckmarks()
 			gcw := &getg().m.p.ptr().gcw
 			gcDrain(gcw, 0)
 			wbBufFlush1(getg().m.p.ptr())
 			gcw.dispose()
-			clearCheckmarks()
+			endCheckmarks()
 		}
 
 		// marking is complete so we can turn the write barrier off
@@ -1713,6 +1706,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
+	memstats.gcPauseDist.record(now - work.pauseStart)
 	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
 	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
@@ -1826,20 +1820,26 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	}
 }
 
-// gcBgMarkStartWorkers prepares background mark worker goroutines.
-// These goroutines will not run until the mark phase, but they must
-// be started while the work is not stopped and from a regular G
-// stack. The caller must hold worldsema.
+// gcBgMarkStartWorkers prepares background mark worker goroutines. These
+// goroutines will not run until the mark phase, but they must be started while
+// the work is not stopped and from a regular G stack. The caller must hold
+// worldsema.
 func gcBgMarkStartWorkers() {
-	// Background marking is performed by per-P G's. Ensure that
-	// each P has a background GC G.
-	for _, p := range allp {
-		if p.gcBgMarkWorker == 0 {
-			expectSystemGoroutine()
-			go gcBgMarkWorker(p)
-			notetsleepg(&work.bgMarkReady, -1)
-			noteclear(&work.bgMarkReady)
-		}
+	// Background marking is performed by per-P G's. Ensure that each P has
+	// a background GC G.
+	//
+	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
+	// again, we can reuse the old workers; no need to create new workers.
+	for gcBgMarkWorkerCount < gomaxprocs {
+		expectSystemGoroutine()
+		go gcBgMarkWorker()
+
+		notetsleepg(&work.bgMarkReady, -1)
+		noteclear(&work.bgMarkReady)
+		// The worker is now guaranteed to be added to the pool before
+		// its P's next findRunnableGCWorker.
+
+		gcBgMarkWorkerCount++
 	}
 }
 
@@ -1859,84 +1859,106 @@ func gcBgMarkPrepare() {
 	work.nwait = ^uint32(0)
 }
 
-func gcBgMarkWorker(_p_ *p) {
+// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
+// gcBgMarkWorker goroutine.
+type gcBgMarkWorkerNode struct {
+	// Unused workers are managed in a lock-free stack. This field must be first.
+	node lfnode
+
+	// The g of this worker.
+	gp guintptr
+
+	// Release this m on park. This is used to communicate with the unlock
+	// function, which cannot access the G's stack. It is unused outside of
+	// gcBgMarkWorker().
+	m muintptr
+}
+
+func gcBgMarkWorker() {
 	setSystemGoroutine()
 
 	gp := getg()
 
-	type parkInfo struct {
-		m      muintptr // Release this m on park.
-		attach puintptr // If non-nil, attach to this p on park.
-	}
-	// We pass park to a gopark unlock function, so it can't be on
+	// We pass node to a gopark unlock function, so it can't be on
 	// the stack (see gopark). Prevent deadlock from recursively
 	// starting GC by disabling preemption.
 	gp.m.preemptoff = "GC worker init"
-	park := new(parkInfo)
+	node := new(gcBgMarkWorkerNode)
 	gp.m.preemptoff = ""
 
-	park.m.set(acquirem())
-	park.attach.set(_p_)
-	// Inform gcBgMarkStartWorkers that this worker is ready.
-	// After this point, the background mark worker is scheduled
-	// cooperatively by gcController.findRunnable. Hence, it must
-	// never be preempted, as this would put it into _Grunnable
-	// and put it on a run queue. Instead, when the preempt flag
-	// is set, this puts itself into _Gwaiting to be woken up by
-	// gcController.findRunnable at the appropriate time.
+	node.gp.set(gp)
+
+	node.m.set(acquirem())
 	notewakeup(&work.bgMarkReady)
+	// After this point, the background mark worker is generally scheduled
+	// cooperatively by gcController.findRunnableGCWorker. While performing
+	// work on the P, preemption is disabled because we are working on
+	// P-local work buffers. When the preempt flag is set, this puts itself
+	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
+	// at the appropriate time.
+	//
+	// When preemption is enabled (e.g., while in gcMarkDone), this worker
+	// may be preempted and schedule as a _Grunnable G from a runq. That is
+	// fine; it will eventually gopark again for further scheduling via
+	// findRunnableGCWorker.
+	//
+	// Since we disable preemption before notifying bgMarkReady, we
+	// guarantee that this G will be in the worker pool for the next
+	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
+	// latency between _GCmark starting and the workers starting.
 
 	for {
-		// Go to sleep until woken by gcController.findRunnable.
-		// We can't releasem yet since even the call to gopark
-		// may be preempted.
-		gopark(func(g *g, parkp unsafe.Pointer) bool {
-			park := (*parkInfo)(parkp)
-
-			// The worker G is no longer running, so it's
-			// now safe to allow preemption.
-			releasem(park.m.ptr())
-
-			// If the worker isn't attached to its P,
-			// attach now. During initialization and after
-			// a phase change, the worker may have been
-			// running on a different P. As soon as we
-			// attach, the owner P may schedule the
-			// worker, so this must be done after the G is
-			// stopped.
-			if park.attach != 0 {
-				p := park.attach.ptr()
-				park.attach.set(nil)
-				// cas the worker because we may be
-				// racing with a new worker starting
-				// on this P.
-				if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
-					// The P got a new worker.
-					// Exit this worker.
-					return false
-				}
+		// Go to sleep until woken by
+		// gcController.findRunnableGCWorker.
+		gopark(func(g *g, nodep unsafe.Pointer) bool {
+			node := (*gcBgMarkWorkerNode)(nodep)
+
+			if mp := node.m.ptr(); mp != nil {
+				// The worker G is no longer running; release
+				// the M.
+				//
+				// N.B. it is _safe_ to release the M as soon
+				// as we are no longer performing P-local mark
+				// work.
+				//
+				// However, since we cooperatively stop work
+				// when gp.preempt is set, if we releasem in
+				// the loop then the following call to gopark
+				// would immediately preempt the G. This is
+				// also safe, but inefficient: the G must
+				// schedule again only to enter gopark and park
+				// again. Thus, we defer the release until
+				// after parking the G.
+				releasem(mp)
 			}
+
+			// Release this G to the pool.
+			gcBgMarkWorkerPool.push(&node.node)
+			// Note that at this point, the G may immediately be
+			// rescheduled and may be running.
 			return true
-		}, unsafe.Pointer(park), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
+		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
 
-		// Loop until the P dies and disassociates this
-		// worker (the P may later be reused, in which case
-		// it will get a new worker) or we failed to associate.
-		if _p_.gcBgMarkWorker.ptr() != gp {
-			break
-		}
+		// Preemption must not occur here, or another G might see
+		// p.gcMarkWorkerMode.
 
 		// Disable preemption so we can use the gcw. If the
 		// scheduler wants to preempt us, we'll stop draining,
 		// dispose the gcw, and then preempt.
-		park.m.set(acquirem())
+		node.m.set(acquirem())
+		pp := gp.m.p.ptr() // P can't change with preemption disabled.
 
 		if gcBlackenEnabled == 0 {
+			println("worker mode", pp.gcMarkWorkerMode)
 			throw("gcBgMarkWorker: blackening not enabled")
 		}
 
+		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
+			throw("gcBgMarkWorker: mode not set")
+		}
+
 		startTime := nanotime()
-		_p_.gcMarkWorkerStartTime = startTime
+		pp.gcMarkWorkerStartTime = startTime
 
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@@ -1953,11 +1975,11 @@ func gcBgMarkWorker(_p_ *p) {
 			// disabled for mark workers, so it is safe to
 			// read from the G stack.
 			casgstatus(gp, _Grunning, _Gwaiting)
-			switch _p_.gcMarkWorkerMode {
+			switch pp.gcMarkWorkerMode {
 			default:
 				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
 			case gcMarkWorkerDedicatedMode:
-				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
 				if gp.preempt {
 					// We were preempted. This is
 					// a useful signal to kick
@@ -1966,7 +1988,7 @@ func gcBgMarkWorker(_p_ *p) {
 					// somewhere else.
 					lock(&sched.lock)
 					for {
-						gp, _ := runqget(_p_)
+						gp, _ := runqget(pp)
 						if gp == nil {
 							break
 						}
@@ -1976,24 +1998,24 @@ func gcBgMarkWorker(_p_ *p) {
 				}
 				// Go back to draining, this time
 				// without preemption.
-				gcDrain(&_p_.gcw, gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
-				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
-				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			}
 			casgstatus(gp, _Gwaiting, _Grunning)
 		})
 
 		// Account for time.
 		duration := nanotime() - startTime
-		switch _p_.gcMarkWorkerMode {
+		switch pp.gcMarkWorkerMode {
 		case gcMarkWorkerDedicatedMode:
 			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
 			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
-			atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
+			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
 		case gcMarkWorkerIdleMode:
 			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
@@ -2002,31 +2024,27 @@ func gcBgMarkWorker(_p_ *p) {
 		// of work?
 		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait > work.nproc {
-			println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
+			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
 				"work.nwait=", incnwait, "work.nproc=", work.nproc)
 			throw("work.nwait > work.nproc")
 		}
 
+		// We'll releasem after this point and thus this P may run
+		// something else. We must clear the worker mode to avoid
+		// attributing the mode to a different (non-worker) G in
+		// traceGoStart.
+		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
+
 		// If this worker reached a background mark completion
 		// point, signal the main GC goroutine.
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
-			// Make this G preemptible and disassociate it
-			// as the worker for this P so
-			// findRunnableGCWorker doesn't try to
-			// schedule it.
-			_p_.gcBgMarkWorker.set(nil)
-			releasem(park.m.ptr())
+			// We don't need the P-local buffers here, allow
+			// preemption becuse we may schedule like a regular
+			// goroutine in gcMarkDone (block on locks, etc).
+			releasem(node.m.ptr())
+			node.m.set(nil)
 
 			gcMarkDone()
-
-			// Disable preemption and prepare to reattach
-			// to the P.
-			//
-			// We may be running on a different P at this
-			// point, so we can't reattach until this G is
-			// parked.
-			park.m.set(acquirem())
-			park.attach.set(_p_)
 		}
 	}
 }
@@ -2087,7 +2105,7 @@ func gcMark(start_time int64) {
 		// ensured all reachable objects were marked, all of
 		// these must be pointers to black objects. Hence we
 		// can just discard the write barrier buffer.
-		if debug.gccheckmark > 0 || throwOnGCWork {
+		if debug.gccheckmark > 0 {
 			// For debugging, flush the buffer and make
 			// sure it really was all marked.
 			wbBufFlush1(p)
@@ -2119,13 +2137,21 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	throwOnGCWork = false
-
-	cachestats()
-
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
+	// might have incorrect information.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
+	}
+
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
@@ -2144,6 +2170,8 @@ func gcMark(start_time int64) {
 //
 //go:systemstack
 func gcSweep(mode gcMode) {
+	assertWorldStopped()
+
 	if gcphase != _GCoff {
 		throw("gcSweep being done but phase is not GCoff")
 	}
@@ -2151,21 +2179,13 @@ func gcSweep(mode gcMode) {
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
 	mheap_.sweepdone = 0
-	if !go115NewMCentralImpl && mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
-		// We should have drained this list during the last
-		// sweep phase. We certainly need to start this phase
-		// with an empty swept list.
-		throw("non-empty swept list")
-	}
 	mheap_.pagesSwept = 0
 	mheap_.sweepArenas = mheap_.allArenas
 	mheap_.reclaimIndex = 0
 	mheap_.reclaimCredit = 0
 	unlock(&mheap_.lock)
 
-	if go115NewMCentralImpl {
-		sweep.centralIndex.clear()
-	}
+	sweep.centralIndex.clear()
 
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
 		// Special case synchronous sweep.
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index 4e4d131..ed2a8c1 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -47,10 +47,6 @@ const (
 	// Must be a multiple of the pageInUse bitmap element size and
 	// must also evenly divide pagesPerArena.
 	pagesPerSpanRoot = 512
-
-	// go115NewMarkrootSpans is a feature flag that indicates whether
-	// to use the new bitmap-based markrootSpans implementation.
-	go115NewMarkrootSpans = true
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
@@ -58,6 +54,8 @@ const (
 //
 // The world must be stopped.
 func gcMarkRootPrepare() {
+	assertWorldStopped()
+
 	work.nFlushCacheRoots = 0
 
 	work.nDataRoots = 0
@@ -73,24 +71,16 @@ func gcMarkRootPrepare() {
 	//
 	// We depend on addfinalizer to mark objects that get
 	// finalizers after root marking.
-	if go115NewMarkrootSpans {
-		// We're going to scan the whole heap (that was available at the time the
-		// mark phase started, i.e. markArenas) for in-use spans which have specials.
-		//
-		// Break up the work into arenas, and further into chunks.
-		//
-		// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
-		// is append-only.
-		mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
-		work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
-	} else {
-		// We're only interested in scanning the in-use spans,
-		// which will all be swept at this point. More spans
-		// may be added to this list during concurrent GC, but
-		// we only care about spans that were allocated before
-		// this mark phase.
-		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
-	}
+	//
+	// We're going to scan the whole heap (that was available at the time the
+	// mark phase started, i.e. markArenas) for in-use spans which have specials.
+	//
+	// Break up the work into arenas, and further into chunks.
+	//
+	// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
+	// is append-only.
+	mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
+	work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
 
 	// Scan stacks.
 	//
@@ -252,10 +242,6 @@ func markrootBlock(roots *gcRootList, gcw *gcWork) {
 //
 //go:nowritebarrier
 func markrootSpans(gcw *gcWork, shard int) {
-	if !go115NewMarkrootSpans {
-		oldMarkrootSpans(gcw, shard)
-		return
-	}
 	// Objects with finalizers have two GC-related invariants:
 	//
 	// 1) Everything reachable from the object must be marked.
@@ -332,90 +318,6 @@ func markrootSpans(gcw *gcWork, shard int) {
 	}
 }
 
-// oldMarkrootSpans marks roots for one shard of work.spans.
-//
-// For go115NewMarkrootSpans = false.
-//
-//go:nowritebarrier
-func oldMarkrootSpans(gcw *gcWork, shard int) {
-	// Objects with finalizers have two GC-related invariants:
-	//
-	// 1) Everything reachable from the object must be marked.
-	// This ensures that when we pass the object to its finalizer,
-	// everything the finalizer can reach will be retained.
-	//
-	// 2) Finalizer specials (which are not in the garbage
-	// collected heap) are roots. In practice, this means the fn
-	// field must be scanned.
-	//
-	// TODO(austin): There are several ideas for making this more
-	// efficient in issue #11485.
-
-	sg := mheap_.sweepgen
-	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
-	// Note that work.spans may not include spans that were
-	// allocated between entering the scan phase and now. We may
-	// also race with spans being added into sweepSpans when they're
-	// just created, and as a result we may see nil pointers in the
-	// spans slice. This is okay because any objects with finalizers
-	// in those spans must have been allocated and given finalizers
-	// after we entered the scan phase, so addfinalizer will have
-	// ensured the above invariants for them.
-	for i := 0; i < len(spans); i++ {
-		// sweepBuf.block requires that we read pointers from the block atomically.
-		// It also requires that we ignore nil pointers.
-		s := (*mspan)(atomic.Loadp(unsafe.Pointer(&spans[i])))
-
-		// This is racing with spans being initialized, so
-		// check the state carefully.
-		if s == nil || s.state.get() != mSpanInUse {
-			continue
-		}
-		// Check that this span was swept (it may be cached or uncached).
-		if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
-			// sweepgen was updated (+2) during non-checkmark GC pass
-			print("sweep ", s.sweepgen, " ", sg, "\n")
-			throw("gc: unswept span")
-		}
-
-		// Speculatively check if there are any specials
-		// without acquiring the span lock. This may race with
-		// adding the first special to a span, but in that
-		// case addfinalizer will observe that the GC is
-		// active (which is globally synchronized) and ensure
-		// the above invariants. We may also ensure the
-		// invariants, but it's okay to scan an object twice.
-		if s.specials == nil {
-			continue
-		}
-
-		// Lock the specials to prevent a special from being
-		// removed from the list while we're traversing it.
-		lock(&s.speciallock)
-
-		for sp := s.specials; sp != nil; sp = sp.next {
-			if sp.kind != _KindSpecialFinalizer {
-				continue
-			}
-			// don't mark finalized object, but scan it so we
-			// retain everything it points to.
-			spf := (*specialfinalizer)(unsafe.Pointer(sp))
-			// A finalizer can be set for an inner byte of an object, find object beginning.
-			p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-
-			// Mark everything that can be reached from
-			// the object (but *not* the object itself or
-			// we'll never collect it).
-			scanobject(p, gcw)
-
-			// The special itself is a root.
-			scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
-		}
-
-		unlock(&s.speciallock)
-	}
-}
-
 // gcAssistAlloc performs GC work to make gp's assist debt positive.
 // gp must be the calling user gorountine.
 //
@@ -436,11 +338,13 @@ retry:
 	// balance positive. When the required amount of work is low,
 	// we over-assist to build up credit for future allocations
 	// and amortize the cost of assisting.
+	assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
 	debtBytes := -gp.gcAssistBytes
-	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	scanWork := int64(assistWorkPerByte * float64(debtBytes))
 	if scanWork < gcOverAssistWork {
 		scanWork = gcOverAssistWork
-		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+		debtBytes = int64(assistBytesPerWork * float64(scanWork))
 	}
 
 	// Steal as much credit as we can from the background GC's
@@ -454,7 +358,7 @@ retry:
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
 			stolen = bgScanCredit
-			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+			gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(stolen))
 		} else {
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
@@ -579,7 +483,8 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// this scan work counts for. The "1+" is a poor man's
 	// round-up, to ensure this adds credit even if
 	// assistBytesPerWork is very low.
-	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
 
 	// If this is the last worker and we ran out of work,
 	// signal a completion point.
@@ -673,7 +578,8 @@ func gcFlushBgCredit(scanWork int64) {
 		return
 	}
 
-	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	scanBytes := int64(float64(scanWork) * assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
 	for !work.assistQueue.q.empty() && scanBytes > 0 {
@@ -706,7 +612,8 @@ func gcFlushBgCredit(scanWork int64) {
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
-		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
 		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 	}
 	unlock(&work.assistQueue.lock)
@@ -1154,11 +1061,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 		}
 		// Load bits once. See CL 22712 and issue 16973 for discussion.
 		bits := hbits.bits()
-		// During checkmarking, 1-word objects store the checkmark
-		// in the type bit for the one word. The only one-word objects
-		// are pointers, or else they'd be merged with other non-pointer
-		// data into larger allocations.
-		if i != 1*sys.PtrSize && bits&bitScan == 0 {
+		if bits&bitScan == 0 {
 			break // no more pointers in this object
 		}
 		if bits&bitPointer == 0 {
@@ -1280,35 +1183,10 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
 	mbits := span.markBitsForIndex(objIndex)
 
 	if useCheckmark {
-		if !mbits.isMarked() {
-			// Stack scanning is conservative, so we can
-			// see a reference to an object not previously
-			// found. Assume the object was correctly not
-			// marked and ignore the pointer.
-			if forStack {
-				return
-			}
-			printlock()
-			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
-			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
-
-			// Dump the source (base) object
-			gcDumpObject("base", base, off)
-
-			// Dump the object
-			gcDumpObject("obj", obj, ^uintptr(0))
-
-			getg().m.traceback = 2
-			throw("checkmark found unmarked object")
-		}
-		hbits := heapBitsForAddr(obj)
-		if hbits.isCheckmarked(span.elemsize) {
+		if setCheckmark(obj, base, off, mbits, forStack) {
+			// Already marked.
 			return
 		}
-		hbits.setCheckmarked(span.elemsize)
-		if !hbits.isCheckmarked(span.elemsize) {
-			throw("setCheckmarked and isCheckmarked disagree")
-		}
 	} else {
 		// Stack scanning is conservative, so we can see a
 		// pointer to a free object. Assume the object was
@@ -1434,6 +1312,8 @@ func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 //
 // The world must be stopped.
 func gcMarkTinyAllocs() {
+	assertWorldStopped()
+
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil || c.tiny == 0 {
@@ -1444,45 +1324,3 @@ func gcMarkTinyAllocs() {
 		greyobject(c.tiny, 0, 0, span, gcw, objIndex, false)
 	}
 }
-
-// Checkmarking
-
-// To help debug the concurrent GC we remark with the world
-// stopped ensuring that any object encountered has their normal
-// mark bit set. To do this we use an orthogonal bit
-// pattern to indicate the object is marked. The following pattern
-// uses the upper two bits in the object's boundary nibble.
-// 01: scalar  not marked
-// 10: pointer not marked
-// 11: pointer     marked
-// 00: scalar      marked
-// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
-// The higher bit is 1 for pointers and 0 for scalars, whether the object
-// is marked or not.
-// The first nibble no longer holds the typeDead pattern indicating that the
-// there are no more pointers in the object. This information is held
-// in the second nibble.
-
-// If useCheckmark is true, marking of an object uses the
-// checkmark bits (encoding above) instead of the standard
-// mark bits.
-var useCheckmark = false
-
-//go:nowritebarrier
-func initCheckmarks() {
-	useCheckmark = true
-	for _, s := range mheap_.allspans {
-		if s.state.get() == mSpanInUse {
-			heapBitsForAddr(s.base()).initCheckmarkSpan(s.layout())
-		}
-	}
-}
-
-func clearCheckmarks() {
-	useCheckmark = false
-	for _, s := range mheap_.allspans {
-		if s.state.get() == mSpanInUse {
-			heapBitsForAddr(s.base()).clearCheckmarkSpan(s.layout())
-		}
-	}
-}
diff --git a/libgo/go/runtime/mgcscavenge.go b/libgo/go/runtime/mgcscavenge.go
index 326e6ee..8b44ac8 100644
--- a/libgo/go/runtime/mgcscavenge.go
+++ b/libgo/go/runtime/mgcscavenge.go
@@ -90,7 +90,7 @@ const (
 	//
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
-	scavengeCostRatio = 0.7 * sys.GoosDarwin
+	scavengeCostRatio = 0.7 * (sys.GoosDarwin + sys.GoosIos)
 
 	// scavengeReservationShards determines the amount of memory the scavenger
 	// should reserve for scavenging at a time. Specifically, the amount of
@@ -100,7 +100,7 @@ const (
 
 // heapRetained returns an estimate of the current heap RSS.
 func heapRetained() uint64 {
-	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
@@ -123,7 +123,7 @@ func gcPaceScavenger() {
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -392,13 +392,15 @@ func bgscavenge(c chan int) {
 //
 // Returns the amount of memory scavenged in bytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	var (
 		addrs addrRange
 		gen   uint32
@@ -406,17 +408,17 @@ func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	released := uintptr(0)
 	for released < nbytes {
 		if addrs.size() == 0 {
-			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
 				break
 			}
 		}
-		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
+		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
 		released += r
 		addrs = a
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
-	s.scavengeUnreserve(addrs, gen)
+	p.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -442,46 +444,48 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 // scavengeStartGen starts a new scavenge generation, resetting
 // the scavenger's search space to the full in-use address space.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeStartGen() {
+func (p *pageAlloc) scavengeStartGen() {
+	assertLockHeld(p.mheapLock)
+
 	if debug.scavtrace > 0 {
-		printScavTrace(s.scav.gen, s.scav.released, false)
+		printScavTrace(p.scav.gen, p.scav.released, false)
 	}
-	s.inUse.cloneInto(&s.scav.inUse)
+	p.inUse.cloneInto(&p.scav.inUse)
 
 	// Pick the new starting address for the scavenger cycle.
 	var startAddr offAddr
-	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
 		// The "free" high watermark exceeds the "scavenged" low watermark,
 		// so there are free scavengable pages in parts of the address space
 		// that the scavenger already searched, the high watermark being the
 		// highest one. Pick that as our new starting point to ensure we
 		// see those pages.
-		startAddr = s.scav.freeHWM
+		startAddr = p.scav.freeHWM
 	} else {
 		// The "free" high watermark does not exceed the "scavenged" low
 		// watermark. This means the allocator didn't free any memory in
 		// the range we scavenged last cycle, so we might as well continue
 		// scavenging from where we were.
-		startAddr = s.scav.scavLWM
+		startAddr = p.scav.scavLWM
 	}
-	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+	p.scav.inUse.removeGreaterEqual(startAddr.addr())
 
-	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
 	// scavengeReservationShards is large. This case is fine as the scavenger
 	// will simply be turned off, but it does mean that scavengeReservationShards,
 	// in concert with pallocChunkBytes, dictates the minimum heap size at which
 	// the scavenger triggers. In practice this minimum is generally less than an
 	// arena in size, so virtually every heap has the scavenger on.
-	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
-	s.scav.gen++
-	s.scav.released = 0
-	s.scav.freeHWM = minOffAddr
-	s.scav.scavLWM = maxOffAddr
+	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	p.scav.gen++
+	p.scav.released = 0
+	p.scav.freeHWM = minOffAddr
+	p.scav.scavLWM = maxOffAddr
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -491,19 +495,21 @@ func (s *pageAlloc) scavengeStartGen() {
 //
 // Returns the reserved range and the scavenge generation number for it.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	assertLockHeld(p.mheapLock)
+
 	// Start by reserving the minimum.
-	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
 
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, s.scav.gen
+		return r, p.scav.gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -513,27 +519,29 @@ func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	newBase := alignDown(r.base.addr(), pallocChunkBytes)
 
 	// Remove from inUse however much extra we just pulled out.
-	s.scav.inUse.removeGreaterEqual(newBase)
+	p.scav.inUse.removeGreaterEqual(newBase)
 	r.base = offAddr{newBase}
-	return r, s.scav.gen
+	return r, p.scav.gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	if r.size() == 0 || gen != s.scav.gen {
+func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	assertLockHeld(p.mheapLock)
+
+	if r.size() == 0 || gen != p.scav.gen {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	s.scav.inUse.add(r)
+	p.scav.inUse.add(r)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -547,13 +555,15 @@ func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+	assertLockHeld(p.mheapLock)
+
 	// Defensively check if we've recieved an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -588,12 +598,12 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// Helpers for locking and unlocking only if mayUnlock == true.
 	lockHeap := func() {
 		if mayUnlock {
-			lock(s.mheapLock)
+			lock(p.mheapLock)
 		}
 	}
 	unlockHeap := func() {
 		if mayUnlock {
-			unlock(s.mheapLock)
+			unlock(p.mheapLock)
 		}
 	}
 
@@ -604,14 +614,16 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// by subtracting 1.
 	maxAddr := work.limit.addr() - 1
 	maxChunk := chunkIndex(maxAddr)
-	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 	}
@@ -633,7 +645,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// that's fine. We're being optimistic anyway.
 
 			// Check quickly if there are enough free pages at all.
-			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
 				continue
 			}
 
@@ -643,7 +655,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// avoid races with heap growth. It may or may not be possible to also
 			// see a nil pointer in this case if we do race with heap growth, but
 			// just defensively ignore the nils. This operation is optimistic anyway.
-			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
 			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
 				return i, true
 			}
@@ -672,16 +684,20 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		}
 
 		// Find, verify, and scavenge if we can.
-		chunk := s.chunkOf(candidateChunkIdx)
+		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+
+			assertLockHeld(p.mheapLock) // Must be locked on return.
 			return uintptr(npages) * pageSize, work
 		}
 
 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+
+	assertLockHeld(p.mheapLock) // Must be locked on return.
 	return 0, work
 }
 
@@ -692,28 +708,38 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
-	s.chunkOf(ci).scavenged.setRange(base, npages)
+// p.mheapLock must be held.
+func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	assertLockHeld(p.mheapLock)
+
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
 	// Update the scavenge low watermark.
-	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
-		s.scav.scavLWM = oAddr
+	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
+		p.scav.scavLWM = oAddr
 	}
 
 	// Only perform the actual scavenging if we're not in a test.
 	// It's dangerous to do so otherwise.
-	if s.test {
+	if p.test {
 		return addr
 	}
 	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	nbytes := int64(npages) * pageSize
+	atomic.Xadd64(&memstats.heap_released, nbytes)
+
+	// Update consistent accounting too.
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, -nbytes)
+	atomic.Xaddint64(&stats.released, nbytes)
+	memstats.heapStats.release()
+
 	return addr
 }
 
diff --git a/libgo/go/runtime/mgcscavenge_test.go b/libgo/go/runtime/mgcscavenge_test.go
index 7f619b1..2503430 100644
--- a/libgo/go/runtime/mgcscavenge_test.go
+++ b/libgo/go/runtime/mgcscavenge_test.go
@@ -235,26 +235,43 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 	if PhysHugePageSize > uintptr(PageSize) {
 		// Check hugepage preserving behavior.
 		bits := uint(PhysHugePageSize / uintptr(PageSize))
-		tests["PreserveHugePageBottom"] = test{
-			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
-			min:   1,
-			max:   3, // Make it so that max would have us try to break the huge page.
-			want:  BitRange{0, bits + 2},
-		}
-		if 3*bits < PallocChunkPages {
-			// We need at least 3 huge pages in a chunk for this test to make sense.
-			tests["PreserveHugePageMiddle"] = test{
-				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+		if bits < PallocChunkPages {
+			tests["PreserveHugePageBottom"] = test{
+				alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
 				min:   1,
-				max:   12, // Make it so that max would have us try to break the huge page.
-				want:  BitRange{bits, bits + 10},
+				max:   3, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{0, bits + 2},
+			}
+			if 3*bits < PallocChunkPages {
+				// We need at least 3 huge pages in a chunk for this test to make sense.
+				tests["PreserveHugePageMiddle"] = test{
+					alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+					min:   1,
+					max:   12, // Make it so that max would have us try to break the huge page.
+					want:  BitRange{bits, bits + 10},
+				}
+			}
+			tests["PreserveHugePageTop"] = test{
+				alloc: []BitRange{{0, PallocChunkPages - bits}},
+				min:   1,
+				max:   1, // Even one page would break a huge page in this case.
+				want:  BitRange{PallocChunkPages - bits, bits},
+			}
+		} else if bits == PallocChunkPages {
+			tests["PreserveHugePageAll"] = test{
+				min:  1,
+				max:  1, // Even one page would break a huge page in this case.
+				want: BitRange{0, PallocChunkPages},
+			}
+		} else {
+			// The huge page size is greater than pallocChunkPages, so it should
+			// be effectively disabled. There's no way we can possible scavenge
+			// a huge page out of this bitmap chunk.
+			tests["PreserveHugePageNone"] = test{
+				min:  1,
+				max:  1,
+				want: BitRange{PallocChunkPages - 1, 1},
 			}
-		}
-		tests["PreserveHugePageTop"] = test{
-			alloc: []BitRange{{0, PallocChunkPages - bits}},
-			min:   1,
-			max:   1, // Even one page would break a huge page in this case.
-			want:  BitRange{PallocChunkPages - bits, bits},
 		}
 	}
 	for name, v := range tests {
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index 6877649..b2a5a86 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -123,6 +123,8 @@ func (h *mheap) nextSpanForSweep() *mspan {
 //
 //go:nowritebarrier
 func finishsweep_m() {
+	assertWorldStopped()
+
 	// Sweeping must be complete before marking commences, so
 	// sweep any unswept spans. If this is a concurrent GC, there
 	// shouldn't be any spans left to sweep, so this should finish
@@ -132,17 +134,15 @@ func finishsweep_m() {
 		sweep.npausesweep++
 	}
 
-	if go115NewMCentralImpl {
-		// Reset all the unswept buffers, which should be empty.
-		// Do this in sweep termination as opposed to mark termination
-		// so that we can catch unswept spans and reclaim blocks as
-		// soon as possible.
-		sg := mheap_.sweepgen
-		for i := range mheap_.central {
-			c := &mheap_.central[i].mcentral
-			c.partialUnswept(sg).reset()
-			c.fullUnswept(sg).reset()
-		}
+	// Reset all the unswept buffers, which should be empty.
+	// Do this in sweep termination as opposed to mark termination
+	// so that we can catch unswept spans and reclaim blocks as
+	// soon as possible.
+	sg := mheap_.sweepgen
+	for i := range mheap_.central {
+		c := &mheap_.central[i].mcentral
+		c.partialUnswept(sg).reset()
+		c.fullUnswept(sg).reset()
 	}
 
 	// Sweeping is done, so if the scavenger isn't already awake,
@@ -204,11 +204,7 @@ func sweepone() uintptr {
 	var s *mspan
 	sg := mheap_.sweepgen
 	for {
-		if go115NewMCentralImpl {
-			s = mheap_.nextSpanForSweep()
-		} else {
-			s = mheap_.sweepSpans[1-sg/2%2].pop()
-		}
+		s = mheap_.nextSpanForSweep()
 		if s == nil {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
@@ -324,9 +320,6 @@ func (s *mspan) ensureSwept() {
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
 func (s *mspan) sweep(preserve bool) bool {
-	if !go115NewMCentralImpl {
-		return s.oldSweep(preserve)
-	}
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -348,8 +341,6 @@ func (s *mspan) sweep(preserve bool) bool {
 	spc := s.spanclass
 	size := s.elemsize
 
-	c := _g_.m.p.ptr().mcache
-
 	// The allocBits indicate which unmarked objects don't need to be
 	// processed since they were free at the end of the last GC cycle
 	// and were not allocated since then.
@@ -514,7 +505,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release()
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -559,8 +552,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			stats := memstats.heapStats.acquire()
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release()
 			return true
 		}
 
@@ -570,214 +565,6 @@ func (s *mspan) sweep(preserve bool) bool {
 	return false
 }
 
-// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-// It clears the mark bits in preparation for the next GC round.
-// Returns true if the span was returned to heap.
-// If preserve=true, don't return it to heap nor relink in mcentral lists;
-// caller takes care of it.
-//
-// For !go115NewMCentralImpl.
-func (s *mspan) oldSweep(preserve bool) bool {
-	// It's critical that we enter this function with preemption disabled,
-	// GC must not start while we are in the middle of this function.
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("mspan.sweep: m is not locked")
-	}
-	sweepgen := mheap_.sweepgen
-	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-		throw("mspan.sweep: bad span state")
-	}
-
-	if trace.enabled {
-		traceGCSweepSpan(s.npages * _PageSize)
-	}
-
-	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
-
-	spc := s.spanclass
-	size := s.elemsize
-	res := false
-
-	c := _g_.m.p.ptr().mcache
-	freeToHeap := false
-
-	// The allocBits indicate which unmarked objects don't need to be
-	// processed since they were free at the end of the last GC cycle
-	// and were not allocated since then.
-	// If the allocBits index is >= s.freeindex and the bit
-	// is not marked then the object remains unallocated
-	// since the last GC.
-	// This situation is analogous to being on a freelist.
-
-	// Unlink & free special records for any objects we're about to free.
-	// Two complications here:
-	// 1. An object can have both finalizer and profile special records.
-	//    In such case we need to queue finalizer for execution,
-	//    mark the object as live and preserve the profile special.
-	// 2. A tiny object can have several finalizers setup for different offsets.
-	//    If such object is not marked, we need to queue all finalizers at once.
-	// Both 1 and 2 are possible at the same time.
-	hadSpecials := s.specials != nil
-	specialp := &s.specials
-	special := *specialp
-	for special != nil {
-		// A finalizer can be set for an inner byte of an object, find object beginning.
-		objIndex := uintptr(special.offset) / size
-		p := s.base() + objIndex*size
-		mbits := s.markBitsForIndex(objIndex)
-		if !mbits.isMarked() {
-			// This object is not marked and has at least one special record.
-			// Pass 1: see if it has at least one finalizer.
-			hasFin := false
-			endOffset := p - s.base() + size
-			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
-				if tmp.kind == _KindSpecialFinalizer {
-					// Stop freeing of object if it has a finalizer.
-					mbits.setMarkedNonAtomic()
-					hasFin = true
-					break
-				}
-			}
-			// Pass 2: queue all finalizers _or_ handle profile record.
-			for special != nil && uintptr(special.offset) < endOffset {
-				// Find the exact byte for which the special was setup
-				// (as opposed to object beginning).
-				p := s.base() + uintptr(special.offset)
-				if special.kind == _KindSpecialFinalizer || !hasFin {
-					// Splice out special record.
-					y := special
-					special = special.next
-					*specialp = special
-					freespecial(y, unsafe.Pointer(p), size)
-				} else {
-					// This is profile record, but the object has finalizers (so kept alive).
-					// Keep special record.
-					specialp = &special.next
-					special = *specialp
-				}
-			}
-		} else {
-			// object is still live: keep special record
-			specialp = &special.next
-			special = *specialp
-		}
-	}
-	if go115NewMarkrootSpans && hadSpecials && s.specials == nil {
-		spanHasNoSpecials(s)
-	}
-
-	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
-		// Find all newly freed objects. This doesn't have to
-		// efficient; allocfreetrace has massive overhead.
-		mbits := s.markBitsForBase()
-		abits := s.allocBitsForIndex(0)
-		for i := uintptr(0); i < s.nelems; i++ {
-			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
-				x := s.base() + i*s.elemsize
-				if debug.allocfreetrace != 0 {
-					tracefree(unsafe.Pointer(x), size)
-				}
-				if debug.clobberfree != 0 {
-					clobberfree(unsafe.Pointer(x), size)
-				}
-				if raceenabled {
-					racefree(unsafe.Pointer(x), size)
-				}
-				if msanenabled {
-					msanfree(unsafe.Pointer(x), size)
-				}
-			}
-			mbits.advance()
-			abits.advance()
-		}
-	}
-
-	// Count the number of free objects in this span.
-	nalloc := uint16(s.countAlloc())
-	if spc.sizeclass() == 0 && nalloc == 0 {
-		s.needzero = 1
-		freeToHeap = true
-	}
-	nfreed := s.allocCount - nalloc
-	if nalloc > s.allocCount {
-		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
-		throw("sweep increased allocation count")
-	}
-
-	s.allocCount = nalloc
-	wasempty := s.nextFreeIndex() == s.nelems
-	s.freeindex = 0 // reset allocation index to start of span.
-	if trace.enabled {
-		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
-	}
-
-	// gcmarkBits becomes the allocBits.
-	// get a fresh cleared gcmarkBits in preparation for next GC
-	s.allocBits = s.gcmarkBits
-	s.gcmarkBits = newMarkBits(s.nelems)
-
-	// Initialize alloc bits cache.
-	s.refillAllocCache(0)
-
-	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
-	// because of the potential for a concurrent free/SetFinalizer.
-	// But we need to set it before we make the span available for allocation
-	// (return it to heap or mcentral), because allocation code assumes that a
-	// span is already swept if available for allocation.
-	if freeToHeap || nfreed == 0 {
-		// The span must be in our exclusive ownership until we update sweepgen,
-		// check for potential races.
-		if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-			throw("mspan.sweep: bad span state after sweep")
-		}
-		// Serialization point.
-		// At this point the mark bits are cleared and allocation ready
-		// to go so release the span.
-		atomic.Store(&s.sweepgen, sweepgen)
-	}
-
-	if nfreed > 0 && spc.sizeclass() != 0 {
-		c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
-		res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty)
-		// mcentral.freeSpan updates sweepgen
-	} else if freeToHeap {
-		// Free large span to heap
-
-		// NOTE(rsc,dvyukov): The original implementation of efence
-		// in CL 22060046 used sysFree instead of sysFault, so that
-		// the operating system would eventually give the memory
-		// back to us again, so that an efence program could run
-		// longer without running out of memory. Unfortunately,
-		// calling sysFree here without any kind of adjustment of the
-		// heap data structures means that when the memory does
-		// come back to us, we have the wrong metadata for it, either in
-		// the mspan structures or in the garbage collection bitmap.
-		// Using sysFault here means that the program will run out of
-		// memory fairly quickly in efence mode, but at least it won't
-		// have mysterious crashes due to confused memory reuse.
-		// It should be possible to switch back to sysFree if we also
-		// implement and then call some kind of mheap.deleteSpan.
-		if debug.efence > 0 {
-			s.limit = 0 // prevent mlookup from finding this span
-			sysFault(unsafe.Pointer(s.base()), size)
-		} else {
-			mheap_.freeSpan(s)
-		}
-		c.local_nlargefree++
-		c.local_largefree += size
-		res = true
-	}
-	if !res {
-		// The span has been swept and is still in-use, so put
-		// it on the swept in-use list.
-		mheap_.sweepSpans[sweepgen/2%2].push(s)
-	}
-	return res
-}
-
 // reportZombies reports any marked but free objects in s and throws.
 //
 // This generally means one of the following:
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
deleted file mode 100644
index 1f722c3..0000000
--- a/libgo/go/runtime/mgcsweepbuf.go
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"internal/cpu"
-	"runtime/internal/atomic"
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-// A gcSweepBuf is a set of *mspans.
-//
-// gcSweepBuf is safe for concurrent push operations *or* concurrent
-// pop operations, but not both simultaneously.
-type gcSweepBuf struct {
-	// A gcSweepBuf is a two-level data structure consisting of a
-	// growable spine that points to fixed-sized blocks. The spine
-	// can be accessed without locks, but adding a block or
-	// growing it requires taking the spine lock.
-	//
-	// Because each mspan covers at least 8K of heap and takes at
-	// most 8 bytes in the gcSweepBuf, the growth of the spine is
-	// quite limited.
-	//
-	// The spine and all blocks are allocated off-heap, which
-	// allows this to be used in the memory manager and avoids the
-	// need for write barriers on all of these. We never release
-	// this memory because there could be concurrent lock-free
-	// access and we're likely to reuse it anyway. (In principle,
-	// we could do this during STW.)
-
-	spineLock mutex
-	spine     unsafe.Pointer // *[N]*gcSweepBlock, accessed atomically
-	spineLen  uintptr        // Spine array length, accessed atomically
-	spineCap  uintptr        // Spine array cap, accessed under lock
-
-	// index is the first unused slot in the logical concatenation
-	// of all blocks. It is accessed atomically.
-	index uint32
-}
-
-const (
-	gcSweepBlockEntries    = 512 // 4KB on 64-bit
-	gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit
-)
-
-type gcSweepBlock struct {
-	spans [gcSweepBlockEntries]*mspan
-}
-
-// push adds span s to buffer b. push is safe to call concurrently
-// with other push operations, but NOT to call concurrently with pop.
-func (b *gcSweepBuf) push(s *mspan) {
-	// Obtain our slot.
-	cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-
-	// Do we need to add a block?
-	spineLen := atomic.Loaduintptr(&b.spineLen)
-	var block *gcSweepBlock
-retry:
-	if top < spineLen {
-		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-		blockp := add(spine, sys.PtrSize*top)
-		block = (*gcSweepBlock)(atomic.Loadp(blockp))
-	} else {
-		// Add a new block to the spine, potentially growing
-		// the spine.
-		lock(&b.spineLock)
-		// spineLen cannot change until we release the lock,
-		// but may have changed while we were waiting.
-		spineLen = atomic.Loaduintptr(&b.spineLen)
-		if top < spineLen {
-			unlock(&b.spineLock)
-			goto retry
-		}
-
-		if spineLen == b.spineCap {
-			// Grow the spine.
-			newCap := b.spineCap * 2
-			if newCap == 0 {
-				newCap = gcSweepBufInitSpineCap
-			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
-			if b.spineCap != 0 {
-				// Blocks are allocated off-heap, so
-				// no write barriers.
-				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
-			}
-			// Spine is allocated off-heap, so no write barrier.
-			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
-			b.spineCap = newCap
-			// We can't immediately free the old spine
-			// since a concurrent push with a lower index
-			// could still be reading from it. We let it
-			// leak because even a 1TB heap would waste
-			// less than 2MB of memory on old spines. If
-			// this is a problem, we could free old spines
-			// during STW.
-		}
-
-		// Allocate a new block and add it to the spine.
-		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
-		blockp := add(b.spine, sys.PtrSize*top)
-		// Blocks are allocated off-heap, so no write barrier.
-		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
-		atomic.Storeuintptr(&b.spineLen, spineLen+1)
-		unlock(&b.spineLock)
-	}
-
-	// We have a block. Insert the span atomically, since there may be
-	// concurrent readers via the block API.
-	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
-}
-
-// pop removes and returns a span from buffer b, or nil if b is empty.
-// pop is safe to call concurrently with other pop operations, but NOT
-// to call concurrently with push.
-func (b *gcSweepBuf) pop() *mspan {
-	cursor := atomic.Xadd(&b.index, -1)
-	if int32(cursor) < 0 {
-		atomic.Xadd(&b.index, +1)
-		return nil
-	}
-
-	// There are no concurrent spine or block modifications during
-	// pop, so we can omit the atomics.
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-	blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
-	block := *blockp
-	s := block.spans[bottom]
-	// Clear the pointer for block(i).
-	block.spans[bottom] = nil
-	return s
-}
-
-// numBlocks returns the number of blocks in buffer b. numBlocks is
-// safe to call concurrently with any other operation. Spans that have
-// been pushed prior to the call to numBlocks are guaranteed to appear
-// in some block in the range [0, numBlocks()), assuming there are no
-// intervening pops. Spans that are pushed after the call may also
-// appear in these blocks.
-func (b *gcSweepBuf) numBlocks() int {
-	return int(divRoundUp(uintptr(atomic.Load(&b.index)), gcSweepBlockEntries))
-}
-
-// block returns the spans in the i'th block of buffer b. block is
-// safe to call concurrently with push. The block may contain nil
-// pointers that must be ignored, and each entry in the block must be
-// loaded atomically.
-func (b *gcSweepBuf) block(i int) []*mspan {
-	// Perform bounds check before loading spine address since
-	// push ensures the allocated length is at least spineLen.
-	if i < 0 || uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {
-		throw("block index out of range")
-	}
-
-	// Get block i.
-	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-	blockp := add(spine, sys.PtrSize*uintptr(i))
-	block := (*gcSweepBlock)(atomic.Loadp(blockp))
-
-	// Slice the block if necessary.
-	cursor := uintptr(atomic.Load(&b.index))
-	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
-	var spans []*mspan
-	if uintptr(i) < top {
-		spans = block.spans[:]
-	} else {
-		spans = block.spans[:bottom]
-	}
-	return spans
-}
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index d23d640..b3a0686 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -22,13 +22,6 @@ const (
 	workbufAlloc = 32 << 10
 )
 
-// throwOnGCWork causes any operations that add pointers to a gcWork
-// buffer to throw.
-//
-// TODO(austin): This is a temporary debugging measure for issue
-// #27993. To be removed before release.
-var throwOnGCWork bool
-
 func init() {
 	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 		throw("bad workbufAlloc")
@@ -93,17 +86,6 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
-
-	// pauseGen causes put operations to spin while pauseGen ==
-	// gcWorkPauseGen if debugCachedWork is true.
-	pauseGen uint32
-
-	// putGen is the pauseGen of the last putGen.
-	putGen uint32
-
-	// pauseStack is the stack at which this P was paused if
-	// debugCachedWork is true.
-	pauseStack [16]location
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -122,59 +104,10 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
-func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
-	if debugCachedWork {
-		alreadyFailed := w.putGen == w.pauseGen
-		w.putGen = w.pauseGen
-		if !canPreemptM(getg().m) {
-			// If we were to spin, the runtime may
-			// deadlock. Since we can't be preempted, the
-			// spin could prevent gcMarkDone from
-			// finishing the ragged barrier, which is what
-			// releases us from the spin.
-			return
-		}
-		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
-		}
-		if throwOnGCWork {
-			printlock()
-			if alreadyFailed {
-				println("runtime: checkPut already failed at this generation")
-			}
-			println("runtime: late gcWork put")
-			if ptr != 0 {
-				gcDumpObject("ptr", ptr, ^uintptr(0))
-			}
-			for _, ptr := range ptrs {
-				gcDumpObject("ptrs", ptr, ^uintptr(0))
-			}
-			println("runtime: paused at")
-			for _, loc := range w.pauseStack {
-				if loc.pc == 0 {
-					break
-				}
-				if loc.function != "" {
-					// Obviously this doesn't
-					// relate to ancestor
-					// tracebacks, but this
-					// function prints what we
-					// want.
-					printAncestorTracebackFuncInfo(loc.function, loc.filename, loc.lineno, loc.pc)
-				} else {
-					println("\tunknown PC ", hex(loc.pc), "\n")
-				}
-			}
-			throw("throwOnGCWork")
-		}
-	}
-}
-
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-	w.checkPut(obj, nil)
-
 	flushed := false
 	wbuf := w.wbuf1
 	// Record that this may acquire the wbufSpans or heap lock to
@@ -213,8 +146,6 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-	w.checkPut(obj, nil)
-
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
@@ -236,8 +167,6 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}
 
-	w.checkPut(0, obj)
-
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -359,12 +288,10 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		putfull(wbuf)
 		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		w.wbuf1 = handoff(wbuf)
 		w.flushedWork = true // handoff did putfull
 	} else {
@@ -444,7 +371,7 @@ func getempty() *workbuf {
 		}
 		if s == nil {
 			systemstack(func() {
-				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+				s = mheap_.allocManual(workbufAlloc/pageSize, spanAllocWorkBuf)
 			})
 			if s == nil {
 				throw("out of memory")
@@ -546,7 +473,7 @@ func freeSomeWbufs(preemptible bool) bool {
 				break
 			}
 			work.wbufSpans.free.remove(span)
-			mheap_.freeManual(span, &memstats.gc_sys)
+			mheap_.freeManual(span, spanAllocWorkBuf)
 		}
 	})
 	more := !work.wbufSpans.free.isEmpty()
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index 755efd1..77f3987 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -42,17 +42,13 @@ const (
 	// roughly 100µs.
 	//
 	// Must be a multiple of the pageInUse bitmap element size and
-	// must also evenly divid pagesPerArena.
+	// must also evenly divide pagesPerArena.
 	pagesPerReclaimerChunk = 512
 
-	// go115NewMCentralImpl is a feature flag for the new mcentral implementation.
-	//
-	// This flag depends on go115NewMarkrootSpans because the new mcentral
-	// implementation requires that markroot spans no longer rely on mgcsweepbufs.
-	// The definition of this flag helps ensure that if there's a problem with
-	// the new markroot spans implementation and it gets turned off, that the new
-	// mcentral implementation also gets turned off so the runtime isn't broken.
-	go115NewMCentralImpl = true && go115NewMarkrootSpans
+	// physPageAlignedStacks indicates whether stack allocations must be
+	// physical page aligned. This is a requirement for MAP_STACK on
+	// OpenBSD.
+	physPageAlignedStacks = GOOS == "openbsd"
 )
 
 // Main malloc heap.
@@ -85,19 +81,6 @@ type mheap struct {
 	// access (since that may free the backing store).
 	allspans []*mspan // all spans out there
 
-	// sweepSpans contains two mspan stacks: one of swept in-use
-	// spans, and one of unswept in-use spans. These two trade
-	// roles on each GC cycle. Since the sweepgen increases by 2
-	// on each cycle, this means the swept spans are in
-	// sweepSpans[sweepgen/2%2] and the unswept spans are in
-	// sweepSpans[1-sweepgen/2%2]. Sweeping pops spans from the
-	// unswept stack and pushes spans that are still in-use on the
-	// swept stack. Likewise, allocating an in-use span pushes it
-	// on the swept stack.
-	//
-	// For !go115NewMCentralImpl.
-	sweepSpans [2]gcSweepBuf
-
 	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
@@ -150,13 +133,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64                  // bytes allocated for large objects
-	nlargealloc uint64                  // number of large object allocations
-	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -220,7 +196,7 @@ type mheap struct {
 		base, end uintptr
 	}
 
-	// _ uint32 // ensure 64-bit alignment of central
+	_ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
 	// the padding makes sure that the mcentrals are
@@ -300,6 +276,10 @@ type heapArena struct {
 	// during marking.
 	pageSpecials [pagesPerArena / 8]uint8
 
+	// checkmarks stores the debug.gccheckmark state. It is only
+	// used if debug.gccheckmark > 0.
+	checkmarks *checkmarksMap
+
 	// zeroedBase marks the first byte of the first page in this
 	// arena which hasn't been used yet and is therefore already
 	// zero. zeroedBase is relative to the arena base.
@@ -508,10 +488,15 @@ func (s *mspan) layout() (size, n, total uintptr) {
 // indirect call from the fixalloc initializer, the compiler can't see
 // this.
 //
+// The heap lock must be held.
+//
 //go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
+
+	assertLockHeld(&h.lock)
+
 	if len(h.allspans) >= cap(h.allspans) {
 		n := 64 * 1024 / sys.PtrSize
 		if n < cap(h.allspans)*3/2 {
@@ -715,8 +700,6 @@ func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8)
 // Initialize the heap.
 func (h *mheap) init() {
 	lockInit(&h.lock, lockRankMheap)
-	lockInit(&h.sweepSpans[0].spineLock, lockRankSpine)
-	lockInit(&h.sweepSpans[1].spineLock, lockRankSpine)
 	lockInit(&h.speciallock, lockRankMheapSpecial)
 
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
@@ -740,7 +723,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gc_sys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -748,7 +731,7 @@ func (h *mheap) init() {
 //
 // reclaim implements the page-reclaimer half of the sweeper.
 //
-// h must NOT be locked.
+// h.lock must NOT be held.
 func (h *mheap) reclaim(npage uintptr) {
 	// TODO(austin): Half of the time spent freeing spans is in
 	// locking/unlocking the heap (even with low contention). We
@@ -831,6 +814,8 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	// In particular, if a span were freed and merged concurrently
 	// with this probing heapArena.spans, it would be possible to
 	// observe arbitrary, stale span pointers.
+	assertLockHeld(&h.lock)
+
 	n0 := n
 	var nFreed uintptr
 	sg := h.sweepgen
@@ -885,9 +870,27 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		traceGCSweepSpan((n0 - nFreed) * pageSize)
 		lock(&h.lock)
 	}
+
+	assertLockHeld(&h.lock) // Must be locked on return.
 	return nFreed
 }
 
+// spanAllocType represents the type of allocation to make, or
+// the type of allocation to be freed.
+type spanAllocType uint8
+
+const (
+	spanAllocHeap          spanAllocType = iota // heap span
+	spanAllocStack                              // stack span
+	spanAllocPtrScalarBits                      // unrolled GC prog bitmap span
+	spanAllocWorkBuf                            // work buf span
+)
+
+// manual returns true if the span allocation is manually managed.
+func (s spanAllocType) manual() bool {
+	return s != spanAllocHeap
+}
+
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
 // spanclass indicates the span's size class and scannability.
@@ -904,7 +907,7 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 		if h.sweepdone == 0 {
 			h.reclaim(npages)
 		}
-		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
+		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
 	if s != nil {
@@ -929,9 +932,15 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 // allocManual must be called on the system stack because it may
 // acquire the heap lock via allocSpan. See mheap for details.
 //
+// If new code is written to call allocManual, do NOT use an
+// existing spanAllocType value and instead declare a new one.
+//
 //go:systemstack
-func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
-	return h.allocSpan(npages, true, 0, stat)
+func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
+	if !typ.manual() {
+		throw("manual span allocation called with non-manually-managed type")
+	}
+	return h.allocSpan(npages, typ, 0)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -1016,7 +1025,7 @@ func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
 // tryAllocMSpan attempts to allocate an mspan object from
 // the P-local cache, but may fail.
 //
-// h need not be locked.
+// h.lock need not be held.
 //
 // This caller must ensure that its P won't change underneath
 // it during this function. Currently to ensure that we enforce
@@ -1040,7 +1049,7 @@ func (h *mheap) tryAllocMSpan() *mspan {
 
 // allocMSpanLocked allocates an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // allocMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1049,6 +1058,8 @@ func (h *mheap) tryAllocMSpan() *mspan {
 //
 //go:systemstack
 func (h *mheap) allocMSpanLocked() *mspan {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	if pp == nil {
 		// We don't have a p so just do the normal thing.
@@ -1070,7 +1081,7 @@ func (h *mheap) allocMSpanLocked() *mspan {
 
 // freeMSpanLocked free an mspan object.
 //
-// h must be locked.
+// h.lock must be held.
 //
 // freeMSpanLocked must be called on the system stack because
 // its caller holds the heap lock. See mheap for details.
@@ -1079,6 +1090,8 @@ func (h *mheap) allocMSpanLocked() *mspan {
 //
 //go:systemstack
 func (h *mheap) freeMSpanLocked(s *mspan) {
+	assertLockHeld(&h.lock)
+
 	pp := getg().m.p.ptr()
 	// First try to free the mspan directly to the cache.
 	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
@@ -1093,7 +1106,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 
 // allocSpan allocates an mspan which owns npages worth of memory.
 //
-// If manual == false, allocSpan allocates a heap span of class spanclass
+// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
 // and updates heap accounting. If manual == true, allocSpan allocates a
 // manually-managed span (spanclass is ignored), and the caller is
 // responsible for any accounting related to its use of the span. Either
@@ -1102,20 +1115,27 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 //
 // The returned span is fully initialized.
 //
-// h must not be locked.
+// h.lock must not be held.
 //
 // allocSpan must be called on the system stack both because it acquires
 // the heap lock and because it must block GC transitions.
 //
 //go:systemstack
-func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
 
+	// On some platforms we need to provide physical page aligned stack
+	// allocations. Where the page size is less than the physical page
+	// size, we already manage to do this by default.
+	needPhysPageAlign := physPageAlignedStacks && typ == spanAllocStack && pageSize < physPageSize
+
 	// If the allocation is small enough, try the page cache!
+	// The page cache does not support aligned allocations, so we cannot use
+	// it if we need to provide a physical page aligned stack allocation.
 	pp := gp.m.p.ptr()
-	if pp != nil && npages < pageCachePages/4 {
+	if !needPhysPageAlign && pp != nil && npages < pageCachePages/4 {
 		c := &pp.pcache
 
 		// If the cache is empty, refill it.
@@ -1129,23 +1149,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		base, scav = c.alloc(npages)
 		if base != 0 {
 			s = h.tryAllocMSpan()
-
-			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+			if s != nil {
 				goto HaveSpan
 			}
-			// We're either running duing GC, failed to acquire a mspan,
-			// or the allocation is for a large object. This means we
-			// have to lock the heap and do a bunch of extra work,
-			// so go down the HaveBaseLocked path.
-			//
-			// We must do this during GC to avoid skew with heap_scan
-			// since we flush mcache stats whenever we lock.
-			//
-			// TODO(mknyszek): It would be nice to not have to
-			// lock the heap if it's a large allocation, but
-			// it's fine for now. The critical section here is
-			// short and large object allocations are relatively
-			// infrequent.
+			// We have a base but no mspan, so we need
+			// to lock the heap.
 		}
 	}
 
@@ -1153,6 +1161,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 	// whole job done without the heap lock.
 	lock(&h.lock)
 
+	if needPhysPageAlign {
+		// Overallocate by a physical page to allow for later alignment.
+		npages += physPageSize / pageSize
+	}
+
 	if base == 0 {
 		// Try to acquire a base address.
 		base, scav = h.pages.alloc(npages)
@@ -1172,39 +1185,23 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// one now that we have the heap lock.
 		s = h.allocMSpanLocked()
 	}
-	if !manual {
-		// This is a heap span, so we should do some additional accounting
-		// which may only be done with the heap locked.
 
-		// Transfer stats from mcache to global.
-		var c *mcache
-		if gp.m.p != 0 {
-			c = gp.m.p.ptr().mcache
-		} else {
-			// This case occurs while bootstrapping.
-			// See the similar code in mallocgc.
-			c = mcache0
-			if c == nil {
-				throw("mheap.allocSpan called with no P")
-			}
-		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
-
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
-		}
+	if needPhysPageAlign {
+		allocBase, allocPages := base, npages
+		base = alignUp(allocBase, physPageSize)
+		npages -= physPageSize / pageSize
 
-		// Either heap_live or heap_scan could have been updated.
-		if gcBlackenEnabled != 0 {
-			gcController.revise()
+		// Return memory around the aligned allocation.
+		spaceBefore := base - allocBase
+		if spaceBefore > 0 {
+			h.pages.free(allocBase, spaceBefore/pageSize)
+		}
+		spaceAfter := (allocPages-npages)*pageSize - spaceBefore
+		if spaceAfter > 0 {
+			h.pages.free(base+npages*pageSize, spaceAfter/pageSize)
 		}
 	}
+
 	unlock(&h.lock)
 
 HaveSpan:
@@ -1215,12 +1212,10 @@ HaveSpan:
 		s.needzero = 1
 	}
 	nbytes := npages * pageSize
-	if manual {
+	if typ.manual() {
 		s.manualFreeList = 0
 		s.nelems = 0
 		s.limit = s.base() + s.npages*pageSize
-		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
 		s.state.set(mSpanManual)
 	} else {
 		// We must set span properties before the span is published anywhere
@@ -1274,11 +1269,31 @@ HaveSpan:
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
 		sysUsed(unsafe.Pointer(base), nbytes)
-		mSysStatDec(&memstats.heap_released, scav)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
-	mSysStatInc(sysStat, nbytes)
-	mSysStatDec(&memstats.heap_idle, nbytes)
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys.
+		memstats.heap_sys.add(-int64(nbytes))
+	}
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release()
 
 	// Publish the span in various locations.
 
@@ -1289,17 +1304,7 @@ HaveSpan:
 	// before that happens) or pageInUse is updated.
 	h.setSpans(s.base(), npages, s)
 
-	if !manual {
-		if !go115NewMCentralImpl {
-			// Add to swept in-use list.
-			//
-			// This publishes the span to root marking.
-			//
-			// h.sweepgen is guaranteed to only change during STW,
-			// and preemption is disabled in the page allocator.
-			h.sweepSpans[h.sweepgen/2%2].push(s)
-		}
-
+	if !typ.manual() {
 		// Mark in-use span in arena page bitmap.
 		//
 		// This publishes the span to the page sweeper, so
@@ -1310,11 +1315,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
@@ -1327,8 +1327,10 @@ HaveSpan:
 // Try to add at least npage pages of memory to the heap,
 // returning whether it worked.
 //
-// h must be locked.
+// h.lock must be held.
 func (h *mheap) grow(npage uintptr) bool {
+	assertLockHeld(&h.lock)
+
 	// We must grow the heap in whole palloc chunks.
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
@@ -1370,8 +1372,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		// The allocation is always aligned to the heap arena
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
-		mSysStatInc(&memstats.heap_released, asize)
-		mSysStatInc(&memstats.heap_idle, asize)
+		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.released, int64(asize))
+		memstats.heapStats.release()
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1403,29 +1407,20 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if gcBlackenEnabled != 0 {
-			// heap_scan changed.
-			gcController.revise()
-		}
-		h.freeSpanLocked(s, true, true)
+		h.freeSpanLocked(s, spanAllocHeap)
 		unlock(&h.lock)
 	})
 }
 
 // freeManual frees a manually-managed span returned by allocManual.
-// stat must be the same as the stat passed to the allocManual that
+// typ must be the same as the spanAllocType passed to the allocManual that
 // allocated s.
 //
 // This must only be called when gcphase == _GCoff. See mSpanState for
@@ -1435,16 +1430,16 @@ func (h *mheap) freeSpan(s *mspan) {
 // the heap lock. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) freeManual(s *mspan, stat *uint64) {
+func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
 	s.needzero = 1
 	lock(&h.lock)
-	mSysStatDec(stat, s.npages*pageSize)
-	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
-	h.freeSpanLocked(s, false, true)
+	h.freeSpanLocked(s, typ)
 	unlock(&h.lock)
 }
 
-func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
+func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
+	assertLockHeld(&h.lock)
+
 	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
@@ -1464,12 +1459,30 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
-	if acctinuse {
-		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
-	}
-	if acctidle {
-		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
-	}
+	// Update stats.
+	//
+	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
+	}
+	// Update consistent stats.
+	stats := memstats.heapStats.acquire()
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
+	}
+	memstats.heapStats.release()
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
@@ -1701,9 +1714,7 @@ func addspecial(p unsafe.Pointer, s *special) bool {
 	s.offset = uint16(offset)
 	s.next = *t
 	*t = s
-	if go115NewMarkrootSpans {
-		spanHasSpecials(span)
-	}
+	spanHasSpecials(span)
 	unlock(&span.speciallock)
 	releasem(mp)
 
@@ -1744,7 +1755,7 @@ func removespecial(p unsafe.Pointer, kind uint8) *special {
 		}
 		t = &s.next
 	}
-	if go115NewMarkrootSpans && span.specials == nil {
+	if span.specials == nil {
 		spanHasNoSpecials(span)
 	}
 	unlock(&span.speciallock)
@@ -2012,7 +2023,7 @@ func newArenaMayUnlock() *gcBitsArena {
 	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
 		unlock(&gcBitsArenas.lock)
-		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
diff --git a/libgo/go/runtime/mkfastlog2table.go b/libgo/go/runtime/mkfastlog2table.go
index 305c84a..d650292 100644
--- a/libgo/go/runtime/mkfastlog2table.go
+++ b/libgo/go/runtime/mkfastlog2table.go
@@ -12,9 +12,9 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"log"
 	"math"
+	"os"
 )
 
 func main() {
@@ -36,7 +36,7 @@ func main() {
 	}
 	fmt.Fprintln(&buf, "}")
 
-	if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
+	if err := os.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
 		log.Fatalln(err)
 	}
 }
diff --git a/libgo/go/runtime/mkpreempt.go b/libgo/go/runtime/mkpreempt.go
index 268941d..8644973 100644
--- a/libgo/go/runtime/mkpreempt.go
+++ b/libgo/go/runtime/mkpreempt.go
@@ -127,12 +127,13 @@ func header(arch string) {
 	}
 	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
 	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
-	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
+	fmt.Fprintf(out, "// Note: asyncPreempt doesn't use the internal ABI, but we must be able to inject calls to it from the signal handler, so Go code has to see the PC of this function literally.\n")
+	fmt.Fprintf(out, "TEXT ·asyncPreempt<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
 
 func p(f string, args ...interface{}) {
 	fmted := fmt.Sprintf(f, args...)
-	fmt.Fprintf(out, "\t%s\n", strings.Replace(fmted, "\n", "\n\t", -1))
+	fmt.Fprintf(out, "\t%s\n", strings.ReplaceAll(fmted, "\n", "\n\t"))
 }
 
 func label(l string) {
@@ -190,7 +191,6 @@ func (l *layout) restore() {
 
 func gen386() {
 	p("PUSHFL")
-
 	// Save general purpose registers.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
@@ -200,12 +200,6 @@ func gen386() {
 		l.add("MOVL", reg, 4)
 	}
 
-	// Save the 387 state.
-	l.addSpecial(
-		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
-		"FRSTOR %d(SP)",
-		108)
-
 	// Save SSE state only if supported.
 	lSSE := layout{stack: l.stack, sp: "SP"}
 	for i := 0; i < 8; i++ {
@@ -356,10 +350,10 @@ func genARM64() {
 	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
 	p("SUB $8, RSP, R29")  // set up new frame pointer
 	p("#endif")
-	// On darwin, save the LR again after decrementing SP. We run the
-	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// On iOS, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support sigaltstack),
 	// so any writes below SP may be clobbered.
-	p("#ifdef GOOS_darwin")
+	p("#ifdef GOOS_ios")
 	p("MOVD R30, (RSP)")
 	p("#endif")
 
@@ -508,12 +502,12 @@ func genRISCV() {
 }
 
 func genRISCV64() {
-	// X0 (zero), X1 (LR), X2 (SP), X4 (g), X31 (TMP) are special.
+	// X0 (zero), X1 (LR), X2 (SP), X4 (TP), X27 (g), X31 (TMP) are special.
 	var l = layout{sp: "X2", stack: 8}
 
-	// Add integer registers (X3, X5-X30).
+	// Add integer registers (X3, X5-X26, X28-30).
 	for i := 3; i < 31; i++ {
-		if i == 4 {
+		if i == 4 || i == 27 {
 			continue
 		}
 		reg := fmt.Sprintf("X%d", i)
diff --git a/libgo/go/runtime/mksizeclasses.go b/libgo/go/runtime/mksizeclasses.go
index cacbb64..b92d1fe 100644
--- a/libgo/go/runtime/mksizeclasses.go
+++ b/libgo/go/runtime/mksizeclasses.go
@@ -35,7 +35,6 @@ import (
 	"fmt"
 	"go/format"
 	"io"
-	"io/ioutil"
 	"log"
 	"os"
 )
@@ -65,7 +64,7 @@ func main() {
 	if *stdout {
 		_, err = os.Stdout.Write(out)
 	} else {
-		err = ioutil.WriteFile("sizeclasses.go", out, 0666)
+		err = os.WriteFile("sizeclasses.go", out, 0666)
 	}
 	if err != nil {
 		log.Fatal(err)
@@ -110,8 +109,8 @@ func makeClasses() []class {
 				align = 256
 			} else if size >= 128 {
 				align = size / 8
-			} else if size >= 16 {
-				align = 16 // required for x86 SSE instructions, if we want to use them
+			} else if size >= 32 {
+				align = 16 // heap bitmaps assume 16 byte alignment for allocations >= 32 bytes.
 			}
 		}
 		if !powerOfTwo(align) {
@@ -157,7 +156,7 @@ func makeClasses() []class {
 		}
 	}
 
-	if len(classes) != 67 {
+	if len(classes) != 68 {
 		panic("number of size classes has changed")
 	}
 
diff --git a/libgo/go/runtime/mpagealloc.go b/libgo/go/runtime/mpagealloc.go
index c90a637..dac1f39 100644
--- a/libgo/go/runtime/mpagealloc.go
+++ b/libgo/go/runtime/mpagealloc.go
@@ -293,13 +293,13 @@ type pageAlloc struct {
 
 	// sysStat is the runtime memstat to update when new system
 	// memory is committed by the pageAlloc for allocation metadata.
-	sysStat *uint64
+	sysStat *sysMemStat
 
 	// Whether or not this struct is being used in tests.
 	test bool
 }
 
-func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
@@ -308,29 +308,29 @@ func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 		print("runtime: summary max pages = ", maxPackedValue, "\n")
 		throw("root level max pages doesn't fit in summary")
 	}
-	s.sysStat = sysStat
+	p.sysStat = sysStat
 
-	// Initialize s.inUse.
-	s.inUse.init(sysStat)
+	// Initialize p.inUse.
+	p.inUse.init(sysStat)
 
 	// System-dependent initialization.
-	s.sysInit()
+	p.sysInit()
 
 	// Start with the searchAddr in a state indicating there's no free memory.
-	s.searchAddr = maxSearchAddr
+	p.searchAddr = maxSearchAddr
 
 	// Set the mheapLock.
-	s.mheapLock = mheapLock
+	p.mheapLock = mheapLock
 
 	// Initialize scavenge tracking state.
-	s.scav.scavLWM = maxSearchAddr
+	p.scav.scavLWM = maxSearchAddr
 }
 
 // tryChunkOf returns the bitmap data for the given chunk.
 //
 // Returns nil if the chunk data has not been mapped.
-func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
-	l2 := s.chunks[ci.l1()]
+func (p *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
+	l2 := p.chunks[ci.l1()]
 	if l2 == nil {
 		return nil
 	}
@@ -340,15 +340,17 @@ func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
 // chunkOf returns the chunk at the given chunk index.
 //
 // The chunk index must be valid or this method may throw.
-func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
-	return &s.chunks[ci.l1()][ci.l2()]
+func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &p.chunks[ci.l1()][ci.l2()]
 }
 
 // grow sets up the metadata for the address range [base, base+size).
-// It may allocate metadata, in which case *s.sysStat will be updated.
+// It may allocate metadata, in which case *p.sysStat will be updated.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) grow(base, size uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) grow(base, size uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// Round up to chunks, since we can't deal with increments smaller
 	// than chunks. Also, sysGrow expects aligned values.
 	limit := alignUp(base+size, pallocChunkBytes)
@@ -356,29 +358,29 @@ func (s *pageAlloc) grow(base, size uintptr) {
 
 	// Grow the summary levels in a system-dependent manner.
 	// We just update a bunch of additional metadata here.
-	s.sysGrow(base, limit)
+	p.sysGrow(base, limit)
 
-	// Update s.start and s.end.
+	// Update p.start and p.end.
 	// If no growth happened yet, start == 0. This is generally
 	// safe since the zero page is unmapped.
-	firstGrowth := s.start == 0
+	firstGrowth := p.start == 0
 	start, end := chunkIndex(base), chunkIndex(limit)
-	if firstGrowth || start < s.start {
-		s.start = start
+	if firstGrowth || start < p.start {
+		p.start = start
 	}
-	if end > s.end {
-		s.end = end
+	if end > p.end {
+		p.end = end
 	}
 	// Note that [base, limit) will never overlap with any existing
 	// range inUse because grow only ever adds never-used memory
 	// regions to the page allocator.
-	s.inUse.add(makeAddrRange(base, limit))
+	p.inUse.add(makeAddrRange(base, limit))
 
 	// A grow operation is a lot like a free operation, so if our
-	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// chunk ends up below p.searchAddr, update p.searchAddr to the
 	// new address, just like in free.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 
 	// Add entries into chunks, which is sparse, if needed. Then,
@@ -387,21 +389,21 @@ func (s *pageAlloc) grow(base, size uintptr) {
 	// Newly-grown memory is always considered scavenged.
 	// Set all the bits in the scavenged bitmaps high.
 	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
-		if s.chunks[c.l1()] == nil {
+		if p.chunks[c.l1()] == nil {
 			// Create the necessary l2 entry.
 			//
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
-			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
-			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
-		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
 	}
 
 	// Update summaries accordingly. The grow acts like a free, so
 	// we need to ensure this newly-free memory is visible in the
 	// summaries.
-	s.update(base, size/pageSize, true, false)
+	p.update(base, size/pageSize, true, false)
 }
 
 // update updates heap metadata. It must be called each time the bitmap
@@ -411,8 +413,10 @@ func (s *pageAlloc) grow(base, size uintptr) {
 // a contiguous allocation or free between addr and addr+npages. alloc indicates
 // whether the operation performed was an allocation or a free.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+// p.mheapLock must be held.
+func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	assertLockHeld(p.mheapLock)
+
 	// base, limit, start, and end are inclusive.
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -421,23 +425,23 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	if sc == ec {
 		// Fast path: the allocation doesn't span more than one chunk,
 		// so update this one and if the summary didn't change, return.
-		x := s.summary[len(s.summary)-1][sc]
-		y := s.chunkOf(sc).summarize()
+		x := p.summary[len(p.summary)-1][sc]
+		y := p.chunkOf(sc).summarize()
 		if x == y {
 			return
 		}
-		s.summary[len(s.summary)-1][sc] = y
+		p.summary[len(p.summary)-1][sc] = y
 	} else if contig {
 		// Slow contiguous path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 
 		// Update the summary for chunk sc.
-		summary[sc] = s.chunkOf(sc).summarize()
+		summary[sc] = p.chunkOf(sc).summarize()
 
 		// Update the summaries for chunks in between, which are
 		// either totally allocated or freed.
-		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		whole := p.summary[len(p.summary)-1][sc+1 : ec]
 		if alloc {
 			// Should optimize into a memclr.
 			for i := range whole {
@@ -450,22 +454,22 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 		}
 
 		// Update the summary for chunk ec.
-		summary[ec] = s.chunkOf(ec).summarize()
+		summary[ec] = p.chunkOf(ec).summarize()
 	} else {
 		// Slow general path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
 		//
 		// We can't assume a contiguous allocation happened, so walk over
 		// every chunk in the range and manually recompute the summary.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 		for c := sc; c <= ec; c++ {
-			summary[c] = s.chunkOf(c).summarize()
+			summary[c] = p.chunkOf(c).summarize()
 		}
 	}
 
 	// Walk up the radix tree and update the summaries appropriately.
 	changed := true
-	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+	for l := len(p.summary) - 2; l >= 0 && changed; l-- {
 		// Update summaries at level l from summaries at level l+1.
 		changed = false
 
@@ -479,12 +483,12 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 
 		// Iterate over each block, updating the corresponding summary in the less-granular level.
 		for i := lo; i < hi; i++ {
-			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			children := p.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
 			sum := mergeSummaries(children, logMaxPages)
-			old := s.summary[l][i]
+			old := p.summary[l][i]
 			if old != sum {
 				changed = true
-				s.summary[l][i] = sum
+				p.summary[l][i] = sum
 			}
 		}
 	}
@@ -497,8 +501,10 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 // Returns the amount of scavenged memory in bytes present in the
 // allocated range.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	assertLockHeld(p.mheapLock)
+
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
 	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
@@ -506,24 +512,24 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	scav := uint(0)
 	if sc == ec {
 		// The range doesn't cross any chunk boundaries.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, ei+1-si)
 		chunk.allocRange(si, ei+1-si)
 	} else {
 		// The range crosses at least one chunk boundary.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
 		chunk.allocRange(si, pallocChunkPages-si)
 		for c := sc + 1; c < ec; c++ {
-			chunk := s.chunkOf(c)
+			chunk := p.chunkOf(c)
 			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
 			chunk.allocAll()
 		}
-		chunk = s.chunkOf(ec)
+		chunk = p.chunkOf(ec)
 		scav += chunk.scavenged.popcntRange(0, ei+1)
 		chunk.allocRange(0, ei+1)
 	}
-	s.update(base, npages, true, true)
+	p.update(base, npages, true, true)
 	return uintptr(scav) * pageSize
 }
 
@@ -532,13 +538,15 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 // returned. If addr is higher than any mapped region, then
 // it returns maxOffAddr.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+// p.mheapLock must be held.
+func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+	assertLockHeld(p.mheapLock)
+
 	// If we're not in a test, validate first by checking mheap_.arenas.
 	// This is a fast path which is only safe to use outside of testing.
 	ai := arenaIndex(addr.addr())
-	if s.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
-		vAddr, ok := s.inUse.findAddrGreaterEqual(addr.addr())
+	if p.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := p.inUse.findAddrGreaterEqual(addr.addr())
 		if ok {
 			return offAddr{vAddr}
 		} else {
@@ -554,20 +562,22 @@ func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 // find searches for the first (address-ordered) contiguous free region of
 // npages in size and returns a base address for that region.
 //
-// It uses s.searchAddr to prune its search and assumes that no palloc chunks
-// below chunkIndex(s.searchAddr) contain any free memory at all.
+// It uses p.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(p.searchAddr) contain any free memory at all.
 //
-// find also computes and returns a candidate s.searchAddr, which may or
-// may not prune more of the address space than s.searchAddr already does.
-// This candidate is always a valid s.searchAddr.
+// find also computes and returns a candidate p.searchAddr, which may or
+// may not prune more of the address space than p.searchAddr already does.
+// This candidate is always a valid p.searchAddr.
 //
 // find represents the slow path and the full radix tree search.
 //
 // Returns a base address of 0 on failure, in which case the candidate
 // searchAddr returned is invalid and must be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+	assertLockHeld(p.mheapLock)
+
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -647,7 +657,7 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	lastSumIdx := -1
 
 nextLevel:
-	for l := 0; l < len(s.summary); l++ {
+	for l := 0; l < len(p.summary); l++ {
 		// For the root level, entriesPerBlock is the whole level.
 		entriesPerBlock := 1 << levelBits[l]
 		logMaxPages := levelLogPages[l]
@@ -657,14 +667,14 @@ nextLevel:
 		i <<= levelBits[l]
 
 		// Slice out the block of entries we care about.
-		entries := s.summary[l][i : i+entriesPerBlock]
+		entries := p.summary[l][i : i+entriesPerBlock]
 
 		// Determine j0, the first index we should start iterating from.
 		// The searchAddr may help us eliminate iterations if we followed the
 		// searchAddr on the previous level or we're on the root leve, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
-		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
 			j0 = searchIdx & (entriesPerBlock - 1)
 		}
 
@@ -729,7 +739,7 @@ nextLevel:
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
 			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
-			return addr, s.findMappedAddr(firstFree.base)
+			return addr, p.findMappedAddr(firstFree.base)
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -741,7 +751,7 @@ nextLevel:
 		// lied to us. In either case, dump some useful state and throw.
 		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
 		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
-		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: p.searchAddr = ", hex(p.searchAddr.addr()), ", i = ", i, "\n")
 		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
 		for j := 0; j < len(entries); j++ {
 			sum := entries[j]
@@ -758,12 +768,12 @@ nextLevel:
 	// After iterating over all levels, i must contain a chunk index which
 	// is what the final level represents.
 	ci := chunkIdx(i)
-	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	j, searchIdx := p.chunkOf(ci).find(npages, 0)
 	if j == ^uint(0) {
 		// We couldn't find any space in this chunk despite the summaries telling
 		// us it should be there. There's likely a bug, so dump some state and throw.
-		sum := s.summary[len(s.summary)-1][i]
-		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		sum := p.summary[len(p.summary)-1][i]
+		print("runtime: summary[", len(p.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
 		print("runtime: npages = ", npages, "\n")
 		throw("bad summary data")
 	}
@@ -775,7 +785,7 @@ nextLevel:
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
 	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
-	return addr, s.findMappedAddr(firstFree.base)
+	return addr, p.findMappedAddr(firstFree.base)
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
@@ -785,25 +795,31 @@ nextLevel:
 // Returns a 0 base address on failure, in which case other returned values
 // should be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return 0, 0
 	}
 
 	// If npages has a chance of fitting in the chunk where the searchAddr is,
 	// search it directly.
 	searchAddr := minOffAddr
-	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
+	if pallocChunkPages-chunkPageIndex(p.searchAddr.addr()) >= uint(npages) {
 		// npages is guaranteed to be no greater than pallocChunkPages here.
-		i := chunkIndex(s.searchAddr.addr())
-		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
-			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+		i := chunkIndex(p.searchAddr.addr())
+		if max := p.summary[len(p.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := p.chunkOf(i).find(npages, chunkPageIndex(p.searchAddr.addr()))
 			if j == ^uint(0) {
 				print("runtime: max = ", max, ", npages = ", npages, "\n")
-				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(p.searchAddr.addr()), ", p.searchAddr = ", hex(p.searchAddr.addr()), "\n")
 				throw("bad summary data")
 			}
 			addr = chunkBase(i) + uintptr(j)*pageSize
@@ -813,7 +829,7 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	}
 	// We failed to use a searchAddr for one reason or another, so try
 	// the slow path.
-	addr, searchAddr = s.find(npages)
+	addr, searchAddr = p.find(npages)
 	if addr == 0 {
 		if npages == 1 {
 			// We failed to find a single free page, the smallest unit
@@ -821,41 +837,47 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 			// exhausted. Otherwise, the heap still might have free
 			// space in it, just not enough contiguous space to
 			// accommodate npages.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 		}
 		return 0, 0
 	}
 Found:
 	// Go ahead and actually mark the bits now that we have an address.
-	scav = s.allocRange(addr, npages)
+	scav = p.allocRange(addr, npages)
 
 	// If we found a higher searchAddr, we know that all the
 	// heap memory before that searchAddr in an offset address space is
-	// allocated, so bump s.searchAddr up to the new one.
-	if s.searchAddr.lessThan(searchAddr) {
-		s.searchAddr = searchAddr
+	// allocated, so bump p.searchAddr up to the new one.
+	if p.searchAddr.lessThan(searchAddr) {
+		p.searchAddr = searchAddr
 	}
 	return addr, scav
 }
 
 // free returns npages worth of memory starting at base back to the page heap.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) free(base, npages uintptr) {
-	// If we're freeing pages below the s.searchAddr, update searchAddr.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) free(base, npages uintptr) {
+	assertLockHeld(p.mheapLock)
+
+	// If we're freeing pages below the p.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
-		s.scav.freeHWM = offLimit
+	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+		p.scav.freeHWM = offLimit
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
 		// where it is, so mark it directly.
 		i := chunkIndex(base)
-		s.chunkOf(i).free1(chunkPageIndex(base))
+		p.chunkOf(i).free1(chunkPageIndex(base))
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
 		sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -863,17 +885,17 @@ func (s *pageAlloc) free(base, npages uintptr) {
 
 		if sc == ec {
 			// The range doesn't cross any chunk boundaries.
-			s.chunkOf(sc).free(si, ei+1-si)
+			p.chunkOf(sc).free(si, ei+1-si)
 		} else {
 			// The range crosses at least one chunk boundary.
-			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			p.chunkOf(sc).free(si, pallocChunkPages-si)
 			for c := sc + 1; c < ec; c++ {
-				s.chunkOf(c).freeAll()
+				p.chunkOf(c).freeAll()
 			}
-			s.chunkOf(ec).free(0, ei+1)
+			p.chunkOf(ec).free(0, ei+1)
 		}
 	}
-	s.update(base, npages, true, false)
+	p.update(base, npages, true, false)
 }
 
 const (
diff --git a/libgo/go/runtime/mpagealloc_32bit.go b/libgo/go/runtime/mpagealloc_32bit.go
index 249b5fe..e721c38 100644
--- a/libgo/go/runtime/mpagealloc_32bit.go
+++ b/libgo/go/runtime/mpagealloc_32bit.go
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm mips mipsle wasm darwin,arm64 amd64p32 armbe m68k mips64p32 mips64p32le nios2 ppc riscv s390 sh shbe sparc
+// +build 386 arm mips mipsle wasm ios,arm64 amd64p32 armbe m68k mips64p32 mips64p32le nios2 ppc riscv s390 sh shbe sparc
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
 // allocator, even though it has 64-bit pointers. This is because any wasm
 // pointer always has its top 32 bits as zero, so the effective heap address
 // space is only 2^32 bytes in size (see heapAddrBits).
 
-// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// ios/arm64 is treated as a 32-bit architecture for the purposes of the
 // page allocator, even though it has 64-bit pointers and a 33-bit address
 // space (see heapAddrBits). The 33 bit address space cannot be rounded up
 // to 64 bits because there are too many summary levels to fit in just 33
@@ -60,7 +60,7 @@ var levelLogPages = [summaryLevels]uint{
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Calculate how much memory all our entries will take up.
 	//
 	// This should be around 12 KiB or less.
@@ -76,7 +76,7 @@ func (s *pageAlloc) sysInit() {
 		throw("failed to reserve page summary memory")
 	}
 	// There isn't much. Just map it and mark it as used immediately.
-	sysMap(reservation, totalSize, s.sysStat)
+	sysMap(reservation, totalSize, p.sysStat)
 	sysUsed(reservation, totalSize)
 
 	// Iterate over the reservation and cut it up into slices.
@@ -88,29 +88,29 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
 	}
 
 	// Walk up the tree and update the summary slices.
-	for l := len(s.summary) - 1; l >= 0; l-- {
+	for l := len(p.summary) - 1; l >= 0; l-- {
 		// Figure out what part of the summary array this new address space needs.
 		// Note that we need to align the ranges to the block width (1<<levelBits[l])
 		// at this level because the full block is needed to compute the summary for
 		// the next level.
 		lo, hi := addrsToSummaryRange(l, base, limit)
 		_, hi = blockAlignSummaryRange(l, lo, hi)
-		if hi > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:hi]
+		if hi > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:hi]
 		}
 	}
 }
diff --git a/libgo/go/runtime/mpagealloc_64bit.go b/libgo/go/runtime/mpagealloc_64bit.go
index ac59946..4d4257a 100644
--- a/libgo/go/runtime/mpagealloc_64bit.go
+++ b/libgo/go/runtime/mpagealloc_64bit.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x arm64be alpha sparc64 ia64
+// +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x arm64be alpha sparc64 ia64
 
-// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+// See mpagealloc_32bit.go for why ios/arm64 is excluded here.
 
 package runtime
 
@@ -67,7 +67,7 @@ var levelLogPages = [summaryLevels]uint{
 // sysInit performs architecture-dependent initialization of fields
 // in pageAlloc. pageAlloc should be uninitialized except for sysStat
 // if any runtime statistic should be updated.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Reserve memory for each level. This will get mapped in
 	// as R/W by setArenas.
 	for l, shift := range levelShift {
@@ -82,21 +82,21 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 	}
 }
 
 // sysGrow performs architecture-dependent operations on heap
 // growth for the page allocator, such as mapping in new memory
 // for summaries. It also updates the length of the slices in
-// s.summary.
+// [.summary.
 //
 // base is the base of the newly-added heap memory and limit is
 // the first address past the end of the newly-added heap memory.
 // Both must be aligned to pallocChunkBytes.
 //
-// The caller must update s.start and s.end after calling sysGrow.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+// The caller must update p.start and p.end after calling sysGrow.
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
@@ -111,12 +111,12 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	}
 
 	// summaryRangeToSumAddrRange converts a range of indices in any
-	// level of s.summary into page-aligned addresses which cover that
+	// level of p.summary into page-aligned addresses which cover that
 	// range of indices.
 	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
 		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
 		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
-		base := unsafe.Pointer(&s.summary[level][0])
+		base := unsafe.Pointer(&p.summary[level][0])
 		return addrRange{
 			offAddr{uintptr(add(base, baseOffset))},
 			offAddr{uintptr(add(base, limitOffset))},
@@ -140,10 +140,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	//
 	// This will be used to look at what memory in the summary array is already
 	// mapped before and after this new range.
-	inUseIndex := s.inUse.findSucc(base)
+	inUseIndex := p.inUse.findSucc(base)
 
 	// Walk up the radix tree and map summaries in as needed.
-	for l := range s.summary {
+	for l := range p.summary {
 		// Figure out what part of the summary array this new address space needs.
 		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
 
@@ -151,8 +151,8 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// we get tight bounds checks on at least the top bound.
 		//
 		// We must do this regardless of whether we map new memory.
-		if needIdxLimit > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:needIdxLimit]
+		if needIdxLimit > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:needIdxLimit]
 		}
 
 		// Compute the needed address range in the summary array for level l.
@@ -163,10 +163,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// for mapping. prune's invariants are guaranteed by the fact that this
 		// function will never be asked to remap the same memory twice.
 		if inUseIndex > 0 {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
 		}
-		if inUseIndex < len(s.inUse.ranges) {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		if inUseIndex < len(p.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex]))
 		}
 		// It's possible that after our pruning above, there's nothing new to map.
 		if need.size() == 0 {
@@ -174,7 +174,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		}
 
 		// Map and commit need.
-		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), p.sysStat)
 		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
 	}
 }
diff --git a/libgo/go/runtime/mpagealloc_test.go b/libgo/go/runtime/mpagealloc_test.go
index 65ba71d..5d979fa 100644
--- a/libgo/go/runtime/mpagealloc_test.go
+++ b/libgo/go/runtime/mpagealloc_test.go
@@ -54,7 +54,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
 			},
 		},
 		"Contiguous2": {
@@ -63,7 +63,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 1,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
 			},
 		},
 		"Contiguous5": {
@@ -75,7 +75,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Discontiguous": {
@@ -85,9 +85,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Mixed": {
@@ -98,8 +98,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"WildlyDiscontiguous": {
@@ -110,9 +110,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x21,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
-				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
-				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)),
 			},
 		},
 		"ManyDiscontiguous": {
@@ -129,39 +129,39 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 64,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
-				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
-				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
-				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
-				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
-				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
-				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
-				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
-				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
-				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
-				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
-				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
-				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
-				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
-				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
-				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
-				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
-				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
-				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
-				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
-				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
-				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
-				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
-				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
-				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
-				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
-				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
-				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
-				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
-				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
-				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)),
 			},
 		},
 	}
@@ -172,8 +172,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)),
 			},
 		}
 	}
@@ -197,7 +197,7 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Fail()
 			} else {
 				for i := range want {
-					if want[i] != got[i] {
+					if !want[i].Equals(got[i]) {
 						t.Fail()
 						break
 					}
@@ -207,11 +207,11 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Logf("found inUse mismatch")
 				t.Logf("got:")
 				for i, r := range got {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 				t.Logf("want:")
 				for i, r := range want {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 			}
 		})
diff --git a/libgo/go/runtime/mpagecache.go b/libgo/go/runtime/mpagecache.go
index 683a997..4b5c66d 100644
--- a/libgo/go/runtime/mpagecache.go
+++ b/libgo/go/runtime/mpagecache.go
@@ -71,8 +71,14 @@ func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
 // into s. Then, it clears the cache, such that empty returns
 // true.
 //
-// s.mheapLock must be held or the world must be stopped.
-func (c *pageCache) flush(s *pageAlloc) {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (c *pageCache) flush(p *pageAlloc) {
+	assertLockHeld(p.mheapLock)
+
 	if c.empty() {
 		return
 	}
@@ -83,18 +89,18 @@ func (c *pageCache) flush(s *pageAlloc) {
 	// slower, safer thing by iterating over each bit individually.
 	for i := uint(0); i < 64; i++ {
 		if c.cache&(1<<i) != 0 {
-			s.chunkOf(ci).free1(pi + i)
+			p.chunkOf(ci).free1(pi + i)
 		}
 		if c.scav&(1<<i) != 0 {
-			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
 		}
 	}
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
-	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
-	s.update(c.base, pageCachePages, false, false)
+	p.update(c.base, pageCachePages, false, false)
 	*c = pageCache{}
 }
 
@@ -102,19 +108,25 @@ func (c *pageCache) flush(s *pageAlloc) {
 // may not be contiguous, and returns a pageCache structure which owns the
 // chunk.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocToCache() pageCache {
+// p.mheapLock must be held.
+//
+// Must run on the system stack because p.mheapLock must be held.
+//
+//go:systemstack
+func (p *pageAlloc) allocToCache() pageCache {
+	assertLockHeld(p.mheapLock)
+
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return pageCache{}
 	}
 	c := pageCache{}
-	ci := chunkIndex(s.searchAddr.addr()) // chunk index
-	if s.summary[len(s.summary)-1][ci] != 0 {
+	ci := chunkIndex(p.searchAddr.addr()) // chunk index
+	if p.summary[len(p.summary)-1][ci] != 0 {
 		// Fast path: there's free pages at or near the searchAddr address.
-		chunk := s.chunkOf(ci)
-		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		chunk := p.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(p.searchAddr.addr()))
 		if j == ^uint(0) {
 			throw("bad summary data")
 		}
@@ -126,15 +138,15 @@ func (s *pageAlloc) allocToCache() pageCache {
 	} else {
 		// Slow path: the searchAddr address had nothing there, so go find
 		// the first free page the slow way.
-		addr, _ := s.find(1)
+		addr, _ := p.find(1)
 		if addr == 0 {
 			// We failed to find adequate free space, so mark the searchAddr as OoM
 			// and return an empty pageCache.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 			return pageCache{}
 		}
 		ci := chunkIndex(addr)
-		chunk := s.chunkOf(ci)
+		chunk := p.chunkOf(ci)
 		c = pageCache{
 			base:  alignDown(addr, 64*pageSize),
 			cache: ^chunk.pages64(chunkPageIndex(addr)),
@@ -143,19 +155,19 @@ func (s *pageAlloc) allocToCache() pageCache {
 	}
 
 	// Set the bits as allocated and clear the scavenged bits.
-	s.allocRange(c.base, pageCachePages)
+	p.allocRange(c.base, pageCachePages)
 
 	// Update as an allocation, but note that it's not contiguous.
-	s.update(c.base, pageCachePages, false, true)
+	p.update(c.base, pageCachePages, false, true)
 
 	// Set the search address to the last page represented by the cache.
 	// Since all of the pages in this block are going to the cache, and we
 	// searched for the first free page, we can confidently start at the
 	// next page.
 	//
-	// However, s.searchAddr is not allowed to point into unmapped heap memory
+	// However, p.searchAddr is not allowed to point into unmapped heap memory
 	// unless it is maxSearchAddr, so make it the last page as opposed to
 	// the page after.
-	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	p.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
 	return c
 }
diff --git a/libgo/go/runtime/mpallocbits.go b/libgo/go/runtime/mpallocbits.go
index a801134..ff11230 100644
--- a/libgo/go/runtime/mpallocbits.go
+++ b/libgo/go/runtime/mpallocbits.go
@@ -120,84 +120,105 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
 // sake of documentation, 0s are free pages and 1s are allocated pages.
 type pallocBits pageBits
 
-// consec8tab is a table containing the number of consecutive
-// zero bits for any uint8 value.
-//
-// The table is generated by calling consec8(i) for each
-// possible uint8 value, which is defined as:
-//
-// // consec8 counts the maximum number of consecutive 0 bits
-// // in a uint8.
-// func consec8(n uint8) int {
-// 	n = ^n
-// 	i := 0
-// 	for n != 0 {
-// 		n &= (n << 1)
-// 		i++
-// 	}
-// 	return i
-// }
-var consec8tab = [256]uint{
-	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-	4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
-}
-
 // summarize returns a packed summary of the bitmap in pallocBits.
 func (b *pallocBits) summarize() pallocSum {
-	// TODO(mknyszek): There may be something more clever to be done
-	// here to make the summarize operation more efficient. For example,
-	// we can compute start and end with 64-bit wide operations easily,
-	// but max is a bit more complex. Perhaps there exists some way to
-	// leverage the 64-bit start and end to our advantage?
-	var start, max, end uint
+	var start, max, cur uint
+	const notSetYet = ^uint(0) // sentinel for start value
+	start = notSetYet
 	for i := 0; i < len(b); i++ {
-		a := b[i]
-		for j := 0; j < 64; j += 8 {
-			k := uint8(a >> j)
-
-			// Compute start.
-			si := uint(sys.TrailingZeros8(k))
-			if start == uint(i*64+j) {
-				start += si
-			}
+		x := b[i]
+		if x == 0 {
+			cur += 64
+			continue
+		}
+		t := uint(sys.TrailingZeros64(x))
+		l := uint(sys.LeadingZeros64(x))
 
-			// Compute max.
-			if end+si > max {
-				max = end + si
-			}
-			if mi := consec8tab[k]; mi > max {
-				max = mi
+		// Finish any region spanning the uint64s
+		cur += t
+		if start == notSetYet {
+			start = cur
+		}
+		if cur > max {
+			max = cur
+		}
+		// Final region that might span to next uint64
+		cur = l
+	}
+	if start == notSetYet {
+		// Made it all the way through without finding a single 1 bit.
+		const n = uint(64 * len(b))
+		return packPallocSum(n, n, n)
+	}
+	if cur > max {
+		max = cur
+	}
+	if max >= 64-2 {
+		// There is no way an internal run of zeros could beat max.
+		return packPallocSum(start, max, cur)
+	}
+	// Now look inside each uint64 for runs of zeros.
+	// All uint64s must be nonzero, or we would have aborted above.
+outer:
+	for i := 0; i < len(b); i++ {
+		x := b[i]
+
+		// Look inside this uint64. We have a pattern like
+		// 000000 1xxxxx1 000000
+		// We need to look inside the 1xxxxx1 for any contiguous
+		// region of zeros.
+
+		// We already know the trailing zeros are no larger than max. Remove them.
+		x >>= sys.TrailingZeros64(x) & 63
+		if x&(x+1) == 0 { // no more zeros (except at the top).
+			continue
+		}
+
+		// Strategy: shrink all runs of zeros by max. If any runs of zero
+		// remain, then we've identified a larger maxiumum zero run.
+		p := max     // number of zeros we still need to shrink by.
+		k := uint(1) // current minimum length of runs of ones in x.
+		for {
+			// Shrink all runs of zeros by p places (except the top zeros).
+			for p > 0 {
+				if p <= k {
+					// Shift p ones down into the top of each run of zeros.
+					x |= x >> (p & 63)
+					if x&(x+1) == 0 { // no more zeros (except at the top).
+						continue outer
+					}
+					break
+				}
+				// Shift k ones down into the top of each run of zeros.
+				x |= x >> (k & 63)
+				if x&(x+1) == 0 { // no more zeros (except at the top).
+					continue outer
+				}
+				p -= k
+				// We've just doubled the minimum length of 1-runs.
+				// This allows us to shift farther in the next iteration.
+				k *= 2
 			}
 
-			// Compute end.
-			if k == 0 {
-				end += 8
-			} else {
-				end = uint(sys.LeadingZeros8(k))
+			// The length of the lowest-order zero run is an increment to our maximum.
+			j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
+			x >>= j & 63                       // remove trailing ones
+			j = uint(sys.TrailingZeros64(x))   // count contiguous trailing zeros
+			x >>= j & 63                       // remove zeros
+			max += j                           // we have a new maximum!
+			if x&(x+1) == 0 {                  // no more zeros (except at the top).
+				continue outer
 			}
+			p = j // remove j more zeros from each zero run.
 		}
 	}
-	return packPallocSum(start, max, end)
+	return packPallocSum(start, max, cur)
 }
 
 // find searches for npages contiguous free pages in pallocBits and returns
 // the index where that run starts, as well as the index of the first free page
 // it found in the search. searchIdx represents the first known free page and
-// where to begin the search from.
+// where to begin the next search from.
 //
 // If find fails to find any free space, it returns an index of ^uint(0) and
 // the new searchIdx should be ignored.
@@ -218,9 +239,10 @@ func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
 //
 // See find for an explanation of the searchIdx parameter.
 func (b *pallocBits) find1(searchIdx uint) uint {
+	_ = b[0] // lift nil check out of loop
 	for i := searchIdx / 64; i < uint(len(b)); i++ {
 		x := b[i]
-		if x == ^uint64(0) {
+		if ^x == 0 {
 			continue
 		}
 		return i*64 + uint(sys.TrailingZeros64(^x))
@@ -242,18 +264,18 @@ func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
 	end, newSearchIdx := uint(0), ^uint(0)
 	for i := searchIdx / 64; i < uint(len(b)); i++ {
 		bi := b[i]
-		if bi == ^uint64(0) {
+		if ^bi == 0 {
 			end = 0
 			continue
 		}
 		// First see if we can pack our allocation in the trailing
 		// zeros plus the end of the last 64 bits.
-		start := uint(sys.TrailingZeros64(bi))
 		if newSearchIdx == ^uint(0) {
 			// The new searchIdx is going to be at these 64 bits after any
 			// 1s we file, so count trailing 1s.
 			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
 		}
+		start := uint(sys.TrailingZeros64(bi))
 		if end+start >= uint(npages) {
 			return i*64 - end, newSearchIdx
 		}
@@ -348,15 +370,33 @@ func (b *pallocBits) pages64(i uint) uint64 {
 // findBitRange64 returns the bit index of the first set of
 // n consecutive 1 bits. If no consecutive set of 1 bits of
 // size n may be found in c, then it returns an integer >= 64.
+// n must be > 0.
 func findBitRange64(c uint64, n uint) uint {
-	i := uint(0)
-	cont := uint(sys.TrailingZeros64(^c))
-	for cont < n && i < 64 {
-		i += cont
-		i += uint(sys.TrailingZeros64(c >> i))
-		cont = uint(sys.TrailingZeros64(^(c >> i)))
+	// This implementation is based on shrinking the length of
+	// runs of contiguous 1 bits. We remove the top n-1 1 bits
+	// from each run of 1s, then look for the first remaining 1 bit.
+	p := n - 1   // number of 1s we want to remove.
+	k := uint(1) // current minimum width of runs of 0 in c.
+	for p > 0 {
+		if p <= k {
+			// Shift p 0s down into the top of each run of 1s.
+			c &= c >> (p & 63)
+			break
+		}
+		// Shift k 0s down into the top of each run of 1s.
+		c &= c >> (k & 63)
+		if c == 0 {
+			return 64
+		}
+		p -= k
+		// We've just doubled the minimum length of 0-runs.
+		// This allows us to shift farther in the next iteration.
+		k *= 2
 	}
-	return i
+	// Find first remaining 1.
+	// Since we shrunk from the top down, the first 1 is in
+	// its correct original position.
+	return uint(sys.TrailingZeros64(c))
 }
 
 // pallocData encapsulates pallocBits and a bitmap for
diff --git a/libgo/go/runtime/mpallocbits_test.go b/libgo/go/runtime/mpallocbits_test.go
index 71a29f3..5095e242 100644
--- a/libgo/go/runtime/mpallocbits_test.go
+++ b/libgo/go/runtime/mpallocbits_test.go
@@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) {
 
 // Ensures two packed summaries are identical, and reports a detailed description
 // of the difference if they're not.
-func checkPallocSum(t *testing.T, got, want PallocSum) {
+func checkPallocSum(t testing.TB, got, want PallocSum) {
 	if got.Start() != want.Start() {
 		t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
 	}
@@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) {
 
 // Benchmarks how quickly we can summarize a PallocBits.
 func BenchmarkPallocBitsSummarize(b *testing.B) {
-	buf0 := new(PallocBits)
-	buf1 := new(PallocBits)
-	for i := 0; i < len(buf1); i++ {
-		buf1[i] = ^uint64(0)
-	}
-	bufa := new(PallocBits)
-	for i := 0; i < len(bufa); i++ {
-		bufa[i] = 0xaa
-	}
-	for _, buf := range []*PallocBits{buf0, buf1, bufa} {
-		b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
+	patterns := []uint64{
+		0,
+		^uint64(0),
+		0xaa,
+		0xaaaaaaaaaaaaaaaa,
+		0x80000000aaaaaaaa,
+		0xaaaaaaaa00000001,
+		0xbbbbbbbbbbbbbbbb,
+		0x80000000bbbbbbbb,
+		0xbbbbbbbb00000001,
+		0xcccccccccccccccc,
+		0x4444444444444444,
+		0x4040404040404040,
+		0x4000400040004000,
+		0x1000404044ccaaff,
+	}
+	for _, p := range patterns {
+		buf := new(PallocBits)
+		for i := 0; i < len(buf); i++ {
+			buf[i] = p
+		}
+		b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) {
+			checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf))
 			for i := 0; i < b.N; i++ {
 				buf.Summarize()
 			}
@@ -492,10 +504,9 @@ func TestFindBitRange64(t *testing.T) {
 			t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result)
 		}
 	}
-	for i := uint(0); i <= 64; i++ {
+	for i := uint(1); i <= 64; i++ {
 		check(^uint64(0), i, 0)
 	}
-	check(0, 0, 0)
 	for i := uint(1); i <= 64; i++ {
 		check(0, i, ^uint(0))
 	}
@@ -508,3 +519,33 @@ func TestFindBitRange64(t *testing.T) {
 	check(0xffff03ff0107ffff, 16, 0)
 	check(0x0fff03ff01079fff, 16, ^uint(0))
 }
+
+func BenchmarkFindBitRange64(b *testing.B) {
+	patterns := []uint64{
+		0,
+		^uint64(0),
+		0xaa,
+		0xaaaaaaaaaaaaaaaa,
+		0x80000000aaaaaaaa,
+		0xaaaaaaaa00000001,
+		0xbbbbbbbbbbbbbbbb,
+		0x80000000bbbbbbbb,
+		0xbbbbbbbb00000001,
+		0xcccccccccccccccc,
+		0x4444444444444444,
+		0x4040404040404040,
+		0x4000400040004000,
+	}
+	sizes := []uint{
+		2, 8, 32,
+	}
+	for _, pattern := range patterns {
+		for _, size := range sizes {
+			b.Run(fmt.Sprintf("Pattern%02XSize%d", pattern, size), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					FindBitRange64(pattern, size)
+				}
+			})
+		}
+	}
+}
diff --git a/libgo/go/runtime/mranges.go b/libgo/go/runtime/mranges.go
index 2c0eb2c..84a2c06 100644
--- a/libgo/go/runtime/mranges.go
+++ b/libgo/go/runtime/mranges.go
@@ -160,10 +160,10 @@ type addrRanges struct {
 	totalBytes uintptr
 
 	// sysStat is the stat to track allocations by this type
-	sysStat *uint64
+	sysStat *sysMemStat
 }
 
-func (a *addrRanges) init(sysStat *uint64) {
+func (a *addrRanges) init(sysStat *sysMemStat) {
 	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
 	ranges.len = 0
 	ranges.cap = 16
@@ -172,20 +172,46 @@ func (a *addrRanges) init(sysStat *uint64) {
 	a.totalBytes = 0
 }
 
-// findSucc returns the first index in a such that base is
+// findSucc returns the first index in a such that addr is
 // less than the base of the addrRange at that index.
 func (a *addrRanges) findSucc(addr uintptr) int {
-	// TODO(mknyszek): Consider a binary search for large arrays.
-	// While iterating over these ranges is potentially expensive,
-	// the expected number of ranges is small, ideally just 1,
-	// since Go heaps are usually mostly contiguous.
 	base := offAddr{addr}
-	for i := range a.ranges {
+
+	// Narrow down the search space via a binary search
+	// for large addrRanges until we have at most iterMax
+	// candidates left.
+	const iterMax = 8
+	bot, top := 0, len(a.ranges)
+	for top-bot > iterMax {
+		i := ((top - bot) / 2) + bot
+		if a.ranges[i].contains(base.addr()) {
+			// a.ranges[i] contains base, so
+			// its successor is the next index.
+			return i + 1
+		}
+		if base.lessThan(a.ranges[i].base) {
+			// In this case i might actually be
+			// the successor, but we can't be sure
+			// until we check the ones before it.
+			top = i
+		} else {
+			// In this case we know base is
+			// greater than or equal to a.ranges[i].limit-1,
+			// so i is definitely not the successor.
+			// We already checked i, so pick the next
+			// one.
+			bot = i + 1
+		}
+	}
+	// There are top-bot candidates left, so
+	// iterate over them and find the first that
+	// base is strictly less than.
+	for i := bot; i < top; i++ {
 		if base.lessThan(a.ranges[i].base) {
 			return i
 		}
 	}
-	return len(a.ranges)
+	return top
 }
 
 // findAddrGreaterEqual returns the smallest address represented by a
@@ -218,7 +244,7 @@ func (a *addrRanges) contains(addr uintptr) bool {
 
 // add inserts a new address range to a.
 //
-// r must not overlap with any address range in a.
+// r must not overlap with any address range in a and r.size() must be > 0.
 func (a *addrRanges) add(r addrRange) {
 	// The copies in this function are potentially expensive, but this data
 	// structure is meant to represent the Go heap. At worst, copying this
@@ -229,6 +255,12 @@ func (a *addrRanges) add(r addrRange) {
 	// of 16) and Go heaps are usually mostly contiguous, so the chance that
 	// an addrRanges even grows to that size is extremely low.
 
+	// An empty range has no effect on the set of addresses represented
+	// by a, but passing a zero-sized range is almost always a bug.
+	if r.size() == 0 {
+		print("runtime: range = {", hex(r.base.addr()), ", ", hex(r.limit.addr()), "}\n")
+		throw("attempted to add zero-sized address range")
+	}
 	// Because we assume r is not currently represented in a,
 	// findSucc gives us our insertion index.
 	i := a.findSucc(r.base.addr())
diff --git a/libgo/go/runtime/mranges_test.go b/libgo/go/runtime/mranges_test.go
new file mode 100644
index 0000000..ed439c5
--- /dev/null
+++ b/libgo/go/runtime/mranges_test.go
@@ -0,0 +1,275 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func validateAddrRanges(t *testing.T, a *AddrRanges, want ...AddrRange) {
+	ranges := a.Ranges()
+	if len(ranges) != len(want) {
+		t.Errorf("want %v, got %v", want, ranges)
+		t.Fatal("different lengths")
+	}
+	gotTotalBytes := uintptr(0)
+	wantTotalBytes := uintptr(0)
+	for i := range ranges {
+		gotTotalBytes += ranges[i].Size()
+		wantTotalBytes += want[i].Size()
+		if ranges[i].Base() >= ranges[i].Limit() {
+			t.Error("empty range found")
+		}
+		// Ensure this is equivalent to what we want.
+		if !ranges[i].Equals(want[i]) {
+			t.Errorf("range %d: got [0x%x, 0x%x), want [0x%x, 0x%x)", i,
+				ranges[i].Base(), ranges[i].Limit(),
+				want[i].Base(), want[i].Limit(),
+			)
+		}
+		if i != 0 {
+			// Ensure the ranges are sorted.
+			if ranges[i-1].Base() >= ranges[i].Base() {
+				t.Errorf("ranges %d and %d are out of sorted order", i-1, i)
+			}
+			// Check for a failure to coalesce.
+			if ranges[i-1].Limit() == ranges[i].Base() {
+				t.Errorf("ranges %d and %d should have coalesced", i-1, i)
+			}
+			// Check if any ranges overlap. Because the ranges are sorted
+			// by base, it's sufficient to just check neighbors.
+			if ranges[i-1].Limit() > ranges[i].Base() {
+				t.Errorf("ranges %d and %d overlap", i-1, i)
+			}
+		}
+	}
+	if wantTotalBytes != gotTotalBytes {
+		t.Errorf("expected %d total bytes, got %d", wantTotalBytes, gotTotalBytes)
+	}
+	if b := a.TotalBytes(); b != gotTotalBytes {
+		t.Errorf("inconsistent total bytes: want %d, got %d", gotTotalBytes, b)
+	}
+	if t.Failed() {
+		t.Errorf("addrRanges: %v", ranges)
+		t.Fatal("detected bad addrRanges")
+	}
+}
+
+func TestAddrRangesAdd(t *testing.T) {
+	a := NewAddrRanges()
+
+	// First range.
+	a.Add(MakeAddrRange(512, 1024))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 1024),
+	)
+
+	// Coalesce up.
+	a.Add(MakeAddrRange(1024, 2048))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+	)
+
+	// Add new independent range.
+	a.Add(MakeAddrRange(4096, 8192))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(4096, 8192),
+	)
+
+	// Coalesce down.
+	a.Add(MakeAddrRange(3776, 4096))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(3776, 8192),
+	)
+
+	// Coalesce up and down.
+	a.Add(MakeAddrRange(2048, 3776))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 8192),
+	)
+
+	// Push a bunch of independent ranges to the end to try and force growth.
+	expectedRanges := []AddrRange{MakeAddrRange(512, 8192)}
+	for i := uintptr(0); i < 64; i++ {
+		dRange := MakeAddrRange(8192+(i+1)*2048, 8192+(i+1)*2048+10)
+		a.Add(dRange)
+		expectedRanges = append(expectedRanges, dRange)
+		validateAddrRanges(t, &a, expectedRanges...)
+	}
+
+	// Push a bunch of independent ranges to the beginning to try and force growth.
+	var bottomRanges []AddrRange
+	for i := uintptr(0); i < 63; i++ {
+		dRange := MakeAddrRange(8+i*8, 8+i*8+4)
+		a.Add(dRange)
+		bottomRanges = append(bottomRanges, dRange)
+		validateAddrRanges(t, &a, append(bottomRanges, expectedRanges...)...)
+	}
+}
+
+func TestAddrRangesFindSucc(t *testing.T) {
+	var large []AddrRange
+	for i := 0; i < 100; i++ {
+		large = append(large, MakeAddrRange(5+uintptr(i)*5, 5+uintptr(i)*5+3))
+	}
+
+	type testt struct {
+		name   string
+		base   uintptr
+		expect int
+		ranges []AddrRange
+	}
+	tests := []testt{
+		{
+			name:   "Empty",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{},
+		},
+		{
+			name:   "OneBefore",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneWithin",
+			base:   14,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfterLimit",
+			base:   16,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfter",
+			base:   17,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "ThreeBefore",
+			base:   3,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeAfter",
+			base:   24,
+			expect: 3,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeBetween",
+			base:   11,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeWithin",
+			base:   9,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "Zero",
+			base:   0,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(0, 10),
+			},
+		},
+		{
+			name:   "Max",
+			base:   ^uintptr(0),
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(^uintptr(0)-5, ^uintptr(0)),
+			},
+		},
+		{
+			name:   "LargeBefore",
+			base:   2,
+			expect: 0,
+			ranges: large,
+		},
+		{
+			name:   "LargeAfter",
+			base:   5 + uintptr(len(large))*5 + 30,
+			expect: len(large),
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenLow",
+			base:   14,
+			expect: 2,
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenHigh",
+			base:   249,
+			expect: 49,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinLow",
+			base:   25,
+			expect: 5,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinHigh",
+			base:   396,
+			expect: 79,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinMiddle",
+			base:   250,
+			expect: 50,
+			ranges: large,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			a := MakeAddrRanges(test.ranges...)
+			i := a.FindSucc(test.base)
+			if i != test.expect {
+				t.Fatalf("expected %d, got %d", test.expect, i)
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/mspanset.go b/libgo/go/runtime/mspanset.go
index c872c11..04ec800 100644
--- a/libgo/go/runtime/mspanset.go
+++ b/libgo/go/runtime/mspanset.go
@@ -102,7 +102,7 @@ retry:
 			if newCap == 0 {
 				newCap = spanSetInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -287,7 +287,7 @@ func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
 	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
 		return s
 	}
-	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
 }
 
 // free returns a spanSetBlock back to the pool.
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 85a0861..488e5d1 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -8,13 +8,10 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
 // Statistics.
-// If you edit this structure, also edit type MemStats below.
-// Their layouts must match exactly.
 //
 // For detailed descriptions see the documentation for MemStats.
 // Fields that differ from MemStats are further documented here.
@@ -35,31 +32,43 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in mSpanInUse spans
-	heap_released uint64 // bytes released to the os
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
 
 	// heap_objects is not used by the runtime directly and instead
 	// computed on the fly by updatememstats.
 	heap_objects uint64 // total number of allocated objects
 
+	// Statistics about stacks.
+	stacks_inuse uint64     // bytes in manually-managed stack spans; computed by updatememstats
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
+	mspan_sys    sysMemStat
 	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64 // updated atomically or during STW
-	other_sys    uint64 // updated atomically or during STW
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+
+	// Statistics about GC overhead.
+	gcWorkBufInUse           uint64     // computed by updatememstats
+	gcProgPtrScalarBitsInUse uint64     // computed by updatememstats
+	gcMiscSys                sysMemStat // updated atomically or during STW
+
+	// Miscellaneous statistics.
+	other_sys sysMemStat // updated atomically or during STW
+
+	// Statistics about the garbage collector.
+
+	// next_gc is the goal heap_live for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	next_gc uint64
 
-	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
-	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
@@ -78,7 +87,9 @@ type mstats struct {
 		nfree   uint64
 	}
 
-	// Statistics below here are not exported to MemStats directly.
+	// Add an uint32 for even number of size classes to align below fields
+	// to 64 bits for atomic operations on 32 bit platforms.
+	_ [1 - _NumSizeClasses%2]uint32
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
@@ -106,11 +117,10 @@ type mstats struct {
 
 	// heap_live is the number of bytes considered live by the GC.
 	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= heap_alloc, since heap_alloc
-	// includes unmarked objects that have not yet been swept (and
-	// hence goes up as we allocate and down as we sweep) while
-	// heap_live excludes these objects (and hence only goes up
-	// between GCs).
+	// since then. heap_live <= alloc, since alloc includes unmarked
+	// objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heap_live excludes these
+	// objects (and hence only goes up between GCs).
 	//
 	// This is updated atomically without locking. To reduce
 	// contention, this is updated only when obtaining a span from
@@ -135,6 +145,8 @@ type mstats struct {
 	// no-scan objects and no-scan tails of objects.
 	//
 	// Whenever this is updated, call gcController.revise().
+	//
+	// Read and written atomically or with the world stopped.
 	heap_scan uint64
 
 	// heap_marked is the number of bytes marked by the previous
@@ -142,6 +154,17 @@ type mstats struct {
 	// unlike heap_live, heap_marked does not change until the
 	// next mark termination.
 	heap_marked uint64
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
+
+	// _ uint32 // ensure gcPauseDist is aligned
+
+	// gcPauseDist represents the distribution of all GC-related
+	// application pauses in the runtime.
+	//
+	// Each individual pause is counted separately, unlike pause_ns.
+	gcPauseDist timeHistogram
 }
 
 var memstats mstats
@@ -419,24 +442,25 @@ type MemStats struct {
 	}
 }
 
-// Size of the trailing by_size array differs between mstats and MemStats,
-// and all data after by_size is local to runtime, not exported.
-// NumSizeClasses was changed, but we cannot change MemStats because of backward compatibility.
-// sizeof_C_MStats is the size of the prefix of mstats that
-// corresponds to MemStats. It should match Sizeof(MemStats{}).
-var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
-
 func init() {
-	var memStats MemStats
-	if sizeof_C_MStats != unsafe.Sizeof(memStats) {
-		println(sizeof_C_MStats, unsafe.Sizeof(memStats))
-		throw("MStats vs MemStatsType size mismatch")
-	}
-
-	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
-		println(unsafe.Offsetof(memstats.heap_live))
+	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
+		println(offset)
 		throw("memstats.heap_live not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
+		println(offset)
+		throw("memstats.gcPauseDist not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
 }
 
 // ReadMemStats populates m with memory allocator statistics.
@@ -458,14 +482,74 @@ func ReadMemStats(m *MemStats) {
 func readmemstats_m(stats *MemStats) {
 	updatememstats()
 
-	// The size of the trailing by_size array differs between
-	// mstats and MemStats. NumSizeClasses was changed, but we
-	// cannot change MemStats because of backward compatibility.
-	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
-
+	stats.Alloc = memstats.alloc
+	stats.TotalAlloc = memstats.total_alloc
+	stats.Sys = memstats.sys
+	stats.Mallocs = memstats.nmalloc
+	stats.Frees = memstats.nfree
+	stats.HeapAlloc = memstats.alloc
+	stats.HeapSys = memstats.heap_sys.load()
+	// By definition, HeapIdle is memory that was mapped
+	// for the heap but is not currently used to hold heap
+	// objects. It also specifically is memory that can be
+	// used for other purposes, like stacks, but this memory
+	// is subtracted out of HeapSys before it makes that
+	// transition. Put another way:
+	//
+	// heap_sys = bytes allocated from the OS for the heap - bytes ultimately used for non-heap purposes
+	// heap_idle = bytes allocated from the OS for the heap - bytes ultimately used for any purpose
+	//
+	// or
+	//
+	// heap_sys = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse
+	// heap_idle = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse - heap_inuse
+	//
+	// => heap_idle = heap_sys - heap_inuse
+	stats.HeapIdle = memstats.heap_sys.load() - memstats.heap_inuse
+	stats.HeapInuse = memstats.heap_inuse
+	stats.HeapReleased = memstats.heap_released
+	stats.HeapObjects = memstats.heap_objects
+	stats.StackInuse = memstats.stacks_inuse
 	// memstats.stacks_sys is only memory mapped directly for OS stacks.
 	// Add in heap-allocated stack memory for user consumption.
-	stats.StackSys += stats.StackInuse
+	stats.StackSys = memstats.stacks_inuse + memstats.stacks_sys.load()
+	stats.MSpanInuse = memstats.mspan_inuse
+	stats.MSpanSys = memstats.mspan_sys.load()
+	stats.MCacheInuse = memstats.mcache_inuse
+	stats.MCacheSys = memstats.mcache_sys.load()
+	stats.BuckHashSys = memstats.buckhash_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+	stats.OtherSys = memstats.other_sys.load()
+	stats.NextGC = memstats.next_gc
+	stats.LastGC = memstats.last_gc_unix
+	stats.PauseTotalNs = memstats.pause_total_ns
+	stats.PauseNs = memstats.pause_ns
+	stats.PauseEnd = memstats.pause_end
+	stats.NumGC = memstats.numgc
+	stats.NumForcedGC = memstats.numforcedgc
+	stats.GCCPUFraction = memstats.gc_cpu_fraction
+	stats.EnableGC = true
+
+	// Handle BySize. Copy N values, where N is
+	// the minimum of the lengths of the two arrays.
+	// Unfortunately copy() won't work here because
+	// the arrays have different structs.
+	//
+	// TODO(mknyszek): Consider renaming the fields
+	// of by_size's elements to align so we can use
+	// the copy built-in.
+	bySizeLen := len(stats.BySize)
+	if l := len(memstats.by_size); l < bySizeLen {
+		bySizeLen = l
+	}
+	for i := 0; i < bySizeLen; i++ {
+		stats.BySize[i].Size = memstats.by_size[i].size
+		stats.BySize[i].Mallocs = memstats.by_size[i].nmalloc
+		stats.BySize[i].Frees = memstats.by_size[i].nfree
+	}
 }
 
 //go:linkname readGCStats runtime_1debug.readGCStats
@@ -511,8 +595,14 @@ func readGCStats_m(pauses *[]uint64) {
 	*pauses = p[:n+n+3]
 }
 
+// Updates the memstats structure.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func updatememstats() {
+	assertWorldStopped()
+
 	// Flush mcaches to mcentral before doing anything else.
 	//
 	// Flushing to the mcentral may in general cause stats to
@@ -521,11 +611,9 @@ func updatememstats() {
 
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
-
-	// We also count stacks_inuse as sys memory.
-	memstats.sys += memstats.stacks_inuse
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
+		memstats.other_sys.load()
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
@@ -542,62 +630,75 @@ func updatememstats() {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
+	// Collect consistent stats, which are the source-of-truth in the some cases.
+	var consStats heapStatsDelta
+	memstats.heapStats.unsafeRead(&consStats)
+
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
 
-	// Aggregate local stats.
-	cachestats()
-
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
 	// Collect per-sizeclass stats.
 	for i := 0; i < _NumSizeClasses; i++ {
-		if i == 0 {
-			memstats.nmalloc += mheap_.nlargealloc
-			totalAlloc += mheap_.largealloc
-			totalFree += mheap_.largefree
-			memstats.nfree += mheap_.nlargefree
-			continue
-		}
-
-		// The mcache stats have been flushed to mheap_.
-		memstats.nfree += mheap_.nsmallfree[i]
-		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
 	}
-	totalFree += smallFree
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
 	memstats.alloc = totalAlloc - totalFree
-	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
-}
 
-// cachestats flushes all mcache stats.
-//
-// The world must be stopped.
-//
-//go:nowritebarrier
-func cachestats() {
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		purgecachedstats(c)
+	memstats.stacks_inuse = uint64(consStats.inStacks)
+	memstats.gcWorkBufInUse = uint64(consStats.inWorkBufs)
+	memstats.gcProgPtrScalarBitsInUse = uint64(consStats.inPtrScalarBits)
+
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+
+	// The world is stopped, so the consistent stats (after aggregation)
+	// should be identical to some combination of memstats. In particular:
+	//
+	// * heap_inuse == inHeap
+	// * heap_released == released
+	// * heap_sys - heap_released == committed - inStacks - inWorkBufs - inPtrScalarBits
+	//
+	// Check if that's actually true.
+	//
+	// TODO(mknyszek): Maybe don't throw here. It would be bad if a
+	// bug in otherwise benign accounting caused the whole application
+	// to crash.
+	if memstats.heap_inuse != uint64(consStats.inHeap) {
+		print("runtime: heap_inuse=", memstats.heap_inuse, "\n")
+		print("runtime: consistent value=", consStats.inHeap, "\n")
+		throw("heap_inuse and consistent stats are not equal")
+	}
+	if memstats.heap_released != uint64(consStats.released) {
+		print("runtime: heap_released=", memstats.heap_released, "\n")
+		print("runtime: consistent value=", consStats.released, "\n")
+		throw("heap_released and consistent stats are not equal")
+	}
+	globalRetained := memstats.heap_sys.load() - memstats.heap_released
+	consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+	if globalRetained != consRetained {
+		print("runtime: global value=", globalRetained, "\n")
+		print("runtime: consistent value=", consRetained, "\n")
+		throw("measures of the retained heap are not equal")
 	}
 }
 
@@ -607,6 +708,8 @@ func cachestats() {
 //
 //go:nowritebarrier
 func flushmcache(i int) {
+	assertWorldStopped()
+
 	p := allp[i]
 	c := p.mcache
 	if c == nil {
@@ -621,69 +724,256 @@ func flushmcache(i int) {
 //
 //go:nowritebarrier
 func flushallmcaches() {
+	assertWorldStopped()
+
 	for i := 0; i < int(gomaxprocs); i++ {
 		flushmcache(i)
 	}
 }
 
+// sysMemStat represents a global system statistic that is managed atomically.
+//
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
+//go:nosplit
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
+}
+
+// add atomically adds the sysMemStat by n.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
 //go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by either heap or GC lock.
-	h := &mheap_
-	memstats.heap_scan += uint64(c.local_scan)
-	c.local_scan = 0
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
-	h.largefree += uint64(c.local_largefree)
-	c.local_largefree = 0
-	h.nlargefree += uint64(c.local_nlargefree)
-	c.local_nlargefree = 0
-	for i := 0; i < len(c.local_nsmallfree); i++ {
-		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
-		c.local_nsmallfree[i] = 0
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
+		return
+	}
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
+	}
+}
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	// Memory stats.
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
 	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
+	}
+}
+
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each P's
+	// statsSeq field. If the reader observes a P not writing,
+	// it can be sure that it will pick up the new gen value the
+	// next time it writes.
+	//
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need a ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+
+	// noPLock is intended to provide mutual exclusion for updating
+	// stats when no P is available. It does not block other writers
+	// with a P, only other writers without a P and the reader. Because
+	// stats are usually updated when a P is available, contention on
+	// this lock should be minimal.
+	noPLock mutex
 }
 
-// Atomically increases a given *system* memory stat. We are counting on this
-// stat never overflowing a uintptr, so this function must only be used for
-// system memory stats.
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated.
 //
-// The current implementation for little endian architectures is based on
-// xadduintptr(), which is less than ideal: xadd64() should really be used.
-// Using xadduintptr() is a stop-gap solution until arm supports xadd64() that
-// doesn't use locks.  (Locks are a problem as they require a valid G, which
-// restricts their useability.)
+// The returned heapStatsDelta must be updated atomically.
 //
-// A side-effect of using xadduintptr() is that we need to check for
-// overflow errors.
-//go:nosplit
-func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) acquire() *heapStatsDelta {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 == 0 {
+			// Should have been incremented to odd.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		lock(&m.noPLock)
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, int64(n))
-		return
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The caller's P must not change between acquire and
+// release. This also means that the caller should not
+// acquire a P or release its P in between.
+func (m *consistentHeapStats) release() {
+	if pp := getg().m.p.ptr(); pp != nil {
+		seq := atomic.Xadd(&pp.statsSeq, 1)
+		if seq%2 != 0 {
+			// Should have been incremented to even.
+			print("runtime: seq=", seq, "\n")
+			throw("bad sequence number")
+		}
+	} else {
+		unlock(&m.noPLock)
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
-		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
-		exit(2)
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// world must be stopped.
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		out.merge(&m.stats[i])
 	}
 }
 
-// Atomically decreases a given *system* memory stat. Same comments as
-// mSysStatInc apply.
-//go:nosplit
-func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	assertWorldStopped()
+
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, -int64(n))
-		return
+}
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently. The world must be stopped
+// or metricsSema must be held.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Get the current generation. We can be confident that this
+	// will not change since read is serialized and is the only
+	// one that modifies currGen.
+	currGen := atomic.Load(&m.gen)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
-		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
-		exit(2)
+
+	// Prevent writers without a P from writing while we update gen.
+	lock(&m.noPLock)
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+
+	// Allow P-less writers to continue. They'll be writing to the
+	// next generation now.
+	unlock(&m.noPLock)
+
+	for _, p := range allp {
+		// Spin until there are no more writers.
+		for atomic.Load(&p.statsSeq)%2 != 0 {
+		}
 	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+
+	releasem(mp)
 }
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
index 548d4c5..b8d4fc2 100644
--- a/libgo/go/runtime/mwbbuf.go
+++ b/libgo/go/runtime/mwbbuf.go
@@ -57,12 +57,6 @@ type wbBuf struct {
 	// on. This must be a multiple of wbBufEntryPointers because
 	// the write barrier only checks for overflow once per entry.
 	buf [wbBufEntryPointers * wbBufEntries]uintptr
-
-	// debugGen causes the write barrier buffer to flush after
-	// every write barrier if equal to gcWorkPauseGen. This is for
-	// debugging #27993. This is only set if debugCachedWork is
-	// set.
-	debugGen uint32
 }
 
 const (
@@ -86,7 +80,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if writeBarrier.cgo || (debugCachedWork && (throwOnGCWork || b.debugGen == atomic.Load(&gcWorkPauseGen))) {
+	if writeBarrier.cgo {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
@@ -204,32 +198,10 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Switch to the system stack so we don't have to worry about
 	// the untyped stack slots or safe points.
 	systemstack(func() {
-		if debugCachedWork {
-			// For debugging, include the old value of the
-			// slot and some other data in the traceback.
-			wbBuf := &getg().m.p.ptr().wbBuf
-			var old uintptr
-			if dst != nil {
-				// dst may be nil in direct calls to wbBufFlush.
-				old = *dst
-			}
-			wbBufFlush1Debug(old, wbBuf.buf[0], wbBuf.buf[1], &wbBuf.buf[0], wbBuf.next)
-		} else {
-			wbBufFlush1(getg().m.p.ptr())
-		}
+		wbBufFlush1(getg().m.p.ptr())
 	})
 }
 
-// wbBufFlush1Debug is a temporary function for debugging issue
-// #27993. It exists solely to add some context to the traceback.
-//
-//go:nowritebarrierrec
-//go:systemstack
-//go:noinline
-func wbBufFlush1Debug(old, buf1, buf2 uintptr, start *uintptr, next uintptr) {
-	wbBufFlush1(getg().m.p.ptr())
-}
-
 // wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
 //
 // This must not have write barriers because it is part of the write
diff --git a/libgo/go/runtime/os_aix.go b/libgo/go/runtime/os_aix.go
index f49b83c..630a1c2 100644
--- a/libgo/go/runtime/os_aix.go
+++ b/libgo/go/runtime/os_aix.go
@@ -7,7 +7,6 @@
 package runtime
 
 import (
-	"internal/cpu"
 	"unsafe"
 )
 
@@ -122,28 +121,9 @@ func semawakeup(mp *m) {
 func osinit() {
 	ncpu = int32(sysconf(__SC_NPROCESSORS_ONLN))
 	physPageSize = uintptr(sysconf(__SC_PAGE_SIZE))
-	setupSystemConf()
 }
 
 const (
 	_CLOCK_REALTIME  = 9
 	_CLOCK_MONOTONIC = 10
 )
-
-const (
-	// getsystemcfg constants
-	_IMPL_POWER8 = 0x10000
-	_IMPL_POWER9 = 0x20000
-)
-
-// setupSystemConf retrieves information about the CPU and updates
-// cpu.HWCap variables.
-func setupSystemConf() {
-	impl := getsystemcfg(_SC_IMPL)
-	if impl&_IMPL_POWER8 != 0 {
-		cpu.HWCap2 |= cpu.PPC_FEATURE2_ARCH_2_07
-	}
-	if impl&_IMPL_POWER9 != 0 {
-		cpu.HWCap2 |= cpu.PPC_FEATURE2_ARCH_3_00
-	}
-}
diff --git a/libgo/go/runtime/os_freebsd_arm64.go b/libgo/go/runtime/os_freebsd_arm64.go
index 51ebf9d..b5b25f0 100644
--- a/libgo/go/runtime/os_freebsd_arm64.go
+++ b/libgo/go/runtime/os_freebsd_arm64.go
@@ -4,149 +4,6 @@
 
 package runtime
 
-import "internal/cpu"
-
-const (
-	hwcap_FP       = 1 << 0
-	hwcap_ASIMD    = 1 << 1
-	hwcap_EVTSTRM  = 1 << 2
-	hwcap_AES      = 1 << 3
-	hwcap_PMULL    = 1 << 4
-	hwcap_SHA1     = 1 << 5
-	hwcap_SHA2     = 1 << 6
-	hwcap_CRC32    = 1 << 7
-	hwcap_ATOMICS  = 1 << 8
-	hwcap_FPHP     = 1 << 9
-	hwcap_ASIMDHP  = 1 << 10
-	hwcap_CPUID    = 1 << 11
-	hwcap_ASIMDRDM = 1 << 12
-	hwcap_JSCVT    = 1 << 13
-	hwcap_FCMA     = 1 << 14
-	hwcap_LRCPC    = 1 << 15
-	hwcap_DCPOP    = 1 << 16
-	hwcap_SHA3     = 1 << 17
-	hwcap_SM3      = 1 << 18
-	hwcap_SM4      = 1 << 19
-	hwcap_ASIMDDP  = 1 << 20
-	hwcap_SHA512   = 1 << 21
-	hwcap_SVE      = 1 << 22
-	hwcap_ASIMDFHM = 1 << 23
-)
-
-func getisar0() uint64
-func getisar1() uint64
-func getpfr0() uint64
-
-// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
-// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
-func archauxv(tag, val uintptr) {
-	var isar0, isar1, pfr0 uint64
-
-	isar0 = getisar0()
-	isar1 = getisar1()
-	pfr0 = getpfr0()
-
-	// ID_AA64ISAR0_EL1
-	switch extractBits(isar0, 4, 7) {
-	case 1:
-		cpu.HWCap |= hwcap_AES
-	case 2:
-		cpu.HWCap |= hwcap_PMULL | hwcap_AES
-	}
-
-	switch extractBits(isar0, 8, 11) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA1
-	}
-
-	switch extractBits(isar0, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA2
-	case 2:
-		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
-	}
-
-	switch extractBits(isar0, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_CRC32
-	}
-
-	switch extractBits(isar0, 20, 23) {
-	case 2:
-		cpu.HWCap |= hwcap_ATOMICS
-	}
-
-	switch extractBits(isar0, 28, 31) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDRDM
-	}
-
-	switch extractBits(isar0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA3
-	}
-
-	switch extractBits(isar0, 36, 39) {
-	case 1:
-		cpu.HWCap |= hwcap_SM3
-	}
-
-	switch extractBits(isar0, 40, 43) {
-	case 1:
-		cpu.HWCap |= hwcap_SM4
-	}
-
-	switch extractBits(isar0, 44, 47) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDDP
-	}
-
-	// ID_AA64ISAR1_EL1
-	switch extractBits(isar1, 0, 3) {
-	case 1:
-		cpu.HWCap |= hwcap_DCPOP
-	}
-
-	switch extractBits(isar1, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_JSCVT
-	}
-
-	switch extractBits(isar1, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_FCMA
-	}
-
-	switch extractBits(isar1, 20, 23) {
-	case 1:
-		cpu.HWCap |= hwcap_LRCPC
-	}
-
-	// ID_AA64PFR0_EL1
-	switch extractBits(pfr0, 16, 19) {
-	case 0:
-		cpu.HWCap |= hwcap_FP
-	case 1:
-		cpu.HWCap |= hwcap_FP | hwcap_FPHP
-	}
-
-	switch extractBits(pfr0, 20, 23) {
-	case 0:
-		cpu.HWCap |= hwcap_ASIMD
-	case 1:
-		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
-	}
-
-	switch extractBits(pfr0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SVE
-	}
-}
-
-func extractBits(data uint64, start, end uint) uint {
-	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
-}
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
diff --git a/libgo/go/runtime/os_linux.go b/libgo/go/runtime/os_linux.go
index 627b6d6..0eebdfa 100644
--- a/libgo/go/runtime/os_linux.go
+++ b/libgo/go/runtime/os_linux.go
@@ -178,6 +178,10 @@ func sysargs(argc int32, argv **byte) {
 	sysauxv(buf[:])
 }
 
+// startupRandomData holds random bytes initialized at startup. These come from
+// the ELF AT_RANDOM auxiliary vector.
+var startupRandomData []byte
+
 func sysauxv(auxv []uintptr) int {
 	var i int
 	for ; auxv[i] != _AT_NULL; i += 2 {
diff --git a/libgo/go/runtime/os_linux_arm64.go b/libgo/go/runtime/os_linux_arm64.go
index a482d47..3844620 100644
--- a/libgo/go/runtime/os_linux_arm64.go
+++ b/libgo/go/runtime/os_linux_arm64.go
@@ -11,18 +11,6 @@ import "internal/cpu"
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
-		// HWCAP/HWCAP2 bits for hardware capabilities.
-		hwcap := uint(val)
-		if GOOS == "android" {
-			// The Samsung S9+ kernel reports support for atomics, but not all cores
-			// actually support them, resulting in SIGILL. See issue #28431.
-			// TODO(elias.naur): Only disable the optimization on bad chipsets.
-			const hwcap_ATOMICS = 1 << 8
-			hwcap &= ^uint(hwcap_ATOMICS)
-		}
-		cpu.HWCap = hwcap
-	case _AT_HWCAP2:
-		cpu.HWCap2 = uint(val)
+		cpu.HWCap = uint(val)
 	}
 }
diff --git a/libgo/go/runtime/os_linux_s390x.go b/libgo/go/runtime/os_linux_s390x.go
index 46fe817..2d57d87 100644
--- a/libgo/go/runtime/os_linux_s390x.go
+++ b/libgo/go/runtime/os_linux_s390x.go
@@ -6,30 +6,9 @@ package runtime
 
 import "internal/cpu"
 
-const (
-	// bit masks taken from bits/hwcap.h
-	_HWCAP_S390_ZARCH  = 2
-	_HWCAP_S390_STFLE  = 4
-	_HWCAP_S390_MSA    = 8
-	_HWCAP_S390_LDISP  = 16
-	_HWCAP_S390_EIMM   = 32
-	_HWCAP_S390_DFP    = 64
-	_HWCAP_S390_ETF3EH = 256
-	_HWCAP_S390_VX     = 2048 // vector facility
-	_HWCAP_S390_VXE    = 8192
-)
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_HWCAP: // CPU capability bit flags
-		cpu.S390X.HasZARCH = val&_HWCAP_S390_ZARCH != 0
-		cpu.S390X.HasSTFLE = val&_HWCAP_S390_STFLE != 0
-		cpu.S390X.HasLDISP = val&_HWCAP_S390_LDISP != 0
-		cpu.S390X.HasEIMM = val&_HWCAP_S390_EIMM != 0
-		cpu.S390X.HasDFP = val&_HWCAP_S390_DFP != 0
-		cpu.S390X.HasETF3EH = val&_HWCAP_S390_ETF3EH != 0
-		cpu.S390X.HasMSA = val&_HWCAP_S390_MSA != 0
-		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
-		cpu.S390X.HasVXE = val&_HWCAP_S390_VXE != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
 	}
 }
diff --git a/libgo/go/runtime/os_netbsd.go b/libgo/go/runtime/os_netbsd.go
index 00c3285..b2c815e 100644
--- a/libgo/go/runtime/os_netbsd.go
+++ b/libgo/go/runtime/os_netbsd.go
@@ -33,13 +33,22 @@ func lwp_unpark(lwp int32, hint unsafe.Pointer) int32
 //extern-sysinfo sysctl
 func sysctl(*uint32, uint32, *byte, *uintptr, *byte, uintptr) int32
 
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
+func sysctlInt(mib []uint32) (int32, bool) {
+	var out int32
 	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
+	ret := sysctl(&mib[0], uint32(len(mib)), (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret < 0 {
+		return 0, false
+	}
+	return out, true
+}
+
+func getncpu() int32 {
+	if n, ok := sysctlInt([]uint32{_CTL_HW, _HW_NCPUONLINE}); ok {
+		return int32(n)
+	}
+	if n, ok := sysctlInt([]uint32{_CTL_HW, _HW_NCPU}); ok {
+		return int32(n)
 	}
 	return 1
 }
diff --git a/libgo/go/runtime/os_openbsd_arm64.go b/libgo/go/runtime/os_openbsd_arm64.go
deleted file mode 100644
index d559a2a..0000000
--- a/libgo/go/runtime/os_openbsd_arm64.go
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"internal/cpu"
-)
-
-//go:nosplit
-func cputicks() int64 {
-	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
-	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	return nanotime()
-}
-
-func sysargs(argc int32, argv **byte) {
-	// OpenBSD does not have auxv, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue #31746.
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index f6747f5..11396b4 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -210,6 +210,11 @@ func panicmem() {
 	panic(memoryError)
 }
 
+func panicmemAddr(addr uintptr) {
+	panicCheck2("invalid memory address or nil pointer dereference")
+	panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr})
+}
+
 // deferproc creates a new deferred function.
 // The compiler turns a defer statement into a call to this.
 // frame points into the stack frame; it is used to determine which
@@ -292,15 +297,6 @@ func newdefer() *_defer {
 		systemstack(func() {
 			d = new(_defer)
 		})
-		if debugCachedWork {
-			// Duplicate the tail below so if there's a
-			// crash in checkPut we can tell if d was just
-			// allocated or came from the pool.
-			d.heap = true
-			d.link = gp._defer
-			gp._defer = d
-			return d
-		}
 	}
 	d.heap = true
 	return d
@@ -1176,12 +1172,6 @@ func startpanic_m() bool {
 	}
 }
 
-// throwReportQuirk, if non-nil, is called by throw after dumping the stacks.
-//
-// TODO(austin): Remove this after Go 1.15 when we remove the
-// mlockGsignal workaround.
-var throwReportQuirk func()
-
 var didothers bool
 var deadlock mutex
 
@@ -1228,10 +1218,6 @@ func dopanic_m(gp *g, pc, sp uintptr) bool {
 
 	printDebugLog()
 
-	if throwReportQuirk != nil {
-		throwReportQuirk()
-	}
-
 	return docrash
 }
 
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index 83bf572..9551ea7 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
diff --git a/libgo/go/runtime/pprof/pprof_rusage.go b/libgo/go/runtime/pprof/pprof_rusage.go
index d42e6ed..7954673 100644
--- a/libgo/go/runtime/pprof/pprof_rusage.go
+++ b/libgo/go/runtime/pprof/pprof_rusage.go
@@ -19,7 +19,7 @@ func addMaxRSS(w io.Writer) {
 	switch runtime.GOOS {
 	case "linux", "android":
 		rssToBytes = 1024
-	case "darwin":
+	case "darwin", "ios":
 		rssToBytes = 1
 	default:
 		panic("unsupported OS")
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 7adf891..42411e9 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -13,7 +13,6 @@ import (
 	"internal/profile"
 	"internal/testenv"
 	"io"
-	"io/ioutil"
 	"math/big"
 	"os"
 	"os/exec"
@@ -262,7 +261,7 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 // as interpreted by matches, and returns the parsed profile.
 func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
 	switch runtime.GOOS {
-	case "darwin":
+	case "darwin", "ios":
 		switch runtime.GOARCH {
 		case "arm64":
 			// nothing
@@ -280,12 +279,16 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 
 	broken := false
 	switch runtime.GOOS {
-	case "darwin", "dragonfly", "netbsd", "illumos", "solaris":
+	case "darwin", "ios", "dragonfly", "netbsd", "illumos", "solaris":
 		broken = true
 	case "openbsd":
 		if runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" {
 			broken = true
 		}
+	case "windows":
+		if runtime.GOARCH == "arm" {
+			broken = true // See https://golang.org/issues/42862
+		}
 	}
 
 	maxDuration := 5 * time.Second
@@ -1195,7 +1198,7 @@ func TestLabelRace(t *testing.T) {
 // Check that there is no deadlock when the program receives SIGPROF while in
 // 64bit atomics' critical section. Used to happen on mips{,le}. See #20146.
 func TestAtomicLoadStore64(t *testing.T) {
-	f, err := ioutil.TempFile("", "profatomic")
+	f, err := os.CreateTemp("", "profatomic")
 	if err != nil {
 		t.Fatalf("TempFile: %v", err)
 	}
@@ -1224,7 +1227,7 @@ func TestAtomicLoadStore64(t *testing.T) {
 func TestTracebackAll(t *testing.T) {
 	// With gccgo, if a profiling signal arrives at the wrong time
 	// during traceback, it may crash or hang. See issue #29448.
-	f, err := ioutil.TempFile("", "proftraceback")
+	f, err := os.CreateTemp("", "proftraceback")
 	if err != nil {
 		t.Fatalf("TempFile: %v", err)
 	}
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index bd269e7..6c5dd31 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -10,7 +10,7 @@ import (
 	"fmt"
 	internalcpu "internal/cpu"
 	"io"
-	"io/ioutil"
+	"os"
 	"runtime"
 	"strconv"
 	"time"
@@ -588,7 +588,7 @@ func (b *profileBuilder) emitLocation() uint64 {
 // It saves the address ranges of the mappings in b.mem for use
 // when emitting locations.
 func (b *profileBuilder) readMapping() {
-	data, _ := ioutil.ReadFile("/proc/self/maps")
+	data, _ := os.ReadFile("/proc/self/maps")
 	parseProcSelfMaps(data, b.addMapping)
 	if len(b.mem) == 0 { // pprof expects a map entry, so fake one.
 		b.addMappingEntry(0, 0, 0, "", "", true)
diff --git a/libgo/go/runtime/pprof/proto_test.go b/libgo/go/runtime/pprof/proto_test.go
index e8efd4a..9290210 100644
--- a/libgo/go/runtime/pprof/proto_test.go
+++ b/libgo/go/runtime/pprof/proto_test.go
@@ -10,7 +10,6 @@ import (
 	"fmt"
 	"internal/profile"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"reflect"
@@ -80,7 +79,7 @@ func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
 	switch runtime.GOOS {
 	case "linux", "android", "netbsd":
 		// Figure out two addresses from /proc/self/maps.
-		mmap, err := ioutil.ReadFile("/proc/self/maps")
+		mmap, err := os.ReadFile("/proc/self/maps")
 		if err != nil {
 			t.Fatal(err)
 		}
diff --git a/libgo/go/runtime/preempt.go b/libgo/go/runtime/preempt.go
index 8452076..40586ca 100644
--- a/libgo/go/runtime/preempt.go
+++ b/libgo/go/runtime/preempt.go
@@ -56,11 +56,6 @@ import (
 	"runtime/internal/atomic"
 )
 
-// Keep in sync with cmd/compile/internal/gc/plive.go:go115ReduceLiveness.
-const go115ReduceLiveness = true
-
-const go115RestartSeq = go115ReduceLiveness && true // enable restartable sequences
-
 type suspendGState struct {
 	g *g
 
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index e42023f..be51ac0 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -259,6 +259,9 @@ func printhex(v uint64) {
 func printpointer(p unsafe.Pointer) {
 	printhex(uint64(uintptr(p)))
 }
+func printuintptr(p uintptr) {
+	printhex(uint64(p))
+}
 
 func printstring(s string) {
 	gwrite(bytes(s))
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index e0b4b50..1696a1b 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -171,10 +171,20 @@ func main(unsafe.Pointer) {
 		maxstacksize = 250000000
 	}
 
+	// An upper limit for max stack size. Used to avoid random crashes
+	// after calling SetMaxStack and trying to allocate a stack that is too big,
+	// since stackalloc works with 32-bit sizes.
+	// Not used by gofrontend.
+	// maxstackceiling = 2 * maxstacksize
+
 	// Allow newproc to start new Ms.
 	mainStarted = true
 
 	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
+		// For runtime_syscall_doAllThreadsSyscall, we
+		// register sysmon is not ready for the world to be
+		// stopped.
+		atomic.Store(&sched.sysmonStarting, 1)
 		systemstack(func() {
 			newm(sysmon, nil, -1)
 		})
@@ -191,11 +201,22 @@ func main(unsafe.Pointer) {
 	if g.m != &m0 {
 		throw("runtime.main not on m0")
 	}
+	m0.doesPark = true
 
-	if nanotime() == 0 {
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
 		throw("nanotime returning zero")
 	}
 
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	// doInit(&runtime_inittask) // Must be before defer.
+
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
 	needUnlock := true
 	defer func() {
@@ -204,9 +225,6 @@ func main(unsafe.Pointer) {
 		}
 	}()
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
-
 	main_init_done = make(chan bool)
 	if iscgo {
 		// Start the template thread in case we enter Go from
@@ -218,12 +236,17 @@ func main(unsafe.Pointer) {
 	fn := main_init // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
 	fn()
 	createGcRootsIndex()
-	close(main_init_done)
 
 	// For gccgo we have to wait until after main is initialized
 	// to enable GC, because initializing main registers the GC roots.
 	gcenable()
 
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
+	close(main_init_done)
+
 	needUnlock = false
 	unlockOSThread()
 
@@ -313,14 +336,23 @@ func goschedguarded() {
 	mcall(goschedguarded_m)
 }
 
-// Puts the current goroutine into a waiting state and calls unlockf.
+// Puts the current goroutine into a waiting state and calls unlockf on the
+// system stack.
+//
 // If unlockf returns false, the goroutine is resumed.
+//
 // unlockf must not access this G's stack, as it may be moved between
 // the call to gopark and the call to unlockf.
-// Reason explains why the goroutine has been parked.
-// It is displayed in stack traces and heap dumps.
-// Reasons should be unique and descriptive.
-// Do not re-use reasons, add new ones.
+//
+// Note that because unlockf is called after putting the G into a waiting
+// state, the G may have already been readied by the time unlockf is called
+// unless there is external synchronization preventing the G from being
+// readied. If unlockf returns false, it must guarantee that the G cannot be
+// externally readied.
+//
+// Reason explains why the goroutine has been parked. It is displayed in stack
+// traces and heap dumps. Reasons should be unique and descriptive. Do not
+// re-use reasons, add new ones.
 func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) {
 	if reason != waitReasonSleep {
 		checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy
@@ -501,7 +533,7 @@ func cpuinit() {
 	var env string
 
 	switch GOOS {
-	case "aix", "darwin", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
+	case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
 		cpu.DebugOptions = true
 
 		// Similar to goenv_unix but extracts the environment value for
@@ -559,12 +591,19 @@ func schedinit() {
 	lockInit(&trace.lock, lockRankTrace)
 	lockInit(&cpuprof.lock, lockRankCpuprof)
 	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+	// Enforce that this lock is always a leaf lock.
+	// All of this lock's critical sections should be
+	// extremely short.
+	lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
 
 	_g_ := getg()
 	sched.maxmcount = 10000
 
 	usestackmaps = probestackmaps()
 
+	// The world starts stopped.
+	worldStopped()
+
 	mallocinit()
 	fastrandinit() // must run before mcommoninit
 	mcommoninit(_g_.m, -1)
@@ -579,6 +618,7 @@ func schedinit() {
 	parsedebugvars()
 	gcinit()
 
+	lock(&sched.lock)
 	sched.lastpoll = uint64(nanotime())
 	procs := ncpu
 
@@ -595,6 +635,10 @@ func schedinit() {
 	if procresize(procs) != nil {
 		throw("unknown runnable goroutine during bootstrap")
 	}
+	unlock(&sched.lock)
+
+	// World is effectively started now, as P's can run.
+	worldStarted()
 
 	// For cgocheck > 1, we turn on the write barrier at all times
 	// and check all pointer writes. We can't do this until after
@@ -625,8 +669,10 @@ func dumpgstatus(gp *g) {
 	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
 }
 
+// sched.lock must be held.
 func checkmcount() {
-	// sched lock is held
+	assertLockHeld(&sched.lock)
+
 	if mcount() > sched.maxmcount {
 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
 		throw("thread exhaustion")
@@ -638,6 +684,8 @@ func checkmcount() {
 //
 // sched.lock must be held.
 func mReserveID() int64 {
+	assertLockHeld(&sched.lock)
+
 	if sched.mnext+1 < sched.mnext {
 		throw("runtime: thread ID overflow")
 	}
@@ -904,10 +952,26 @@ func stopTheWorld(reason string) {
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
 	systemstack(func() { startTheWorldWithSema(false) })
+
 	// worldsema must be held over startTheWorldWithSema to ensure
 	// gomaxprocs cannot change while worldsema is held.
-	semrelease(&worldsema)
-	getg().m.preemptoff = ""
+	//
+	// Release worldsema with direct handoff to the next waiter, but
+	// acquirem so that semrelease1 doesn't try to yield our time.
+	//
+	// Otherwise if e.g. ReadMemStats is being called in a loop,
+	// it might stomp on other attempts to stop the world, such as
+	// for starting or ending GC. The operation this blocks is
+	// so heavy-weight that we should just try to be as fair as
+	// possible here.
+	//
+	// We don't want to just allow us to get preempted between now
+	// and releasing the semaphore because then we keep everyone
+	// (including, for example, GCs) waiting longer.
+	mp := acquirem()
+	mp.preemptoff = ""
+	semrelease1(&worldsema, true, 0)
+	releasem(mp)
 }
 
 // stopTheWorldGC has the same effect as stopTheWorld, but blocks
@@ -1031,9 +1095,13 @@ func stopTheWorldWithSema() {
 	if bad != "" {
 		throw(bad)
 	}
+
+	worldStopped()
 }
 
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
+	assertWorldStopped()
+
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if netpollinited() {
 		list := netpoll(0) // non-blocking
@@ -1054,6 +1122,8 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	}
 	unlock(&sched.lock)
 
+	worldStarted()
+
 	for p1 != nil {
 		p := p1
 		p1 = p1.link.ptr()
@@ -1173,6 +1243,21 @@ func mstartm0() {
 	initsig(false)
 }
 
+// mPark causes a thread to park itself - temporarily waking for
+// fixups but otherwise waiting to be fully woken. This is the
+// only way that m's should park themselves.
+//go:nosplit
+func mPark() {
+	g := getg()
+	for {
+		notesleep(&g.m.park)
+		noteclear(&g.m.park)
+		if !mDoFixup() {
+			return
+		}
+	}
+}
+
 // mexit tears down and exits the current thread.
 //
 // Don't call this directly to exit the thread, since it must run at
@@ -1204,7 +1289,7 @@ func mexit(osStack bool) {
 		sched.nmfreed++
 		checkdead()
 		unlock(&sched.lock)
-		notesleep(&m.park)
+		mPark()
 		throw("locked m0 woke up")
 	}
 
@@ -1258,7 +1343,7 @@ found:
 	checkdead()
 	unlock(&sched.lock)
 
-	if GOOS == "darwin" {
+	if GOOS == "darwin" || GOOS == "ios" {
 		// Make sure pendingPreemptSignals is correct when an M exits.
 		// For #41702.
 		if atomic.Load(&m.signalPending) != 0 {
@@ -1428,7 +1513,12 @@ func allocm(_p_ *p, fn func(), id int64, allocatestack bool) (mp *m, g0Stack uns
 				freem = next
 				continue
 			}
-			stackfree(freem.g0)
+			// stackfree must be on the system stack, but allocm is
+			// reachable off the system stack transitively from
+			// startm.
+			systemstack(func() {
+				stackfree(freem.g0)
+			})
 			freem = freem.freelink
 		}
 		sched.freem = newList
@@ -1484,7 +1574,7 @@ func allocm(_p_ *p, fn func(), id int64, allocatestack bool) (mp *m, g0Stack uns
 // When the callback is done with the m, it calls dropm to
 // put the m back on the list.
 //go:nosplit
-func needm(x byte) {
+func needm() {
 	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
 		// Can happen if C/C++ code calls Go from a global ctor.
 		// Can also happen on Windows if a global ctor uses a
@@ -1750,6 +1840,7 @@ var newmHandoff struct {
 //go:nowritebarrierrec
 func newm(fn func(), _p_ *p, id int64) {
 	mp, _, _ := allocm(_p_, fn, id, false)
+	mp.doesPark = (_p_ != nil)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
@@ -1806,6 +1897,44 @@ func startTemplateThread() {
 	releasem(mp)
 }
 
+// mFixupRace is used to temporarily borrow the race context from the
+// coordinating m during a syscall_runtime_doAllThreadsSyscall and
+// loan it out to each of the m's of the runtime so they can execute a
+// mFixup.fn in that context.
+var mFixupRace struct {
+	lock mutex
+	ctx  uintptr
+}
+
+// mDoFixup runs any outstanding fixup function for the running m.
+// Returns true if a fixup was outstanding and actually executed.
+//
+//go:nosplit
+func mDoFixup() bool {
+	_g_ := getg()
+	lock(&_g_.m.mFixup.lock)
+	fn := _g_.m.mFixup.fn
+	if fn != nil {
+		if gcphase != _GCoff {
+			// We can't have a write barrier in this
+			// context since we may not have a P, but we
+			// clear fn to signal that we've executed the
+			// fixup. As long as fn is kept alive
+			// elsewhere, technically we should have no
+			// issues with the GC, but fn is likely
+			// generated in a different package altogether
+			// that may change independently. Just assert
+			// the GC is off so this lack of write barrier
+			// is more obviously safe.
+			throw("GC must be disabled to protect validity of fn value")
+		}
+		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+		fn(false)
+	}
+	unlock(&_g_.m.mFixup.lock)
+	return fn != nil
+}
+
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
 // may not be in a good state.
@@ -1842,6 +1971,7 @@ func templateThread() {
 		noteclear(&newmHandoff.wake)
 		unlock(&newmHandoff.lock)
 		notesleep(&newmHandoff.wake)
+		mDoFixup()
 	}
 }
 
@@ -1863,8 +1993,7 @@ func stopm() {
 	lock(&sched.lock)
 	mput(_g_.m)
 	unlock(&sched.lock)
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	acquirep(_g_.m.nextp.ptr())
 	_g_.m.nextp = 0
 }
@@ -1879,8 +2008,30 @@ func mspinning() {
 // May run with m.p==nil, so write barriers are not allowed.
 // If spinning is set, the caller has incremented nmspinning and startm will
 // either decrement nmspinning or set m.spinning in the newly started M.
+//
+// Callers passing a non-nil P must call from a non-preemptible context. See
+// comment on acquirem below.
+//
+// Must not have write barriers because this may be called without a P.
 //go:nowritebarrierrec
 func startm(_p_ *p, spinning bool) {
+	// Disable preemption.
+	//
+	// Every owned P must have an owner that will eventually stop it in the
+	// event of a GC stop request. startm takes transient ownership of a P
+	// (either from argument or pidleget below) and transfers ownership to
+	// a started M, which will be responsible for performing the stop.
+	//
+	// Preemption must be disabled during this transient ownership,
+	// otherwise the P this is running on may enter GC stop while still
+	// holding the transient P, leaving that P in limbo and deadlocking the
+	// STW.
+	//
+	// Callers passing a non-nil P must already be in non-preemptible
+	// context, otherwise such preemption could occur on function entry to
+	// startm. Callers passing a nil P may be preemptible, so we must
+	// disable preemption before acquiring a P from pidleget below.
+	mp := acquirem()
 	lock(&sched.lock)
 	if _p_ == nil {
 		_p_ = pidleget()
@@ -1893,11 +2044,12 @@ func startm(_p_ *p, spinning bool) {
 					throw("startm: negative nmspinning")
 				}
 			}
+			releasem(mp)
 			return
 		}
 	}
-	mp := mget()
-	if mp == nil {
+	nmp := mget()
+	if nmp == nil {
 		// No M is available, we must drop sched.lock and call newm.
 		// However, we already own a P to assign to the M.
 		//
@@ -1919,22 +2071,28 @@ func startm(_p_ *p, spinning bool) {
 			fn = mspinning
 		}
 		newm(fn, _p_, id)
+		// Ownership transfer of _p_ committed by start in newm.
+		// Preemption is now safe.
+		releasem(mp)
 		return
 	}
 	unlock(&sched.lock)
-	if mp.spinning {
+	if nmp.spinning {
 		throw("startm: m is spinning")
 	}
-	if mp.nextp != 0 {
+	if nmp.nextp != 0 {
 		throw("startm: m has p")
 	}
 	if spinning && !runqempty(_p_) {
 		throw("startm: p has runnable gs")
 	}
 	// The caller incremented nmspinning, so set m.spinning in the new M.
-	mp.spinning = spinning
-	mp.nextp.set(_p_)
-	notewakeup(&mp.park)
+	nmp.spinning = spinning
+	nmp.nextp.set(_p_)
+	notewakeup(&nmp.park)
+	// Ownership transfer of _p_ committed by wakeup. Preemption is now
+	// safe.
+	releasem(mp)
 }
 
 // Hands off P from syscall or locked M.
@@ -1989,11 +2147,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2024,12 +2187,11 @@ func stoplockedm() {
 	}
 	incidlelocked(1)
 	// Wait until another thread schedules lockedg again.
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
-		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_)
+		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_.m.lockedg.ptr())
 		throw("stoplockedm: not runnable")
 	}
 	acquirep(_g_.m.nextp.ptr())
@@ -2202,31 +2364,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
-			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-				return gp, false
-			}
 
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// timerpMask tells us whether the P may have timers at all. If it
+			// can't, no need to check at all.
+			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2247,6 +2411,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2259,14 +2430,17 @@ stop:
 	// We have nothing to do. If we're in the GC mark phase, can
 	// safely scan and blacken objects, and have work to do, run
 	// idle-time marking rather than give up the P.
-	if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
-		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
-		gp := _p_.gcBgMarkWorker.ptr()
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		if trace.enabled {
-			traceGoUnpark(gp, 0)
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+		if node != nil {
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			gp := node.gp.ptr()
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
 		}
-		return gp, false
 	}
 
 	delta := int64(-1)
@@ -2296,6 +2470,10 @@ stop:
 	// safe-points. We don't need to snapshot the contents because
 	// everything up to cap(allp) is immutable.
 	allpSnapshot := allp
+	// Also snapshot masks. Value changes are OK, but we can't allow
+	// len to change out from under us.
+	idlepMaskSnapshot := idlepMask
+	timerpMaskSnapshot := timerpMask
 
 	// return P and block
 	lock(&sched.lock)
@@ -2319,7 +2497,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2336,8 +2514,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for _, _p_ := range allpSnapshot {
-		if !runqempty(_p_) {
+	for id, _p_ := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2353,13 +2531,56 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for id, _p_ := range allpSnapshot {
+		if timerpMaskSnapshot.read(uint32(id)) {
+			w := nobarrierWakeTime(_p_)
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
-	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
+	//
+	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
+	// must check again after acquiring a P.
+	if atomic.Load(&gcBlackenEnabled) != 0 && gcMarkWorkAvailable(nil) {
+		// Work is available; we can start an idle GC worker only if
+		// there is an available P and available worker G.
+		//
+		// We can attempt to acquire these in either order. Workers are
+		// almost always available (see comment in findRunnableGCWorker
+		// for the one case there may be none). Since we're slightly
+		// less likely to find a P, check for that first.
 		lock(&sched.lock)
+		var node *gcBgMarkWorkerNode
 		_p_ = pidleget()
-		if _p_ != nil && _p_.gcBgMarkWorker == 0 {
-			pidleput(_p_)
-			_p_ = nil
+		if _p_ != nil {
+			// Now that we own a P, gcBlackenEnabled can't change
+			// (as it requires STW).
+			if gcBlackenEnabled != 0 {
+				node = (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+				if node == nil {
+					pidleput(_p_)
+					_p_ = nil
+				}
+			} else {
+				pidleput(_p_)
+				_p_ = nil
+			}
 		}
 		unlock(&sched.lock)
 		if _p_ != nil {
@@ -2368,8 +2589,15 @@ stop:
 				_g_.m.spinning = true
 				atomic.Xadd(&sched.nmspinning, 1)
 			}
-			// Go back to idle GC check.
-			goto stop
+
+			// Run the idle worker.
+			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			gp := node.gp.ptr()
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
 		}
 	}
 
@@ -2448,9 +2676,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2461,6 +2689,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -2486,7 +2718,7 @@ func resetspinning() {
 // Otherwise, for each idle P, this adds a G to the global queue
 // and starts an M. Any remaining G's are added to the current P's
 // local run queue.
-// This may temporarily acquire the scheduler lock.
+// This may temporarily acquire sched.lock.
 // Can run concurrently with GC.
 func injectglist(glist *gList) {
 	if glist.empty() {
@@ -2530,15 +2762,20 @@ func injectglist(glist *gList) {
 		return
 	}
 
-	lock(&sched.lock)
-	npidle := int(sched.npidle)
+	npidle := int(atomic.Load(&sched.npidle))
+	var globq gQueue
 	var n int
 	for n = 0; n < npidle && !q.empty(); n++ {
-		globrunqput(q.pop())
+		g := q.pop()
+		globq.pushBack(g)
+	}
+	if n > 0 {
+		lock(&sched.lock)
+		globrunqputbatch(&globq, int32(n))
+		unlock(&sched.lock)
+		startIdle(n)
+		qsize -= n
 	}
-	unlock(&sched.lock)
-	startIdle(n)
-	qsize -= n
 
 	if !q.empty() {
 		runqputbatch(pp, &q, qsize)
@@ -2702,40 +2939,40 @@ func dropg() {
 // We pass now in and out to avoid extra calls of nanotime.
 //go:yeswritebarrierrec
 func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
-	// If there are no timers to adjust, and the first timer on
-	// the heap is not yet ready to run, then there is nothing to do.
-	if atomic.Load(&pp.adjustTimers) == 0 {
-		next := int64(atomic.Load64(&pp.timer0When))
-		if next == 0 {
-			return now, 0, false
-		}
-		if now == 0 {
-			now = nanotime()
-		}
-		if now < next {
-			// Next timer is not ready to run.
-			// But keep going if we would clear deleted timers.
-			// This corresponds to the condition below where
-			// we decide whether to call clearDeletedTimers.
-			if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
-				return now, next, false
-			}
+	// If it's not yet time for the first timer, or the first adjusted
+	// timer, then there is nothing to do.
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+
+	if next == 0 {
+		// No timers to run or adjust.
+		return now, 0, false
+	}
+
+	if now == 0 {
+		now = nanotime()
+	}
+	if now < next {
+		// Next timer is not ready to run, but keep going
+		// if we would clear deleted timers.
+		// This corresponds to the condition below where
+		// we decide whether to call clearDeletedTimers.
+		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+			return now, next, false
 		}
 	}
 
 	lock(&pp.timersLock)
 
-	adjusttimers(pp)
-
-	rnow = now
 	if len(pp.timers) > 0 {
-		if rnow == 0 {
-			rnow = nanotime()
-		}
+		adjusttimers(pp, now)
 		for len(pp.timers) > 0 {
 			// Note that runtimer may temporarily unlock
 			// pp.timersLock.
-			if tw := runtimer(pp, rnow); tw != 0 {
+			if tw := runtimer(pp, now); tw != 0 {
 				if tw > 0 {
 					pollUntil = tw
 				}
@@ -2754,26 +2991,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 
 	unlock(&pp.timersLock)
 
-	return rnow, pollUntil, ran
-}
-
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
+	return now, pollUntil, ran
 }
 
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
@@ -2930,7 +3148,8 @@ func goexit0(gp *g) {
 		// Flush assist credit to the global pool. This gives
 		// better information to pacing if the application is
 		// rapidly creating an exiting goroutines.
-		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
 		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
 		gp.gcAssistBytes = 0
 	}
@@ -3376,7 +3595,7 @@ func syscall_runtime_BeforeExec() {
 
 	// On Darwin, wait for all pending preemption signals to
 	// be received. See issue #41702.
-	if GOOS == "darwin" {
+	if GOOS == "darwin" || GOOS == "ios" {
 		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
 			osyield()
 		}
@@ -3760,6 +3979,12 @@ func sigprof(pc uintptr, gp *g, mp *m) {
 		return
 	}
 
+	// If mp.profilehz is 0, then profiling is not enabled for this thread.
+	// We must check this to avoid a deadlock between setcpuprofilerate
+	// and the call to cpuprof.add, below.
+	if mp != nil && mp.profilehz == 0 {
+		return
+	}
 	// Profiling runs concurrently with GC, so it must not allocate.
 	// Set a trap in case the code does allocate.
 	// Note that on windows, one thread takes profiles of all the
@@ -3768,6 +3993,71 @@ func sigprof(pc uintptr, gp *g, mp *m) {
 	// See golang.org/issue/17165.
 	getg().m.mallocing++
 
+	// Define that a "user g" is a user-created goroutine, and a "system g"
+	// is one that is m->g0 or m->gsignal.
+	//
+	// We might be interrupted for profiling halfway through a
+	// goroutine switch. The switch involves updating three (or four) values:
+	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
+	// because once it gets updated the new g is running.
+	//
+	// When switching from a user g to a system g, LR is not considered live,
+	// so the update only affects g, SP, and PC. Since PC must be last, there
+	// the possible partial transitions in ordinary execution are (1) g alone is updated,
+	// (2) both g and SP are updated, and (3) SP alone is updated.
+	// If SP or g alone is updated, we can detect the partial transition by checking
+	// whether the SP is within g's stack bounds. (We could also require that SP
+	// be changed only after g, but the stack bounds check is needed by other
+	// cases, so there is no need to impose an additional requirement.)
+	//
+	// There is one exceptional transition to a system g, not in ordinary execution.
+	// When a signal arrives, the operating system starts the signal handler running
+	// with an updated PC and SP. The g is updated last, at the beginning of the
+	// handler. There are two reasons this is okay. First, until g is updated the
+	// g and SP do not match, so the stack bounds check detects the partial transition.
+	// Second, signal handlers currently run with signals disabled, so a profiling
+	// signal cannot arrive during the handler.
+	//
+	// When switching from a system g to a user g, there are three possibilities.
+	//
+	// First, it may be that the g switch has no PC update, because the SP
+	// either corresponds to a user g throughout (as in asmcgocall)
+	// or because it has been arranged to look like a user g frame
+	// (as in cgocallback). In this case, since the entire
+	// transition is a g+SP update, a partial transition updating just one of
+	// those will be detected by the stack bounds check.
+	//
+	// Second, when returning from a signal handler, the PC and SP updates
+	// are performed by the operating system in an atomic update, so the g
+	// update must be done before them. The stack bounds check detects
+	// the partial transition here, and (again) signal handlers run with signals
+	// disabled, so a profiling signal cannot arrive then anyway.
+	//
+	// Third, the common case: it may be that the switch updates g, SP, and PC
+	// separately. If the PC is within any of the functions that does this,
+	// we don't ask for a traceback. C.F. the function setsSP for more about this.
+	//
+	// There is another apparently viable approach, recorded here in case
+	// the "PC within setsSP function" check turns out not to be usable.
+	// It would be possible to delay the update of either g or SP until immediately
+	// before the PC update instruction. Then, because of the stack bounds check,
+	// the only problematic interrupt point is just before that PC update instruction,
+	// and the sigprof handler can detect that instruction and simulate stepping past
+	// it in order to reach a consistent state. On ARM, the update of g must be made
+	// in two places (in R10 and also in a TLS slot), so the delayed update would
+	// need to be the SP update. The sigprof handler must read the instruction at
+	// the current PC and if it was the known instruction (for example, JMP BX or
+	// MOV R2, PC), use that other register in place of the PC value.
+	// The biggest drawback to this solution is that it requires that we can tell
+	// whether it's safe to read from the memory pointed at by PC.
+	// In a correct program, we can test PC == nil and otherwise read,
+	// but if a profiling signal happens at the instant that a program executes
+	// a bad jump (before the program manages to handle the resulting fault)
+	// the profiling handler could fault trying to read nonexistent memory.
+	//
+	// To recap, there are no constraints on the assembly being used for the
+	// transition. We simply require that g and SP match and that the PC is not
+	// in gogo.
 	traceback := true
 
 	// If SIGPROF arrived while already fetching runtime callers
@@ -3953,6 +4243,13 @@ func (pp *p) init(id int32) {
 		}
 	}
 	lockInit(&pp.timersLock, lockRankTimers)
+
+	// This P may get timers when it starts running. Set the mask here
+	// since the P may not go through pidleget (notably P 0 on startup).
+	timerpMask.set(id)
+	// Similarly, we may not go through pidleget before this P starts
+	// running if it is P 0 on startup.
+	idlepMask.clear(id)
 }
 
 // destroy releases all of the resources associated with pp and
@@ -3960,6 +4257,9 @@ func (pp *p) init(id int32) {
 //
 // sched.lock must be held and the world must be stopped.
 func (pp *p) destroy() {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
 	// Move all runnable goroutines to the global queue
 	for pp.runqhead != pp.runqtail {
 		// Pop from tail of local queue
@@ -3989,18 +4289,6 @@ func (pp *p) destroy() {
 		unlock(&pp.timersLock)
 		unlock(&plocal.timersLock)
 	}
-	// If there's a background worker, make it runnable and put
-	// it on the global queue so it can clean itself up.
-	if gp := pp.gcBgMarkWorker.ptr(); gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		if trace.enabled {
-			traceGoUnpark(gp, 0)
-		}
-		globrunqput(gp)
-		// This assignment doesn't race because the
-		// world is stopped.
-		pp.gcBgMarkWorker.set(nil)
-	}
 	// Flush p's write barrier buffer.
 	if gcphase != _GCoff {
 		wbBufFlush1(pp)
@@ -4020,7 +4308,9 @@ func (pp *p) destroy() {
 			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
 		}
 		pp.mspancache.len = 0
+		lock(&mheap_.lock)
 		pp.pcache.flush(&mheap_.pages)
+		unlock(&mheap_.lock)
 	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
@@ -4030,11 +4320,18 @@ func (pp *p) destroy() {
 	pp.status = _Pdead
 }
 
-// Change number of processors. The world is stopped, sched is locked.
-// gcworkbufs are not being modified by either the GC or
-// the write barrier code.
+// Change number of processors.
+//
+// sched.lock must be held, and the world must be stopped.
+//
+// gcworkbufs must not be being modified by either the GC or the write barrier
+// code, so the GC must not be running if the number of Ps actually changes.
+//
 // Returns list of Ps with local work, they need to be scheduled by the caller.
 func procresize(nprocs int32) *p {
+	assertLockHeld(&sched.lock)
+	assertWorldStopped()
+
 	old := gomaxprocs
 	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
@@ -4050,6 +4347,8 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	maskWords := (nprocs + 31) / 32
+
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
 		// Synchronize with retake, which could be running
@@ -4064,6 +4363,20 @@ func procresize(nprocs int32) *p {
 			copy(nallp, allp[:cap(allp)])
 			allp = nallp
 		}
+
+		if maskWords <= int32(cap(idlepMask)) {
+			idlepMask = idlepMask[:maskWords]
+			timerpMask = timerpMask[:maskWords]
+		} else {
+			nidlepMask := make([]uint32, maskWords)
+			// No need to copy beyond len, old Ps are irrelevant.
+			copy(nidlepMask, idlepMask)
+			idlepMask = nidlepMask
+
+			ntimerpMask := make([]uint32, maskWords)
+			copy(ntimerpMask, timerpMask)
+			timerpMask = ntimerpMask
+		}
 		unlock(&allpLock)
 	}
 
@@ -4122,6 +4435,8 @@ func procresize(nprocs int32) *p {
 	if int32(len(allp)) != nprocs {
 		lock(&allpLock)
 		allp = allp[:nprocs]
+		idlepMask = idlepMask[:maskWords]
+		timerpMask = timerpMask[:maskWords]
 		unlock(&allpLock)
 	}
 
@@ -4226,6 +4541,8 @@ func incidlelocked(v int32) {
 // The check is based on number of running M's, if 0 -> deadlock.
 // sched.lock must be held.
 func checkdead() {
+	assertLockHeld(&sched.lock)
+
 	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
 	// there are no running goroutines. The calling program is
 	// assumed to be running.
@@ -4341,9 +4658,14 @@ func sysmon() {
 	checkdead()
 	unlock(&sched.lock)
 
+	// For syscall_runtime_doAllThreadsSyscall, sysmon is
+	// sufficiently up to participate in fixups.
+	atomic.Store(&sched.sysmonStarting, 0)
+
 	lasttrace := int64(0)
 	idle := 0 // how many cycles in succession we had not wokeup somebody
 	delay := uint32(0)
+
 	for {
 		if idle == 0 { // start with 20us sleep...
 			delay = 20
@@ -4354,11 +4676,29 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4372,32 +4712,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
+					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4421,12 +4756,7 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
+		mDoFixup()
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
@@ -4709,7 +5039,11 @@ func schedEnableUser(enable bool) {
 
 // schedEnabled reports whether gp should be scheduled. It returns
 // false is scheduling of gp is disabled.
+//
+// sched.lock must be held.
 func schedEnabled(gp *g) bool {
+	assertLockHeld(&sched.lock)
+
 	if sched.disable.user {
 		return isSystemGoroutine(gp, true)
 	}
@@ -4717,10 +5051,12 @@ func schedEnabled(gp *g) bool {
 }
 
 // Put mp on midle list.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func mput(mp *m) {
+	assertLockHeld(&sched.lock)
+
 	mp.schedlink = sched.midle
 	sched.midle.set(mp)
 	sched.nmidle++
@@ -4728,10 +5064,12 @@ func mput(mp *m) {
 }
 
 // Try to get an m from midle list.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func mget() *m {
+	assertLockHeld(&sched.lock)
+
 	mp := sched.midle.ptr()
 	if mp != nil {
 		sched.midle = mp.schedlink
@@ -4741,35 +5079,43 @@ func mget() *m {
 }
 
 // Put gp on the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqput(gp *g) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.pushBack(gp)
 	sched.runqsize++
 }
 
 // Put gp at the head of the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqputhead(gp *g) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.push(gp)
 	sched.runqsize++
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
 // This clears *batch.
-// Sched must be locked.
+// sched.lock must be held.
 func globrunqputbatch(batch *gQueue, n int32) {
+	assertLockHeld(&sched.lock)
+
 	sched.runq.pushBackAll(*batch)
 	sched.runqsize += n
 	*batch = gQueue{}
 }
 
 // Try get a batch of G's from the global runnable queue.
-// Sched must be locked.
+// sched.lock must be held.
 func globrunqget(_p_ *p, max int32) *g {
+	assertLockHeld(&sched.lock)
+
 	if sched.runqsize == 0 {
 		return nil
 	}
@@ -4796,26 +5142,106 @@ func globrunqget(_p_ *p, max int32) *g {
 	return gp
 }
 
-// Put p to on _Pidle list.
-// Sched must be locked.
+// pMask is an atomic bitstring with one bit per P.
+type pMask []uint32
+
+// read returns true if P id's bit is set.
+func (p pMask) read(id uint32) bool {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	return (atomic.Load(&p[word]) & mask) != 0
+}
+
+// set sets P id's bit.
+func (p pMask) set(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.Or(&p[word], mask)
+}
+
+// clear clears P id's bit.
+func (p pMask) clear(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.And(&p[word], ^mask)
+}
+
+// updateTimerPMask clears pp's timer mask if it has no timers on its heap.
+//
+// Ideally, the timer mask would be kept immediately consistent on any timer
+// operations. Unfortunately, updating a shared global data structure in the
+// timer hot path adds too much overhead in applications frequently switching
+// between no timers and some timers.
+//
+// As a compromise, the timer mask is updated only on pidleget / pidleput. A
+// running P (returned by pidleget) may add a timer at any time, so its mask
+// must be set. An idle P (passed to pidleput) cannot add new timers while
+// idle, so if it has no timers at that time, its mask may be cleared.
+//
+// Thus, we get the following effects on timer-stealing in findrunnable:
+//
+// * Idle Ps with no timers when they go idle are never checked in findrunnable
+//   (for work- or timer-stealing; this is the ideal case).
+// * Running Ps must always be checked.
+// * Idle Ps whose timers are stolen must continue to be checked until they run
+//   again, even after timer expiration.
+//
+// When the P starts running again, the mask should be set, as a timer may be
+// added at any time.
+//
+// TODO(prattmic): Additional targeted updates may improve the above cases.
+// e.g., updating the mask when stealing a timer.
+func updateTimerPMask(pp *p) {
+	if atomic.Load(&pp.numTimers) > 0 {
+		return
+	}
+
+	// Looks like there are no timers, however another P may transiently
+	// decrement numTimers when handling a timerModified timer in
+	// checkTimers. We must take timersLock to serialize with these changes.
+	lock(&pp.timersLock)
+	if atomic.Load(&pp.numTimers) == 0 {
+		timerpMask.clear(pp.id)
+	}
+	unlock(&pp.timersLock)
+}
+
+// pidleput puts p to on the _Pidle list.
+//
+// This releases ownership of p. Once sched.lock is released it is no longer
+// safe to use p.
+//
+// sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleput(_p_ *p) {
+	assertLockHeld(&sched.lock)
+
 	if !runqempty(_p_) {
 		throw("pidleput: P has non-empty run queue")
 	}
+	updateTimerPMask(_p_) // clear if there are no timers.
+	idlepMask.set(_p_.id)
 	_p_.link = sched.pidle
 	sched.pidle.set(_p_)
 	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
 }
 
-// Try get a p from _Pidle list.
-// Sched must be locked.
+// pidleget tries to get a p from the _Pidle list, acquiring ownership.
+//
+// sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleget() *p {
+	assertLockHeld(&sched.lock)
+
 	_p_ := sched.pidle.ptr()
 	if _p_ != nil {
+		// Timer may get added at any time now.
+		timerpMask.set(_p_.id)
+		idlepMask.clear(_p_.id)
 		sched.pidle = _p_.link
 		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
 	}
@@ -5308,3 +5734,14 @@ func gcd(a, b uint32) uint32 {
 	}
 	return a
 }
+
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init go routine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
diff --git a/libgo/go/runtime/race0.go b/libgo/go/runtime/race0.go
index 6f26afa..180f707 100644
--- a/libgo/go/runtime/race0.go
+++ b/libgo/go/runtime/race0.go
@@ -32,6 +32,8 @@ func raceacquireg(gp *g, addr unsafe.Pointer)                               { th
 func raceacquirectx(racectx uintptr, addr unsafe.Pointer)                   { throw("race") }
 func racerelease(addr unsafe.Pointer)                                       { throw("race") }
 func racereleaseg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func racereleaseacquire(addr unsafe.Pointer)                                { throw("race") }
+func racereleaseacquireg(gp *g, addr unsafe.Pointer)                        { throw("race") }
 func racereleasemerge(addr unsafe.Pointer)                                  { throw("race") }
 func racereleasemergeg(gp *g, addr unsafe.Pointer)                          { throw("race") }
 func racefingo()                                                            { throw("race") }
diff --git a/libgo/go/runtime/runtime-lldb_test.go b/libgo/go/runtime/runtime-lldb_test.go
index 1e2e5d5..c923b87 100644
--- a/libgo/go/runtime/runtime-lldb_test.go
+++ b/libgo/go/runtime/runtime-lldb_test.go
@@ -6,7 +6,6 @@ package runtime_test
 
 import (
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -143,20 +142,20 @@ func TestLldbPython(t *testing.T) {
 
 	checkLldbPython(t)
 
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
 	defer os.RemoveAll(dir)
 
 	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
+	err = os.WriteFile(src, []byte(lldbHelloSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create src file: %v", err)
 	}
 
 	mod := filepath.Join(dir, "go.mod")
-	err = ioutil.WriteFile(mod, []byte("module lldbtest"), 0644)
+	err = os.WriteFile(mod, []byte("module lldbtest"), 0644)
 	if err != nil {
 		t.Fatalf("failed to create mod file: %v", err)
 	}
@@ -172,7 +171,7 @@ func TestLldbPython(t *testing.T) {
 	}
 
 	src = filepath.Join(dir, "script.py")
-	err = ioutil.WriteFile(src, []byte(lldbScriptSource), 0755)
+	err = os.WriteFile(src, []byte(lldbScriptSource), 0755)
 	if err != nil {
 		t.Fatalf("failed to create script: %v", err)
 	}
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 39969d1..143d19e 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/bytealg"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -310,7 +311,6 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace     int32
 	cgocheck           int32
 	clobberfree        int32
 	efence             int32
@@ -321,13 +321,20 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	sbrk               int32
 	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
 }
 
 var dbgvars = []dbgVar{
@@ -349,6 +356,7 @@ var dbgvars = []dbgVar{
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
 }
 
 func parsedebugvars() {
@@ -361,16 +369,27 @@ func parsedebugvars() {
 	// ensure that we only trace allocated heap objects, which should
 	// not contain invalid pointers.
 	debug.invalidptr = 1
+	if GOOS == "linux" {
+		// On Linux, MADV_FREE is faster than MADV_DONTNEED,
+		// but doesn't affect many of the statistics that
+		// MADV_DONTNEED does until the memory is actually
+		// reclaimed. This generally leads to poor user
+		// experience, like confusing stats in top and other
+		// monitoring tools; and bad integration with
+		// management systems that respond to memory usage.
+		// Hence, default to MADV_DONTNEED.
+		debug.madvdontneed = 1
+	}
 
 	for p := gogetenv("GODEBUG"); p != ""; {
 		field := ""
-		i := index(p, ",")
+		i := bytealg.IndexByteString(p, ',')
 		if i < 0 {
 			field, p = p, ""
 		} else {
 			field, p = p[:i], p[i+1:]
 		}
-		i = index(field, "=")
+		i = bytealg.IndexByteString(field, '=')
 		if i < 0 {
 			continue
 		}
@@ -394,6 +413,8 @@ func parsedebugvars() {
 		}
 	}
 
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 }
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index f30d1bc..2e55015 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -378,23 +378,6 @@ type libcall struct {
 	err  uintptr // error number
 }
 
-*/
-
-/*
-Not used by gccgo.
-
-// describes how to handle callback
-type wincallbackcontext struct {
-	gobody       unsafe.Pointer // go function to call
-	argsize      uintptr        // callback arguments size (in bytes)
-	restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
-	cleanstack   bool
-}
-*/
-
-/*
-Not used by gccgo.
-
 // Stack describes a Go execution stack.
 // The bounds of the stack are exactly [lo, hi),
 // with no implicit data structures on either side.
@@ -584,6 +567,7 @@ type m struct {
 	ncgo        int32  // number of cgo calls currently in progress
 	// Not for gccgo: cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
 	// Not for gccgo: cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	doesPark      bool // non-P running threads: sysmon and newmHandoff never use .park
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
@@ -600,6 +584,13 @@ type m struct {
 	syscalltick   uint32
 	freelink      *m // on sched.freem
 
+	// mFixup is used to synchronize OS related m state (credentials etc)
+	// use mutex to access.
+	mFixup struct {
+		lock mutex
+		fn   func(bool) bool
+	}
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	// Not for gccgo: libcall   libcall
@@ -710,14 +701,25 @@ type p struct {
 	// This is 0 if the timer heap is empty.
 	timer0When uint64
 
-	// Per-P GC state
-	gcAssistTime         int64    // Nanoseconds in assistAlloc
-	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
-	gcBgMarkWorker       guintptr // (atomic)
-	gcMarkWorkerMode     gcMarkWorkerMode
+	// The earliest known nextwhen field of a timer with
+	// timerModifiedEarlier status. Because the timer may have been
+	// modified again, there need not be any timer with this value.
+	// This is updated using atomic functions.
+	// This is 0 if the value is unknown.
+	timerModifiedEarliest uint64
 
-	// gcMarkWorkerStartTime is the nanotime() at which this mark
-	// worker started.
+	// Per-P GC state
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
+
+	// gcMarkWorkerMode is the mode for the next mark worker to run in.
+	// That is, this is used to communicate with the worker goroutine
+	// selected for immediate execution by
+	// gcController.findRunnableGCWorker. When scheduling other goroutines,
+	// this field must be set to gcMarkWorkerNotWorker.
+	gcMarkWorkerMode gcMarkWorkerMode
+	// gcMarkWorkerStartTime is the nanotime() at which the most recent
+	// mark worker started.
 	gcMarkWorkerStartTime int64
 
 	// gcw is this P's GC work buffer cache. The work buffer is
@@ -732,6 +734,10 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
+
 	// Lock for timers. We normally access the timers while running
 	// on this P, but the scheduler can also do it from a different P.
 	timersLock mutex
@@ -831,6 +837,10 @@ type schedt struct {
 	sysmonwait uint32
 	sysmonnote note
 
+	// While true, sysmon not ready for mFixup calls.
+	// Accessed atomically.
+	sysmonStarting uint32
+
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
 	safePointFn   func(*p)
@@ -875,10 +885,6 @@ type forcegcstate struct {
 	idle uint32
 }
 
-// startup_random_data holds random bytes initialized at startup. These come from
-// the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
-var startupRandomData []byte
-
 // extendRandom extends the random numbers in r[:n] to the whole slice r.
 // Treats n<0 as n==0.
 func extendRandom(r []byte, n int) {
@@ -1062,14 +1068,40 @@ func (w waitReason) String() string {
 var (
 	allglen    uintptr
 	allm       *m
-	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
-	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
 	sched      schedt
 	newprocs   int32
 
+	// allpLock protects P-less reads and size changes of allp, idlepMask,
+	// and timerpMask, and all writes to allp.
+	allpLock mutex
+	// len(allp) == gomaxprocs; may change at safe points, otherwise
+	// immutable.
+	allp []*p
+	// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
+	// be atomic. Length may change at safe points.
+	//
+	// Each P must update only its own bit. In order to maintain
+	// consistency, a P going idle must the idle mask simultaneously with
+	// updates to the idle P list under the sched.lock, otherwise a racing
+	// pidleget may clear the mask before pidleput sets the mask,
+	// corrupting the bitmap.
+	//
+	// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
+	idlepMask pMask
+	// Bitmask of Ps that may have a timer, one bit per P. Reads and writes
+	// must be atomic. Length may change at safe points.
+	timerpMask pMask
+
+	// Pool of GC parked background workers. Entries are type
+	// *gcBgMarkWorkerNode.
+	gcBgMarkWorkerPool lfstack
+
+	// Total number of gcBgMarkWorker goroutines. Protected by worldsema.
+	gcBgMarkWorkerCount int32
+
 	support_aes bool
 )
 
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index 45cc284..31272b0 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -106,6 +106,10 @@ func block() {
 // Both reside on the goroutine's stack (regardless of any escaping in
 // selectgo).
 //
+// For race detector builds, pc0 points to an array of type
+// [ncases]uintptr (also on the stack); for other builds, it's set to
+// nil.
+//
 // selectgo returns the index of the chosen scase, which matches the
 // ordinal position of its respective select{recv,send,default} call.
 // Also, if the chosen scase was a receive operation, it reports whether
@@ -124,6 +128,7 @@ func selectgo(cas0 *scase, order0 *uint16, nsends, nrecvs int, block bool) (int,
 	scases := cas1[:ncases:ncases]
 	pollorder := order1[:ncases:ncases]
 	lockorder := order1[ncases:][:ncases:ncases]
+	// NOTE: pollorder/lockorder's underlying array was not zero-initialized by compiler.
 
 	var t0 int64
 	if blockprofilerate > 0 {
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index ec7c647..1c040f7 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -281,6 +281,12 @@ func setProcessCPUProfiler(hz int32) {
 			atomic.Storeuintptr(&fwdSig[_SIGPROF], getsig(_SIGPROF))
 			setsig(_SIGPROF, getSigtramp())
 		}
+
+		var it _itimerval
+		it.it_interval.tv_sec = 0
+		it.it_interval.set_usec(1000000 / hz)
+		it.it_value = it.it_interval
+		setitimer(_ITIMER_PROF, &it, nil)
 	} else {
 		// If the Go signal handler should be disabled by default,
 		// switch back to the signal handler that was installed
@@ -305,23 +311,16 @@ func setProcessCPUProfiler(hz int32) {
 				setsig(_SIGPROF, h)
 			}
 		}
+
+		setitimer(_ITIMER_PROF, &_itimerval{}, nil)
 	}
 }
 
 // setThreadCPUProfiler makes any thread-specific changes required to
 // implement profiling at a rate of hz.
+// No changes required on Unix systems.
 func setThreadCPUProfiler(hz int32) {
-	var it _itimerval
-	if hz == 0 {
-		setitimer(_ITIMER_PROF, &it, nil)
-	} else {
-		it.it_interval.tv_sec = 0
-		it.it_interval.set_usec(1000000 / hz)
-		it.it_value = it.it_interval
-		setitimer(_ITIMER_PROF, &it, nil)
-	}
-	_g_ := getg()
-	_g_.m.profilehz = hz
+	getg().m.profilehz = hz
 }
 
 func sigpipe() {
@@ -348,7 +347,7 @@ func doSigPreempt(gp *g, ctxt *sigctxt, sigpc uintptr) {
 	atomic.Xadd(&gp.m.preemptGen, 1)
 	atomic.Store(&gp.m.signalPending, 0)
 
-	if GOOS == "darwin" {
+	if GOOS == "darwin" || GOOS == "ios" {
 		atomic.Xadd(&pendingPreemptSignals, -1)
 	}
 }
@@ -363,18 +362,29 @@ const preemptMSupported = false
 // safe-point, it will preempt the goroutine. It always atomically
 // increments mp.preemptGen after handling a preemption request.
 func preemptM(mp *m) {
-	if GOOS == "darwin" && GOARCH == "arm64" && !iscgo {
-		// On darwin, we use libc calls, and cgo is required on ARM64
-		// so we have TLS set up to save/restore G during C calls. If cgo is
-		// absent, we cannot save/restore G in TLS, and if a signal is
-		// received during C execution we cannot get the G. Therefore don't
-		// send signals.
-		// This can only happen in the go_bootstrap program (otherwise cgo is
-		// required).
-		return
+	// On Darwin, don't try to preempt threads during exec.
+	// Issue #41702.
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.rlock()
+	}
+
+	if atomic.Cas(&mp.signalPending, 0, 1) {
+		if GOOS == "darwin" || GOOS == "ios" {
+			atomic.Xadd(&pendingPreemptSignals, 1)
+		}
+
+		// If multiple threads are preempting the same M, it may send many
+		// signals to the same M such that it hardly make progress, causing
+		// live-lock problem. Apparently this could happen on darwin. See
+		// issue #37741.
+		// Only send a signal if there isn't already one pending.
+		// signalM(mp, sigPreempt)
+		throw("signalM not implemented")
+	}
+
+	if GOOS == "darwin" || GOOS == "ios" {
+		execLock.runlock()
 	}
-	// signalM(mp, sigPreempt)
-	throw("signalM not implemented")
 }
 
 // sigtrampgo is called from the signal handler function, sigtramp,
@@ -408,7 +418,7 @@ func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
 			// no non-Go signal handler for sigPreempt.
 			// The default behavior for sigPreempt is to ignore
 			// the signal, so badsignal will be a no-op anyway.
-			if GOOS == "darwin" {
+			if GOOS == "darwin" || GOOS == "ios" {
 				atomic.Xadd(&pendingPreemptSignals, -1)
 			}
 			return
@@ -565,7 +575,7 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 		print("signal arrived during cgo execution\n")
 		gp = _g_.m.lockedg.ptr()
 	}
-	if sig == _SIGILL {
+	if sig == _SIGILL || sig == _SIGFPE {
 		// It would be nice to know how long the instruction is.
 		// Unfortunately, that's complicated to do in general (mostly for x86
 		// and s930x, but other archs have non-standard instruction lengths also).
@@ -656,7 +666,7 @@ func sigpanic() {
 		}
 		// Support runtime/debug.SetPanicOnFault.
 		if g.paniconfault {
-			panicmem()
+			panicmemAddr(g.sigcode1)
 		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
@@ -666,7 +676,7 @@ func sigpanic() {
 		}
 		// Support runtime/debug.SetPanicOnFault.
 		if g.paniconfault {
-			panicmem()
+			panicmemAddr(g.sigcode1)
 		}
 		print("unexpected fault address ", hex(g.sigcode1), "\n")
 		throw("fault")
@@ -879,7 +889,7 @@ func badsignal(sig uintptr, c *sigctxt) {
 		exit(2)
 		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
 	}
-	needm(0)
+	needm()
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
 		// Go code does not want to handle it.
@@ -922,7 +932,7 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 	// This function and its caller sigtrampgo assumes SIGPIPE is delivered on the
 	// originating thread. This property does not hold on macOS (golang.org/issue/33384),
 	// so we have no choice but to ignore SIGPIPE.
-	if GOOS == "darwin" && sig == _SIGPIPE {
+	if (GOOS == "darwin" || GOOS == "ios") && sig == _SIGPIPE {
 		return true
 	}
 
diff --git a/libgo/go/runtime/signal_windows_test.go b/libgo/go/runtime/signal_windows_test.go
index f998571..a5a885c 100644
--- a/libgo/go/runtime/signal_windows_test.go
+++ b/libgo/go/runtime/signal_windows_test.go
@@ -7,7 +7,6 @@ import (
 	"bytes"
 	"fmt"
 	"internal/testenv"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -28,7 +27,7 @@ func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
@@ -93,7 +92,7 @@ func TestLibraryCtrlHandler(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := ioutil.TempDir("", "go-build")
+	dir, err := os.MkdirTemp("", "go-build")
 	if err != nil {
 		t.Fatalf("failed to create temp directory: %v", err)
 	}
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index ed024e1..7f9badd 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -105,7 +105,7 @@ Send:
 			break Send
 		case sigReceiving:
 			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
-				if GOOS == "darwin" {
+				if GOOS == "darwin" || GOOS == "ios" {
 					sigNoteWakeup(&sig.note)
 					break Send
 				}
@@ -140,7 +140,7 @@ func signal_recv() uint32 {
 				throw("signal_recv: inconsistent state")
 			case sigIdle:
 				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
-					if GOOS == "darwin" {
+					if GOOS == "darwin" || GOOS == "ios" {
 						sigNoteSleep(&sig.note)
 						break Receive
 					}
@@ -194,7 +194,7 @@ func signal_enable(s uint32) {
 	if !sig.inuse {
 		// This is the first call to signal_enable. Initialize.
 		sig.inuse = true // enable reception of signals; cannot disable
-		if GOOS == "darwin" {
+		if GOOS == "darwin" || GOOS == "ios" {
 			sigNoteSetup(&sig.note)
 		} else {
 			noteclear(&sig.note)
diff --git a/libgo/go/runtime/sizeclasses.go b/libgo/go/runtime/sizeclasses.go
index 9c1b44f..c5521ce 100644
--- a/libgo/go/runtime/sizeclasses.go
+++ b/libgo/go/runtime/sizeclasses.go
@@ -6,82 +6,83 @@ package runtime
 // class  bytes/obj  bytes/span  objects  tail waste  max waste
 //     1          8        8192     1024           0     87.50%
 //     2         16        8192      512           0     43.75%
-//     3         32        8192      256           0     46.88%
-//     4         48        8192      170          32     31.52%
-//     5         64        8192      128           0     23.44%
-//     6         80        8192      102          32     19.07%
-//     7         96        8192       85          32     15.95%
-//     8        112        8192       73          16     13.56%
-//     9        128        8192       64           0     11.72%
-//    10        144        8192       56         128     11.82%
-//    11        160        8192       51          32      9.73%
-//    12        176        8192       46          96      9.59%
-//    13        192        8192       42         128      9.25%
-//    14        208        8192       39          80      8.12%
-//    15        224        8192       36         128      8.15%
-//    16        240        8192       34          32      6.62%
-//    17        256        8192       32           0      5.86%
-//    18        288        8192       28         128     12.16%
-//    19        320        8192       25         192     11.80%
-//    20        352        8192       23          96      9.88%
-//    21        384        8192       21         128      9.51%
-//    22        416        8192       19         288     10.71%
-//    23        448        8192       18         128      8.37%
-//    24        480        8192       17          32      6.82%
-//    25        512        8192       16           0      6.05%
-//    26        576        8192       14         128     12.33%
-//    27        640        8192       12         512     15.48%
-//    28        704        8192       11         448     13.93%
-//    29        768        8192       10         512     13.94%
-//    30        896        8192        9         128     15.52%
-//    31       1024        8192        8           0     12.40%
-//    32       1152        8192        7         128     12.41%
-//    33       1280        8192        6         512     15.55%
-//    34       1408       16384       11         896     14.00%
-//    35       1536        8192        5         512     14.00%
-//    36       1792       16384        9         256     15.57%
-//    37       2048        8192        4           0     12.45%
-//    38       2304       16384        7         256     12.46%
-//    39       2688        8192        3         128     15.59%
-//    40       3072       24576        8           0     12.47%
-//    41       3200       16384        5         384      6.22%
-//    42       3456       24576        7         384      8.83%
-//    43       4096        8192        2           0     15.60%
-//    44       4864       24576        5         256     16.65%
-//    45       5376       16384        3         256     10.92%
-//    46       6144       24576        4           0     12.48%
-//    47       6528       32768        5         128      6.23%
-//    48       6784       40960        6         256      4.36%
-//    49       6912       49152        7         768      3.37%
-//    50       8192        8192        1           0     15.61%
-//    51       9472       57344        6         512     14.28%
-//    52       9728       49152        5         512      3.64%
-//    53      10240       40960        4           0      4.99%
-//    54      10880       32768        3         128      6.24%
-//    55      12288       24576        2           0     11.45%
-//    56      13568       40960        3         256      9.99%
-//    57      14336       57344        4           0      5.35%
-//    58      16384       16384        1           0     12.49%
-//    59      18432       73728        4           0     11.11%
-//    60      19072       57344        3         128      3.57%
-//    61      20480       40960        2           0      6.87%
-//    62      21760       65536        3         256      6.25%
-//    63      24576       24576        1           0     11.45%
-//    64      27264       81920        3         128     10.00%
-//    65      28672       57344        2           0      4.91%
-//    66      32768       32768        1           0     12.50%
+//     3         24        8192      341           8     29.24%
+//     4         32        8192      256           0     21.88%
+//     5         48        8192      170          32     31.52%
+//     6         64        8192      128           0     23.44%
+//     7         80        8192      102          32     19.07%
+//     8         96        8192       85          32     15.95%
+//     9        112        8192       73          16     13.56%
+//    10        128        8192       64           0     11.72%
+//    11        144        8192       56         128     11.82%
+//    12        160        8192       51          32      9.73%
+//    13        176        8192       46          96      9.59%
+//    14        192        8192       42         128      9.25%
+//    15        208        8192       39          80      8.12%
+//    16        224        8192       36         128      8.15%
+//    17        240        8192       34          32      6.62%
+//    18        256        8192       32           0      5.86%
+//    19        288        8192       28         128     12.16%
+//    20        320        8192       25         192     11.80%
+//    21        352        8192       23          96      9.88%
+//    22        384        8192       21         128      9.51%
+//    23        416        8192       19         288     10.71%
+//    24        448        8192       18         128      8.37%
+//    25        480        8192       17          32      6.82%
+//    26        512        8192       16           0      6.05%
+//    27        576        8192       14         128     12.33%
+//    28        640        8192       12         512     15.48%
+//    29        704        8192       11         448     13.93%
+//    30        768        8192       10         512     13.94%
+//    31        896        8192        9         128     15.52%
+//    32       1024        8192        8           0     12.40%
+//    33       1152        8192        7         128     12.41%
+//    34       1280        8192        6         512     15.55%
+//    35       1408       16384       11         896     14.00%
+//    36       1536        8192        5         512     14.00%
+//    37       1792       16384        9         256     15.57%
+//    38       2048        8192        4           0     12.45%
+//    39       2304       16384        7         256     12.46%
+//    40       2688        8192        3         128     15.59%
+//    41       3072       24576        8           0     12.47%
+//    42       3200       16384        5         384      6.22%
+//    43       3456       24576        7         384      8.83%
+//    44       4096        8192        2           0     15.60%
+//    45       4864       24576        5         256     16.65%
+//    46       5376       16384        3         256     10.92%
+//    47       6144       24576        4           0     12.48%
+//    48       6528       32768        5         128      6.23%
+//    49       6784       40960        6         256      4.36%
+//    50       6912       49152        7         768      3.37%
+//    51       8192        8192        1           0     15.61%
+//    52       9472       57344        6         512     14.28%
+//    53       9728       49152        5         512      3.64%
+//    54      10240       40960        4           0      4.99%
+//    55      10880       32768        3         128      6.24%
+//    56      12288       24576        2           0     11.45%
+//    57      13568       40960        3         256      9.99%
+//    58      14336       57344        4           0      5.35%
+//    59      16384       16384        1           0     12.49%
+//    60      18432       73728        4           0     11.11%
+//    61      19072       57344        3         128      3.57%
+//    62      20480       40960        2           0      6.87%
+//    63      21760       65536        3         256      6.25%
+//    64      24576       24576        1           0     11.45%
+//    65      27264       81920        3         128     10.00%
+//    66      28672       57344        2           0      4.91%
+//    67      32768       32768        1           0     12.50%
 
 const (
 	_MaxSmallSize   = 32768
 	smallSizeDiv    = 8
 	smallSizeMax    = 1024
 	largeSizeDiv    = 128
-	_NumSizeClasses = 67
+	_NumSizeClasses = 68
 	_PageShift      = 13
 )
 
-var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
-var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
+var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
+var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
 
 type divMagic struct {
 	shift    uint8
@@ -90,6 +91,6 @@ type divMagic struct {
 	baseMask uint16
 }
 
-var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
-var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}
-var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{31, 32, 33, 34, 35, 36, 36, 37, 37, 38, 38, 39, 39, 39, 40, 40, 40, 41, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66}
+var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {3, 11, 683, 0}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
+var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
+var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index ddd588e..27a9777 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -18,8 +18,6 @@ import (
 //go:linkname checkMakeSlice
 //go:linkname makeslice64
 //go:linkname growslice
-//go:linkname slicecopy
-//go:linkname slicestringcopy
 
 type slice struct {
 	array unsafe.Pointer
@@ -158,7 +156,7 @@ func growslice(et *_type, oldarray unsafe.Pointer, oldlen, oldcap, cap int) slic
 	if cap > doublecap {
 		newcap = cap
 	} else {
-		if oldlen < 1024 {
+		if oldcap < 1024 {
 			newcap = doublecap
 		} else {
 			// Check 0 < newcap to detect overflow
@@ -255,12 +253,13 @@ func isPowerOfTwo(x uintptr) bool {
 	return x&(x-1) == 0
 }
 
-func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int, width uintptr) int {
-	if fmLen == 0 || toLen == 0 {
+// slicecopy is used to copy from a string or slice of pointerless elements into a slice.
+func slicecopy(toPtr unsafe.Pointer, toLen int, fromPtr unsafe.Pointer, fromLen int, width uintptr) int {
+	if fromLen == 0 || toLen == 0 {
 		return 0
 	}
 
-	n := fmLen
+	n := fromLen
 	if toLen < n {
 		n = toLen
 	}
@@ -269,46 +268,23 @@ func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int,
 		return n
 	}
 
+	size := uintptr(n) * width
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
-		racereadrangepc(fmPtr, uintptr(n*int(width)), callerpc, pc)
-		racewriterangepc(toPtr, uintptr(n*int(width)), callerpc, pc)
+		racereadrangepc(fromPtr, size, callerpc, pc)
+		racewriterangepc(toPtr, size, callerpc, pc)
 	}
 	if msanenabled {
-		msanread(fmPtr, uintptr(n*int(width)))
-		msanwrite(toPtr, uintptr(n*int(width)))
+		msanread(fromPtr, size)
+		msanwrite(toPtr, size)
 	}
 
-	size := uintptr(n) * width
 	if size == 1 { // common case worth about 2x to do here
 		// TODO: is this still worth it with new memmove impl?
-		*(*byte)(toPtr) = *(*byte)(fmPtr) // known to be a byte pointer
+		*(*byte)(toPtr) = *(*byte)(fromPtr) // known to be a byte pointer
 	} else {
-		memmove(toPtr, fmPtr, size)
-	}
-	return n
-}
-
-func slicestringcopy(toPtr *byte, toLen int, fm string) int {
-	if len(fm) == 0 || toLen == 0 {
-		return 0
-	}
-
-	n := len(fm)
-	if toLen < n {
-		n = toLen
+		memmove(toPtr, fromPtr, size)
 	}
-
-	if raceenabled {
-		callerpc := getcallerpc()
-		pc := funcPC(slicestringcopy)
-		racewriterangepc(unsafe.Pointer(toPtr), uintptr(n), callerpc, pc)
-	}
-	if msanenabled {
-		msanwrite(unsafe.Pointer(toPtr), uintptr(n))
-	}
-
-	memmove(unsafe.Pointer(toPtr), stringStructOf(&fm).str, uintptr(n))
 	return n
 }
diff --git a/libgo/go/runtime/slice_test.go b/libgo/go/runtime/slice_test.go
index e963a43..cd2bc26 100644
--- a/libgo/go/runtime/slice_test.go
+++ b/libgo/go/runtime/slice_test.go
@@ -1,6 +1,7 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+
 package runtime_test
 
 import (
diff --git a/libgo/go/runtime/stack_test.go b/libgo/go/runtime/stack_test.go
index 169dde2..d4ee52c 100644
--- a/libgo/go/runtime/stack_test.go
+++ b/libgo/go/runtime/stack_test.go
@@ -17,6 +17,7 @@ import (
 	"sync/atomic"
 	"testing"
 	"time"
+	_ "unsafe" // for go:linkname
 )
 
 // TestStackMem measures per-thread stack segment cache behavior.
@@ -871,3 +872,43 @@ func deferHeapAndStack(n int) (r int) {
 
 // Pass a value to escapeMe to force it to escape.
 var escapeMe = func(x interface{}) {}
+
+// Test that when F -> G is inlined and F is excluded from stack
+// traces, G still appears.
+func TestTracebackInlineExcluded(t *testing.T) {
+	defer func() {
+		recover()
+		buf := make([]byte, 4<<10)
+		stk := string(buf[:Stack(buf, false)])
+
+		t.Log(stk)
+
+		if not := "tracebackExcluded"; strings.Contains(stk, not) {
+			t.Errorf("found but did not expect %q", not)
+		}
+		if want := "tracebackNotExcluded"; !strings.Contains(stk, want) {
+			t.Errorf("expected %q in stack", want)
+		}
+	}()
+	tracebackExcluded()
+}
+
+// tracebackExcluded should be excluded from tracebacks. There are
+// various ways this could come up. Linking it to a "runtime." name is
+// rather synthetic, but it's easy and reliable. See issue #42754 for
+// one way this happened in real code.
+//
+//go:linkname tracebackExcluded runtime.tracebackExcluded
+//go:noinline
+func tracebackExcluded() {
+	// Call an inlined function that should not itself be excluded
+	// from tracebacks.
+	tracebackNotExcluded()
+}
+
+// tracebackNotExcluded should be inlined into tracebackExcluded, but
+// should not itself be excluded from the traceback.
+func tracebackNotExcluded() {
+	var x *int
+	*x = 0
+}
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index c0058be..44b9314 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -327,22 +327,6 @@ func gostringn(p *byte, l int) string {
 	return s
 }
 
-func index(s, t string) int {
-	if len(t) == 0 {
-		return 0
-	}
-	for i := 0; i < len(s); i++ {
-		if s[i] == t[0] && hasPrefix(s[i:], t) {
-			return i
-		}
-	}
-	return -1
-}
-
-func contains(s, t string) bool {
-	return index(s, t) >= 0
-}
-
 func hasPrefix(s, prefix string) bool {
 	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
 }
@@ -512,37 +496,3 @@ func __go_byte_array_to_string(p unsafe.Pointer, l int) string {
 func __go_string_to_byte_array(s string) []byte {
 	return stringtoslicebyte(nil, s)
 }
-
-// parseRelease parses a dot-separated version number. It follows the
-// semver syntax, but allows the minor and patch versions to be
-// elided.
-func parseRelease(rel string) (major, minor, patch int, ok bool) {
-	// Strip anything after a dash or plus.
-	for i := 0; i < len(rel); i++ {
-		if rel[i] == '-' || rel[i] == '+' {
-			rel = rel[:i]
-			break
-		}
-	}
-
-	next := func() (int, bool) {
-		for i := 0; i < len(rel); i++ {
-			if rel[i] == '.' {
-				ver, ok := atoi(rel[:i])
-				rel = rel[i+1:]
-				return ver, ok
-			}
-		}
-		ver, ok := atoi(rel)
-		rel = ""
-		return ver, ok
-	}
-	if major, ok = next(); !ok || rel == "" {
-		return
-	}
-	if minor, ok = next(); !ok || rel == "" {
-		return
-	}
-	patch, ok = next()
-	return
-}
diff --git a/libgo/go/runtime/string_test.go b/libgo/go/runtime/string_test.go
index b979973..2a65416 100644
--- a/libgo/go/runtime/string_test.go
+++ b/libgo/go/runtime/string_test.go
@@ -462,34 +462,3 @@ func TestAtoi32(t *testing.T) {
 		}
 	}
 }
-
-type parseReleaseTest struct {
-	in                  string
-	major, minor, patch int
-}
-
-var parseReleaseTests = []parseReleaseTest{
-	{"", -1, -1, -1},
-	{"x", -1, -1, -1},
-	{"5", 5, 0, 0},
-	{"5.12", 5, 12, 0},
-	{"5.12-x", 5, 12, 0},
-	{"5.12.1", 5, 12, 1},
-	{"5.12.1-x", 5, 12, 1},
-	{"5.12.1.0", 5, 12, 1},
-	{"5.20496382327982653440", -1, -1, -1},
-}
-
-func TestParseRelease(t *testing.T) {
-	for _, test := range parseReleaseTests {
-		major, minor, patch, ok := runtime.ParseRelease(test.in)
-		if !ok {
-			major, minor, patch = -1, -1, -1
-		}
-		if test.major != major || test.minor != minor || test.patch != patch {
-			t.Errorf("parseRelease(%q) = (%v, %v, %v) want (%v, %v, %v)",
-				test.in, major, minor, patch,
-				test.major, test.minor, test.patch)
-		}
-	}
-}
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index d0fe551..b0c5b38 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -140,6 +140,12 @@ func fastrandn(n uint32) uint32 {
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
 
+//go:linkname net_fastrand net.fastrand
+func net_fastrand() uint32 { return fastrand() }
+
+//go:linkname os_fastrand os.fastrand
+func os_fastrand() uint32 { return fastrand() }
+
 // in internal/bytealg/equal_*.s
 //go:noescape
 func memequal(a, b unsafe.Pointer, size uintptr) bool
diff --git a/libgo/go/runtime/testdata/testprog/memprof.go b/libgo/go/runtime/testdata/testprog/memprof.go
index 7b134bc..0392e60 100644
--- a/libgo/go/runtime/testdata/testprog/memprof.go
+++ b/libgo/go/runtime/testdata/testprog/memprof.go
@@ -7,7 +7,6 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -31,7 +30,7 @@ func MemProf() {
 
 	runtime.GC()
 
-	f, err := ioutil.TempFile("", "memprof")
+	f, err := os.CreateTemp("", "memprof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/libgo/go/runtime/testdata/testprog/syscalls_linux.go b/libgo/go/runtime/testdata/testprog/syscalls_linux.go
index b8ac087..48f8014 100644
--- a/libgo/go/runtime/testdata/testprog/syscalls_linux.go
+++ b/libgo/go/runtime/testdata/testprog/syscalls_linux.go
@@ -7,7 +7,6 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"syscall"
 )
@@ -17,7 +16,7 @@ func gettid() int {
 }
 
 func tidExists(tid int) (exists, supported bool) {
-	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	stat, err := os.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
 	if os.IsNotExist(err) {
 		return false, true
 	}
diff --git a/libgo/go/runtime/testdata/testprog/timeprof.go b/libgo/go/runtime/testdata/testprog/timeprof.go
index 0702885..1e90af4 100644
--- a/libgo/go/runtime/testdata/testprog/timeprof.go
+++ b/libgo/go/runtime/testdata/testprog/timeprof.go
@@ -6,7 +6,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime/pprof"
 	"time"
@@ -17,7 +16,7 @@ func init() {
 }
 
 func TimeProf() {
-	f, err := ioutil.TempFile("", "timeprof")
+	f, err := os.CreateTemp("", "timeprof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/libgo/go/runtime/testdata/testprog/vdso.go b/libgo/go/runtime/testdata/testprog/vdso.go
index ef92f48..d2a300d 100644
--- a/libgo/go/runtime/testdata/testprog/vdso.go
+++ b/libgo/go/runtime/testdata/testprog/vdso.go
@@ -8,7 +8,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime/pprof"
 	"time"
@@ -19,7 +18,7 @@ func init() {
 }
 
 func signalInVDSO() {
-	f, err := ioutil.TempFile("", "timeprofnow")
+	f, err := os.CreateTemp("", "timeprofnow")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/libgo/go/runtime/testdata/testprogcgo/eintr.go b/libgo/go/runtime/testdata/testprogcgo/eintr.go
index 791ff1b..1722a75 100644
--- a/libgo/go/runtime/testdata/testprogcgo/eintr.go
+++ b/libgo/go/runtime/testdata/testprogcgo/eintr.go
@@ -32,7 +32,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net"
 	"os"
@@ -242,5 +241,5 @@ func testExec(wg *sync.WaitGroup) {
 
 // Block blocks until stdin is closed.
 func Block() {
-	io.Copy(ioutil.Discard, os.Stdin)
+	io.Copy(io.Discard, os.Stdin)
 }
diff --git a/libgo/go/runtime/testdata/testprogcgo/exec.go b/libgo/go/runtime/testdata/testprogcgo/exec.go
index 94da5dc..15723c7 100644
--- a/libgo/go/runtime/testdata/testprogcgo/exec.go
+++ b/libgo/go/runtime/testdata/testprogcgo/exec.go
@@ -31,6 +31,7 @@ import "C"
 
 import (
 	"fmt"
+	"io/fs"
 	"os"
 	"os/exec"
 	"os/signal"
@@ -98,7 +99,7 @@ func CgoExecSignalMask() {
 
 // isEAGAIN reports whether err is an EAGAIN error from a process execution.
 func isEAGAIN(err error) bool {
-	if p, ok := err.(*os.PathError); ok {
+	if p, ok := err.(*fs.PathError); ok {
 		err = p.Err
 	}
 	return err == syscall.EAGAIN
diff --git a/libgo/go/runtime/testdata/testprogcgo/pprof.go b/libgo/go/runtime/testdata/testprogcgo/pprof.go
index 9fa5c1b..3a3176d 100644
--- a/libgo/go/runtime/testdata/testprogcgo/pprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/pprof.go
@@ -62,7 +62,6 @@ import "C"
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -77,7 +76,7 @@ func init() {
 func CgoPprof() {
 	runtime.SetCgoTraceback(0, unsafe.Pointer(C.pprofCgoTraceback), nil, nil)
 
-	f, err := ioutil.TempFile("", "prof")
+	f, err := os.CreateTemp("", "prof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
index 28b094f..3b36c30 100644
--- a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
@@ -75,7 +75,6 @@ import "C"
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 	"runtime/pprof"
@@ -98,7 +97,7 @@ func CgoPprofThreadNoTraceback() {
 }
 
 func pprofThread() {
-	f, err := ioutil.TempFile("", "prof")
+	f, err := os.CreateTemp("", "prof")
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
 		os.Exit(2)
diff --git a/libgo/go/runtime/testdata/testprogcgo/traceback.go b/libgo/go/runtime/testdata/testprogcgo/traceback.go
index d21990e..46cf644 100644
--- a/libgo/go/runtime/testdata/testprogcgo/traceback.go
+++ b/libgo/go/runtime/testdata/testprogcgo/traceback.go
@@ -13,58 +13,11 @@ package main
 /*
 #cgo CFLAGS: -g -O0
 
-#include <stdint.h>
-
-char *p;
-
-static int f3(void) {
-	*p = 0;
-	return 0;
-}
-
-static int f2(void) {
-	return f3();
-}
-
-static int f1(void) {
-	return f2();
-}
-
-struct cgoTracebackArg {
-	uintptr_t  context;
-	uintptr_t  sigContext;
-	uintptr_t* buf;
-	uintptr_t  max;
-};
-
-struct cgoSymbolizerArg {
-	uintptr_t   pc;
-	const char* file;
-	uintptr_t   lineno;
-	const char* func;
-	uintptr_t   entry;
-	uintptr_t   more;
-	uintptr_t   data;
-};
-
-void cgoTraceback(void* parg) {
-	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
-	arg->buf[0] = 1;
-	arg->buf[1] = 2;
-	arg->buf[2] = 3;
-	arg->buf[3] = 0;
-}
-
-void cgoSymbolizer(void* parg) {
-	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
-	if (arg->pc != arg->data + 1) {
-		arg->file = "unexpected data";
-	} else {
-		arg->file = "cgo symbolizer";
-	}
-	arg->lineno = arg->data + 1;
-	arg->data++;
-}
+// Defined in traceback_c.c.
+extern int crashInGo;
+int tracebackF1(void);
+void cgoTraceback(void* parg);
+void cgoSymbolizer(void* parg);
 */
 import "C"
 
@@ -75,9 +28,29 @@ import (
 
 func init() {
 	register("CrashTraceback", CrashTraceback)
+	register("CrashTracebackGo", CrashTracebackGo)
 }
 
 func CrashTraceback() {
 	runtime.SetCgoTraceback(0, unsafe.Pointer(C.cgoTraceback), nil, unsafe.Pointer(C.cgoSymbolizer))
-	C.f1()
+	C.tracebackF1()
+}
+
+func CrashTracebackGo() {
+	C.crashInGo = 1
+	CrashTraceback()
+}
+
+//export h1
+func h1() {
+	h2()
+}
+
+func h2() {
+	h3()
+}
+
+func h3() {
+	var x *int
+	*x = 0
 }
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index 3cf13f1..65a1ae0 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -22,6 +22,8 @@ type timer struct {
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
 	// a well-behaved function and not block.
+	//
+	// when must be positive on an active timer.
 	when   int64
 	period int64
 	f      func(interface{}, uintptr)
@@ -184,6 +186,9 @@ func timeSleep(ns int64) {
 	t.f = goroutineReady
 	t.arg = gp
 	t.nextwhen = nanotime() + ns
+	if t.nextwhen < 0 { // check for overflow.
+		t.nextwhen = maxWhen
+	}
 	gopark(resetForSleep, unsafe.Pointer(t), waitReasonSleep, traceEvGoSleep, 1)
 }
 
@@ -241,10 +246,14 @@ func goroutineReady(arg interface{}, seq uintptr) {
 // That avoids the risk of changing the when field of a timer in some P's heap,
 // which could cause the heap to become unsorted.
 func addtimer(t *timer) {
-	// when must never be negative; otherwise runtimer will overflow
-	// during its delta calculation and never expire other runtime timers.
-	if t.when < 0 {
-		t.when = maxWhen
+	// when must be positive. A negative value will cause runtimer to
+	// overflow during its delta calculation and never expire other runtime
+	// timers. Zero will cause checkTimers to fail to notice the timer.
+	if t.when <= 0 {
+		throw("timer when must be positive")
+	}
+	if t.period < 0 {
+		throw("timer period must be non-negative")
 	}
 	if t.status != timerNoStatus {
 		throw("addtimer called with initialized timer")
@@ -402,11 +411,14 @@ func dodeltimer0(pp *p) {
 }
 
 // modtimer modifies an existing timer.
-// This is called by the netpoll code or time.Ticker.Reset.
+// This is called by the netpoll code or time.Ticker.Reset or time.Timer.Reset.
 // Reports whether the timer was modified before it was run.
 func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
-	if when < 0 {
-		when = maxWhen
+	if when <= 0 {
+		throw("timer when must be positive")
+	}
+	if period < 0 {
+		throw("timer period must be non-negative")
 	}
 
 	status := uint32(timerNoStatus)
@@ -490,6 +502,8 @@ loop:
 			newStatus = timerModifiedEarlier
 		}
 
+		tpp := t.pp.ptr()
+
 		// Update the adjustTimers field.  Subtract one if we
 		// are removing a timerModifiedEarlier, add one if we
 		// are adding a timerModifiedEarlier.
@@ -499,9 +513,10 @@ loop:
 		}
 		if newStatus == timerModifiedEarlier {
 			adjust++
+			updateTimerModifiedEarliest(tpp, when)
 		}
 		if adjust != 0 {
-			atomic.Xadd(&t.pp.ptr().adjustTimers, adjust)
+			atomic.Xadd(&tpp.adjustTimers, adjust)
 		}
 
 		// Set the new status of the timer.
@@ -636,16 +651,36 @@ func moveTimers(pp *p, timers []*timer) {
 // the correct place in the heap. While looking for those timers,
 // it also moves timers that have been modified to run later,
 // and removes deleted timers. The caller must have locked the timers for pp.
-func adjusttimers(pp *p) {
-	if len(pp.timers) == 0 {
-		return
-	}
+func adjusttimers(pp *p, now int64) {
 	if atomic.Load(&pp.adjustTimers) == 0 {
 		if verifyTimers {
 			verifyTimerHeap(pp)
 		}
+		// There are no timers to adjust, so it is safe to clear
+		// timerModifiedEarliest. Do so in case it is stale.
+		// Everything will work if we don't do this,
+		// but clearing here may save future calls to adjusttimers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
 		return
 	}
+
+	// If we haven't yet reached the time of the first timerModifiedEarlier
+	// timer, don't do anything. This speeds up programs that adjust
+	// a lot of timers back and forth if the timers rarely expire.
+	// We'll postpone looking through all the adjusted timers until
+	// one would actually expire.
+	if first := atomic.Load64(&pp.timerModifiedEarliest); first != 0 {
+		if int64(first) > now {
+			if verifyTimers {
+				verifyTimerHeap(pp)
+			}
+			return
+		}
+
+		// We are going to clear all timerModifiedEarlier timers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
+	}
+
 	var moved []*timer
 loop:
 	for i := 0; i < len(pp.timers); i++ {
@@ -718,16 +753,15 @@ func addAdjustedTimers(pp *p, moved []*timer) {
 // nobarrierWakeTime looks at P's timers and returns the time when we
 // should wake up the netpoller. It returns 0 if there are no timers.
 // This function is invoked when dropping a P, and must run without
-// any write barriers. Therefore, if there are any timers that needs
-// to be moved earlier, it conservatively returns the current time.
-// The netpoller M will wake up and adjust timers before sleeping again.
+// any write barriers.
 //go:nowritebarrierrec
 func nobarrierWakeTime(pp *p) int64 {
-	if atomic.Load(&pp.adjustTimers) > 0 {
-		return nanotime()
-	} else {
-		return int64(atomic.Load64(&pp.timer0When))
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
 	}
+	return next
 }
 
 // runtimer examines the first timer in timers. If it is ready based on now,
@@ -815,6 +849,9 @@ func runOneTimer(pp *p, t *timer, now int64) {
 		// Leave in heap but adjust next time to fire.
 		delta := t.when - now
 		t.when += t.period * (1 + -delta/t.period)
+		if t.when < 0 { // check for overflow.
+			t.when = maxWhen
+		}
 		siftdownTimer(pp.timers, 0)
 		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
 			badTimer()
@@ -845,6 +882,10 @@ func runOneTimer(pp *p, t *timer, now int64) {
 //
 // The caller must have locked the timers for pp.
 func clearDeletedTimers(pp *p) {
+	// We are going to clear all timerModifiedEarlier timers.
+	// Do this now in case new ones show up while we are looping.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
 	cdel := int32(0)
 	cearlier := int32(0)
 	to := 0
@@ -954,6 +995,21 @@ func updateTimer0When(pp *p) {
 	}
 }
 
+// updateTimerModifiedEarliest updates the recorded nextwhen field of the
+// earlier timerModifiedEarier value.
+// The timers for pp will not be locked.
+func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
+	for {
+		old := atomic.Load64(&pp.timerModifiedEarliest)
+		if old != 0 && int64(old) < nextwhen {
+			return
+		}
+		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+			return
+		}
+	}
+}
+
 // timeSleepUntil returns the time when the next timer should fire,
 // and the P that holds the timer heap that that timer is on.
 // This is only called by sysmon and checkdead.
@@ -970,48 +1026,17 @@ func timeSleepUntil() (int64, *p) {
 			continue
 		}
 
-		c := atomic.Load(&pp.adjustTimers)
-		if c == 0 {
-			w := int64(atomic.Load64(&pp.timer0When))
-			if w != 0 && w < next {
-				next = w
-				pret = pp
-			}
-			continue
+		w := int64(atomic.Load64(&pp.timer0When))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
 
-		lock(&pp.timersLock)
-		for _, t := range pp.timers {
-			switch s := atomic.Load(&t.status); s {
-			case timerWaiting:
-				if t.when < next {
-					next = t.when
-				}
-			case timerModifiedEarlier, timerModifiedLater:
-				if t.nextwhen < next {
-					next = t.nextwhen
-				}
-				if s == timerModifiedEarlier {
-					c--
-				}
-			}
-			// The timers are sorted, so we only have to check
-			// the first timer for each P, unless there are
-			// some timerModifiedEarlier timers. The number
-			// of timerModifiedEarlier timers is in the adjustTimers
-			// field, used to initialize c, above.
-			//
-			// We don't worry about cases like timerModifying.
-			// New timers can show up at any time,
-			// so this function is necessarily imprecise.
-			// Do a signed check here since we aren't
-			// synchronizing the read of pp.adjustTimers
-			// with the check of a timer status.
-			if int32(c) <= 0 {
-				break
-			}
+		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
-		unlock(&pp.timersLock)
 	}
 	unlock(&allpLock)
 
@@ -1031,6 +1056,9 @@ func siftupTimer(t []*timer, i int) {
 		badTimer()
 	}
 	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
 	tmp := t[i]
 	for i > 0 {
 		p := (i - 1) / 4 // parent
@@ -1051,6 +1079,9 @@ func siftdownTimer(t []*timer, i int) {
 		badTimer()
 	}
 	when := t[i].when
+	if when <= 0 {
+		badTimer()
+	}
 	tmp := t[i]
 	for {
 		c := i*4 + 1 // left child
diff --git a/libgo/go/runtime/time_test.go b/libgo/go/runtime/time_test.go
index 9b1922d..254b0f1 100644
--- a/libgo/go/runtime/time_test.go
+++ b/libgo/go/runtime/time_test.go
@@ -23,6 +23,10 @@ func TestFakeTime(t *testing.T) {
 		t.Skip("faketime not supported for gccgo")
 	}
 
+	// Faketime is advanced in checkdead. External linking brings in cgo,
+	// causing checkdead not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 
 	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
@@ -41,7 +45,7 @@ func TestFakeTime(t *testing.T) {
 	}
 
 	t.Logf("raw stdout: %q", stdout.String())
-	t.Logf("raw stderr: %q", stdout.String())
+	t.Logf("raw stderr: %q", stderr.String())
 
 	f1, err1 := parseFakeTime(stdout.Bytes())
 	if err1 != nil {
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index b05f30a..f13c81a 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -13,6 +13,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -1058,7 +1059,7 @@ func traceGoStart() {
 	_g_ := getg().m.curg
 	_p_ := _g_.m.p
 	_g_.traceseq++
-	if _g_ == _p_.ptr().gcBgMarkWorker.ptr() {
+	if _p_.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
 		traceEvent(traceEvGoStartLabel, -1, uint64(_g_.goid), _g_.traceseq, trace.markWorkerLabels[_p_.ptr().gcMarkWorkerMode])
 	} else if _g_.tracelastp == _p_ {
 		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
@@ -1141,11 +1142,11 @@ func traceHeapAlloc() {
 }
 
 func traceNextGC() {
-	if memstats.next_gc == ^uint64(0) {
+	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
 		// Heap-based triggering is disabled.
 		traceEvent(traceEvNextGC, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, memstats.next_gc)
+		traceEvent(traceEvNextGC, -1, nextGC)
 	}
 }
 
diff --git a/libgo/go/runtime/trace/annotation.go b/libgo/go/runtime/trace/annotation.go
index 82cb232..6e18bfb 100644
--- a/libgo/go/runtime/trace/annotation.go
+++ b/libgo/go/runtime/trace/annotation.go
@@ -1,3 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 package trace
 
 import (
diff --git a/libgo/go/runtime/trace/annotation_test.go b/libgo/go/runtime/trace/annotation_test.go
index 71abbfc..31fccef 100644
--- a/libgo/go/runtime/trace/annotation_test.go
+++ b/libgo/go/runtime/trace/annotation_test.go
@@ -1,3 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 package trace_test
 
 import (
diff --git a/libgo/go/runtime/trace/trace_stack_test.go b/libgo/go/runtime/trace/trace_stack_test.go
index f856fdc..be3adc9 100644
--- a/libgo/go/runtime/trace/trace_stack_test.go
+++ b/libgo/go/runtime/trace/trace_stack_test.go
@@ -252,8 +252,7 @@ func TestTraceSymbolize(t *testing.T) {
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
-				{"internal/poll.(*FD).Read.func1", 0},
-				{"internal/poll.ignoringEINTR", 0},
+				{"internal/poll.ignoringEINTRIO", 0},
 				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
diff --git a/libgo/go/runtime/trace/trace_test.go b/libgo/go/runtime/trace/trace_test.go
index 235845d..b316eaf 100644
--- a/libgo/go/runtime/trace/trace_test.go
+++ b/libgo/go/runtime/trace/trace_test.go
@@ -10,7 +10,6 @@ import (
 	"internal/race"
 	"internal/trace"
 	"io"
-	"io/ioutil"
 	"net"
 	"os"
 	"runtime"
@@ -586,7 +585,7 @@ func saveTrace(t *testing.T, buf *bytes.Buffer, name string) {
 	if !*saveTraces {
 		return
 	}
-	if err := ioutil.WriteFile(name+".trace", buf.Bytes(), 0600); err != nil {
+	if err := os.WriteFile(name+".trace", buf.Bytes(), 0600); err != nil {
 		t.Errorf("failed to write trace file: %s", err)
 	}
 }
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index ebdbefc..af1676f 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -8,6 +8,7 @@
 package runtime
 
 import (
+	"internal/bytealg"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -155,7 +156,7 @@ func showfuncinfo(name string, firstFrame bool) bool {
 	// We want to print those in the traceback.
 	// But unless GOTRACEBACK > 1 (checked below), still skip
 	// internal C functions and cgo-generated functions.
-	if name != "" && !contains(name, ".") && !hasPrefix(name, "__go_") && !hasPrefix(name, "_cgo_") {
+	if name != "" && bytealg.IndexByteString(name, '.') < 0 && !hasPrefix(name, "__go_") && !hasPrefix(name, "_cgo_") {
 		return true
 	}
 
@@ -178,7 +179,7 @@ func showfuncinfo(name string, firstFrame bool) bool {
 		return true
 	}
 
-	return contains(name, ".") && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
+	return bytealg.IndexByteString(name, '.') >= 0 && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.