152 files changed, 14893 insertions, 909 deletions
diff --git a/libgo/go/bytes/bytes_test.go b/libgo/go/bytes/bytes_test.go
index 26eac5e..ad01952 100644
--- a/libgo/go/bytes/bytes_test.go
+++ b/libgo/go/bytes/bytes_test.go
@@ -10,6 +10,7 @@ import (
 	"internal/testenv"
 	"math/rand"
 	"reflect"
+	"runtime"
 	"strings"
 	"testing"
 	"unicode"
@@ -392,7 +393,11 @@ func TestIndexRune(t *testing.T) {
 		}
 	})
 	if allocs != 0 {
-		t.Errorf("expected no allocations, got %f", allocs)
+		if runtime.Compiler == "gccgo" {
+			t.Log("does not work on gccgo without better escape analysis")
+		} else {
+			t.Errorf("expected no allocations, got %f", allocs)
+		}
 	}
 }
 
diff --git a/libgo/go/crypto/rand/eagain.go b/libgo/go/crypto/rand/eagain.go
index 7ed2f47..045d037 100644
--- a/libgo/go/crypto/rand/eagain.go
+++ b/libgo/go/crypto/rand/eagain.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package rand
 
diff --git a/libgo/go/crypto/rand/rand_unix.go b/libgo/go/crypto/rand/rand_unix.go
index 631972b..ec474d3 100644
--- a/libgo/go/crypto/rand/rand_unix.go
+++ b/libgo/go/crypto/rand/rand_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris
 
 // Unix cryptographically secure pseudorandom number
 // generator.
diff --git a/libgo/go/crypto/x509/root_aix.go b/libgo/go/crypto/x509/root_aix.go
new file mode 100644
index 0000000..de5702d
--- /dev/null
+++ b/libgo/go/crypto/x509/root_aix.go
@@ -0,0 +1,8 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package x509
+
+// Possible certificate files; stop after finding one.
+var certFiles []string
diff --git a/libgo/go/crypto/x509/root_unix.go b/libgo/go/crypto/x509/root_unix.go
index 7bcb3d6..c44a524 100644
--- a/libgo/go/crypto/x509/root_unix.go
+++ b/libgo/go/crypto/x509/root_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package x509
 
@@ -16,6 +16,7 @@ import (
 var certDirectories = []string{
 	"/etc/ssl/certs",               // SLES10/SLES11, https://golang.org/issue/12139
 	"/system/etc/security/cacerts", // Android
+	"/var/ssl/certs",               // AIX
 }
 
 func (c *Certificate) systemVerify(opts *VerifyOptions) (chains [][]*Certificate, err error) {
diff --git a/libgo/go/go/build/syslist.go b/libgo/go/go/build/syslist.go
index ea316ea..fe9820e 100644
--- a/libgo/go/go/build/syslist.go
+++ b/libgo/go/go/build/syslist.go
@@ -4,5 +4,5 @@
 
 package build
 
-const goosList = "android darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris windows zos "
+const goosList = "aix android darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris windows zos "
 const goarchList = "386 amd64 amd64p32 arm armbe arm64 arm64be alpha m68k ppc64 ppc64le mips mipsle mips64 mips64le mips64p32 mips64p32le mipso32 mipsn32 mipsn64 mipso64 ppc s390 s390x sparc sparc64 "
diff --git a/libgo/go/math/atan.go b/libgo/go/math/atan.go
index d942bce..4c9eda4 100644
--- a/libgo/go/math/atan.go
+++ b/libgo/go/math/atan.go
@@ -97,6 +97,9 @@ func satan(x float64) float64 {
 func libc_atan(float64) float64
 
 func Atan(x float64) float64 {
+	if x == 0 {
+		return x
+	}
 	return libc_atan(x)
 }
 
diff --git a/libgo/go/math/expm1.go b/libgo/go/math/expm1.go
index a0a62d1..7494043 100644
--- a/libgo/go/math/expm1.go
+++ b/libgo/go/math/expm1.go
@@ -126,6 +126,9 @@ package math
 func libc_expm1(float64) float64
 
 func Expm1(x float64) float64 {
+	if x == 0 {
+		return x
+	}
 	return libc_expm1(x)
 }
 
diff --git a/libgo/go/math/log1p.go b/libgo/go/math/log1p.go
index ef1c7de..044495a 100644
--- a/libgo/go/math/log1p.go
+++ b/libgo/go/math/log1p.go
@@ -97,6 +97,9 @@ package math
 func libc_log1p(float64) float64
 
 func Log1p(x float64) float64 {
+	if x == 0 {
+		return x
+	}
 	return libc_log1p(x)
 }
 
diff --git a/libgo/go/mime/type_unix.go b/libgo/go/mime/type_unix.go
index bb06a77..8e177ca 100644
--- a/libgo/go/mime/type_unix.go
+++ b/libgo/go/mime/type_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package mime
 
diff --git a/libgo/go/net/addrselect.go b/libgo/go/net/addrselect.go
index 1ab9fc53..7c0dfe2 100644
--- a/libgo/go/net/addrselect.go
+++ b/libgo/go/net/addrselect.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Minimal RFC 6724 address selection.
 
diff --git a/libgo/go/net/cgo_aix.go b/libgo/go/net/cgo_aix.go
new file mode 100644
index 0000000..4f23d9b
--- /dev/null
+++ b/libgo/go/net/cgo_aix.go
@@ -0,0 +1,13 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo,!netgo
+
+package net
+
+import (
+	"syscall"
+)
+
+const cgoAddrInfoFlags = syscall.AI_CANONNAME
diff --git a/libgo/go/net/cgo_resnew.go b/libgo/go/net/cgo_resnew.go
index ebca1bd..81b39c9 100644
--- a/libgo/go/net/cgo_resnew.go
+++ b/libgo/go/net/cgo_resnew.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo,!netgo
-// +build darwin linux,!android netbsd solaris
+// +build aix darwin linux,!android netbsd solaris
 
 package net
 
diff --git a/libgo/go/net/cgo_sockold.go b/libgo/go/net/cgo_sockold.go
index 432634b..25d4f67 100644
--- a/libgo/go/net/cgo_sockold.go
+++ b/libgo/go/net/cgo_sockold.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo,!netgo
-// +build darwin dragonfly freebsd netbsd openbsd
+// +build aix darwin dragonfly freebsd netbsd openbsd
 
 package net
 
diff --git a/libgo/go/net/cgo_unix.go b/libgo/go/net/cgo_unix.go
index a90aaa9..09cfb2a 100644
--- a/libgo/go/net/cgo_unix.go
+++ b/libgo/go/net/cgo_unix.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo,!netgo
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/cgo_unix_test.go b/libgo/go/net/cgo_unix_test.go
index e861c7a..e579198 100644
--- a/libgo/go/net/cgo_unix_test.go
+++ b/libgo/go/net/cgo_unix_test.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo,!netgo
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/conf.go b/libgo/go/net/conf.go
index c10aafe..a798699 100644
--- a/libgo/go/net/conf.go
+++ b/libgo/go/net/conf.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/dial_unix_test.go b/libgo/go/net/dial_unix_test.go
index 4705254..d5c6dde2 100644
--- a/libgo/go/net/dial_unix_test.go
+++ b/libgo/go/net/dial_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/dnsclient_unix.go b/libgo/go/net/dnsclient_unix.go
index 4dd4e16..0647b9c 100644
--- a/libgo/go/net/dnsclient_unix.go
+++ b/libgo/go/net/dnsclient_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // DNS client: see RFC 1035.
 // Has to be linked into package net for Dial.
diff --git a/libgo/go/net/dnsclient_unix_test.go b/libgo/go/net/dnsclient_unix_test.go
index 85267bb..c66d2d1 100644
--- a/libgo/go/net/dnsclient_unix_test.go
+++ b/libgo/go/net/dnsclient_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/dnsconfig_unix.go b/libgo/go/net/dnsconfig_unix.go
index 9c8108d..24487af 100644
--- a/libgo/go/net/dnsconfig_unix.go
+++ b/libgo/go/net/dnsconfig_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Read system DNS config from /etc/resolv.conf
 
diff --git a/libgo/go/net/dnsconfig_unix_test.go b/libgo/go/net/dnsconfig_unix_test.go
index 37bdeb0..0797559 100644
--- a/libgo/go/net/dnsconfig_unix_test.go
+++ b/libgo/go/net/dnsconfig_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/fd_poll_runtime.go b/libgo/go/net/fd_poll_runtime.go
index 62b69fc..4ea92cb 100644
--- a/libgo/go/net/fd_poll_runtime.go
+++ b/libgo/go/net/fd_poll_runtime.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd windows solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd windows solaris
 
 package net
 
diff --git a/libgo/go/net/fd_posix.go b/libgo/go/net/fd_posix.go
index b4b908a..7230479 100644
--- a/libgo/go/net/fd_posix.go
+++ b/libgo/go/net/fd_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/fd_unix.go b/libgo/go/net/fd_unix.go
index 9bc5ebc..b6ee059 100644
--- a/libgo/go/net/fd_unix.go
+++ b/libgo/go/net/fd_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/file_unix.go b/libgo/go/net/file_unix.go
index 9e581fc..b47a614 100644
--- a/libgo/go/net/file_unix.go
+++ b/libgo/go/net/file_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/hook_unix.go b/libgo/go/net/hook_unix.go
index cf52567..b2522a2 100644
--- a/libgo/go/net/hook_unix.go
+++ b/libgo/go/net/hook_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/interface_stub.go b/libgo/go/net/interface_stub.go
index 3b0a1ae..6d7147e 100644
--- a/libgo/go/net/interface_stub.go
+++ b/libgo/go/net/interface_stub.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build nacl
+// +build aix nacl
 
 package net
 
diff --git a/libgo/go/net/internal/socktest/switch_unix.go b/libgo/go/net/internal/socktest/switch_unix.go
index 14c0c22..8fb15f3 100644
--- a/libgo/go/net/internal/socktest/switch_unix.go
+++ b/libgo/go/net/internal/socktest/switch_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package socktest
 
diff --git a/libgo/go/net/iprawsock_posix.go b/libgo/go/net/iprawsock_posix.go
index 8f4b702..16e65dc 100644
--- a/libgo/go/net/iprawsock_posix.go
+++ b/libgo/go/net/iprawsock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/ipsock_posix.go b/libgo/go/net/ipsock_posix.go
index ff280c3..05bf939 100644
--- a/libgo/go/net/ipsock_posix.go
+++ b/libgo/go/net/ipsock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/lookup_unix.go b/libgo/go/net/lookup_unix.go
index be2ced9..f96c8be 100644
--- a/libgo/go/net/lookup_unix.go
+++ b/libgo/go/net/lookup_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/main_unix_test.go b/libgo/go/net/main_unix_test.go
index 0cc129f..8c8f944 100644
--- a/libgo/go/net/main_unix_test.go
+++ b/libgo/go/net/main_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/nss.go b/libgo/go/net/nss.go
index 08c3e6a..1650f5e 100644
--- a/libgo/go/net/nss.go
+++ b/libgo/go/net/nss.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/port_unix.go b/libgo/go/net/port_unix.go
index 868d1e4..3120ba1 100644
--- a/libgo/go/net/port_unix.go
+++ b/libgo/go/net/port_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris nacl
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris nacl
 
 // Read system port mappings from /etc/services
 
diff --git a/libgo/go/net/sendfile_stub.go b/libgo/go/net/sendfile_stub.go
index 905f1d6..f043062 100644
--- a/libgo/go/net/sendfile_stub.go
+++ b/libgo/go/net/sendfile_stub.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin nacl netbsd openbsd
+// +build aix darwin nacl netbsd openbsd
 
 package net
 
diff --git a/libgo/go/net/sock_posix.go b/libgo/go/net/sock_posix.go
index 16351e1..6bbfd12 100644
--- a/libgo/go/net/sock_posix.go
+++ b/libgo/go/net/sock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/sock_stub.go b/libgo/go/net/sock_stub.go
index 5ac1e86..d1ec029 100644
--- a/libgo/go/net/sock_stub.go
+++ b/libgo/go/net/sock_stub.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build nacl solaris
+// +build aix nacl solaris
 
 package net
 
diff --git a/libgo/go/net/sockopt_aix.go b/libgo/go/net/sockopt_aix.go
new file mode 100644
index 0000000..7aef64b
--- /dev/null
+++ b/libgo/go/net/sockopt_aix.go
@@ -0,0 +1,34 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package net
+
+import (
+	"os"
+	"syscall"
+)
+
+// This was copied from sockopt_linux.go
+
+func setDefaultSockopts(s, family, sotype int, ipv6only bool) error {
+	if family == syscall.AF_INET6 && sotype != syscall.SOCK_RAW {
+		// Allow both IP versions even if the OS default
+		// is otherwise. Note that some operating systems
+		// never admit this option.
+		syscall.SetsockoptInt(s, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, boolint(ipv6only))
+	}
+	// Allow broadcast.
+	return os.NewSyscallError("setsockopt", syscall.SetsockoptInt(s, syscall.SOL_SOCKET, syscall.SO_BROADCAST, 1))
+}
+
+func setDefaultListenerSockopts(s int) error {
+	// Allow reuse of recently-used addresses.
+	return os.NewSyscallError("setsockopt", syscall.SetsockoptInt(s, syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1))
+}
+
+func setDefaultMulticastSockopts(s int) error {
+	// Allow multicast UDP and raw IP datagram sockets to listen
+	// concurrently across multiple listeners.
+	return os.NewSyscallError("setsockopt", syscall.SetsockoptInt(s, syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1))
+}
diff --git a/libgo/go/net/sockopt_posix.go b/libgo/go/net/sockopt_posix.go
index cd3d562..cacd048 100644
--- a/libgo/go/net/sockopt_posix.go
+++ b/libgo/go/net/sockopt_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/sockoptip_aix.go b/libgo/go/net/sockoptip_aix.go
new file mode 100644
index 0000000..1e28fe6
--- /dev/null
+++ b/libgo/go/net/sockoptip_aix.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package net
+
+import "syscall"
+
+func setIPv4MulticastInterface(fd *netFD, ifi *Interface) error {
+	return syscall.ENOPROTOOPT
+}
+
+func setIPv4MulticastLoopback(fd *netFD, v bool) error {
+	return syscall.ENOPROTOOPT
+}
diff --git a/libgo/go/net/sockoptip_posix.go b/libgo/go/net/sockoptip_posix.go
index d508860..4afd4c8 100644
--- a/libgo/go/net/sockoptip_posix.go
+++ b/libgo/go/net/sockoptip_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd windows
+// +build aix darwin dragonfly freebsd linux netbsd openbsd windows
 
 package net
 
diff --git a/libgo/go/net/sys_cloexec.go b/libgo/go/net/sys_cloexec.go
index ba266e6..f2ea842 100644
--- a/libgo/go/net/sys_cloexec.go
+++ b/libgo/go/net/sys_cloexec.go
@@ -5,7 +5,7 @@
 // This file implements sysSocket and accept for platforms that do not
 // provide a fast path for setting SetNonblock and CloseOnExec.
 
-// +build darwin dragonfly nacl netbsd openbsd solaris
+// +build aix darwin dragonfly nacl netbsd openbsd solaris
 
 package net
 
diff --git a/libgo/go/net/tcpsock_posix.go b/libgo/go/net/tcpsock_posix.go
index 9641e5c..7533c24 100644
--- a/libgo/go/net/tcpsock_posix.go
+++ b/libgo/go/net/tcpsock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/tcpsockopt_posix.go b/libgo/go/net/tcpsockopt_posix.go
index 805b56b..36866ac 100644
--- a/libgo/go/net/tcpsockopt_posix.go
+++ b/libgo/go/net/tcpsockopt_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/tcpsockopt_unix.go b/libgo/go/net/tcpsockopt_unix.go
index 8d44fb2..46e5e6d 100644
--- a/libgo/go/net/tcpsockopt_unix.go
+++ b/libgo/go/net/tcpsockopt_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd linux netbsd
+// +build aix freebsd linux netbsd
 
 package net
 
diff --git a/libgo/go/net/udpsock_posix.go b/libgo/go/net/udpsock_posix.go
index 72aadca..0c905af 100644
--- a/libgo/go/net/udpsock_posix.go
+++ b/libgo/go/net/udpsock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/net/unixsock_posix.go b/libgo/go/net/unixsock_posix.go
index a8f892e..945aa03 100644
--- a/libgo/go/net/unixsock_posix.go
+++ b/libgo/go/net/unixsock_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package net
 
diff --git a/libgo/go/os/dir_gccgo.go b/libgo/go/os/dir_gccgo.go
index d811c9f..8923f0e 100644
--- a/libgo/go/os/dir_gccgo.go
+++ b/libgo/go/os/dir_gccgo.go
@@ -6,6 +6,7 @@ package os
 
 import (
 	"io"
+	"runtime"
 	"sync/atomic"
 	"syscall"
 	"unsafe"
@@ -81,6 +82,11 @@ func (file *File) readdirnames(n int) (names []string, err error) {
 		syscall.Entersyscall()
 		i := libc_readdir_r(file.dirinfo.dir, entryDirent, pr)
 		syscall.Exitsyscall()
+		// On AIX when readdir_r hits EOF it sets dirent to nil and returns 9.
+		//  https://www.ibm.com/support/knowledgecenter/ssw_aix_71/com.ibm.aix.basetrf2/readdir_r.htm
+		if runtime.GOOS == "aix" && i == 9 && dirent == nil {
+			break
+		}
 		if i != 0 {
 			return names, NewSyscallError("readdir_r", i)
 		}
diff --git a/libgo/go/os/dir_largefile.go b/libgo/go/os/dir_largefile.go
index 2873342..75df6a4 100644
--- a/libgo/go/os/dir_largefile.go
+++ b/libgo/go/os/dir_largefile.go
@@ -5,7 +5,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux solaris,386 solaris,sparc
+// +build aix linux solaris,386 solaris,sparc
 
 package os
 
diff --git a/libgo/go/os/dir_regfile.go b/libgo/go/os/dir_regfile.go
index 8b17f38..02ddd7b 100644
--- a/libgo/go/os/dir_regfile.go
+++ b/libgo/go/os/dir_regfile.go
@@ -5,6 +5,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !aix
 // +build !linux
 // +build !solaris !386
 // +build !solaris !sparc
diff --git a/libgo/go/os/dir_unix.go b/libgo/go/os/dir_unix.go
index cd42f59..2dc6a89 100644
--- a/libgo/go/os/dir_unix.go
+++ b/libgo/go/os/dir_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/error_unix.go b/libgo/go/os/error_unix.go
index be1440c..2349851 100644
--- a/libgo/go/os/error_unix.go
+++ b/libgo/go/os/error_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/exec/lp_unix.go b/libgo/go/os/exec/lp_unix.go
index 7a30275..20ce7a4 100644
--- a/libgo/go/os/exec/lp_unix.go
+++ b/libgo/go/os/exec/lp_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package exec
 
diff --git a/libgo/go/os/exec_posix.go b/libgo/go/os/exec_posix.go
index 3cf38b68..9e792b4 100644
--- a/libgo/go/os/exec_posix.go
+++ b/libgo/go/os/exec_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package os
 
diff --git a/libgo/go/os/exec_unix.go b/libgo/go/os/exec_unix.go
index c4999db..d6433bf 100644
--- a/libgo/go/os/exec_unix.go
+++ b/libgo/go/os/exec_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/executable_path.go b/libgo/go/os/executable_path.go
new file mode 100644
index 0000000..117320d
--- /dev/null
+++ b/libgo/go/os/executable_path.go
@@ -0,0 +1,104 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix
+
+package os
+
+// We query the working directory at init, to use it later to search for the
+// executable file
+// errWd will be checked later, if we need to use initWd
+var initWd, errWd = Getwd()
+
+func executable() (string, error) {
+	var err error
+	var exePath string
+	if len(Args) == 0 || Args[0] == "" {
+		return "", ErrNotExist
+	}
+	// Args[0] is an absolute path : this is the executable
+	if IsPathSeparator(Args[0][0]) {
+		exePath = Args[0]
+	} else {
+		for i := 1; i < len(Args[0]); i++ {
+			// Args[0] is a relative path : append current directory
+			if IsPathSeparator(Args[0][i]) {
+				if errWd != nil {
+					return "", errWd
+				}
+				exePath = initWd + string(PathSeparator) + Args[0]
+				break
+			}
+		}
+	}
+	if exePath != "" {
+		err = isExecutable(exePath)
+		if err == nil {
+			return exePath, nil
+		}
+		// File does not exist or is not executable,
+		// this is an unexpected situation !
+		return "", err
+	}
+	// Search for executable in $PATH
+	for _, dir := range splitPathList(Getenv("PATH")) {
+		if len(dir) == 0 {
+			continue
+		}
+		if !IsPathSeparator(dir[0]) {
+			if errWd != nil {
+				return "", errWd
+			}
+			dir = initWd + string(PathSeparator) + dir
+		}
+		exePath = dir + string(PathSeparator) + Args[0]
+		err = isExecutable(exePath)
+		if err == nil {
+			return exePath, nil
+		}
+		if err == ErrPermission {
+			return "", err
+		}
+	}
+	return "", ErrNotExist
+}
+
+// isExecutable returns an error if a given file is not an executable.
+func isExecutable(path string) error {
+	stat, err := Stat(path)
+	if err != nil {
+		return err
+	}
+	mode := stat.Mode()
+	if !mode.IsRegular() {
+		return ErrPermission
+	}
+	if (mode & 0111) != 0 {
+		return nil
+	}
+	return ErrPermission
+}
+
+// splitPathList splits a path list.
+// This is based on genSplit from strings/strings.go
+func splitPathList(pathList string) []string {
+	n := 1
+	for i := 0; i < len(pathList); i++ {
+		if pathList[i] == PathListSeparator {
+			n++
+		}
+	}
+	start := 0
+	a := make([]string, n)
+	na := 0
+	for i := 0; i+1 <= len(pathList) && na+1 < n; i++ {
+		if pathList[i] == PathListSeparator {
+			a[na] = pathList[start:i]
+			na++
+			start = i + 1
+		}
+	}
+	a[na] = pathList[start:]
+	return a[:na+1]
+}
diff --git a/libgo/go/os/file_posix.go b/libgo/go/os/file_posix.go
index d817f34..6634112 100644
--- a/libgo/go/os/file_posix.go
+++ b/libgo/go/os/file_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package os
 
@@ -19,6 +19,10 @@ func Readlink(name string) (string, error) {
 	for len := 128; ; len *= 2 {
 		b := make([]byte, len)
 		n, e := fixCount(syscall.Readlink(fixLongPath(name), b))
+		// buffer too small
+		if e == syscall.ERANGE {
+			continue
+		}
 		if e != nil {
 			return "", &PathError{"readlink", name, e}
 		}
diff --git a/libgo/go/os/file_unix.go b/libgo/go/os/file_unix.go
index 54b5dfd..1bba4ed 100644
--- a/libgo/go/os/file_unix.go
+++ b/libgo/go/os/file_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/path_unix.go b/libgo/go/os/path_unix.go
index ecf098c..bc0f239 100644
--- a/libgo/go/os/path_unix.go
+++ b/libgo/go/os/path_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/pipe_bsd.go b/libgo/go/os/pipe_bsd.go
index 3b81ed2..ebe198b 100644
--- a/libgo/go/os/pipe_bsd.go
+++ b/libgo/go/os/pipe_bsd.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/signal/signal_test.go b/libgo/go/os/signal/signal_test.go
index 406102c..c8409e7 100644
--- a/libgo/go/os/signal/signal_test.go
+++ b/libgo/go/os/signal/signal_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package signal
 
diff --git a/libgo/go/os/signal/signal_unix.go b/libgo/go/os/signal/signal_unix.go
index 01b1b14..5ec7e97 100644
--- a/libgo/go/os/signal/signal_unix.go
+++ b/libgo/go/os/signal/signal_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package signal
 
diff --git a/libgo/go/os/stat.go b/libgo/go/os/stat.go
index 59cac9c..564215b 100644
--- a/libgo/go/os/stat.go
+++ b/libgo/go/os/stat.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !aix
 // +build !darwin
 // +build !freebsd
 // +build !linux
diff --git a/libgo/go/os/stat_atim.go b/libgo/go/os/stat_atim.go
index ef8a574..82481c0 100644
--- a/libgo/go/os/stat_atim.go
+++ b/libgo/go/os/stat_atim.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux openbsd solaristag
+// +build aix linux openbsd solaristag
 
 package os
 
diff --git a/libgo/go/os/stat_unix.go b/libgo/go/os/stat_unix.go
index 1733d3f..043aefe 100644
--- a/libgo/go/os/stat_unix.go
+++ b/libgo/go/os/stat_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/sys_uname.go b/libgo/go/os/sys_uname.go
index 71fa867..89fbff8 100644
--- a/libgo/go/os/sys_uname.go
+++ b/libgo/go/os/sys_uname.go
@@ -4,7 +4,7 @@
 
 // For systems which only store the hostname in uname (Solaris).
 
-// +build solaris irix rtems
+// +build aix solaris irix rtems
 
 package os
 
diff --git a/libgo/go/os/sys_unix.go b/libgo/go/os/sys_unix.go
index 39c20dc..4caf8bd 100644
--- a/libgo/go/os/sys_unix.go
+++ b/libgo/go/os/sys_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build dragonfly linux netbsd openbsd solaris
+// +build aix dragonfly linux netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/os/user/decls_aix.go b/libgo/go/os/user/decls_aix.go
new file mode 100644
index 0000000..64455b5
--- /dev/null
+++ b/libgo/go/os/user/decls_aix.go
@@ -0,0 +1,24 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package user
+
+import "syscall"
+
+// Declarations for the libc functions on AIX.
+
+//extern _posix_getpwnam_r
+func libc_getpwnam_r(name *byte, pwd *syscall.Passwd, buf *byte, buflen syscall.Size_t, result **syscall.Passwd) int
+
+//extern _posix_getpwuid_r
+func libc_getpwuid_r(uid syscall.Uid_t, pwd *syscall.Passwd, buf *byte, buflen syscall.Size_t, result **syscall.Passwd) int
+
+//extern _posix_getgrnam_r
+func libc_getgrnam_r(name *byte, grp *syscall.Group, buf *byte, buflen syscall.Size_t, result **syscall.Group) int
+
+//extern _posix_getgrgid_r
+func libc_getgrgid_r(gid syscall.Gid_t, grp *syscall.Group, buf *byte, buflen syscall.Size_t, result **syscall.Group) int
+
+//extern getgrset
+func libc_getgrset(user *byte) *byte
diff --git a/libgo/go/os/user/listgroups_aix.go b/libgo/go/os/user/listgroups_aix.go
new file mode 100644
index 0000000..5b9f3f9
--- /dev/null
+++ b/libgo/go/os/user/listgroups_aix.go
@@ -0,0 +1,11 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package user
+
+import "fmt"
+
+func listGroups(u *User) ([]string, error) {
+	return nil, fmt.Errorf("user: list groups for %s: not supported on AIX", u.Username)
+}
diff --git a/libgo/go/os/user/lookup_unix.go b/libgo/go/os/user/lookup_unix.go
index 8881366..9670ada 100644
--- a/libgo/go/os/user/lookup_unix.go
+++ b/libgo/go/os/user/lookup_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd !android,linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd !android,linux netbsd openbsd solaris
 // +build cgo
 
 package user
diff --git a/libgo/go/os/wait_unimp.go b/libgo/go/os/wait_unimp.go
index 7059e59..0378b83 100644
--- a/libgo/go/os/wait_unimp.go
+++ b/libgo/go/os/wait_unimp.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build dragonfly nacl netbsd openbsd solaris
+// +build aix dragonfly nacl netbsd openbsd solaris
 
 package os
 
diff --git a/libgo/go/path/filepath/path_unix.go b/libgo/go/path/filepath/path_unix.go
index d77ff24..2d407a8 100644
--- a/libgo/go/path/filepath/path_unix.go
+++ b/libgo/go/path/filepath/path_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package filepath
 
diff --git a/libgo/go/reflect/type.go b/libgo/go/reflect/type.go
index 0325260..3ae0f18 100644
--- a/libgo/go/reflect/type.go
+++ b/libgo/go/reflect/type.go
@@ -259,20 +259,21 @@ const (
 // with a unique tag like `reflect:"array"` or `reflect:"ptr"`
 // so that code cannot convert from, say, *arrayType to *ptrType.
 type rtype struct {
-	kind       uint8 // enumeration for C
-	align      int8  // alignment of variable with this type
-	fieldAlign uint8 // alignment of struct field with this type
-	_          uint8 // unused/padding
 	size       uintptr
-	hash       uint32 // hash of type; avoids computation in hash tables
+	ptrdata    uintptr // size of memory prefix holding all pointers
+	hash       uint32  // hash of type; avoids computation in hash tables
+	kind       uint8   // enumeration for C
+	align      int8    // alignment of variable with this type
+	fieldAlign uint8   // alignment of struct field with this type
+	_          uint8   // unused/padding
 
 	hashfn  func(unsafe.Pointer, uintptr) uintptr     // hash function
 	equalfn func(unsafe.Pointer, unsafe.Pointer) bool // equality function
 
-	gc            unsafe.Pointer // garbage collection data
-	string        *string        // string form; unnecessary  but undeniably useful
-	*uncommonType                // (relatively) uncommon fields
-	ptrToThis     *rtype         // type for pointer to this type, if used in binary or has methods
+	gcdata        *byte   // garbage collection data
+	string        *string // string form; unnecessary  but undeniably useful
+	*uncommonType         // (relatively) uncommon fields
+	ptrToThis     *rtype  // type for pointer to this type, if used in binary or has methods
 }
 
 // Method on non-interface type
@@ -382,24 +383,6 @@ type structType struct {
 	fields []structField // sorted by offset
 }
 
-// NOTE: These are copied from ../runtime/mgc0.h.
-// They must be kept in sync.
-const (
-	_GC_END = iota
-	_GC_PTR
-	_GC_APTR
-	_GC_ARRAY_START
-	_GC_ARRAY_NEXT
-	_GC_CALL
-	_GC_CHAN_PTR
-	_GC_STRING
-	_GC_EFACE
-	_GC_IFACE
-	_GC_SLICE
-	_GC_REGION
-	_GC_NUM_INSTR
-)
-
 /*
  * The compiler knows the exact layout of all the data structures above.
  * The compiler does not know about the data structures and methods below.
@@ -1098,32 +1081,6 @@ var ptrMap struct {
 	m map[*rtype]*ptrType
 }
 
-// garbage collection bytecode program for pointer to memory without pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrDataGC struct {
-	width uintptr // sizeof(ptr)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var ptrDataGCProg = ptrDataGC{
-	width: unsafe.Sizeof((*byte)(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
-// garbage collection bytecode program for pointer to memory with pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrGC struct {
-	width  uintptr        // sizeof(ptr)
-	op     uintptr        // _GC_PTR
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc type
-	end    uintptr        // _GC_END
-}
-
 // PtrTo returns the pointer type with element t.
 // For example, if t represents type Foo, PtrTo(t) represents *Foo.
 func PtrTo(t Type) Type {
@@ -1189,18 +1146,6 @@ func (t *rtype) ptrTo() *rtype {
 	pp.ptrToThis = nil
 	pp.elem = t
 
-	if t.kind&kindNoPointers != 0 {
-		pp.gc = unsafe.Pointer(&ptrDataGCProg)
-	} else {
-		pp.gc = unsafe.Pointer(&ptrGC{
-			width:  pp.size,
-			op:     _GC_PTR,
-			off:    0,
-			elemgc: t.gc,
-			end:    _GC_END,
-		})
-	}
-
 	q := canonicalize(&pp.rtype)
 	p = (*ptrType)(unsafe.Pointer(q.(*rtype)))
 
@@ -1507,16 +1452,6 @@ func cachePut(k cacheKey, t *rtype) Type {
 	return t
 }
 
-// garbage collection bytecode program for chan.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type chanGC struct {
-	width uintptr // sizeof(map)
-	op    uintptr // _GC_CHAN_PTR
-	off   uintptr // 0
-	typ   *rtype  // map type
-	end   uintptr // _GC_END
-}
-
 // The funcLookupCache caches FuncOf lookups.
 // FuncOf does not share the common lookupCache since cacheKey is not
 // sufficient to represent functions unambiguously.
@@ -1584,17 +1519,6 @@ func ChanOf(dir ChanDir, t Type) Type {
 	ch.uncommonType = nil
 	ch.ptrToThis = nil
 
-	ch.gc = unsafe.Pointer(&chanGC{
-		width: ch.size,
-		op:    _GC_CHAN_PTR,
-		off:   0,
-		typ:   &ch.rtype,
-		end:   _GC_END,
-	})
-
-	// INCORRECT. Uncomment to check that TestChanOfGC fails when ch.gc is wrong.
-	// ch.gc = unsafe.Pointer(&badGC{width: ch.size, end: _GC_END})
-
 	return cachePut(ckey, &ch.rtype)
 }
 
@@ -1733,9 +1657,6 @@ func FuncOf(in, out []Type, variadic bool) Type {
 	ft.uncommonType = nil
 	ft.ptrToThis = nil
 
-	// TODO(cmang): Generate GC data for funcs.
-	ft.gc = unsafe.Pointer(&ptrDataGCProg)
-
 	funcLookupCache.m[hash] = append(funcLookupCache.m[hash], &ft.rtype)
 
 	return toType(&ft.rtype)
@@ -1859,8 +1780,8 @@ func bucketOf(ktyp, etyp *rtype) *rtype {
 	// and it's easier to generate a pointer bitmap than a GC program.
 	// Note that since the key and value are known to be <= 128 bytes,
 	// they're guaranteed to have bitmaps instead of GC programs.
-	// var gcdata *byte
-	// var ptrdata uintptr
+	var gcdata *byte
+	var ptrdata uintptr
 
 	size := bucketSize
 	size = align(size, uintptr(ktyp.fieldAlign))
@@ -1875,37 +1796,63 @@ func bucketOf(ktyp, etyp *rtype) *rtype {
 	if maxAlign > ptrSize {
 		size = align(size, maxAlign)
 		size += align(ptrSize, maxAlign) - ptrSize
+	} else if maxAlign < ptrSize {
+		size = align(size, ptrSize)
+		maxAlign = ptrSize
 	}
 
 	ovoff := size
 	size += ptrSize
-	if maxAlign < ptrSize {
-		maxAlign = ptrSize
-	}
 
-	var gcPtr unsafe.Pointer
 	if kind != kindNoPointers {
-		gc := []uintptr{size}
-		base := bucketSize
-		base = align(base, uintptr(ktyp.fieldAlign))
+		nptr := size / ptrSize
+		mask := make([]byte, (nptr+7)/8)
+		psize := bucketSize
+		psize = align(psize, uintptr(ktyp.fieldAlign))
+		base := psize / ptrSize
+
 		if ktyp.kind&kindNoPointers == 0 {
-			gc = append(gc, _GC_ARRAY_START, base, bucketSize, ktyp.size)
-			gc = appendGCProgram(gc, ktyp, 0)
-			gc = append(gc, _GC_ARRAY_NEXT)
+			if ktyp.kind&kindGCProg != 0 {
+				panic("reflect: unexpected GC program in MapOf")
+			}
+			kmask := (*[16]byte)(unsafe.Pointer(ktyp.gcdata))
+			for i := uintptr(0); i < ktyp.size/ptrSize; i++ {
+				if (kmask[i/8]>>(i%8))&1 != 0 {
+					for j := uintptr(0); j < bucketSize; j++ {
+						word := base + j*ktyp.size/ptrSize + i
+						mask[word/8] |= 1 << (word % 8)
+					}
+				}
+			}
 		}
-		base += ktyp.size * bucketSize
-		base = align(base, uintptr(etyp.fieldAlign))
+		psize += bucketSize * ktyp.size
+		psize = align(psize, uintptr(etyp.fieldAlign))
+		base = psize / ptrSize
+
 		if etyp.kind&kindNoPointers == 0 {
-			gc = append(gc, _GC_ARRAY_START, base, bucketSize, etyp.size)
-			gc = appendGCProgram(gc, etyp, 0)
-			gc = append(gc, _GC_ARRAY_NEXT)
+			if etyp.kind&kindGCProg != 0 {
+				panic("reflect: unexpected GC program in MapOf")
+			}
+			emask := (*[16]byte)(unsafe.Pointer(etyp.gcdata))
+			for i := uintptr(0); i < etyp.size/ptrSize; i++ {
+				if (emask[i/8]>>(i%8))&1 != 0 {
+					for j := uintptr(0); j < bucketSize; j++ {
+						word := base + j*etyp.size/ptrSize + i
+						mask[word/8] |= 1 << (word % 8)
+					}
+				}
+			}
+		}
+
+		word := ovoff / ptrSize
+		mask[word/8] |= 1 << (word % 8)
+		gcdata = &mask[0]
+		ptrdata = (word + 1) * ptrSize
+
+		// overflow word must be last
+		if ptrdata != size {
+			panic("reflect: bad layout computation in MapOf")
 		}
-		gc = append(gc, _GC_APTR, ovoff, _GC_END)
-		gcPtr = unsafe.Pointer(&gc[0])
-	} else {
-		// No pointers in bucket.
-		gc := [...]uintptr{size, _GC_END}
-		gcPtr = unsafe.Pointer(&gc[0])
 	}
 
 	b := &rtype{
@@ -1913,102 +1860,14 @@ func bucketOf(ktyp, etyp *rtype) *rtype {
 		fieldAlign: uint8(maxAlign),
 		size:       size,
 		kind:       kind,
-		gc:         gcPtr,
+		ptrdata:    ptrdata,
+		gcdata:     gcdata,
 	}
 	s := "bucket(" + *ktyp.string + "," + *etyp.string + ")"
 	b.string = &s
 	return b
 }
 
-// Take the GC program for "t" and append it to the GC program "gc".
-func appendGCProgram(gc []uintptr, t *rtype, offset uintptr) []uintptr {
-	p := t.gc
-	p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0))) // skip size
-loop:
-	for {
-		var argcnt int
-		switch *(*uintptr)(p) {
-		case _GC_END:
-			// Note: _GC_END not included in append
-			break loop
-		case _GC_ARRAY_NEXT:
-			argcnt = 0
-		case _GC_APTR, _GC_STRING, _GC_EFACE, _GC_IFACE:
-			argcnt = 1
-		case _GC_PTR, _GC_CALL, _GC_CHAN_PTR, _GC_SLICE:
-			argcnt = 2
-		case _GC_ARRAY_START, _GC_REGION:
-			argcnt = 3
-		default:
-			panic("unknown GC program op for " + *t.string + ": " + strconv.FormatUint(*(*uint64)(p), 10))
-		}
-		for i := 0; i < argcnt+1; i++ {
-			v := *(*uintptr)(p)
-			if i == 1 {
-				v += offset
-			}
-			gc = append(gc, v)
-			p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0)))
-		}
-	}
-	return gc
-}
-func hMapOf(bucket *rtype) *rtype {
-	ptrsize := unsafe.Sizeof(uintptr(0))
-
-	// make gc program & compute hmap size
-	gc := make([]uintptr, 1)           // first entry is size, filled in at the end
-	offset := unsafe.Sizeof(uint(0))   // count
-	offset += unsafe.Sizeof(uint32(0)) // flags
-	offset += unsafe.Sizeof(uint32(0)) // hash0
-	offset += unsafe.Sizeof(uint8(0))  // B
-	offset += unsafe.Sizeof(uint8(0))  // keysize
-	offset += unsafe.Sizeof(uint8(0))  // valuesize
-	offset = (offset + 1) / 2 * 2
-	offset += unsafe.Sizeof(uint16(0)) // bucketsize
-	offset = (offset + ptrsize - 1) / ptrsize * ptrsize
-	// gc = append(gc, _GC_PTR, offset, uintptr(bucket.gc)) // buckets
-	offset += ptrsize
-	// gc = append(gc, _GC_PTR, offset, uintptr(bucket.gc)) // oldbuckets
-	offset += ptrsize
-	offset += ptrsize // nevacuate
-	gc = append(gc, _GC_END)
-	gc[0] = offset
-
-	h := new(rtype)
-	h.size = offset
-	// h.gc = unsafe.Pointer(&gc[0])
-	s := "hmap(" + *bucket.string + ")"
-	h.string = &s
-	return h
-}
-
-// garbage collection bytecode program for slice of non-zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceGC struct {
-	width  uintptr        // sizeof(slice)
-	op     uintptr        // _GC_SLICE
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc program
-	end    uintptr        // _GC_END
-}
-
-// garbage collection bytecode program for slice of zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceEmptyGC struct {
-	width uintptr // sizeof(slice)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var sliceEmptyGCProg = sliceEmptyGC{
-	width: unsafe.Sizeof([]byte(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
 // SliceOf returns the slice type with element type t.
 // For example, if t represents int, SliceOf(t) represents []int.
 func SliceOf(t Type) Type {
@@ -2037,21 +1896,6 @@ func SliceOf(t Type) Type {
 	slice.uncommonType = nil
 	slice.ptrToThis = nil
 
-	if typ.size == 0 {
-		slice.gc = unsafe.Pointer(&sliceEmptyGCProg)
-	} else {
-		slice.gc = unsafe.Pointer(&sliceGC{
-			width:  slice.size,
-			op:     _GC_SLICE,
-			off:    0,
-			elemgc: typ.gc,
-			end:    _GC_END,
-		})
-	}
-
-	// INCORRECT. Uncomment to check that TestSliceOfOfGC fails when slice.gc is wrong.
-	// slice.gc = unsafe.Pointer(&badGC{width: slice.size, end: _GC_END})
-
 	return cachePut(ckey, &slice.rtype)
 }
 
@@ -2073,15 +1917,18 @@ var structLookupCache struct {
 // This limitation may be lifted in a future version.
 func StructOf(fields []StructField) Type {
 	var (
-		hash     = uint32(0)
-		size     uintptr
-		typalign int8
+		hash       = uint32(0)
+		size       uintptr
+		typalign   int8
+		comparable = true
+		hashable   = true
 
 		fs   = make([]structField, len(fields))
 		repr = make([]byte, 0, 64)
 		fset = map[string]struct{}{} // fields' names
 
-		hasPtr = false // records whether at least one struct-field is a pointer
+		hasPtr    = false // records whether at least one struct-field is a pointer
+		hasGCProg = false // records whether a struct-field type has a GCProg
 	)
 
 	lastzero := uintptr(0)
@@ -2092,6 +1939,9 @@ func StructOf(fields []StructField) Type {
 		}
 		f := runtimeStructField(field)
 		ft := f.typ
+		if ft.kind&kindGCProg != 0 {
+			hasGCProg = true
+		}
 		if ft.pointers() {
 			hasPtr = true
 		}
@@ -2156,6 +2006,9 @@ func StructOf(fields []StructField) Type {
 			repr = append(repr, ';')
 		}
 
+		comparable = comparable && (ft.equalfn != nil)
+		hashable = hashable && (ft.hashfn != nil)
+
 		f.offset = align(size, uintptr(ft.fieldAlign))
 		if int8(ft.fieldAlign) > typalign {
 			typalign = int8(ft.fieldAlign)
@@ -2228,36 +2081,95 @@ func StructOf(fields []StructField) Type {
 	typ.fieldAlign = uint8(typalign)
 	if !hasPtr {
 		typ.kind |= kindNoPointers
-		gc := [...]uintptr{size, _GC_END}
-		typ.gc = unsafe.Pointer(&gc[0])
 	} else {
 		typ.kind &^= kindNoPointers
-		gc := []uintptr{size}
-		for _, ft := range fs {
-			gc = appendGCProgram(gc, ft.typ, ft.offset)
+	}
+
+	if hasGCProg {
+		lastPtrField := 0
+		for i, ft := range fs {
+			if ft.typ.pointers() {
+				lastPtrField = i
+			}
+		}
+		prog := []byte{0, 0, 0, 0} // will be length of prog
+		for i, ft := range fs {
+			if i > lastPtrField {
+				// gcprog should not include anything for any field after
+				// the last field that contains pointer data
+				break
+			}
+			// FIXME(sbinet) handle padding, fields smaller than a word
+			elemGC := (*[1 << 30]byte)(unsafe.Pointer(ft.typ.gcdata))[:]
+			elemPtrs := ft.typ.ptrdata / ptrSize
+			switch {
+			case ft.typ.kind&kindGCProg == 0 && ft.typ.ptrdata != 0:
+				// Element is small with pointer mask; use as literal bits.
+				mask := elemGC
+				// Emit 120-bit chunks of full bytes (max is 127 but we avoid using partial bytes).
+				var n uintptr
+				for n := elemPtrs; n > 120; n -= 120 {
+					prog = append(prog, 120)
+					prog = append(prog, mask[:15]...)
+					mask = mask[15:]
+				}
+				prog = append(prog, byte(n))
+				prog = append(prog, mask[:(n+7)/8]...)
+			case ft.typ.kind&kindGCProg != 0:
+				// Element has GC program; emit one element.
+				elemProg := elemGC[4 : 4+*(*uint32)(unsafe.Pointer(&elemGC[0]))-1]
+				prog = append(prog, elemProg...)
+			}
+			// Pad from ptrdata to size.
+			elemWords := ft.typ.size / ptrSize
+			if elemPtrs < elemWords {
+				// Emit literal 0 bit, then repeat as needed.
+				prog = append(prog, 0x01, 0x00)
+				if elemPtrs+1 < elemWords {
+					prog = append(prog, 0x81)
+					prog = appendVarint(prog, elemWords-elemPtrs-1)
+				}
+			}
+		}
+		*(*uint32)(unsafe.Pointer(&prog[0])) = uint32(len(prog) - 4)
+		typ.kind |= kindGCProg
+		typ.gcdata = &prog[0]
+	} else {
+		typ.kind &^= kindGCProg
+		bv := new(bitVector)
+		addTypeBits(bv, 0, typ.common())
+		if len(bv.data) > 0 {
+			typ.gcdata = &bv.data[0]
 		}
-		gc = append(gc, _GC_END)
-		typ.gc = unsafe.Pointer(&gc[0])
 	}
+	typ.ptrdata = typeptrdata(typ.common())
 
-	typ.hashfn = func(p unsafe.Pointer, seed uintptr) uintptr {
-		ret := seed
-		for _, ft := range typ.fields {
-			o := unsafe.Pointer(uintptr(p) + ft.offset)
-			ret = ft.typ.hashfn(o, ret)
+	if hashable {
+		typ.hashfn = func(p unsafe.Pointer, seed uintptr) uintptr {
+			o := seed
+			for _, ft := range typ.fields {
+				pi := unsafe.Pointer(uintptr(p) + ft.offset)
+				o = ft.typ.hashfn(pi, o)
+			}
+			return o
 		}
-		return ret
+	} else {
+		typ.hashfn = nil
 	}
 
-	typ.equalfn = func(p, q unsafe.Pointer) bool {
-		for _, ft := range typ.fields {
-			pi := unsafe.Pointer(uintptr(p) + ft.offset)
-			qi := unsafe.Pointer(uintptr(q) + ft.offset)
-			if !ft.typ.equalfn(pi, qi) {
-				return false
+	if comparable {
+		typ.equalfn = func(p, q unsafe.Pointer) bool {
+			for _, ft := range typ.fields {
+				pi := unsafe.Pointer(uintptr(p) + ft.offset)
+				qi := unsafe.Pointer(uintptr(q) + ft.offset)
+				if !ft.typ.equalfn(pi, qi) {
+					return false
+				}
 			}
+			return true
 		}
-		return true
+	} else {
+		typ.equalfn = nil
 	}
 
 	typ.kind &^= kindDirectIface
@@ -2308,6 +2220,35 @@ func runtimeStructField(field StructField) structField {
 	}
 }
 
+// typeptrdata returns the length in bytes of the prefix of t
+// containing pointer data. Anything after this offset is scalar data.
+// keep in sync with ../cmd/compile/internal/gc/reflect.go
+func typeptrdata(t *rtype) uintptr {
+	if !t.pointers() {
+		return 0
+	}
+	switch t.Kind() {
+	case Struct:
+		st := (*structType)(unsafe.Pointer(t))
+		// find the last field that has pointers.
+		field := 0
+		for i := range st.fields {
+			ft := st.fields[i].typ
+			if ft.pointers() {
+				field = i
+			}
+		}
+		f := st.fields[field]
+		return f.offset + f.typ.ptrdata
+
+	default:
+		panic("reflect.typeptrdata: unexpected type, " + t.String())
+	}
+}
+
+// See cmd/compile/internal/gc/reflect.go for derivation of constant.
+const maxPtrmaskBytes = 2048
+
 // ArrayOf returns the array type with the given count and element type.
 // For example, if t represents int, ArrayOf(5, t) represents [5]int.
 //
@@ -2350,9 +2291,9 @@ func ArrayOf(count int, elem Type) Type {
 		panic("reflect.ArrayOf: array size would exceed virtual address space")
 	}
 	array.size = typ.size * uintptr(count)
-	// if count > 0 && typ.ptrdata != 0 {
-	// 	array.ptrdata = typ.size*uintptr(count-1) + typ.ptrdata
-	// }
+	if count > 0 && typ.ptrdata != 0 {
+		array.ptrdata = typ.size*uintptr(count-1) + typ.ptrdata
+	}
 	array.align = typ.align
 	array.fieldAlign = typ.fieldAlign
 	array.uncommonType = nil
@@ -2364,41 +2305,111 @@ func ArrayOf(count int, elem Type) Type {
 	case typ.kind&kindNoPointers != 0 || array.size == 0:
 		// No pointers.
 		array.kind |= kindNoPointers
-		gc := [...]uintptr{array.size, _GC_END}
-		array.gc = unsafe.Pointer(&gc[0])
+		array.gcdata = nil
+		array.ptrdata = 0
 
 	case count == 1:
 		// In memory, 1-element array looks just like the element.
 		array.kind |= typ.kind & kindGCProg
-		array.gc = typ.gc
+		array.gcdata = typ.gcdata
+		array.ptrdata = typ.ptrdata
+
+	case typ.kind&kindGCProg == 0 && array.size <= maxPtrmaskBytes*8*ptrSize:
+		// Element is small with pointer mask; array is still small.
+		// Create direct pointer mask by turning each 1 bit in elem
+		// into count 1 bits in larger mask.
+		mask := make([]byte, (array.ptrdata/ptrSize+7)/8)
+		elemMask := (*[1 << 30]byte)(unsafe.Pointer(typ.gcdata))[:]
+		elemWords := typ.size / ptrSize
+		for j := uintptr(0); j < typ.ptrdata/ptrSize; j++ {
+			if (elemMask[j/8]>>(j%8))&1 != 0 {
+				for i := uintptr(0); i < array.len; i++ {
+					k := i*elemWords + j
+					mask[k/8] |= 1 << (k % 8)
+				}
+			}
+		}
+		array.gcdata = &mask[0]
 
 	default:
-		gc := []uintptr{array.size, _GC_ARRAY_START, 0, uintptr(count), typ.size}
-		gc = appendGCProgram(gc, typ, 0)
-		gc = append(gc, _GC_ARRAY_NEXT, _GC_END)
-		array.gc = unsafe.Pointer(&gc[0])
+		// Create program that emits one element
+		// and then repeats to make the array.
+		prog := []byte{0, 0, 0, 0} // will be length of prog
+		elemGC := (*[1 << 30]byte)(unsafe.Pointer(typ.gcdata))[:]
+		elemPtrs := typ.ptrdata / ptrSize
+		if typ.kind&kindGCProg == 0 {
+			// Element is small with pointer mask; use as literal bits.
+			mask := elemGC
+			// Emit 120-bit chunks of full bytes (max is 127 but we avoid using partial bytes).
+			var n uintptr
+			for n = elemPtrs; n > 120; n -= 120 {
+				prog = append(prog, 120)
+				prog = append(prog, mask[:15]...)
+				mask = mask[15:]
+			}
+			prog = append(prog, byte(n))
+			prog = append(prog, mask[:(n+7)/8]...)
+		} else {
+			// Element has GC program; emit one element.
+			elemProg := elemGC[4 : 4+*(*uint32)(unsafe.Pointer(&elemGC[0]))-1]
+			prog = append(prog, elemProg...)
+		}
+		// Pad from ptrdata to size.
+		elemWords := typ.size / ptrSize
+		if elemPtrs < elemWords {
+			// Emit literal 0 bit, then repeat as needed.
+			prog = append(prog, 0x01, 0x00)
+			if elemPtrs+1 < elemWords {
+				prog = append(prog, 0x81)
+				prog = appendVarint(prog, elemWords-elemPtrs-1)
+			}
+		}
+		// Repeat count-1 times.
+		if elemWords < 0x80 {
+			prog = append(prog, byte(elemWords|0x80))
+		} else {
+			prog = append(prog, 0x80)
+			prog = appendVarint(prog, elemWords)
+		}
+		prog = appendVarint(prog, uintptr(count)-1)
+		prog = append(prog, 0)
+		*(*uint32)(unsafe.Pointer(&prog[0])) = uint32(len(prog) - 4)
+		array.kind |= kindGCProg
+		array.gcdata = &prog[0]
+		array.ptrdata = array.size // overestimate but ok; must match program
 	}
 
 	array.kind &^= kindDirectIface
 
-	array.hashfn = func(p unsafe.Pointer, seed uintptr) uintptr {
-		ret := seed
-		for i := 0; i < count; i++ {
-			ret = typ.hashfn(p, ret)
-			p = unsafe.Pointer(uintptr(p) + typ.size)
+	esize := typ.size
+
+	if typ.equalfn == nil {
+		array.equalfn = nil
+	} else {
+		eequal := typ.equalfn
+		array.equalfn = func(p, q unsafe.Pointer) bool {
+			for i := 0; i < count; i++ {
+				pi := arrayAt(p, i, esize)
+				qi := arrayAt(q, i, esize)
+				if !eequal(pi, qi) {
+					return false
+				}
+			}
+			return true
 		}
-		return ret
 	}
 
-	array.equalfn = func(p1, p2 unsafe.Pointer) bool {
-		for i := 0; i < count; i++ {
-			if !typ.equalfn(p1, p2) {
-				return false
+	if typ.hashfn == nil {
+		array.hashfn = nil
+	} else {
+		ehash := typ.hashfn
+		array.hashfn = func(ptr unsafe.Pointer, seed uintptr) uintptr {
+			o := seed
+			for i := 0; i < count; i++ {
+				o = ehash(arrayAt(ptr, i, esize), o)
 			}
-			p1 = unsafe.Pointer(uintptr(p1) + typ.size)
-			p2 = unsafe.Pointer(uintptr(p2) + typ.size)
+			return o
 		}
-		return true
 	}
 
 	return cachePut(ckey, &array.rtype)
diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go
index 4946269..174320f 100644
--- a/libgo/go/runtime/alg.go
+++ b/libgo/go/runtime/alg.go
@@ -131,7 +131,7 @@ func c128hash(p unsafe.Pointer, h uintptr) uintptr {
 	return f64hash(unsafe.Pointer(&x[1]), f64hash(unsafe.Pointer(&x[0]), h))
 }
 
-func interhash(p unsafe.Pointer, h uintptr, size uintptr) uintptr {
+func interhash(p unsafe.Pointer, h uintptr) uintptr {
 	a := (*iface)(p)
 	tab := a.tab
 	if tab == nil {
@@ -199,10 +199,10 @@ func c128equal(p, q unsafe.Pointer) bool {
 func strequal(p, q unsafe.Pointer) bool {
 	return *(*string)(p) == *(*string)(q)
 }
-func interequal(p, q unsafe.Pointer, size uintptr) bool {
+func interequal(p, q unsafe.Pointer) bool {
 	return ifaceeq(*(*iface)(p), *(*iface)(q))
 }
-func nilinterequal(p, q unsafe.Pointer, size uintptr) bool {
+func nilinterequal(p, q unsafe.Pointer) bool {
 	return efaceeq(*(*eface)(p), *(*eface)(q))
 }
 func efaceeq(x, y eface) bool {
@@ -361,6 +361,34 @@ var _ = nilinterequal
 var _ = pointerhash
 var _ = pointerequal
 
+// Testing adapters for hash quality tests (see hash_test.go)
+func stringHash(s string, seed uintptr) uintptr {
+	return strhash(noescape(unsafe.Pointer(&s)), seed)
+}
+
+func bytesHash(b []byte, seed uintptr) uintptr {
+	s := (*slice)(unsafe.Pointer(&b))
+	return memhash(s.array, seed, uintptr(s.len))
+}
+
+func int32Hash(i uint32, seed uintptr) uintptr {
+	return memhash32(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func int64Hash(i uint64, seed uintptr) uintptr {
+	return memhash64(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func efaceHash(i interface{}, seed uintptr) uintptr {
+	return nilinterhash(noescape(unsafe.Pointer(&i)), seed)
+}
+
+func ifaceHash(i interface {
+	F()
+}, seed uintptr) uintptr {
+	return interhash(noescape(unsafe.Pointer(&i)), seed)
+}
+
 const hashRandomBytes = sys.PtrSize / 4 * 64
 
 // used in asm_{386,amd64}.s to seed the hash function
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
new file mode 100644
index 0000000..2e0e591
--- /dev/null
+++ b/libgo/go/runtime/cgocall.go
@@ -0,0 +1,307 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Cgo call and callback support.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Pointer checking for cgo code.
+
+// We want to detect all cases where a program that does not use
+// unsafe makes a cgo call passing a Go pointer to memory that
+// contains a Go pointer. Here a Go pointer is defined as a pointer
+// to memory allocated by the Go runtime. Programs that use unsafe
+// can evade this restriction easily, so we don't try to catch them.
+// The cgo program will rewrite all possibly bad pointer arguments to
+// call cgoCheckPointer, where we can catch cases of a Go pointer
+// pointing to a Go pointer.
+
+// Complicating matters, taking the address of a slice or array
+// element permits the C program to access all elements of the slice
+// or array. In that case we will see a pointer to a single element,
+// but we need to check the entire data structure.
+
+// The cgoCheckPointer call takes additional arguments indicating that
+// it was called on an address expression. An additional argument of
+// true means that it only needs to check a single element. An
+// additional argument of a slice or array means that it needs to
+// check the entire slice/array, but nothing else. Otherwise, the
+// pointer could be anything, and we check the entire heap object,
+// which is conservative but safe.
+
+// When and if we implement a moving garbage collector,
+// cgoCheckPointer will pin the pointer for the duration of the cgo
+// call.  (This is necessary but not sufficient; the cgo program will
+// also have to change to pin Go pointers that cannot point to Go
+// pointers.)
+
+// cgoCheckPointer checks if the argument contains a Go pointer that
+// points to a Go pointer, and panics if it does.
+func cgoCheckPointer(ptr interface{}, args ...interface{}) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := (*eface)(unsafe.Pointer(&ptr))
+	t := ep._type
+
+	top := true
+	if len(args) > 0 && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+		p := ep.data
+		if t.kind&kindDirectIface == 0 {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		aep := (*eface)(unsafe.Pointer(&args[0]))
+		switch aep._type.kind & kindMask {
+		case kindBool:
+			if t.kind&kindMask == kindUnsafePointer {
+				// We don't know the type of the element.
+				break
+			}
+			pt := (*ptrtype)(unsafe.Pointer(t))
+			cgoCheckArg(pt.elem, p, true, false, cgoCheckPointerFail)
+			return
+		case kindSlice:
+			// Check the slice rather than the pointer.
+			ep = aep
+			t = ep._type
+		case kindArray:
+			// Check the array rather than the pointer.
+			// Pass top as false since we have a pointer
+			// to the array.
+			ep = aep
+			t = ep._type
+			top = false
+		default:
+			throw("can't happen")
+		}
+	}
+
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, top, cgoCheckPointerFail)
+}
+
+const cgoCheckPointerFail = "cgo argument has Go pointer to Go pointer"
+const cgoResultFail = "cgo result has Go pointer"
+
+// cgoCheckArg is the real work of cgoCheckPointer. The argument p
+// is either a pointer to the value (of type t), or the value itself,
+// depending on indir. The top parameter is whether we are at the top
+// level, where Go pointers are allowed.
+func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
+	if t.kind&kindNoPointers != 0 {
+		// If the type has no pointers there is nothing to do.
+		return
+	}
+
+	switch t.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		if !indir {
+			if at.len != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(at.elem, p, at.elem.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for i := uintptr(0); i < at.len; i++ {
+			cgoCheckArg(at.elem, p, true, top, msg)
+			p = add(p, at.elem.size)
+		}
+	case kindChan, kindMap:
+		// These types contain internal pointers that will
+		// always be allocated in the Go heap. It's never OK
+		// to pass them to C.
+		panic(errorString(msg))
+	case kindFunc:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		panic(errorString(msg))
+	case kindInterface:
+		it := *(**_type)(p)
+		if it == nil {
+			return
+		}
+		// A type known at compile time is OK since it's
+		// constant. A type not known at compile time will be
+		// in the heap and will not be OK.
+		if inheap(uintptr(unsafe.Pointer(it))) {
+			panic(errorString(msg))
+		}
+		p = *(*unsafe.Pointer)(add(p, sys.PtrSize))
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		cgoCheckArg(it, p, it.kind&kindDirectIface == 0, false, msg)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		s := (*slice)(p)
+		p = s.array
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		if st.elem.kind&kindNoPointers != 0 {
+			return
+		}
+		for i := 0; i < s.cap; i++ {
+			cgoCheckArg(st.elem, p, true, false, msg)
+			p = add(p, st.elem.size)
+		}
+	case kindString:
+		ss := (*stringStruct)(p)
+		if !cgoIsGoPointer(ss.str) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		if !indir {
+			if len(st.fields) != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(st.fields[0].typ, p, st.fields[0].typ.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for _, f := range st.fields {
+			cgoCheckArg(f.typ, add(p, f.offset), true, top, msg)
+		}
+	case kindPtr, kindUnsafePointer:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+
+		cgoCheckUnknownPointer(p, msg)
+	}
+}
+
+// cgoCheckUnknownPointer is called for an arbitrary pointer into Go
+// memory. It checks whether that Go memory contains any other
+// pointer into Go memory. If it does, we panic.
+// The return values are unused but useful to see in panic tracebacks.
+func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
+	if cgoInRange(p, mheap_.arena_start, mheap_.arena_used) {
+		if !inheap(uintptr(p)) {
+			// On 32-bit systems it is possible for C's allocated memory
+			// to have addresses between arena_start and arena_used.
+			// Either this pointer is a stack or an unused span or it's
+			// a C allocation. Escape analysis should prevent the first,
+			// garbage collection should prevent the second,
+			// and the third is completely OK.
+			return
+		}
+
+		b, hbits, span, _ := heapBitsForObject(uintptr(p), 0, 0, false)
+		base = b
+		if base == 0 {
+			return
+		}
+		n := span.elemsize
+		for i = uintptr(0); i < n; i += sys.PtrSize {
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				// No more possible pointers.
+				break
+			}
+			if hbits.isPointer() {
+				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+					panic(errorString(msg))
+				}
+			}
+			hbits = hbits.next()
+		}
+
+		return
+	}
+
+	roots := gcRoots
+	for roots != nil {
+		for j := 0; j < roots.count; j++ {
+			pr := roots.roots[j]
+			addr := uintptr(pr.decl)
+			if cgoInRange(p, addr, addr+pr.size) {
+				cgoCheckBits(pr.decl, pr.gcdata, 0, pr.ptrdata)
+				return
+			}
+		}
+		roots = roots.next
+	}
+
+	return
+}
+
+// cgoIsGoPointer returns whether the pointer is a Go pointer--a
+// pointer to Go memory. We only care about Go memory that might
+// contain pointers.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoIsGoPointer(p unsafe.Pointer) bool {
+	if p == nil {
+		return false
+	}
+
+	if inHeapOrStack(uintptr(p)) {
+		return true
+	}
+
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if cgoInRange(p, addr, addr+pr.size) {
+				return true
+			}
+		}
+		roots = roots.next
+	}
+
+	return false
+}
+
+// cgoInRange returns whether p is between start and end.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoInRange(p unsafe.Pointer, start, end uintptr) bool {
+	return start <= uintptr(p) && uintptr(p) < end
+}
+
+// cgoCheckResult is called to check the result parameter of an
+// exported Go function. It panics if the result is or contains a Go
+// pointer.
+func cgoCheckResult(val interface{}) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := (*eface)(unsafe.Pointer(&val))
+	t := ep._type
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
+}
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index fec3646..09d444d 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 // Code to check that pointer writes follow the cgo rules.
 // These functions are invoked via the write barrier when debug.cgocheck > 1.
 
@@ -110,17 +108,18 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
 	}
 
 	// The type has a GC program. Try to find GC bits somewhere else.
-	for _, datap := range activeModules() {
-		if cgoInRange(src, datap.data, datap.edata) {
-			doff := uintptr(src) - datap.data
-			cgoCheckBits(add(src, -doff), datap.gcdatamask.bytedata, off+doff, size)
-			return
-		}
-		if cgoInRange(src, datap.bss, datap.ebss) {
-			boff := uintptr(src) - datap.bss
-			cgoCheckBits(add(src, -boff), datap.gcbssmask.bytedata, off+boff, size)
-			return
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if cgoInRange(src, addr, addr+pr.size) {
+				doff := uintptr(src) - addr
+				cgoCheckBits(add(src, -doff), pr.gcdata, off+doff, size)
+				return
+			}
 		}
+		roots = roots.next
 	}
 
 	aoff := uintptr(src) - mheap_.arena_start
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 182c84b..67ef334 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime_test
 
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index a8827f2..6a9efcd 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -39,7 +39,9 @@ func GOMAXPROCS(n int) int {
 // The set of available CPUs is checked by querying the operating system
 // at process startup. Changes to operating system CPU allocation after
 // process startup are not reflected.
-func NumCPU() int
+func NumCPU() int {
+	return int(ncpu)
+}
 
 // NumCgoCall returns the number of cgo calls made by the current process.
 func NumCgoCall() int64 {
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index e076edb..9bf7ddc 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package runtime
 
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index cc4b188..bf435f4 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -26,10 +26,11 @@ import (
 var Entersyscall = entersyscall
 var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread
+var Xadduintptr = atomic.Xadduintptr
 
-// var Xadduintptr = xadduintptr
+var FuncPC = funcPC
 
-// var FuncPC = funcPC
+var Fastlog2 = fastlog2
 
 var Atoi = atoi
 var Atoi32 = atoi32
@@ -148,12 +149,12 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 	}
 }
 
-//var StringHash = stringHash
-//var BytesHash = bytesHash
-//var Int32Hash = int32Hash
-//var Int64Hash = int64Hash
-//var EfaceHash = efaceHash
-//var IfaceHash = ifaceHash
+var StringHash = stringHash
+var BytesHash = bytesHash
+var Int32Hash = int32Hash
+var Int64Hash = int64Hash
+var EfaceHash = efaceHash
+var IfaceHash = ifaceHash
 
 func MemclrBytes(b []byte) {
 	s := (*slice)(unsafe.Pointer(&b))
@@ -182,7 +183,6 @@ func SetEnvs(e []string) { envs = e }
 
 // For benchmarking.
 
-/*
 func BenchSetType(n int, x interface{}) {
 	e := *efaceOf(&x)
 	t := e._type
@@ -213,7 +213,6 @@ func BenchSetType(n int, x interface{}) {
 const PtrSize = sys.PtrSize
 
 var ForceGCPeriod = &forcegcperiod
-*/
 
 // SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
 // the "environment" traceback level, so later calls to
@@ -223,7 +222,6 @@ func SetTracebackEnv(level string) {
 	traceback_env = traceback_cache
 }
 
-/*
 var ReadUnaligned32 = readUnaligned32
 var ReadUnaligned64 = readUnaligned64
 
@@ -242,7 +240,6 @@ func CountPagesInUse() (pagesInUse, counted uintptr) {
 
 	return
 }
-*/
 
 // BlockOnSystemStack switches to the system stack, prints "x\n" to
 // stderr, and blocks in a stack containing
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 5aa76ac..5c50760 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -183,93 +183,6 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 // It returns the number of entries written to pc.
 func Callers(skip int, pc []uintptr) int
 
-// SetFinalizer sets the finalizer associated with obj to the provided
-// finalizer function. When the garbage collector finds an unreachable block
-// with an associated finalizer, it clears the association and runs
-// finalizer(obj) in a separate goroutine. This makes obj reachable again,
-// but now without an associated finalizer. Assuming that SetFinalizer
-// is not called again, the next time the garbage collector sees
-// that obj is unreachable, it will free obj.
-//
-// SetFinalizer(obj, nil) clears any finalizer associated with obj.
-//
-// The argument obj must be a pointer to an object allocated by
-// calling new or by taking the address of a composite literal.
-// The argument finalizer must be a function that takes a single argument
-// to which obj's type can be assigned, and can have arbitrary ignored return
-// values. If either of these is not true, SetFinalizer aborts the
-// program.
-//
-// Finalizers are run in dependency order: if A points at B, both have
-// finalizers, and they are otherwise unreachable, only the finalizer
-// for A runs; once A is freed, the finalizer for B can run.
-// If a cyclic structure includes a block with a finalizer, that
-// cycle is not guaranteed to be garbage collected and the finalizer
-// is not guaranteed to run, because there is no ordering that
-// respects the dependencies.
-//
-// The finalizer for obj is scheduled to run at some arbitrary time after
-// obj becomes unreachable.
-// There is no guarantee that finalizers will run before a program exits,
-// so typically they are useful only for releasing non-memory resources
-// associated with an object during a long-running program.
-// For example, an os.File object could use a finalizer to close the
-// associated operating system file descriptor when a program discards
-// an os.File without calling Close, but it would be a mistake
-// to depend on a finalizer to flush an in-memory I/O buffer such as a
-// bufio.Writer, because the buffer would not be flushed at program exit.
-//
-// It is not guaranteed that a finalizer will run if the size of *obj is
-// zero bytes.
-//
-// It is not guaranteed that a finalizer will run for objects allocated
-// in initializers for package-level variables. Such objects may be
-// linker-allocated, not heap-allocated.
-//
-// A finalizer may run as soon as an object becomes unreachable.
-// In order to use finalizers correctly, the program must ensure that
-// the object is reachable until it is no longer required.
-// Objects stored in global variables, or that can be found by tracing
-// pointers from a global variable, are reachable. For other objects,
-// pass the object to a call of the KeepAlive function to mark the
-// last point in the function where the object must be reachable.
-//
-// For example, if p points to a struct that contains a file descriptor d,
-// and p has a finalizer that closes that file descriptor, and if the last
-// use of p in a function is a call to syscall.Write(p.d, buf, size), then
-// p may be unreachable as soon as the program enters syscall.Write. The
-// finalizer may run at that moment, closing p.d, causing syscall.Write
-// to fail because it is writing to a closed file descriptor (or, worse,
-// to an entirely different file descriptor opened by a different goroutine).
-// To avoid this problem, call runtime.KeepAlive(p) after the call to
-// syscall.Write.
-//
-// A single goroutine runs all finalizers for a program, sequentially.
-// If a finalizer must run for a long time, it should do so by starting
-// a new goroutine.
-func SetFinalizer(obj interface{}, finalizer interface{})
-
-// KeepAlive marks its argument as currently reachable.
-// This ensures that the object is not freed, and its finalizer is not run,
-// before the point in the program where KeepAlive is called.
-//
-// A very simplified example showing where KeepAlive is required:
-// 	type File struct { d int }
-// 	d, err := syscall.Open("/file/path", syscall.O_RDONLY, 0)
-// 	// ... do something if err != nil ...
-// 	p := &File{d}
-// 	runtime.SetFinalizer(p, func(p *File) { syscall.Close(p.d) })
-// 	var buf [10]byte
-// 	n, err := syscall.Read(p.d, buf[:])
-// 	// Ensure p is not finalized until Read returns.
-// 	runtime.KeepAlive(p)
-// 	// No more uses of p after this point.
-//
-// Without the KeepAlive call, the finalizer could run at the start of
-// syscall.Read, closing the file descriptor before syscall.Read makes
-// the actual system call.
-func KeepAlive(interface{})
-
 // GOROOT returns the root of the Go tree.
 // It uses the GOROOT environment variable, if set,
 // or else the root used during the Go build.
diff --git a/libgo/go/runtime/fastlog2.go b/libgo/go/runtime/fastlog2.go
index 6fbe572f4..5f3fb53 100644
--- a/libgo/go/runtime/fastlog2.go
+++ b/libgo/go/runtime/fastlog2.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 package runtime
 
 import "unsafe"
diff --git a/libgo/go/runtime/fastlog2_test.go b/libgo/go/runtime/fastlog2_test.go
index 6e9fcd4..ae0f40b 100644
--- a/libgo/go/runtime/fastlog2_test.go
+++ b/libgo/go/runtime/fastlog2_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 package runtime_test
 
 import (
diff --git a/libgo/go/runtime/fastlog2table.go b/libgo/go/runtime/fastlog2table.go
index 47ae5e8..c36d583 100644
--- a/libgo/go/runtime/fastlog2table.go
+++ b/libgo/go/runtime/fastlog2table.go
@@ -2,8 +2,6 @@
 // Run go generate from src/runtime to update.
 // See mkfastlog2table.go for comments.
 
-// +build ignore
-
 package runtime
 
 const fastlogNumBits = 5
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index 2a6acf0..ec043ed 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -434,8 +434,6 @@ func testIfaceEqual(x interface{}) {
 	}
 }
 
-/*
-
 func TestPageAccounting(t *testing.T) {
 	// Grow the heap in small increments. This used to drop the
 	// pages-in-use count below zero because of a rounding
@@ -452,5 +450,3 @@ func TestPageAccounting(t *testing.T) {
 		t.Fatalf("mheap_.pagesInUse is %d, but direct count is %d", pagesInUse, counted)
 	}
 }
-
-*/
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
new file mode 100644
index 0000000..167c49e
--- /dev/null
+++ b/libgo/go/runtime/hash_test.go
@@ -0,0 +1,710 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	. "runtime"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+// Smhasher is a torture test for hash functions.
+// https://code.google.com/p/smhasher/
+// This code is a port of some of the Smhasher tests to Go.
+//
+// The current AES hash function passes Smhasher. Our fallback
+// hash functions don't, so we only enable the difficult tests when
+// we know the AES implementation is available.
+
+// Sanity checks.
+// hash should not depend on values outside key.
+// hash should not depend on alignment.
+func TestSmhasherSanity(t *testing.T) {
+	r := rand.New(rand.NewSource(1234))
+	const REP = 10
+	const KEYMAX = 128
+	const PAD = 16
+	const OFFMAX = 16
+	for k := 0; k < REP; k++ {
+		for n := 0; n < KEYMAX; n++ {
+			for i := 0; i < OFFMAX; i++ {
+				var b [KEYMAX + OFFMAX + 2*PAD]byte
+				var c [KEYMAX + OFFMAX + 2*PAD]byte
+				randBytes(r, b[:])
+				randBytes(r, c[:])
+				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
+				if BytesHash(b[PAD:PAD+n], 0) != BytesHash(c[PAD+i:PAD+i+n], 0) {
+					t.Errorf("hash depends on bytes outside key")
+				}
+			}
+		}
+	}
+}
+
+type HashSet struct {
+	m map[uintptr]struct{} // set of hashes added
+	n int                  // number of hashes added
+}
+
+func newHashSet() *HashSet {
+	return &HashSet{make(map[uintptr]struct{}), 0}
+}
+func (s *HashSet) add(h uintptr) {
+	s.m[h] = struct{}{}
+	s.n++
+}
+func (s *HashSet) addS(x string) {
+	s.add(StringHash(x, 0))
+}
+func (s *HashSet) addB(x []byte) {
+	s.add(BytesHash(x, 0))
+}
+func (s *HashSet) addS_seed(x string, seed uintptr) {
+	s.add(StringHash(x, seed))
+}
+func (s *HashSet) check(t *testing.T) {
+	const SLOP = 10.0
+	collisions := s.n - len(s.m)
+	//fmt.Printf("%d/%d\n", len(s.m), s.n)
+	pairs := int64(s.n) * int64(s.n-1) / 2
+	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
+	stddev := math.Sqrt(expected)
+	if float64(collisions) > expected+SLOP*(3*stddev+1) {
+		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
+	}
+}
+
+// a string plus adding zeros must make distinct hashes
+func TestSmhasherAppendedZeros(t *testing.T) {
+	s := "hello" + strings.Repeat("\x00", 256)
+	h := newHashSet()
+	for i := 0; i <= len(s); i++ {
+		h.addS(s[:i])
+	}
+	h.check(t)
+}
+
+// All 0-3 byte strings have distinct hashes.
+func TestSmhasherSmallKeys(t *testing.T) {
+	h := newHashSet()
+	var b [3]byte
+	for i := 0; i < 256; i++ {
+		b[0] = byte(i)
+		h.addB(b[:1])
+		for j := 0; j < 256; j++ {
+			b[1] = byte(j)
+			h.addB(b[:2])
+			if !testing.Short() {
+				for k := 0; k < 256; k++ {
+					b[2] = byte(k)
+					h.addB(b[:3])
+				}
+			}
+		}
+	}
+	h.check(t)
+}
+
+// Different length strings of all zeros have distinct hashes.
+func TestSmhasherZeros(t *testing.T) {
+	N := 256 * 1024
+	if testing.Short() {
+		N = 1024
+	}
+	h := newHashSet()
+	b := make([]byte, N)
+	for i := 0; i <= N; i++ {
+		h.addB(b[:i])
+	}
+	h.check(t)
+}
+
+// Strings with up to two nonzero bytes all have distinct hashes.
+func TestSmhasherTwoNonzero(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	h := newHashSet()
+	for n := 2; n <= 16; n++ {
+		twoNonZero(h, n)
+	}
+	h.check(t)
+}
+func twoNonZero(h *HashSet, n int) {
+	b := make([]byte, n)
+
+	// all zero
+	h.addB(b[:])
+
+	// one non-zero byte
+	for i := 0; i < n; i++ {
+		for x := 1; x < 256; x++ {
+			b[i] = byte(x)
+			h.addB(b[:])
+			b[i] = 0
+		}
+	}
+
+	// two non-zero bytes
+	for i := 0; i < n; i++ {
+		for x := 1; x < 256; x++ {
+			b[i] = byte(x)
+			for j := i + 1; j < n; j++ {
+				for y := 1; y < 256; y++ {
+					b[j] = byte(y)
+					h.addB(b[:])
+					b[j] = 0
+				}
+			}
+			b[i] = 0
+		}
+	}
+}
+
+// Test strings with repeats, like "abcdabcdabcdabcd..."
+func TestSmhasherCyclic(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	r := rand.New(rand.NewSource(1234))
+	const REPEAT = 8
+	const N = 1000000
+	for n := 4; n <= 12; n++ {
+		h := newHashSet()
+		b := make([]byte, REPEAT*n)
+		for i := 0; i < N; i++ {
+			b[0] = byte(i * 79 % 97)
+			b[1] = byte(i * 43 % 137)
+			b[2] = byte(i * 151 % 197)
+			b[3] = byte(i * 199 % 251)
+			randBytes(r, b[4:n])
+			for j := n; j < n*REPEAT; j++ {
+				b[j] = b[j-n]
+			}
+			h.addB(b)
+		}
+		h.check(t)
+	}
+}
+
+// Test strings with only a few bits set
+func TestSmhasherSparse(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	sparse(t, 32, 6)
+	sparse(t, 40, 6)
+	sparse(t, 48, 5)
+	sparse(t, 56, 5)
+	sparse(t, 64, 5)
+	sparse(t, 96, 4)
+	sparse(t, 256, 3)
+	sparse(t, 2048, 2)
+}
+func sparse(t *testing.T, n int, k int) {
+	b := make([]byte, n/8)
+	h := newHashSet()
+	setbits(h, b, 0, k)
+	h.check(t)
+}
+
+// set up to k bits at index i and greater
+func setbits(h *HashSet, b []byte, i int, k int) {
+	h.addB(b)
+	if k == 0 {
+		return
+	}
+	for j := i; j < len(b)*8; j++ {
+		b[j/8] |= byte(1 << uint(j&7))
+		setbits(h, b, j+1, k-1)
+		b[j/8] &= byte(^(1 << uint(j&7)))
+	}
+}
+
+// Test all possible combinations of n blocks from the set s.
+// "permutation" is a bad name here, but it is what Smhasher uses.
+func TestSmhasherPermutation(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
+	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
+	permutation(t, []uint32{0, 1}, 20)
+	permutation(t, []uint32{0, 1 << 31}, 20)
+	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
+}
+func permutation(t *testing.T, s []uint32, n int) {
+	b := make([]byte, n*4)
+	h := newHashSet()
+	genPerm(h, b, s, 0)
+	h.check(t)
+}
+func genPerm(h *HashSet, b []byte, s []uint32, n int) {
+	h.addB(b[:n])
+	if n == len(b) {
+		return
+	}
+	for _, v := range s {
+		b[n] = byte(v)
+		b[n+1] = byte(v >> 8)
+		b[n+2] = byte(v >> 16)
+		b[n+3] = byte(v >> 24)
+		genPerm(h, b, s, n+4)
+	}
+}
+
+type Key interface {
+	clear()              // set bits all to 0
+	random(r *rand.Rand) // set key to something random
+	bits() int           // how many bits key has
+	flipBit(i int)       // flip bit i of the key
+	hash() uintptr       // hash the key
+	name() string        // for error reporting
+}
+
+type BytesKey struct {
+	b []byte
+}
+
+func (k *BytesKey) clear() {
+	for i := range k.b {
+		k.b[i] = 0
+	}
+}
+func (k *BytesKey) random(r *rand.Rand) {
+	randBytes(r, k.b)
+}
+func (k *BytesKey) bits() int {
+	return len(k.b) * 8
+}
+func (k *BytesKey) flipBit(i int) {
+	k.b[i>>3] ^= byte(1 << uint(i&7))
+}
+func (k *BytesKey) hash() uintptr {
+	return BytesHash(k.b, 0)
+}
+func (k *BytesKey) name() string {
+	return fmt.Sprintf("bytes%d", len(k.b))
+}
+
+type Int32Key struct {
+	i uint32
+}
+
+func (k *Int32Key) clear() {
+	k.i = 0
+}
+func (k *Int32Key) random(r *rand.Rand) {
+	k.i = r.Uint32()
+}
+func (k *Int32Key) bits() int {
+	return 32
+}
+func (k *Int32Key) flipBit(i int) {
+	k.i ^= 1 << uint(i)
+}
+func (k *Int32Key) hash() uintptr {
+	return Int32Hash(k.i, 0)
+}
+func (k *Int32Key) name() string {
+	return "int32"
+}
+
+type Int64Key struct {
+	i uint64
+}
+
+func (k *Int64Key) clear() {
+	k.i = 0
+}
+func (k *Int64Key) random(r *rand.Rand) {
+	k.i = uint64(r.Uint32()) + uint64(r.Uint32())<<32
+}
+func (k *Int64Key) bits() int {
+	return 64
+}
+func (k *Int64Key) flipBit(i int) {
+	k.i ^= 1 << uint(i)
+}
+func (k *Int64Key) hash() uintptr {
+	return Int64Hash(k.i, 0)
+}
+func (k *Int64Key) name() string {
+	return "int64"
+}
+
+type EfaceKey struct {
+	i interface{}
+}
+
+func (k *EfaceKey) clear() {
+	k.i = nil
+}
+func (k *EfaceKey) random(r *rand.Rand) {
+	k.i = uint64(r.Int63())
+}
+func (k *EfaceKey) bits() int {
+	// use 64 bits. This tests inlined interfaces
+	// on 64-bit targets and indirect interfaces on
+	// 32-bit targets.
+	return 64
+}
+func (k *EfaceKey) flipBit(i int) {
+	k.i = k.i.(uint64) ^ uint64(1)<<uint(i)
+}
+func (k *EfaceKey) hash() uintptr {
+	return EfaceHash(k.i, 0)
+}
+func (k *EfaceKey) name() string {
+	return "Eface"
+}
+
+type IfaceKey struct {
+	i interface {
+		F()
+	}
+}
+type fInter uint64
+
+func (x fInter) F() {
+}
+
+func (k *IfaceKey) clear() {
+	k.i = nil
+}
+func (k *IfaceKey) random(r *rand.Rand) {
+	k.i = fInter(r.Int63())
+}
+func (k *IfaceKey) bits() int {
+	// use 64 bits. This tests inlined interfaces
+	// on 64-bit targets and indirect interfaces on
+	// 32-bit targets.
+	return 64
+}
+func (k *IfaceKey) flipBit(i int) {
+	k.i = k.i.(fInter) ^ fInter(1)<<uint(i)
+}
+func (k *IfaceKey) hash() uintptr {
+	return IfaceHash(k.i, 0)
+}
+func (k *IfaceKey) name() string {
+	return "Iface"
+}
+
+// Flipping a single bit of a key should flip each output bit with 50% probability.
+func TestSmhasherAvalanche(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	avalancheTest1(t, &BytesKey{make([]byte, 2)})
+	avalancheTest1(t, &BytesKey{make([]byte, 4)})
+	avalancheTest1(t, &BytesKey{make([]byte, 8)})
+	avalancheTest1(t, &BytesKey{make([]byte, 16)})
+	avalancheTest1(t, &BytesKey{make([]byte, 32)})
+	avalancheTest1(t, &BytesKey{make([]byte, 200)})
+	avalancheTest1(t, &Int32Key{})
+	avalancheTest1(t, &Int64Key{})
+	avalancheTest1(t, &EfaceKey{})
+	avalancheTest1(t, &IfaceKey{})
+}
+func avalancheTest1(t *testing.T, k Key) {
+	const REP = 100000
+	r := rand.New(rand.NewSource(1234))
+	n := k.bits()
+
+	// grid[i][j] is a count of whether flipping
+	// input bit i affects output bit j.
+	grid := make([][hashSize]int, n)
+
+	for z := 0; z < REP; z++ {
+		// pick a random key, hash it
+		k.random(r)
+		h := k.hash()
+
+		// flip each bit, hash & compare the results
+		for i := 0; i < n; i++ {
+			k.flipBit(i)
+			d := h ^ k.hash()
+			k.flipBit(i)
+
+			// record the effects of that bit flip
+			g := &grid[i]
+			for j := 0; j < hashSize; j++ {
+				g[j] += int(d & 1)
+				d >>= 1
+			}
+		}
+	}
+
+	// Each entry in the grid should be about REP/2.
+	// More precisely, we did N = k.bits() * hashSize experiments where
+	// each is the sum of REP coin flips. We want to find bounds on the
+	// sum of coin flips such that a truly random experiment would have
+	// all sums inside those bounds with 99% probability.
+	N := n * hashSize
+	var c float64
+	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
+	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
+	}
+	c *= 4.0 // allowed slack - we don't need to be perfectly random
+	mean := .5 * REP
+	stddev := .5 * math.Sqrt(REP)
+	low := int(mean - c*stddev)
+	high := int(mean + c*stddev)
+	for i := 0; i < n; i++ {
+		for j := 0; j < hashSize; j++ {
+			x := grid[i][j]
+			if x < low || x > high {
+				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
+			}
+		}
+	}
+}
+
+// All bit rotations of a set of distinct keys
+func TestSmhasherWindowed(t *testing.T) {
+	windowed(t, &Int32Key{})
+	windowed(t, &Int64Key{})
+	windowed(t, &BytesKey{make([]byte, 128)})
+}
+func windowed(t *testing.T, k Key) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	const BITS = 16
+
+	for r := 0; r < k.bits(); r++ {
+		h := newHashSet()
+		for i := 0; i < 1<<BITS; i++ {
+			k.clear()
+			for j := 0; j < BITS; j++ {
+				if i>>uint(j)&1 != 0 {
+					k.flipBit((j + r) % k.bits())
+				}
+			}
+			h.add(k.hash())
+		}
+		h.check(t)
+	}
+}
+
+// All keys of the form prefix + [A-Za-z0-9]*N + suffix.
+func TestSmhasherText(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	text(t, "Foo", "Bar")
+	text(t, "FooBar", "")
+	text(t, "", "FooBar")
+}
+func text(t *testing.T, prefix, suffix string) {
+	const N = 4
+	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
+	const L = len(S)
+	b := make([]byte, len(prefix)+N+len(suffix))
+	copy(b, prefix)
+	copy(b[len(prefix)+N:], suffix)
+	h := newHashSet()
+	c := b[len(prefix):]
+	for i := 0; i < L; i++ {
+		c[0] = S[i]
+		for j := 0; j < L; j++ {
+			c[1] = S[j]
+			for k := 0; k < L; k++ {
+				c[2] = S[k]
+				for x := 0; x < L; x++ {
+					c[3] = S[x]
+					h.addB(b)
+				}
+			}
+		}
+	}
+	h.check(t)
+}
+
+// Make sure different seed values generate different hashes.
+func TestSmhasherSeed(t *testing.T) {
+	h := newHashSet()
+	const N = 100000
+	s := "hello"
+	for i := 0; i < N; i++ {
+		h.addS_seed(s, uintptr(i))
+	}
+	h.check(t)
+}
+
+// size of the hash output (32 or 64 bits)
+const hashSize = 32 + int(^uintptr(0)>>63<<5)
+
+func randBytes(r *rand.Rand, b []byte) {
+	for i := range b {
+		b[i] = byte(r.Uint32())
+	}
+}
+
+func benchmarkHash(b *testing.B, n int) {
+	s := strings.Repeat("A", n)
+
+	for i := 0; i < b.N; i++ {
+		StringHash(s, 0)
+	}
+	b.SetBytes(int64(n))
+}
+
+func BenchmarkHash5(b *testing.B)     { benchmarkHash(b, 5) }
+func BenchmarkHash16(b *testing.B)    { benchmarkHash(b, 16) }
+func BenchmarkHash64(b *testing.B)    { benchmarkHash(b, 64) }
+func BenchmarkHash1024(b *testing.B)  { benchmarkHash(b, 1024) }
+func BenchmarkHash65536(b *testing.B) { benchmarkHash(b, 65536) }
+
+func TestArrayHash(t *testing.T) {
+	if Compiler == "gccgo" {
+		t.Skip("does not work on gccgo without better escape analysis")
+	}
+
+	// Make sure that "" in arrays hash correctly. The hash
+	// should at least scramble the input seed so that, e.g.,
+	// {"","foo"} and {"foo",""} have different hashes.
+
+	// If the hash is bad, then all (8 choose 4) = 70 keys
+	// have the same hash. If so, we allocate 70/8 = 8
+	// overflow buckets. If the hash is good we don't
+	// normally allocate any overflow buckets, and the
+	// probability of even one or two overflows goes down rapidly.
+	// (There is always 1 allocation of the bucket array. The map
+	// header is allocated on the stack.)
+	f := func() {
+		// Make the key type at most 128 bytes. Otherwise,
+		// we get an allocation per key.
+		type key [8]string
+		m := make(map[key]bool, 70)
+
+		// fill m with keys that have 4 "foo"s and 4 ""s.
+		for i := 0; i < 256; i++ {
+			var k key
+			cnt := 0
+			for j := uint(0); j < 8; j++ {
+				if i>>j&1 != 0 {
+					k[j] = "foo"
+					cnt++
+				}
+			}
+			if cnt == 4 {
+				m[k] = true
+			}
+		}
+		if len(m) != 70 {
+			t.Errorf("bad test: (8 choose 4) should be 70, not %d", len(m))
+		}
+	}
+	if n := testing.AllocsPerRun(10, f); n > 6 {
+		t.Errorf("too many allocs %f - hash not balanced", n)
+	}
+}
+func TestStructHash(t *testing.T) {
+	// See the comment in TestArrayHash.
+	f := func() {
+		type key struct {
+			a, b, c, d, e, f, g, h string
+		}
+		m := make(map[key]bool, 70)
+
+		// fill m with keys that have 4 "foo"s and 4 ""s.
+		for i := 0; i < 256; i++ {
+			var k key
+			cnt := 0
+			if i&1 != 0 {
+				k.a = "foo"
+				cnt++
+			}
+			if i&2 != 0 {
+				k.b = "foo"
+				cnt++
+			}
+			if i&4 != 0 {
+				k.c = "foo"
+				cnt++
+			}
+			if i&8 != 0 {
+				k.d = "foo"
+				cnt++
+			}
+			if i&16 != 0 {
+				k.e = "foo"
+				cnt++
+			}
+			if i&32 != 0 {
+				k.f = "foo"
+				cnt++
+			}
+			if i&64 != 0 {
+				k.g = "foo"
+				cnt++
+			}
+			if i&128 != 0 {
+				k.h = "foo"
+				cnt++
+			}
+			if cnt == 4 {
+				m[k] = true
+			}
+		}
+		if len(m) != 70 {
+			t.Errorf("bad test: (8 choose 4) should be 70, not %d", len(m))
+		}
+	}
+	if n := testing.AllocsPerRun(10, f); n > 6 {
+		t.Errorf("too many allocs %f - hash not balanced", n)
+	}
+}
+
+var sink uint64
+
+func BenchmarkAlignedLoad(b *testing.B) {
+	var buf [16]byte
+	p := unsafe.Pointer(&buf[0])
+	var s uint64
+	for i := 0; i < b.N; i++ {
+		s += ReadUnaligned64(p)
+	}
+	sink = s
+}
+
+func BenchmarkUnalignedLoad(b *testing.B) {
+	var buf [16]byte
+	p := unsafe.Pointer(&buf[1])
+	var s uint64
+	for i := 0; i < b.N; i++ {
+		s += ReadUnaligned64(p)
+	}
+	sink = s
+}
+
+func TestCollisions(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	for i := 0; i < 16; i++ {
+		for j := 0; j < 16; j++ {
+			if j == i {
+				continue
+			}
+			var a [16]byte
+			m := make(map[uint16]struct{}, 1<<16)
+			for n := 0; n < 1<<16; n++ {
+				a[i] = byte(n)
+				a[j] = byte(n >> 8)
+				m[uint16(BytesHash(a[:], 0))] = struct{}{}
+			}
+			if len(m) <= 1<<15 {
+				t.Errorf("too many collisions i=%d j=%d outputs=%d out of 65536\n", i, j, len(m))
+			}
+		}
+	}
+}
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
new file mode 100644
index 0000000..0db53f5
--- /dev/null
+++ b/libgo/go/runtime/heapdump.go
@@ -0,0 +1,594 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of runtime/debug.WriteHeapDump. Writes all
+// objects in the heap plus additional info (roots, threads,
+// finalizers, etc.) to a file.
+
+// The format of the dumped file is described at
+// https://golang.org/s/go15heapdump.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+//go:linkname runtime_debug_WriteHeapDump runtime_debug.WriteHeapDump
+func runtime_debug_WriteHeapDump(fd uintptr) {
+	stopTheWorld("write heap dump")
+
+	systemstack(func() {
+		writeheapdump_m(fd)
+	})
+
+	startTheWorld()
+}
+
+const (
+	fieldKindEol       = 0
+	fieldKindPtr       = 1
+	fieldKindIface     = 2
+	fieldKindEface     = 3
+	tagEOF             = 0
+	tagObject          = 1
+	tagOtherRoot       = 2
+	tagType            = 3
+	tagGoroutine       = 4
+	tagStackFrame      = 5
+	tagParams          = 6
+	tagFinalizer       = 7
+	tagItab            = 8
+	tagOSThread        = 9
+	tagMemStats        = 10
+	tagQueuedFinalizer = 11
+	tagData            = 12
+	tagBSS             = 13
+	tagDefer           = 14
+	tagPanic           = 15
+	tagMemProf         = 16
+	tagAllocSample     = 17
+)
+
+var dumpfd uintptr // fd to write the dump to.
+var tmpbuf []byte
+
+// buffer of pending write data
+const (
+	bufSize = 4096
+)
+
+var buf [bufSize]byte
+var nbuf uintptr
+
+func dwrite(data unsafe.Pointer, len uintptr) {
+	if len == 0 {
+		return
+	}
+	if nbuf+len <= bufSize {
+		copy(buf[nbuf:], (*[bufSize]byte)(data)[:len])
+		nbuf += len
+		return
+	}
+
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	if len >= bufSize {
+		write(dumpfd, data, int32(len))
+		nbuf = 0
+	} else {
+		copy(buf[:], (*[bufSize]byte)(data)[:len])
+		nbuf = len
+	}
+}
+
+func dwritebyte(b byte) {
+	dwrite(unsafe.Pointer(&b), 1)
+}
+
+func flush() {
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
+	nbuf = 0
+}
+
+// Cache of types that have been serialized already.
+// We use a type's hash field to pick a bucket.
+// Inside a bucket, we keep a list of types that
+// have been serialized so far, most recently used first.
+// Note: when a bucket overflows we may end up
+// serializing a type more than once. That's ok.
+const (
+	typeCacheBuckets = 256
+	typeCacheAssoc   = 4
+)
+
+type typeCacheBucket struct {
+	t [typeCacheAssoc]*_type
+}
+
+var typecache [typeCacheBuckets]typeCacheBucket
+
+// dump a uint64 in a varint format parseable by encoding/binary
+func dumpint(v uint64) {
+	var buf [10]byte
+	var n int
+	for v >= 0x80 {
+		buf[n] = byte(v | 0x80)
+		n++
+		v >>= 7
+	}
+	buf[n] = byte(v)
+	n++
+	dwrite(unsafe.Pointer(&buf), uintptr(n))
+}
+
+func dumpbool(b bool) {
+	if b {
+		dumpint(1)
+	} else {
+		dumpint(0)
+	}
+}
+
+// dump varint uint64 length followed by memory contents
+func dumpmemrange(data unsafe.Pointer, len uintptr) {
+	dumpint(uint64(len))
+	dwrite(data, len)
+}
+
+func dumpslice(b []byte) {
+	dumpint(uint64(len(b)))
+	if len(b) > 0 {
+		dwrite(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	}
+}
+
+func dumpstr(s string) {
+	sp := stringStructOf(&s)
+	dumpmemrange(sp.str, uintptr(sp.len))
+}
+
+// dump information for a type
+func dumptype(t *_type) {
+	if t == nil {
+		return
+	}
+
+	// If we've definitely serialized the type before,
+	// no need to do it again.
+	b := &typecache[t.hash&(typeCacheBuckets-1)]
+	if t == b.t[0] {
+		return
+	}
+	for i := 1; i < typeCacheAssoc; i++ {
+		if t == b.t[i] {
+			// Move-to-front
+			for j := i; j > 0; j-- {
+				b.t[j] = b.t[j-1]
+			}
+			b.t[0] = t
+			return
+		}
+	}
+
+	// Might not have been dumped yet. Dump it and
+	// remember we did so.
+	for j := typeCacheAssoc - 1; j > 0; j-- {
+		b.t[j] = b.t[j-1]
+	}
+	b.t[0] = t
+
+	// dump the type
+	dumpint(tagType)
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
+	dumpint(uint64(t.size))
+	if x := t.uncommontype; x == nil || t.pkgPath == nil || *t.pkgPath == "" {
+		dumpstr(*t.string)
+	} else {
+		pkgpathstr := *t.pkgPath
+		pkgpath := stringStructOf(&pkgpathstr)
+		namestr := *t.name
+		name := stringStructOf(&namestr)
+		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
+		dwrite(pkgpath.str, uintptr(pkgpath.len))
+		dwritebyte('.')
+		dwrite(name.str, uintptr(name.len))
+	}
+	dumpbool(t.kind&kindDirectIface == 0 || t.kind&kindNoPointers == 0)
+}
+
+// dump an object
+func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
+	dumpbvtypes(&bv, obj)
+	dumpint(tagObject)
+	dumpint(uint64(uintptr(obj)))
+	dumpmemrange(obj, size)
+	dumpfields(bv)
+}
+
+func dumpotherroot(description string, to unsafe.Pointer) {
+	dumpint(tagOtherRoot)
+	dumpstr(description)
+	dumpint(uint64(uintptr(to)))
+}
+
+func dumpfinalizer(obj unsafe.Pointer, fn *funcval, ft *functype, ot *ptrtype) {
+	dumpint(tagFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ft))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+type childInfo struct {
+	// Information passed up from the callee frame about
+	// the layout of the outargs region.
+	argoff uintptr   // where the arguments start in the frame
+	arglen uintptr   // size of args region
+	args   bitvector // if args.n >= 0, pointer map of args region
+	sp     *uint8    // callee sp
+	depth  uintptr   // depth in call stack (0 == most recent)
+}
+
+// dump kinds & offsets of interesting fields in bv
+func dumpbv(cbv *bitvector, offset uintptr) {
+	bv := gobv(*cbv)
+	for i := uintptr(0); i < bv.n; i++ {
+		if bv.bytedata[i/8]>>(i%8)&1 == 1 {
+			dumpint(fieldKindPtr)
+			dumpint(uint64(offset + i*sys.PtrSize))
+		}
+	}
+}
+
+func dumpgoroutine(gp *g) {
+	sp := gp.syscallsp
+
+	dumpint(tagGoroutine)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+	dumpint(uint64(sp))
+	dumpint(uint64(gp.goid))
+	dumpint(uint64(gp.gopc))
+	dumpint(uint64(readgstatus(gp)))
+	dumpbool(isSystemGoroutine(gp))
+	dumpbool(false) // isbackground
+	dumpint(uint64(gp.waitsince))
+	dumpstr(gp.waitreason)
+	dumpint(0)
+	dumpint(uint64(uintptr(unsafe.Pointer(gp.m))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._defer))))
+	dumpint(uint64(uintptr(unsafe.Pointer(gp._panic))))
+
+	// dump defer & panic records
+	for d := gp._defer; d != nil; d = d.link {
+		dumpint(tagDefer)
+		dumpint(uint64(uintptr(unsafe.Pointer(d))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		dumpint(0)
+		dumpint(0)
+		dumpint(uint64(uintptr(unsafe.Pointer(d.pfn))))
+		dumpint(0)
+		dumpint(uint64(uintptr(unsafe.Pointer(d.link))))
+	}
+	for p := gp._panic; p != nil; p = p.link {
+		dumpint(tagPanic)
+		dumpint(uint64(uintptr(unsafe.Pointer(p))))
+		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
+		eface := efaceOf(&p.arg)
+		dumpint(uint64(uintptr(unsafe.Pointer(eface._type))))
+		dumpint(uint64(uintptr(unsafe.Pointer(eface.data))))
+		dumpint(0) // was p->defer, no longer recorded
+		dumpint(uint64(uintptr(unsafe.Pointer(p.link))))
+	}
+}
+
+func dumpgs() {
+	// goroutines & stacks
+	for i := 0; uintptr(i) < allglen; i++ {
+		gp := allgs[i]
+		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
+		switch status {
+		default:
+			print("runtime: unexpected G.status ", hex(status), "\n")
+			throw("dumpgs in STW - bad status")
+		case _Gdead:
+			// ok
+		case _Grunnable,
+			_Gsyscall,
+			_Gwaiting:
+			dumpgoroutine(gp)
+		}
+	}
+}
+
+func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
+	dumpint(tagQueuedFinalizer)
+	dumpint(uint64(uintptr(obj)))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(fn.fn))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ft))))
+	dumpint(uint64(uintptr(unsafe.Pointer(ot))))
+}
+
+func dumproots() {
+	// MSpan.types
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			// Finalizers
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				p := unsafe.Pointer(s.base() + uintptr(spf.special.offset))
+				dumpfinalizer(p, spf.fn, spf.ft, spf.ot)
+			}
+		}
+	}
+
+	// Finalizer queue
+	iterate_finq(finq_callback)
+}
+
+// Bit vector of free marks.
+// Needs to be as big as the largest number of objects per span.
+var freemark [_PageSize / 8]bool
+
+func dumpobjs() {
+	for _, s := range mheap_.allspans {
+		if s.state != _MSpanInUse {
+			continue
+		}
+		p := s.base()
+		size := s.elemsize
+		n := (s.npages << _PageShift) / size
+		if n > uintptr(len(freemark)) {
+			throw("freemark array doesn't have enough entries")
+		}
+
+		for freeIndex := uintptr(0); freeIndex < s.nelems; freeIndex++ {
+			if s.isFree(freeIndex) {
+				freemark[freeIndex] = true
+			}
+		}
+
+		for j := uintptr(0); j < n; j, p = j+1, p+size {
+			if freemark[j] {
+				freemark[j] = false
+				continue
+			}
+			dumpobj(unsafe.Pointer(p), size, makeheapobjbv(p, size))
+		}
+	}
+}
+
+func dumpparams() {
+	dumpint(tagParams)
+	x := uintptr(1)
+	if *(*byte)(unsafe.Pointer(&x)) == 1 {
+		dumpbool(false) // little-endian ptrs
+	} else {
+		dumpbool(true) // big-endian ptrs
+	}
+	dumpint(sys.PtrSize)
+	dumpint(uint64(mheap_.arena_start))
+	dumpint(uint64(mheap_.arena_used))
+	dumpstr(sys.GOARCH)
+	dumpstr(sys.Goexperiment)
+	dumpint(uint64(ncpu))
+}
+
+func dumpms() {
+	for mp := allm; mp != nil; mp = mp.alllink {
+		dumpint(tagOSThread)
+		dumpint(uint64(uintptr(unsafe.Pointer(mp))))
+		dumpint(uint64(mp.id))
+		dumpint(mp.procid)
+	}
+}
+
+func dumpmemstats() {
+	dumpint(tagMemStats)
+	dumpint(memstats.alloc)
+	dumpint(memstats.total_alloc)
+	dumpint(memstats.sys)
+	dumpint(memstats.nlookup)
+	dumpint(memstats.nmalloc)
+	dumpint(memstats.nfree)
+	dumpint(memstats.heap_alloc)
+	dumpint(memstats.heap_sys)
+	dumpint(memstats.heap_idle)
+	dumpint(memstats.heap_inuse)
+	dumpint(memstats.heap_released)
+	dumpint(memstats.heap_objects)
+	dumpint(memstats.stacks_inuse)
+	dumpint(memstats.stacks_sys)
+	dumpint(memstats.mspan_inuse)
+	dumpint(memstats.mspan_sys)
+	dumpint(memstats.mcache_inuse)
+	dumpint(memstats.mcache_sys)
+	dumpint(memstats.buckhash_sys)
+	dumpint(memstats.gc_sys)
+	dumpint(memstats.other_sys)
+	dumpint(memstats.next_gc)
+	dumpint(memstats.last_gc)
+	dumpint(memstats.pause_total_ns)
+	for i := 0; i < 256; i++ {
+		dumpint(memstats.pause_ns[i])
+	}
+	dumpint(uint64(memstats.numgc))
+}
+
+func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *location, size, allocs, frees uintptr) {
+	stk := (*[100000]location)(unsafe.Pointer(pstk))
+	dumpint(tagMemProf)
+	dumpint(uint64(uintptr(unsafe.Pointer(b))))
+	dumpint(uint64(size))
+	dumpint(uint64(nstk))
+	for i := uintptr(0); i < nstk; i++ {
+		pc := stk[i].pc
+		fn := stk[i].function
+		file := stk[i].filename
+		line := stk[i].lineno
+		if fn == "" {
+			var buf [64]byte
+			n := len(buf)
+			n--
+			buf[n] = ')'
+			if pc == 0 {
+				n--
+				buf[n] = '0'
+			} else {
+				for pc > 0 {
+					n--
+					buf[n] = "0123456789abcdef"[pc&15]
+					pc >>= 4
+				}
+			}
+			n--
+			buf[n] = 'x'
+			n--
+			buf[n] = '0'
+			n--
+			buf[n] = '('
+			dumpslice(buf[n:])
+			dumpstr("?")
+			dumpint(0)
+		} else {
+			dumpstr(fn)
+			dumpstr(file)
+			dumpint(uint64(line))
+		}
+	}
+	dumpint(uint64(allocs))
+	dumpint(uint64(frees))
+}
+
+func dumpmemprof() {
+	iterate_memprof(dumpmemprof_callback)
+	for _, s := range mheap_.allspans {
+		if s.state != _MSpanInUse {
+			continue
+		}
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialProfile {
+				continue
+			}
+			spp := (*specialprofile)(unsafe.Pointer(sp))
+			p := s.base() + uintptr(spp.special.offset)
+			dumpint(tagAllocSample)
+			dumpint(uint64(p))
+			dumpint(uint64(uintptr(unsafe.Pointer(spp.b))))
+		}
+	}
+}
+
+var dumphdr = []byte("go1.7 heap dump\n")
+
+func mdump() {
+	// make sure we're done sweeping
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			s.ensureSwept()
+		}
+	}
+	memclrNoHeapPointers(unsafe.Pointer(&typecache), unsafe.Sizeof(typecache))
+	dwrite(unsafe.Pointer(&dumphdr[0]), uintptr(len(dumphdr)))
+	dumpparams()
+	dumpobjs()
+	dumpgs()
+	dumpms()
+	dumproots()
+	dumpmemstats()
+	dumpmemprof()
+	dumpint(tagEOF)
+	flush()
+}
+
+func writeheapdump_m(fd uintptr) {
+	_g_ := getg()
+	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
+	_g_.waitreason = "dumping heap"
+
+	// Update stats so we can dump them.
+	// As a side effect, flushes all the MCaches so the MSpan.freelist
+	// lists contain all the free objects.
+	updatememstats(nil)
+
+	// Set dump file.
+	dumpfd = fd
+
+	// Call dump routine.
+	mdump()
+
+	// Reset dump file.
+	dumpfd = 0
+	if tmpbuf != nil {
+		sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		tmpbuf = nil
+	}
+
+	casgstatus(_g_.m.curg, _Gwaiting, _Grunning)
+}
+
+// dumpint() the kind & offset of each field in an object.
+func dumpfields(bv bitvector) {
+	dumpbv(&bv, 0)
+	dumpint(fieldKindEol)
+}
+
+// The heap dump reader needs to be able to disambiguate
+// Eface entries. So it needs to know every type that might
+// appear in such an entry. The following routine accomplishes that.
+// TODO(rsc, khr): Delete - no longer possible.
+
+// Dump all the types that appear in the type field of
+// any Eface described by this bit vector.
+func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
+}
+
+func makeheapobjbv(p uintptr, size uintptr) bitvector {
+	// Extend the temp buffer if necessary.
+	nptr := size / sys.PtrSize
+	if uintptr(len(tmpbuf)) < nptr/8+1 {
+		if tmpbuf != nil {
+			sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
+		}
+		n := nptr/8 + 1
+		p := sysAlloc(n, &memstats.other_sys)
+		if p == nil {
+			throw("heapdump: out of memory")
+		}
+		tmpbuf = (*[1 << 30]byte)(p)[:n]
+	}
+	// Convert heap bitmap to pointer bitmap.
+	for i := uintptr(0); i < nptr/8+1; i++ {
+		tmpbuf[i] = 0
+	}
+	i := uintptr(0)
+	hbits := heapBitsForAddr(p)
+	for ; i < nptr; i++ {
+		if i != 1 && !hbits.morePointers() {
+			break // end of object
+		}
+		if hbits.isPointer() {
+			tmpbuf[i/8] |= 1 << (i % 8)
+		}
+		hbits = hbits.next()
+	}
+	return bitvector{int32(i), &tmpbuf[0]}
+}
+
+type gobitvector struct {
+	n        uintptr
+	bytedata []uint8
+}
+
+func gobv(bv bitvector) gobitvector {
+	return gobitvector{
+		uintptr(bv.n),
+		(*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
+	}
+}
diff --git a/libgo/go/runtime/iface_test.go b/libgo/go/runtime/iface_test.go
index 7f27baa..3744a4f 100644
--- a/libgo/go/runtime/iface_test.go
+++ b/libgo/go/runtime/iface_test.go
@@ -223,6 +223,10 @@ func BenchmarkAssertE2E2Blank(b *testing.B) {
 }
 
 func TestNonEscapingConvT2E(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("does not work on gccgo without better escape analysis")
+	}
+
 	m := make(map[interface{}]bool)
 	m[42] = true
 	if !m[42] {
@@ -243,6 +247,10 @@ func TestNonEscapingConvT2E(t *testing.T) {
 }
 
 func TestNonEscapingConvT2I(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("does not work on gccgo without better escape analysis")
+	}
+
 	m := make(map[I1]bool)
 	m[TM(42)] = true
 	if !m[TM(42)] {
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index 4d914b2..9877bc3 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -198,13 +198,10 @@ func notetsleep_internal(n *note, ns int64) bool {
 }
 
 func notetsleep(n *note, ns int64) bool {
-	// Currently OK to sleep in non-g0 for gccgo.  It happens in
-	// stoptheworld because our version of systemstack does not
-	// change to g0.
-	// gp := getg()
-	// if gp != gp.m.g0 && gp.m.preemptoff != "" {
-	//	throw("notetsleep not on g0")
-	// }
+	gp := getg()
+	if gp != gp.m.g0 && gp.m.preemptoff != "" {
+		throw("notetsleep not on g0")
+	}
 
 	return notetsleep_internal(n, ns)
 }
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 5c70a74..57fee19 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin nacl netbsd openbsd plan9 solaris windows
+// +build aix darwin nacl netbsd openbsd plan9 solaris windows
 
 package runtime
 
@@ -251,14 +251,9 @@ func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
 
 func notetsleep(n *note, ns int64) bool {
 	gp := getg()
-
-	// Currently OK to sleep in non-g0 for gccgo.  It happens in
-	// stoptheworld because our version of systemstack does not
-	// change to g0.
-	// if gp != gp.m.g0 && gp.m.preemptoff != "" {
-	//	throw("notetsleep not on g0")
-	// }
-
+	if gp != gp.m.g0 && gp.m.preemptoff != "" {
+		throw("notetsleep not on g0")
+	}
 	semacreate(gp.m)
 	return notetsleep_internal(n, ns, nil, 0)
 }
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
new file mode 100644
index 0000000..ed25782
--- /dev/null
+++ b/libgo/go/runtime/malloc.go
@@ -0,0 +1,998 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Memory allocator.
+//
+// This was originally based on tcmalloc, but has diverged quite a bit.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 70 size classes, each of which
+// has its own free set of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using a free bitmap.
+//
+// The allocator's data structures are:
+//
+//	fixalloc: a free-list allocator for fixed-size off-heap objects,
+//		used to manage storage used by the allocator.
+//	mheap: the malloc heap, managed at page (8192-byte) granularity.
+//	mspan: a run of pages managed by the mheap.
+//	mcentral: collects all spans of a given size class.
+//	mcache: a per-P cache of mspans with free space.
+//	mstats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding mspan in this P's mcache.
+//	   Scan the mspan's free bitmap to find a free slot.
+//	   If there is a free slot, allocate it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the mspan has no free slots, obtain a new mspan
+//	   from the mcentral's list of mspans of the required size
+//	   class that have free space.
+//	   Obtaining a whole span amortizes the cost of locking
+//	   the mcentral.
+//
+//	3. If the mcentral's mspan list is empty, obtain a run
+//	   of pages from the mheap to use for the mspan.
+//
+//	4. If the mheap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system. Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Sweeping an mspan and freeing objects on it proceeds up a similar
+// hierarchy:
+//
+//	1. If the mspan is being swept in response to allocation, it
+//	   is returned to the mcache to satisfy the allocation.
+//
+//	2. Otherwise, if the mspan still has allocated objects in it,
+//	   it is placed on the mcentral free list for the mspan's size
+//	   class.
+//
+//	3. Otherwise, if all objects in the mspan are free, the mspan
+//	   is now "idle", so it is returned to the mheap and no longer
+//	   has a size class.
+//	   This may coalesce it with adjacent idle mspans.
+//
+//	4. If an mspan remains idle for long enough, return its pages
+//	   to the operating system.
+//
+// Allocating and freeing a large object uses the mheap
+// directly, bypassing the mcache and mcentral.
+//
+// Free object slots in an mspan are zeroed only if mspan.needzero is
+// false. If needzero is true, objects are zeroed as they are
+// allocated. There are various benefits to delaying zeroing this way:
+//
+//	1. Stack frame allocation can avoid zeroing altogether.
+//
+//	2. It exhibits better temporal locality, since the program is
+//	   probably about to write to the memory.
+//
+//	3. We don't zero pages that never get reused.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// C function to get the end of the program's memory.
+func getEnd() uintptr
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname newobject runtime.newobject
+
+// Functions called by C code.
+//go:linkname mallocgc runtime.mallocgc
+
+const (
+	debugMalloc = false
+
+	maxTinySize   = _TinySize
+	tinySizeClass = _TinySizeClass
+	maxSmallSize  = _MaxSmallSize
+
+	pageShift = _PageShift
+	pageSize  = _PageSize
+	pageMask  = _PageMask
+	// By construction, single page spans of the smallest object class
+	// have the most objects per span.
+	maxObjsPerSpan = pageSize / 8
+
+	mSpanInUse = _MSpanInUse
+
+	concurrentSweep = _ConcurrentSweep
+
+	_PageSize = 1 << _PageShift
+	_PageMask = _PageSize - 1
+
+	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+	_64bit = 1 << (^uintptr(0) >> 63) / 2
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
+	_TinySize      = 16
+	_TinySizeClass = 2
+
+	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+
+	// Per-P, per order stack segment cache size.
+	_StackCacheSize = 32 * 1024
+
+	// Number of orders that get caching. Order 0 is FixedStack
+	// and each successive order is twice as large.
+	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
+	// will be allocated directly.
+	// Since FixedStack is different on different systems, we
+	// must vary NumStackOrders to keep the same maximum cached size.
+	//   OS               | FixedStack | NumStackOrders
+	//   -----------------+------------+---------------
+	//   linux/darwin/bsd | 2KB        | 4
+	//   windows/32       | 4KB        | 3
+	//   windows/64       | 8KB        | 2
+	//   plan9            | 4KB        | 3
+	_NumStackOrders = 4 - sys.PtrSize/4*sys.GoosWindows - 1*sys.GoosPlan9
+
+	// Number of bits in page to span calculations (4k pages).
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See https://golang.org/issue/5402 and https://golang.org/issue/5236.
+	// On other 64-bit platforms, we limit the arena to 512GB, or 39 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+	// The only exception is mips32 which only has access to low 2GB of virtual memory.
+	// On Darwin/arm64, we cannot reserve more than ~5GB of virtual memory,
+	// but as most devices have less than 4GB of physical memory anyway, we
+	// try to be conservative here, and only ask for a 2GB heap.
+	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
+	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+
+	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine. The garbage
+	// collector scales well to 32 cpus.
+	_MaxGcproc = 32
+
+	_MaxArena32 = 1<<32 - 1
+
+	// minLegalPointer is the smallest possible legal pointer.
+	// This is the smallest possible architectural page size,
+	// since we assume that the first page is never mapped.
+	//
+	// This should agree with minZeroPage in the compiler.
+	minLegalPointer uintptr = 4096
+)
+
+// physPageSize is the size in bytes of the OS's physical pages.
+// Mapping and unmapping operations must be done at multiples of
+// physPageSize.
+//
+// This must be set by the OS init code (typically in osinit) before
+// mallocinit.
+var physPageSize uintptr
+
+// OS-defined helpers:
+//
+// sysAlloc obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte.
+// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysUnused notifies the operating system that the contents
+// of the memory region are no longer needed and can be reused
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
+//
+// SysFree returns it unconditionally; this is only used if
+// an out-of-memory error has been detected midway through
+// an allocation. It is okay if SysFree is a no-op.
+//
+// SysReserve reserves address space without allocating memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but SysReserve can still choose another
+// location if that one is unavailable. On some systems and in some
+// cases SysReserve will simply check that the address space is
+// available and not actually reserve it. If SysReserve returns
+// non-nil, it sets *reserved to true if the address space is
+// reserved, false if it has merely been checked.
+// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysMap maps previously reserved address space for use.
+// The reserved argument is true if the address space was really
+// reserved, not merely checked.
+//
+// SysFault marks a (already sysAlloc'd) region to fault
+// if accessed. Used only for debugging the runtime.
+
+func mallocinit() {
+	if class_to_size[_TinySizeClass] != _TinySize {
+		throw("bad TinySizeClass")
+	}
+
+	// Not used for gccgo.
+	// testdefersizes()
+
+	// Copy class sizes out for statistics table.
+	for i := range class_to_size {
+		memstats.by_size[i].size = uint32(class_to_size[i])
+	}
+
+	// Check physPageSize.
+	if physPageSize == 0 {
+		// The OS init code failed to fetch the physical page size.
+		throw("failed to get system page size")
+	}
+	if physPageSize < minPhysPageSize {
+		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
+	if physPageSize&(physPageSize-1) != 0 {
+		print("system page size (", physPageSize, ") must be a power of 2\n")
+		throw("bad system page size")
+	}
+
+	var p, bitmapSize, spansSize, pSize, limit uintptr
+	var reserved bool
+
+	// limit = runtime.memlimit();
+	// See https://golang.org/issue/5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0
+
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found. The arena begins with a bitmap large
+	// enough to hold 2 bits per allocated word.
+	if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
+		// On a 64-bit machine, allocate from a single contiguous reservation.
+		// 512 GB (MaxMem) should be big enough for now.
+		//
+		// The code will work with the reservation at any address, but ask
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
+		// Allocating a 512 GB region takes away 39 bits, and the amd64
+		// doesn't let us choose the top 17 bits, so that leaves the 9 bits
+		// in the middle of 0x00c0 for us to choose. Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from
+		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
+		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices are both for debuggability and to reduce the
+		// odds of a conservative garbage collector (as is still used in gccgo)
+		// not collecting memory because some non-pointer block of memory
+		// had a bit pattern that matched a memory address.
+		//
+		// Actually we reserve 544 GB (because the bitmap ends up being 32 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
+		//
+		// If this fails we fall back to the 32 bit memory mechanism
+		//
+		// However, on arm64, we ignore all this advice above and slam the
+		// allocation at 0x40 << 32 because when using 4k pages with 3-level
+		// translation buffers, the user address space is limited to 39 bits
+		// On darwin/arm64, the address space is even smaller.
+		arenaSize := round(_MaxMem, _PageSize)
+		bitmapSize = arenaSize / (sys.PtrSize * 8 / 2)
+		spansSize = arenaSize / _PageSize * sys.PtrSize
+		spansSize = round(spansSize, _PageSize)
+		for i := 0; i <= 0x7f; i++ {
+			switch {
+			case GOARCH == "arm64" && GOOS == "darwin":
+				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
+			case GOARCH == "arm64":
+				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
+			default:
+				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+			}
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+	}
+
+	if p == 0 {
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle the entire 4GB address space (256 MB),
+		// along with a reservation for an initial arena.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere.
+
+		// If we fail to allocate, try again with a smaller arena.
+		// This is necessary on Android L where we share a process
+		// with ART, which reserves virtual memory aggressively.
+		// In the worst case, fall back to a 0-sized initial arena,
+		// in the hope that subsequent reservations will succeed.
+		arenaSizes := [...]uintptr{
+			512 << 20,
+			256 << 20,
+			128 << 20,
+			0,
+		}
+
+		for _, arenaSize := range &arenaSizes {
+			bitmapSize = (_MaxArena32 + 1) / (sys.PtrSize * 8 / 2)
+			spansSize = (_MaxArena32 + 1) / _PageSize * sys.PtrSize
+			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
+				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
+				arenaSize = bitmapSize * 8
+				spansSize = arenaSize / _PageSize * sys.PtrSize
+			}
+			spansSize = round(spansSize, _PageSize)
+
+			// SysReserve treats the address we ask for, end, as a hint,
+			// not as an absolute requirement. If we ask for the end
+			// of the data segment but the operating system requires
+			// a little more space before we can start allocating, it will
+			// give out a slightly higher pointer. Except QEMU, which
+			// is buggy, as usual: it won't adjust the pointer upward.
+			// So adjust it upward a little bit ourselves: 1/4 MB to get
+			// away from the running binary image and then round up
+			// to a MB boundary.
+			p = round(getEnd()+(1<<18), 1<<20)
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+		if p == 0 {
+			throw("runtime: cannot reserve arena virtual address space")
+		}
+	}
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 := round(p, _PageSize)
+
+	spansStart := p1
+	mheap_.bitmap = p1 + spansSize + bitmapSize
+	if sys.PtrSize == 4 {
+		// Set arena_start such that we can accept memory
+		// reservations located anywhere in the 4GB virtual space.
+		mheap_.arena_start = 0
+	} else {
+		mheap_.arena_start = p1 + (spansSize + bitmapSize)
+	}
+	mheap_.arena_end = p + pSize
+	mheap_.arena_used = p1 + (spansSize + bitmapSize)
+	mheap_.arena_reserved = reserved
+
+	if mheap_.arena_start&(_PageSize-1) != 0 {
+		println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
+		throw("misrounded allocation in mallocinit")
+	}
+
+	// Initialize the rest of the allocator.
+	mheap_.init(spansStart, spansSize)
+	_g_ := getg()
+	_g_.m.mcache = allocmcache()
+}
+
+// sysAlloc allocates the next n bytes from the heap arena. The
+// returned pointer is always _PageSize aligned and between
+// h.arena_start and h.arena_end. sysAlloc returns nil on failure.
+// There is no corresponding free function.
+func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
+	if n > h.arena_end-h.arena_used {
+		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
+		// Reserve some more space.
+		p_size := round(n+_PageSize, 256<<20)
+		new_end := h.arena_end + p_size // Careful: can overflow
+		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxArena32 {
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			var reserved bool
+			p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
+			if p == 0 {
+				return nil
+			}
+			if p == h.arena_end {
+				h.arena_end = new_end
+				h.arena_reserved = reserved
+			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxArena32 {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h.arena_end = p + p_size
+				used := p + (-p & (_PageSize - 1))
+				h.mapBits(used)
+				h.mapSpans(used)
+				h.arena_used = used
+				h.arena_reserved = reserved
+			} else {
+				// We haven't added this allocation to
+				// the stats, so subtract it from a
+				// fake stat (but avoid underflow).
+				stat := uint64(p_size)
+				sysFree(unsafe.Pointer(p), p_size, &stat)
+			}
+		}
+	}
+
+	if n <= h.arena_end-h.arena_used {
+		// Keep taking from our reservation.
+		p := h.arena_used
+		sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
+		h.arena_used = p + n
+		if raceenabled {
+			racemapshadow(unsafe.Pointer(p), n)
+		}
+
+		if p&(_PageSize-1) != 0 {
+			throw("misrounded allocation in MHeap_SysAlloc")
+		}
+		return unsafe.Pointer(p)
+	}
+
+	// If using 64-bit, our reservation is all we have.
+	if h.arena_end-h.arena_start > _MaxArena32 {
+		return nil
+	}
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS.
+	p_size := round(n, _PageSize) + _PageSize
+	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
+	if p == 0 {
+		return nil
+	}
+
+	if p < h.arena_start || p+p_size-h.arena_start > _MaxArena32 {
+		top := ^uintptr(0)
+		if top-h.arena_start-1 > _MaxArena32 {
+			top = h.arena_start + _MaxArena32 + 1
+		}
+		print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
+		sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
+		return nil
+	}
+
+	p_end := p + p_size
+	p += -p & (_PageSize - 1)
+	if p+n > h.arena_used {
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
+		h.arena_used = p + n
+		if p_end > h.arena_end {
+			h.arena_end = p_end
+		}
+		if raceenabled {
+			racemapshadow(unsafe.Pointer(p), n)
+		}
+	}
+
+	if p&(_PageSize-1) != 0 {
+		throw("misrounded allocation in MHeap_SysAlloc")
+	}
+	return unsafe.Pointer(p)
+}
+
+// base address for all 0-byte allocations
+var zerobase uintptr
+
+// nextFreeFast returns the next free object if one is quickly available.
+// Otherwise it returns 0.
+func nextFreeFast(s *mspan) gclinkptr {
+	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
+	if theBit < 64 {
+		result := s.freeindex + uintptr(theBit)
+		if result < s.nelems {
+			freeidx := result + 1
+			if freeidx%64 == 0 && freeidx != s.nelems {
+				return 0
+			}
+			s.allocCache >>= (theBit + 1)
+			s.freeindex = freeidx
+			v := gclinkptr(result*s.elemsize + s.base())
+			s.allocCount++
+			return v
+		}
+	}
+	return 0
+}
+
+// nextFree returns the next free object from the cached span if one is available.
+// Otherwise it refills the cache with a span with an available object and
+// returns that object along with a flag indicating that this was a heavy
+// weight allocation. If it is a heavy weight allocation the caller must
+// determine whether a new GC cycle needs to be started or if the GC is active
+// whether this goroutine needs to assist the GC.
+func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[sizeclass]
+	shouldhelpgc = false
+	freeIndex := s.nextFreeIndex()
+	if freeIndex == s.nelems {
+		// The span is full.
+		if uintptr(s.allocCount) != s.nelems {
+			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
+		}
+		systemstack(func() {
+			c.refill(int32(sizeclass))
+		})
+		shouldhelpgc = true
+		s = c.alloc[sizeclass]
+
+		freeIndex = s.nextFreeIndex()
+	}
+
+	if freeIndex >= s.nelems {
+		throw("freeIndex is not valid")
+	}
+
+	v = gclinkptr(freeIndex*s.elemsize + s.base())
+	s.allocCount++
+	if uintptr(s.allocCount) > s.nelems {
+		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+		throw("s.allocCount > s.nelems")
+	}
+	return
+}
+
+// Allocate an object of size bytes.
+// Small objects are allocated from the per-P cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
+	if gcphase == _GCmarktermination {
+		throw("mallocgc called with gcphase == _GCmarktermination")
+	}
+
+	if size == 0 {
+		return unsafe.Pointer(&zerobase)
+	}
+
+	if debug.sbrk != 0 {
+		align := uintptr(16)
+		if typ != nil {
+			align = uintptr(typ.align)
+		}
+		return persistentalloc(size, align, &memstats.other_sys)
+	}
+
+	// When using gccgo, when a cgo or SWIG function has an
+	// interface return type and the function returns a
+	// non-pointer, memory allocation occurs after syscall.Cgocall
+	// but before syscall.CgocallDone. Treat this allocation as a
+	// callback.
+	incallback := false
+	if gomcache() == nil && getg().m.ncgo > 0 {
+		exitsyscall(0)
+		incallback = true
+	}
+
+	// assistG is the G to charge for this allocation, or nil if
+	// GC is not currently active.
+	var assistG *g
+	if gcBlackenEnabled != 0 {
+		// Charge the current user G for this allocation.
+		assistG = getg()
+		if assistG.m.curg != nil {
+			assistG = assistG.m.curg
+		}
+		// Charge the allocation against the G. We'll account
+		// for internal fragmentation at the end of mallocgc.
+		assistG.gcAssistBytes -= int64(size)
+
+		if assistG.gcAssistBytes < 0 {
+			// This G is in debt. Assist the GC to correct
+			// this before allocating. This must happen
+			// before disabling preemption.
+			gcAssistAlloc(assistG)
+		}
+	}
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
+	mp.mallocing = 1
+
+	shouldhelpgc := false
+	dataSize := size
+	c := gomcache()
+	var x unsafe.Pointer
+	noscan := typ == nil || typ.kind&kindNoPointers != 0
+	if size <= maxSmallSize {
+		if noscan && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be noscan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+			off := c.tinyoffset
+			// Align tiny pointer for required (conservative) alignment.
+			if size&7 == 0 {
+				off = round(off, 8)
+			} else if size&3 == 0 {
+				off = round(off, 4)
+			} else if size&1 == 0 {
+				off = round(off, 2)
+			}
+			if off+size <= maxTinySize && c.tiny != 0 {
+				// The object fits into existing tiny block.
+				x = unsafe.Pointer(c.tiny + off)
+				c.tinyoffset = off + size
+				c.local_tinyallocs++
+				mp.mallocing = 0
+				releasem(mp)
+				if incallback {
+					entersyscall(0)
+				}
+				return x
+			}
+			// Allocate a new maxTinySize block.
+			span := c.alloc[tinySizeClass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, _, shouldhelpgc = c.nextFree(tinySizeClass)
+			}
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if size < c.tinyoffset || c.tiny == 0 {
+				c.tiny = uintptr(x)
+				c.tinyoffset = size
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass uint8
+			if size <= smallSizeMax-8 {
+				sizeclass = size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]
+			} else {
+				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			span := c.alloc[sizeclass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, span, shouldhelpgc = c.nextFree(sizeclass)
+			}
+			x = unsafe.Pointer(v)
+			if needzero && span.needzero != 0 {
+				memclrNoHeapPointers(unsafe.Pointer(v), size)
+			}
+		}
+	} else {
+		var s *mspan
+		shouldhelpgc = true
+		systemstack(func() {
+			s = largeAlloc(size, needzero)
+		})
+		s.freeindex = 1
+		s.allocCount = 1
+		x = unsafe.Pointer(s.base())
+		size = s.elemsize
+	}
+
+	var scanSize uintptr
+	if noscan {
+		heapBitsSetTypeNoScan(uintptr(x))
+	} else {
+		heapBitsSetType(uintptr(x), size, dataSize, typ)
+		if dataSize > typ.size {
+			// Array allocation. If there are any
+			// pointers, GC has to scan to the last
+			// element.
+			if typ.ptrdata != 0 {
+				scanSize = dataSize - typ.size + typ.ptrdata
+			}
+		} else {
+			scanSize = typ.ptrdata
+		}
+		c.local_scan += scanSize
+	}
+
+	// Ensure that the stores above that initialize x to
+	// type-safe memory and set the heap bits occur before
+	// the caller can make x observable to the garbage
+	// collector. Otherwise, on weakly ordered machines,
+	// the garbage collector could follow a pointer to x,
+	// but see uninitialized memory or stale heap bits.
+	publicationBarrier()
+
+	// Allocate black during GC.
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase != _GCoff {
+		gcmarknewobject(uintptr(x), size, scanSize)
+	}
+
+	if raceenabled {
+		racemalloc(x, size)
+	}
+
+	if msanenabled {
+		msanmalloc(x, size)
+	}
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	if debug.allocfreetrace != 0 {
+		tracealloc(x, size, typ)
+	}
+
+	if rate := MemProfileRate; rate > 0 {
+		if size < uintptr(rate) && int32(size) < c.next_sample {
+			c.next_sample -= int32(size)
+		} else {
+			mp := acquirem()
+			profilealloc(mp, x, size)
+			releasem(mp)
+		}
+	}
+
+	if assistG != nil {
+		// Account for internal fragmentation in the assist
+		// debt now that we know it.
+		assistG.gcAssistBytes -= int64(size - dataSize)
+	}
+
+	if shouldhelpgc && gcShouldStart(false) {
+		gcStart(gcBackgroundMode, false)
+	}
+
+	if getg().preempt {
+		checkPreempt()
+	}
+
+	if incallback {
+		entersyscall(0)
+	}
+
+	return x
+}
+
+func largeAlloc(size uintptr, needzero bool) *mspan {
+	// print("largeAlloc size=", size, "\n")
+
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	s := mheap_.alloc(npages, 0, true, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	s.limit = s.base() + size
+	heapBitsForSpan(s.base()).initSpan(s)
+	return s
+}
+
+// implementation of new builtin
+// compiler (both frontend and SSA backend) knows the signature
+// of this function
+func newobject(typ *_type) unsafe.Pointer {
+	return mallocgc(typ.size, typ, true)
+}
+
+//go:linkname reflect_unsafe_New reflect.unsafe_New
+func reflect_unsafe_New(typ *_type) unsafe.Pointer {
+	return newobject(typ)
+}
+
+// newarray allocates an array of n elements of type typ.
+func newarray(typ *_type, n int) unsafe.Pointer {
+	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
+		panic(plainError("runtime: allocation size out of range"))
+	}
+	return mallocgc(typ.size*uintptr(n), typ, true)
+}
+
+//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
+func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
+	return newarray(typ, n)
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	mp.mcache.next_sample = nextSample()
+	mProf_Malloc(x, size)
+}
+
+// nextSample returns the next sampling point for heap profiling.
+// It produces a random variable with a geometric distribution and
+// mean MemProfileRate. This is done by generating a uniformly
+// distributed random number and applying the cumulative distribution
+// function for an exponential.
+func nextSample() int32 {
+	if GOOS == "plan9" {
+		// Plan 9 doesn't support floating point in note handler.
+		if g := getg(); g == g.m.gsignal {
+			return nextSampleNoFP()
+		}
+	}
+
+	period := MemProfileRate
+
+	// make nextSample not overflow. Maximum possible step is
+	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+	switch {
+	case period > 0x7000000:
+		period = 0x7000000
+	case period == 0:
+		return 0
+	}
+
+	// Let m be the sample rate,
+	// the probability distribution function is m*exp(-mx), so the CDF is
+	// p = 1 - exp(-mx), so
+	// q = 1 - p == exp(-mx)
+	// log_e(q) = -mx
+	// -log_e(q)/m = x
+	// x = -log_e(q) * period
+	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	const randomBitCount = 26
+	q := fastrand()%(1<<randomBitCount) + 1
+	qlog := fastlog2(float64(q)) - randomBitCount
+	if qlog > 0 {
+		qlog = 0
+	}
+	const minusLog2 = -0.6931471805599453 // -ln(2)
+	return int32(qlog*(minusLog2*float64(period))) + 1
+}
+
+// nextSampleNoFP is similar to nextSample, but uses older,
+// simpler code to avoid floating point.
+func nextSampleNoFP() int32 {
+	// Set first allocation sample size.
+	rate := MemProfileRate
+	if rate > 0x3fffffff { // make 2*rate not overflow
+		rate = 0x3fffffff
+	}
+	if rate != 0 {
+		return int32(int(fastrand()) % (2 * rate))
+	}
+	return 0
+}
+
+type persistentAlloc struct {
+	base unsafe.Pointer
+	off  uintptr
+}
+
+var globalAlloc struct {
+	mutex
+	persistentAlloc
+}
+
+// Wrapper around sysAlloc that can allocate small chunks.
+// There is no associated free operation.
+// Intended for things like function/type/debug-related persistent data.
+// If align is 0, uses default align (currently 8).
+// The returned memory will be zeroed.
+//
+// Consider marking persistentalloc'd types go:notinheap.
+func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+	var p unsafe.Pointer
+	systemstack(func() {
+		p = persistentalloc1(size, align, sysStat)
+	})
+	return p
+}
+
+// Must run on system stack because stack growth can (re)invoke it.
+// See issue 9174.
+//go:systemstack
+func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+	const (
+		chunk    = 256 << 10
+		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
+	)
+
+	if size == 0 {
+		throw("persistentalloc: size == 0")
+	}
+	if align != 0 {
+		if align&(align-1) != 0 {
+			throw("persistentalloc: align is not a power of 2")
+		}
+		if align > _PageSize {
+			throw("persistentalloc: align is too large")
+		}
+	} else {
+		align = 8
+	}
+
+	if size >= maxBlock {
+		return sysAlloc(size, sysStat)
+	}
+
+	mp := acquirem()
+	var persistent *persistentAlloc
+	if mp != nil && mp.p != 0 {
+		persistent = &mp.p.ptr().palloc
+	} else {
+		lock(&globalAlloc.mutex)
+		persistent = &globalAlloc.persistentAlloc
+	}
+	persistent.off = round(persistent.off, align)
+	if persistent.off+size > chunk || persistent.base == nil {
+		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		if persistent.base == nil {
+			if persistent == &globalAlloc.persistentAlloc {
+				unlock(&globalAlloc.mutex)
+			}
+			throw("runtime: cannot allocate memory")
+		}
+		persistent.off = 0
+	}
+	p := add(persistent.base, persistent.off)
+	persistent.off += size
+	releasem(mp)
+	if persistent == &globalAlloc.persistentAlloc {
+		unlock(&globalAlloc.mutex)
+	}
+
+	if sysStat != &memstats.other_sys {
+		mSysStatInc(sysStat, size)
+		mSysStatDec(&memstats.other_sys, size)
+	}
+	return p
+}
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
new file mode 100644
index 0000000..3a463c8
--- /dev/null
+++ b/libgo/go/runtime/mbarrier.go
@@ -0,0 +1,418 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: write barriers.
+//
+// For the concurrent garbage collector, the Go compiler implements
+// updates to pointer-valued fields that may be in heap objects by
+// emitting calls to write barriers. This file contains the actual write barrier
+// implementation, gcmarkwb_m, and the various wrappers called by the
+// compiler to implement pointer assignment, slice assignment,
+// typed memmove, and so on.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname writebarrierptr runtime.writebarrierptr
+//go:linkname typedmemmove runtime.typedmemmove
+//go:linkname typedslicecopy runtime.typedslicecopy
+
+// gcmarkwb_m is the mark-phase write barrier, the only barrier we have.
+// The rest of this file exists only to make calls to this function.
+//
+// This is a hybrid barrier that combines a Yuasa-style deletion
+// barrier—which shades the object whose reference is being
+// overwritten—with Dijkstra insertion barrier—which shades the object
+// whose reference is being written. The insertion part of the barrier
+// is necessary while the calling goroutine's stack is grey. In
+// pseudocode, the barrier is:
+//
+//     writePointer(slot, ptr):
+//         shade(*slot)
+//         if current stack is grey:
+//             shade(ptr)
+//         *slot = ptr
+//
+// slot is the destination in Go code.
+// ptr is the value that goes into the slot in Go code.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// The two shades and the condition work together to prevent a mutator
+// from hiding an object from the garbage collector:
+//
+// 1. shade(*slot) prevents a mutator from hiding an object by moving
+// the sole pointer to it from the heap to its stack. If it attempts
+// to unlink an object from the heap, this will shade it.
+//
+// 2. shade(ptr) prevents a mutator from hiding an object by moving
+// the sole pointer to it from its stack into a black object in the
+// heap. If it attempts to install the pointer into a black object,
+// this will shade it.
+//
+// 3. Once a goroutine's stack is black, the shade(ptr) becomes
+// unnecessary. shade(ptr) prevents hiding an object by moving it from
+// the stack to the heap, but this requires first having a pointer
+// hidden on the stack. Immediately after a stack is scanned, it only
+// points to shaded objects, so it's not hiding anything, and the
+// shade(*slot) prevents it from hiding any other pointers on its
+// stack.
+//
+// For a detailed description of this barrier and proof of
+// correctness, see https://github.com/golang/proposal/blob/master/design/17503-eliminate-rescan.md
+//
+//
+//
+// Dealing with memory ordering:
+//
+// Both the Yuasa and Dijkstra barriers can be made conditional on the
+// color of the object containing the slot. We chose not to make these
+// conditional because the cost of ensuring that the object holding
+// the slot doesn't concurrently change color without the mutator
+// noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into
+// a slot and then loads the slot's mark bit while the GC thread
+// writes to the slot's mark bit and then as part of scanning reads
+// the slot.
+//
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+//
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// Without an expensive memory barrier between the st and the ld, the final
+// result on most HW (including 386/amd64) can be r1==r2==0. This is a classic
+// example of what can happen when loads are allowed to be reordered with older
+// stores (avoiding such reorderings lies at the heart of the classic
+// Peterson/Dekker algorithms for mutual exclusion). Rather than require memory
+// barriers, which will slow down both the mutator and the GC, we always grey
+// the ptr object regardless of the slot's color.
+//
+// Another place where we intentionally omit memory barriers is when
+// accessing mheap_.arena_used to check if a pointer points into the
+// heap. On relaxed memory machines, it's possible for a mutator to
+// extend the size of the heap by updating arena_used, allocate an
+// object from this new region, and publish a pointer to that object,
+// but for tracing running on another processor to observe the pointer
+// but use the old value of arena_used. In this case, tracing will not
+// mark the object, even though it's reachable. However, the mutator
+// is guaranteed to execute a write barrier when it publishes the
+// pointer, so it will take care of marking the object. A general
+// consequence of this is that the garbage collector may cache the
+// value of mheap_.arena_used. (See issue #9984.)
+//
+//
+// Stack writes:
+//
+// The compiler omits write barriers for writes to the current frame,
+// but if a stack pointer has been passed down the call stack, the
+// compiler will generate a write barrier for writes through that
+// pointer (because it doesn't know it's not a heap pointer).
+//
+// One might be tempted to ignore the write barrier if slot points
+// into to the stack. Don't do it! Mark termination only re-scans
+// frames that have potentially been active since the concurrent scan,
+// so it depends on write barriers to track changes to pointers in
+// stack frames that have not been active.
+//
+//
+// Global writes:
+//
+// The Go garbage collector requires write barriers when heap pointers
+// are stored in globals. Many garbage collectors ignore writes to
+// globals and instead pick up global -> heap pointers during
+// termination. This increases pause time, so we instead rely on write
+// barriers for writes to globals so that we don't have to rescan
+// global during mark termination.
+//
+//
+// Publication ordering:
+//
+// The write barrier is *pre-publication*, meaning that the write
+// barrier happens prior to the *slot = ptr write that may make ptr
+// reachable by some goroutine that currently cannot reach it.
+//
+//
+//go:nowritebarrierrec
+//go:systemstack
+func gcmarkwb_m(slot *uintptr, ptr uintptr) {
+	if writeBarrier.needed {
+		// Note: This turns bad pointer writes into bad
+		// pointer reads, which could be confusing. We avoid
+		// reading from obviously bad pointers, which should
+		// take care of the vast majority of these. We could
+		// patch this up in the signal handler, or use XCHG to
+		// combine the read and the write. Checking inheap is
+		// insufficient since we need to track changes to
+		// roots outside the heap.
+		if slot1 := uintptr(unsafe.Pointer(slot)); slot1 >= minPhysPageSize {
+			if optr := *slot; optr != 0 {
+				shade(optr)
+			}
+		}
+		// TODO: Make this conditional on the caller's stack color.
+		if ptr != 0 && inheap(ptr) {
+			shade(ptr)
+		}
+	}
+}
+
+// writebarrierptr_prewrite1 invokes a write barrier for *dst = src
+// prior to the write happening.
+//
+// Write barrier calls must not happen during critical GC and scheduler
+// related operations. In particular there are times when the GC assumes
+// that the world is stopped but scheduler related code is still being
+// executed, dealing with syscalls, dealing with putting gs on runnable
+// queues and so forth. This code cannot execute write barriers because
+// the GC might drop them on the floor. Stopping the world involves removing
+// the p associated with an m. We use the fact that m.p == nil to indicate
+// that we are in one these critical section and throw if the write is of
+// a pointer to a heap object.
+//go:nosplit
+func writebarrierptr_prewrite1(dst *uintptr, src uintptr) {
+	mp := acquirem()
+	if mp.inwb || mp.dying > 0 {
+		releasem(mp)
+		return
+	}
+	systemstack(func() {
+		if mp.p == 0 && memstats.enablegc && !mp.inwb && inheap(src) {
+			throw("writebarrierptr_prewrite1 called with mp.p == nil")
+		}
+		mp.inwb = true
+		gcmarkwb_m(dst, src)
+	})
+	mp.inwb = false
+	releasem(mp)
+}
+
+// NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
+// but if we do that, Go inserts a write barrier on *dst = src.
+//go:nosplit
+func writebarrierptr(dst *uintptr, src uintptr) {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
+		*dst = src
+		return
+	}
+	if src != 0 && src < minPhysPageSize {
+		systemstack(func() {
+			print("runtime: writebarrierptr *", dst, " = ", hex(src), "\n")
+			throw("bad pointer in write barrier")
+		})
+	}
+	writebarrierptr_prewrite1(dst, src)
+	*dst = src
+}
+
+// writebarrierptr_prewrite is like writebarrierptr, but the store
+// will be performed by the caller after this call. The caller must
+// not allow preemption between this call and the write.
+//
+//go:nosplit
+func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	if src != 0 && src < minPhysPageSize {
+		systemstack(func() { throw("bad pointer in write barrier") })
+	}
+	writebarrierptr_prewrite1(dst, src)
+}
+
+// typedmemmove copies a value of type t to dst from src.
+//go:nosplit
+func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if typ.kind&kindNoPointers == 0 {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.size)
+	}
+	// There's a race here: if some other goroutine can write to
+	// src, it may change some pointer in src after we've
+	// performed the write barrier but before we perform the
+	// memory copy. This safe because the write performed by that
+	// other goroutine must also be accompanied by a write
+	// barrier, so at worst we've unnecessarily greyed the old
+	// pointer that was in src.
+	memmove(dst, src, typ.size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, 0, typ.size)
+	}
+}
+
+//go:linkname reflect_typedmemmove reflect.typedmemmove
+func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
+	if raceenabled {
+		raceWriteObjectPC(typ, dst, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+	}
+	if msanenabled {
+		msanwrite(dst, typ.size)
+		msanread(src, typ.size)
+	}
+	typedmemmove(typ, dst, src)
+}
+
+// typedmemmovepartial is like typedmemmove but assumes that
+// dst and src point off bytes into the value and only copies size bytes.
+//go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
+func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if writeBarrier.needed && typ.kind&kindNoPointers == 0 && size >= sys.PtrSize {
+		// Pointer-align start address for bulk barrier.
+		adst, asrc, asize := dst, src, size
+		if frag := -off & (sys.PtrSize - 1); frag != 0 {
+			adst = add(dst, frag)
+			asrc = add(src, frag)
+			asize -= frag
+		}
+		bulkBarrierPreWrite(uintptr(adst), uintptr(asrc), asize&^(sys.PtrSize-1))
+	}
+
+	memmove(dst, src, size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, off, size)
+	}
+}
+
+//go:nosplit
+func typedslicecopy(typ *_type, dst, src slice) int {
+	// TODO(rsc): If typedslicecopy becomes faster than calling
+	// typedmemmove repeatedly, consider using during func growslice.
+	n := dst.len
+	if n > src.len {
+		n = src.len
+	}
+	if n == 0 {
+		return 0
+	}
+	dstp := dst.array
+	srcp := src.array
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&typ))
+		pc := funcPC(slicecopy)
+		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
+		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
+	}
+	if msanenabled {
+		msanwrite(dstp, uintptr(n)*typ.size)
+		msanread(srcp, uintptr(n)*typ.size)
+	}
+
+	if writeBarrier.cgo {
+		cgoCheckSliceCopy(typ, dst, src, n)
+	}
+
+	// Note: No point in checking typ.kind&kindNoPointers here:
+	// compiler only emits calls to typedslicecopy for types with pointers,
+	// and growslice and reflect_typedslicecopy check for pointers
+	// before calling typedslicecopy.
+	if !writeBarrier.needed {
+		memmove(dstp, srcp, uintptr(n)*typ.size)
+		return n
+	}
+
+	systemstack(func() {
+		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
+			// Overlap with src before dst.
+			// Copy backward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			dstp = add(dstp, uintptr(n-1)*typ.size)
+			srcp = add(srcp, uintptr(n-1)*typ.size)
+			i := 0
+			for {
+				typedmemmove(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, -typ.size)
+				srcp = add(srcp, -typ.size)
+			}
+		} else {
+			// Copy forward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			i := 0
+			for {
+				typedmemmove(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, typ.size)
+				srcp = add(srcp, typ.size)
+			}
+		}
+	})
+	return n
+}
+
+//go:linkname reflect_typedslicecopy reflect.typedslicecopy
+func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
+	if elemType.kind&kindNoPointers != 0 {
+		n := dst.len
+		if n > src.len {
+			n = src.len
+		}
+		if n == 0 {
+			return 0
+		}
+
+		size := uintptr(n) * elemType.size
+		if raceenabled {
+			callerpc := getcallerpc(unsafe.Pointer(&elemType))
+			pc := funcPC(reflect_typedslicecopy)
+			racewriterangepc(dst.array, size, callerpc, pc)
+			racereadrangepc(src.array, size, callerpc, pc)
+		}
+		if msanenabled {
+			msanwrite(dst.array, size)
+			msanread(src.array, size)
+		}
+
+		memmove(dst.array, src.array, size)
+		return n
+	}
+	return typedslicecopy(elemType, dst, src)
+}
+
+// typedmemclr clears the typed memory at ptr with type typ. The
+// memory at ptr must already be initialized (and hence in type-safe
+// state). If the memory is being initialized for the first time, see
+// memclrNoHeapPointers.
+//
+// If the caller knows that typ has pointers, it can alternatively
+// call memclrHasPointers.
+//
+//go:nosplit
+func typedmemclr(typ *_type, ptr unsafe.Pointer) {
+	if typ.kind&kindNoPointers == 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, typ.size)
+	}
+	memclrNoHeapPointers(ptr, typ.size)
+}
+
+// memclrHasPointers clears n bytes of typed memory starting at ptr.
+// The caller must ensure that the type of the object at ptr has
+// pointers, usually by checking typ.kind&kindNoPointers. However, ptr
+// does not have to point to the start of the allocation.
+//
+//go:nosplit
+func memclrHasPointers(ptr unsafe.Pointer, n uintptr) {
+	bulkBarrierPreWrite(uintptr(ptr), 0, n)
+	memclrNoHeapPointers(ptr, n)
+}
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
new file mode 100644
index 0000000..2b00493
--- /dev/null
+++ b/libgo/go/runtime/mbitmap.go
@@ -0,0 +1,1874 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: type and heap bitmaps.
+//
+// Stack, data, and bss bitmaps
+//
+// Stack frames and global variables in the data and bss sections are described
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
+// to be visited during GC. The bits in each byte are consumed starting with
+// the low bit: 1<<0, 1<<1, and so on.
+//
+// Heap bitmap
+//
+// The allocated heap comes from a subset of the memory in the range [start, used),
+// where start == mheap_.arena_start and used == mheap_.arena_used.
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
+// stored in bytes indexed backward in memory from start.
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
+//
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
+// The meaning of the high bit depends on the position of the word being described
+// in its allocated object. In all words *except* the second word, the
+// high bit indicates that the object is still being described. In
+// these words, if a bit pair with a high bit 0 is encountered, the
+// low bit can also be assumed to be 0, and the object description is
+// over. This 00 is called the ``dead'' encoding: it signals that the
+// rest of the words in the object are uninteresting to the garbage
+// collector.
+//
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+//
+// The 2-bit entries are split when written into the byte, so that the top half
+// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
+// bits.
+// This form allows a copy from the 1-bit to the 4-bit form to keep the
+// pointer bits contiguous, instead of having to space them out.
+//
+// The code makes use of the fact that the zero value for a heap bitmap
+// has no live pointer bit set and is (depending on position), not used,
+// not checkmarked, and is the dead encoding.
+// These properties must be preserved when modifying the encoding.
+//
+// Checkmarks
+//
+// In a concurrent garbage collector, one worries about failing to mark
+// a live object due to mutations without write barriers or bugs in the
+// collector implementation. As a sanity check, the GC has a 'checkmark'
+// mode that retraverses the object graph with the world stopped, to make
+// sure that everything that should be marked is marked.
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
+// for the second word of the object holds the checkmark bit.
+// When not in checkmark mode, this bit is set to 1.
+//
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
+// means every allocated object has two words, so there is room for the
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
+//
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	bitPointer = 1 << 0
+	bitScan    = 1 << 4
+
+	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
+	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
+
+	// all scan/pointer bits in a byte
+	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
+)
+
+// addb returns the byte pointer p+n.
+//go:nowritebarrier
+//go:nosplit
+func addb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
+}
+
+// subtractb returns the byte pointer p-n.
+// subtractb is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
+//go:nowritebarrier
+//go:nosplit
+func subtractb(p *byte, n uintptr) *byte {
+	// Note: wrote out full expression instead of calling add(p, -n)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
+}
+
+// add1 returns the byte pointer p+1.
+//go:nowritebarrier
+//go:nosplit
+func add1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling addb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
+}
+
+// subtract1 returns the byte pointer p-1.
+// subtract1 is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
+//go:nowritebarrier
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func subtract1(p *byte) *byte {
+	// Note: wrote out full expression instead of calling subtractb(p, 1)
+	// to reduce the number of temporaries generated by the
+	// compiler for this trivial expression during inlining.
+	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
+}
+
+// mHeap_MapBits is called each time arena_used is extended.
+// It maps any additional bitmap memory needed for the new arena memory.
+// It must be called with the expected new value of arena_used,
+// *before* h.arena_used has been updated.
+// Waiting to update arena_used until after the memory has been mapped
+// avoids faults when other threads try access the bitmap immediately
+// after observing the change to arena_used.
+//
+//go:nowritebarrier
+func (h *mheap) mapBits(arena_used uintptr) {
+	// Caller has added extra mappings to the arena.
+	// Add extra mappings of bitmap words as needed.
+	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+	const bitmapChunk = 8192
+
+	n := (arena_used - mheap_.arena_start) / heapBitmapScale
+	n = round(n, bitmapChunk)
+	n = round(n, physPageSize)
+	if h.bitmap_mapped >= n {
+		return
+	}
+
+	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+	h.bitmap_mapped = n
+}
+
+// heapBits provides access to the bitmap bits for a single heap word.
+// The methods on heapBits take value receivers so that the compiler
+// can more easily inline calls to those methods and registerize the
+// struct fields independently.
+type heapBits struct {
+	bitp  *uint8
+	shift uint32
+}
+
+// markBits provides access to the mark bit for an object in the heap.
+// bytep points to the byte holding the mark bit.
+// mask is a byte with a single bit set that can be &ed with *bytep
+// to see if the bit has been set.
+// *m.byte&m.mask != 0 indicates the mark bit is set.
+// index can be used along with span information to generate
+// the address of the object in the heap.
+// We maintain one set of mark bits for allocation and one for
+// marking purposes.
+type markBits struct {
+	bytep *uint8
+	mask  uint8
+	index uintptr
+}
+
+//go:nosplit
+func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
+	whichByte := allocBitIndex / 8
+	whichBit := allocBitIndex % 8
+	bytePtr := addb(s.allocBits, whichByte)
+	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
+}
+
+// refillaCache takes 8 bytes s.allocBits starting at whichByte
+// and negates them so that ctz (count trailing zeros) instructions
+// can be used. It then places these 8 bytes into the cached 64 bit
+// s.allocCache.
+func (s *mspan) refillAllocCache(whichByte uintptr) {
+	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
+	aCache := uint64(0)
+	aCache |= uint64(bytes[0])
+	aCache |= uint64(bytes[1]) << (1 * 8)
+	aCache |= uint64(bytes[2]) << (2 * 8)
+	aCache |= uint64(bytes[3]) << (3 * 8)
+	aCache |= uint64(bytes[4]) << (4 * 8)
+	aCache |= uint64(bytes[5]) << (5 * 8)
+	aCache |= uint64(bytes[6]) << (6 * 8)
+	aCache |= uint64(bytes[7]) << (7 * 8)
+	s.allocCache = ^aCache
+}
+
+// nextFreeIndex returns the index of the next free object in s at
+// or after s.freeindex.
+// There are hardware instructions that can be used to make this
+// faster if profiling warrants it.
+func (s *mspan) nextFreeIndex() uintptr {
+	sfreeindex := s.freeindex
+	snelems := s.nelems
+	if sfreeindex == snelems {
+		return sfreeindex
+	}
+	if sfreeindex > snelems {
+		throw("s.freeindex > s.nelems")
+	}
+
+	aCache := s.allocCache
+
+	bitIndex := sys.Ctz64(aCache)
+	for bitIndex == 64 {
+		// Move index to start of next cached bits.
+		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
+		if sfreeindex >= snelems {
+			s.freeindex = snelems
+			return snelems
+		}
+		whichByte := sfreeindex / 8
+		// Refill s.allocCache with the next 64 alloc bits.
+		s.refillAllocCache(whichByte)
+		aCache = s.allocCache
+		bitIndex = sys.Ctz64(aCache)
+		// nothing available in cached bits
+		// grab the next 8 bytes and try again.
+	}
+	result := sfreeindex + uintptr(bitIndex)
+	if result >= snelems {
+		s.freeindex = snelems
+		return snelems
+	}
+
+	s.allocCache >>= (bitIndex + 1)
+	sfreeindex = result + 1
+
+	if sfreeindex%64 == 0 && sfreeindex != snelems {
+		// We just incremented s.freeindex so it isn't 0.
+		// As each 1 in s.allocCache was encountered and used for allocation
+		// it was shifted away. At this point s.allocCache contains all 0s.
+		// Refill s.allocCache so that it corresponds
+		// to the bits at s.allocBits starting at s.freeindex.
+		whichByte := sfreeindex / 8
+		s.refillAllocCache(whichByte)
+	}
+	s.freeindex = sfreeindex
+	return result
+}
+
+// isFree returns whether the index'th object in s is unallocated.
+func (s *mspan) isFree(index uintptr) bool {
+	if index < s.freeindex {
+		return false
+	}
+	whichByte := index / 8
+	whichBit := index % 8
+	byteVal := *addb(s.allocBits, whichByte)
+	return byteVal&uint8(1<<whichBit) == 0
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	byteOffset := p - s.base()
+	if byteOffset == 0 {
+		return 0
+	}
+	if s.baseMask != 0 {
+		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
+		return byteOffset >> s.divShift
+	}
+	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+}
+
+func markBitsForAddr(p uintptr) markBits {
+	s := spanOf(p)
+	objIndex := s.objIndex(p)
+	return s.markBitsForIndex(objIndex)
+}
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
+	whichByte := objIndex / 8
+	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
+	bytePtr := addb(s.gcmarkBits, whichByte)
+	return markBits{bytePtr, bitMask, objIndex}
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	return markBits{s.gcmarkBits, uint8(1), 0}
+}
+
+// isMarked reports whether mark bit m is set.
+func (m markBits) isMarked() bool {
+	return *m.bytep&m.mask != 0
+}
+
+// setMarked sets the marked bit in the markbits, atomically. Some compilers
+// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
+// inlining it manually.
+func (m markBits) setMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.Or8(m.bytep, m.mask)
+}
+
+// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
+func (m markBits) setMarkedNonAtomic() {
+	*m.bytep |= m.mask
+}
+
+// clearMarked clears the marked bit in the markbits, atomically.
+func (m markBits) clearMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.And8(m.bytep, ^m.mask)
+}
+
+// clearMarkedNonAtomic clears the marked bit non-atomically.
+func (m markBits) clearMarkedNonAtomic() {
+	*m.bytep ^= m.mask
+}
+
+// markBitsForSpan returns the markBits for the span base address base.
+func markBitsForSpan(base uintptr) (mbits markBits) {
+	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		throw("markBitsForSpan: base out of range")
+	}
+	mbits = markBitsForAddr(base)
+	if mbits.mask != 1 {
+		throw("markBitsForSpan: unaligned start")
+	}
+	return mbits
+}
+
+// advance advances the markBits to the next object in the span.
+func (m *markBits) advance() {
+	if m.mask == 1<<7 {
+		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
+		m.mask = 1
+	} else {
+		m.mask = m.mask << 1
+	}
+	m.index++
+}
+
+// heapBitsForAddr returns the heapBits for the address addr.
+// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
+	off := (addr - mheap_.arena_start) / sys.PtrSize
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
+}
+
+// heapBitsForSpan returns the heapBits for the span base address base.
+func heapBitsForSpan(base uintptr) (hbits heapBits) {
+	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		throw("heapBitsForSpan: base out of range")
+	}
+	return heapBitsForAddr(base)
+}
+
+// heapBitsForObject returns the base address for the heap object
+// containing the address p, the heapBits for base,
+// the object's span, and of the index of the object in s.
+// If p does not point into a heap object,
+// return base == 0
+// otherwise return the base of the object.
+//
+// For gccgo, the forStack parameter is true if the value came from the stack.
+// The stack is collected conservatively and may contain invalid pointers.
+//
+// refBase and refOff optionally give the base address of the object
+// in which the pointer p was found and the byte offset at which it
+// was found. These are used for error reporting.
+func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
+	arenaStart := mheap_.arena_start
+	if p < arenaStart || p >= mheap_.arena_used {
+		return
+	}
+	off := p - arenaStart
+	idx := off >> _PageShift
+	// p points into the heap, but possibly to the middle of an object.
+	// Consult the span table to find the block beginning.
+	s = mheap_.spans[idx]
+	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
+		if s == nil || s.state == _MSpanStack || forStack {
+			// If s is nil, the virtual address has never been part of the heap.
+			// This pointer may be to some mmap'd region, so we allow it.
+			// Pointers into stacks are also ok, the runtime manages these explicitly.
+			return
+		}
+
+		// The following ensures that we are rigorous about what data
+		// structures hold valid pointers.
+		if debug.invalidptr != 0 {
+			// Typically this indicates an incorrect use
+			// of unsafe or cgo to store a bad pointer in
+			// the Go heap. It may also indicate a runtime
+			// bug.
+			//
+			// TODO(austin): We could be more aggressive
+			// and detect pointers to unallocated objects
+			// in allocated spans.
+			printlock()
+			print("runtime: pointer ", hex(p))
+			if s.state != mSpanInUse {
+				print(" to unallocated span")
+			} else {
+				print(" to unused region of span")
+			}
+			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
+			if refBase != 0 {
+				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+				gcDumpObject("object", refBase, refOff)
+			}
+			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+		}
+		return
+	}
+
+	if forStack {
+		// A span can be entered in mheap_.spans, and be set
+		// to mSpanInUse, before it is fully initialized.
+		// All we need in practice is allocBits and gcmarkBits,
+		// so make sure they are set.
+		if s.allocBits == nil || s.gcmarkBits == nil {
+			return
+		}
+	}
+
+	// If this span holds object of a power of 2 size, just mask off the bits to
+	// the interior of the object. Otherwise use the size to get the base.
+	if s.baseMask != 0 {
+		// optimize for power of 2 sized objects.
+		base = s.base()
+		base = base + (p-base)&uintptr(s.baseMask)
+		objIndex = (base - s.base()) >> s.divShift
+		// base = p & s.baseMask is faster for small spans,
+		// but doesn't work for large spans.
+		// Overall, it's faster to use the more general computation above.
+	} else {
+		base = s.base()
+		if p-base >= s.elemsize {
+			// n := (p - base) / s.elemsize, using division by multiplication
+			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
+			base += objIndex * s.elemsize
+		}
+	}
+	// Now that we know the actual base, compute heapBits to return to caller.
+	hbits = heapBitsForAddr(base)
+	return
+}
+
+// prefetch the bits.
+func (h heapBits) prefetch() {
+	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
+}
+
+// next returns the heapBits describing the next pointer-sized word in memory.
+// That is, if h describes address p, h.next() describes p+ptrSize.
+// Note that next does not modify h. The caller must record the result.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) next() heapBits {
+	if h.shift < 3*heapBitsShift {
+		return heapBits{h.bitp, h.shift + heapBitsShift}
+	}
+	return heapBits{subtract1(h.bitp), 0}
+}
+
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsShift
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
+}
+
+// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	// The (shift & 31) eliminates a test and conditional branch
+	// from the generated code.
+	return uint32(*h.bitp) >> (h.shift & 31)
+}
+
+// morePointers returns true if this word and all remaining words in this object
+// are scalars.
+// h must not describe the second word of the object.
+func (h heapBits) morePointers() bool {
+	return h.bits()&bitScan != 0
+}
+
+// isPointer reports whether the heap bits describe a pointer word.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//go:nosplit
+func (h heapBits) isPointer() bool {
+	return h.bits()&bitPointer != 0
+}
+
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is for efficiency.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == sys.PtrSize { // 1-word objects are always pointers
+		return true
+	}
+	return (*h.bitp>>h.shift)&bitScan != 0
+}
+
+// isCheckmarked reports whether the heap bits have the checkmarked bit set.
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == sys.PtrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
+}
+
+// setCheckmarked sets the checkmarked bit.
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) setCheckmarked(size uintptr) {
+	if size == sys.PtrSize {
+		atomic.Or8(h.bitp, bitPointer<<h.shift)
+		return
+	}
+	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
+}
+
+// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// for every pointer slot in the memory range [src, src+size),
+// using pointer/scalar information from [dst, dst+size).
+// This executes the write barriers necessary before a memmove.
+// src, dst, and size must be pointer-aligned.
+// The range [dst, dst+size) must lie within a single object.
+//
+// As a special case, src == 0 indicates that this is being used for a
+// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
+// barrier.
+//
+// Callers should call bulkBarrierPreWrite immediately before
+// calling memmove(dst, src, size). This function is marked nosplit
+// to avoid being preempted; the GC must not stop the goroutine
+// between the memmove and the execution of the barriers.
+// The caller is also responsible for cgo pointer checks if this
+// may be writing Go pointers into non-Go memory.
+//
+// The pointer bitmap is not maintained for allocations containing
+// no pointers at all; any caller of bulkBarrierPreWrite must first
+// make sure the underlying allocation contains pointers, usually
+// by checking typ.kind&kindNoPointers.
+//
+//go:nosplit
+func bulkBarrierPreWrite(dst, src, size uintptr) {
+	if (dst|src|size)&(sys.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	if !inheap(dst) {
+		// If dst is a global, use the data or BSS bitmaps to
+		// execute write barriers.
+		roots := gcRoots
+		for roots != nil {
+			for i := 0; i < roots.count; i++ {
+				pr := roots.roots[i]
+				addr := uintptr(pr.decl)
+				if addr <= dst && dst < addr+pr.size {
+					if dst < addr+pr.ptrdata {
+						bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
+					}
+					return
+				}
+			}
+			roots = roots.next
+		}
+		return
+	}
+
+	h := heapBitsForAddr(dst)
+	if src == 0 {
+		for i := uintptr(0); i < size; i += sys.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				writebarrierptr_prewrite1(dstx, 0)
+			}
+			h = h.next()
+		}
+	} else {
+		for i := uintptr(0); i < size; i += sys.PtrSize {
+			if h.isPointer() {
+				dstx := (*uintptr)(unsafe.Pointer(dst + i))
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				writebarrierptr_prewrite1(dstx, *srcx)
+			}
+			h = h.next()
+		}
+	}
+}
+
+// bulkBarrierBitmap executes write barriers for copying from [src,
+// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
+// assumed to start maskOffset bytes into the data covered by the
+// bitmap in bits (which may not be a multiple of 8).
+//
+// This is used by bulkBarrierPreWrite for writes to data and BSS.
+//
+//go:nosplit
+func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
+	word := maskOffset / sys.PtrSize
+	bits = addb(bits, word/8)
+	mask := uint8(1) << (word % 8)
+
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if mask == 0 {
+			bits = addb(bits, 1)
+			if *bits == 0 {
+				// Skip 8 words.
+				i += 7 * sys.PtrSize
+				continue
+			}
+			mask = 1
+		}
+		if *bits&mask != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			if src == 0 {
+				writebarrierptr_prewrite1(dstx, 0)
+			} else {
+				srcx := (*uintptr)(unsafe.Pointer(src + i))
+				writebarrierptr_prewrite1(dstx, *srcx)
+			}
+		}
+		mask <<= 1
+	}
+}
+
+// typeBitsBulkBarrier executes writebarrierptr_prewrite for every
+// pointer that would be copied from [src, src+size) to [dst,
+// dst+size) by a memmove using the type bitmap to locate those
+// pointer slots.
+//
+// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
+// dst, src, and size must be pointer-aligned.
+// The type typ must have a plain bitmap, not a GC program.
+// The only use of this function is in channel sends, and the
+// 64 kB channel element limit takes care of this for us.
+//
+// Must not be preempted because it typically runs right before memmove,
+// and the GC must observe them as an atomic action.
+//
+//go:nosplit
+func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
+	if typ == nil {
+		throw("runtime: typeBitsBulkBarrier without type")
+	}
+	if typ.size != size {
+		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " of size ", typ.size, " but memory size", size)
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if typ.kind&kindGCProg != 0 {
+		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " with GC prog")
+		throw("runtime: invalid typeBitsBulkBarrier")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	ptrmask := typ.gcdata
+	var bits uint32
+	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
+		if i&(sys.PtrSize*8-1) == 0 {
+			bits = uint32(*ptrmask)
+			ptrmask = addb(ptrmask, 1)
+		} else {
+			bits = bits >> 1
+		}
+		if bits&1 != 0 {
+			dstx := (*uintptr)(unsafe.Pointer(dst + i))
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			writebarrierptr_prewrite(dstx, *srcx)
+		}
+	}
+}
+
+// The methods operating on spans all require that h has been returned
+// by heapBitsForSpan and that size, n, total are the span layout description
+// returned by the mspan's layout method.
+// If total > size*n, it means that there is extra leftover memory in the span,
+// usually due to rounding.
+//
+// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
+
+// initSpan initializes the heap bitmap for a span.
+// It clears all checkmark bits.
+// If this is a span of pointer-sized objects, it initializes all
+// words to pointer/scan.
+// Otherwise, it initializes all words to scalar/dead.
+func (h heapBits) initSpan(s *mspan) {
+	size, n, total := s.layout()
+
+	// Init the markbit structures
+	s.freeindex = 0
+	s.allocCache = ^uint64(0) // all 1s indicating all free.
+	s.nelems = n
+	s.allocBits = nil
+	s.gcmarkBits = nil
+	s.gcmarkBits = newMarkBits(s.nelems)
+	s.allocBits = newAllocBits(s.nelems)
+
+	// Clear bits corresponding to objects.
+	if total%heapBitmapScale != 0 {
+		throw("initSpan: unaligned length")
+	}
+	nbyte := total / heapBitmapScale
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		end := h.bitp
+		bitp := subtractb(end, nbyte-1)
+		for {
+			*bitp = bitPointerAll | bitScanAll
+			if bitp == end {
+				break
+			}
+			bitp = add1(bitp)
+		}
+		return
+	}
+	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
+}
+
+// initCheckmarkSpan initializes a span for being checkmarked.
+// It clears the checkmark bits, which are set to 1 in normal operation.
+func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
+		// Only possible on 64-bit system, since minimum size is 8.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
+		bitp := h.bitp
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp &^= bitPointerAll
+			bitp = subtract1(bitp)
+		}
+		return
+	}
+	for i := uintptr(0); i < n; i++ {
+		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
+		h = h.forward(size / sys.PtrSize)
+	}
+}
+
+// clearCheckmarkSpan undoes all the checkmarking in a span.
+// The actual checkmark bits are ignored, so the only work to do
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
+func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
+		// Only possible on 64-bit system, since minimum size is 8.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
+		bitp := h.bitp
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp |= bitPointerAll
+			bitp = subtract1(bitp)
+		}
+	}
+}
+
+// oneBitCount is indexed by byte and produces the
+// number of 1 bits in that byte. For example 128 has 1 bit set
+// and oneBitCount[128] will holds 1.
+var oneBitCount = [256]uint8{
+	0, 1, 1, 2, 1, 2, 2, 3,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	5, 6, 6, 7, 6, 7, 7, 8}
+
+// countFree runs through the mark bits in a span and counts the number of free objects
+// in the span.
+// TODO:(rlh) Use popcount intrinsic.
+func (s *mspan) countFree() int {
+	count := 0
+	maxIndex := s.nelems / 8
+	for i := uintptr(0); i < maxIndex; i++ {
+		mrkBits := *addb(s.gcmarkBits, i)
+		count += int(oneBitCount[mrkBits])
+	}
+	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
+		mrkBits := *addb(s.gcmarkBits, maxIndex)
+		mask := uint8((1 << bitsInLastByte) - 1)
+		bits := mrkBits & mask
+		count += int(oneBitCount[bits])
+	}
+	return int(s.nelems) - count
+}
+
+// heapBitsSetType records that the new allocation [x, x+size)
+// holds in [x, x+dataSize) one or more values of type typ.
+// (The number of values is given by dataSize / typ.size.)
+// If dataSize < size, the fragment [x+dataSize, x+size) is
+// recorded as non-pointer data.
+// It is known that the type has pointers somewhere;
+// malloc does not call heapBitsSetType when there are no pointers,
+// because all free objects are marked as noscan during
+// heapBitsSweepSpan.
+//
+// There can only be one allocation from a given span active at a time,
+// and the bitmap for a span always falls on byte boundaries,
+// so there are no write-write races for access to the heap bitmap.
+// Hence, heapBitsSetType can access the bitmap without atomics.
+//
+// There can be read-write races between heapBitsSetType and things
+// that read the heap bitmap like scanobject. However, since
+// heapBitsSetType is only used for objects that have not yet been
+// made reachable, readers will ignore bits being modified by this
+// function. This does mean this function cannot transiently modify
+// bits that belong to neighboring objects. Also, on weakly-ordered
+// machines, callers must execute a store/store (publication) barrier
+// between calling this function and making the object reachable.
+func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
+	const doubleCheck = false // slow but helpful; enable to test modifications to this code
+
+	// dataSize is always size rounded up to the next malloc size class,
+	// except in the case of allocating a defer block, in which case
+	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
+	// arbitrarily larger.
+	//
+	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
+	// assume that dataSize == size without checking it explicitly.
+
+	if sys.PtrSize == 8 && size == sys.PtrSize {
+		// It's one word and it has pointers, it must be a pointer.
+		// Since all allocated one-word objects are pointers
+		// (non-pointers are aggregated into tinySize allocations),
+		// initSpan sets the pointer bits for us. Nothing to do here.
+		if doubleCheck {
+			h := heapBitsForAddr(x)
+			if !h.isPointer() {
+				throw("heapBitsSetType: pointer bit missing")
+			}
+			if !h.morePointers() {
+				throw("heapBitsSetType: scan bit missing")
+			}
+		}
+		return
+	}
+
+	h := heapBitsForAddr(x)
+	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
+
+	// Heap bitmap bits for 2-word object are only 4 bits,
+	// so also shared with objects next to it.
+	// This is called out as a special case primarily for 32-bit systems,
+	// so that on 32-bit systems the code below can assume all objects
+	// are 4-word aligned (because they're all 16-byte aligned).
+	if size == 2*sys.PtrSize {
+		if typ.size == sys.PtrSize {
+			// We're allocating a block big enough to hold two pointers.
+			// On 64-bit, that means the actual object must be two pointers,
+			// or else we'd have used the one-pointer-sized block.
+			// On 32-bit, however, this is the 8-byte block, the smallest one.
+			// So it could be that we're allocating one pointer and this was
+			// just the smallest block available. Distinguish by checking dataSize.
+			// (In general the number of instances of typ being allocated is
+			// dataSize/typ.size.)
+			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
+				// 1 pointer object. On 32-bit machines clear the bit for the
+				// unused second word.
+				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+				*h.bitp |= (bitPointer | bitScan) << h.shift
+			} else {
+				// 2-element slice of pointer.
+				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
+			}
+			return
+		}
+		// Otherwise typ.size must be 2*sys.PtrSize,
+		// and typ.kind&kindGCProg == 0.
+		if doubleCheck {
+			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
+				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
+				throw("heapBitsSetType")
+			}
+		}
+		b := uint32(*ptrmask)
+		hb := (b & 3) | bitScan
+		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
+		// 110011 is shifted h.shift and complemented.
+		// This clears out the bits that are about to be
+		// ored into *h.hbitp in the next instructions.
+		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
+		*h.bitp |= uint8(hb << h.shift)
+		return
+	}
+
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// The basic approach is to use a single uintptr as a bit buffer,
+	// alternating between reloading the buffer and writing bitmap bytes.
+	// In general, one load can supply two bitmap byte writes.
+	// This is a lot of lines of code, but it compiles into relatively few
+	// machine instructions.
+
+	var (
+		// Ptrmask input.
+		p     *byte   // last ptrmask byte read
+		b     uintptr // ptrmask bits already loaded
+		nb    uintptr // number of bits in b at next read
+		endp  *byte   // final ptrmask byte to read (then repeat)
+		endnb uintptr // number of valid bits in *endp
+		pbits uintptr // alternate source of bits
+
+		// Heap bitmap output.
+		w     uintptr // words processed
+		nw    uintptr // number of words to process
+		hbitp *byte   // next heap bitmap byte to write
+		hb    uintptr // bits being prepared for *hbitp
+	)
+
+	hbitp = h.bitp
+
+	// Handle GC program. Delayed until this part of the code
+	// so that we can use the same double-checking mechanism
+	// as the 1-bit case. Nothing above could have encountered
+	// GC programs: the cases were all too small.
+	if typ.kind&kindGCProg != 0 {
+		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
+		if doubleCheck {
+			// Double-check the heap bits written by GC program
+			// by running the GC program to create a 1-bit pointer mask
+			// and then jumping to the double-check code below.
+			// This doesn't catch bugs shared between the 1-bit and 4-bit
+			// GC program execution, but it does catch mistakes specific
+			// to just one of those and bugs in heapBitsSetTypeGCProg's
+			// implementation of arrays.
+			lock(&debugPtrmask.lock)
+			if debugPtrmask.data == nil {
+				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
+			}
+			ptrmask = debugPtrmask.data
+			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
+			goto Phase4
+		}
+		return
+	}
+
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
+	p = ptrmask
+	if typ.size < dataSize {
+		// Filling in bits for an array of typ.
+		// Set up for repetition of ptrmask during main loop.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = sys.PtrSize*8 - 7
+		if typ.ptrdata/sys.PtrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
+			// Load into pbits and never read from ptrmask again.
+			// This is especially important when the ptrmask has
+			// fewer than 8 bits in it; otherwise the reload in the middle
+			// of the Phase 2 loop would itself need to loop to gather
+			// at least 8 bits.
+
+			// Accumulate ptrmask into b.
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / sys.PtrSize
+			for i := uintptr(0); i < nb; i += 8 {
+				b |= uintptr(*p) << i
+				p = add1(p)
+			}
+			nb = typ.size / sys.PtrSize
+
+			// Replicate ptrmask to fill entire pbits uintptr.
+			// Doubling and truncating is fewer steps than
+			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/sys.PtrSize bits
+			// but are pretending to have typ.size/sys.PtrSize,
+			// there might be no replication necessary/possible.
+			pbits = b
+			endnb = nb
+			if nb+nb <= maxBits {
+				for endnb <= sys.PtrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				endnb = maxBits / nb * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
+			}
+
+			// Clear p and endp as sentinel for using pbits.
+			// Checked during Phase 2 loop.
+			p = nil
+			endp = nil
+		} else {
+			// Ptrmask is larger. Read it multiple times.
+			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/sys.PtrSize - n*8
+		}
+	}
+	if p != nil {
+		b = uintptr(*p)
+		p = add1(p)
+		nb = 8
+	}
+
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / sys.PtrSize
+	} else {
+		// Repeated instances of typ in an array.
+		// Have to process first N-1 entries in full, but can stop
+		// once we reach the non-pointer data in the final entry.
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
+	}
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", *typ.string)
+		throw("heapBitsSetType: called with non-pointer type")
+		return
+	}
+	if nw < 2 {
+		// Must write at least 2 words, because the "no scan"
+		// encoding doesn't take effect until the third word.
+		nw = 2
+	}
+
+	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
+	// The leading byte is special because it contains the bits for word 1,
+	// which does not have the scan bit set.
+	// The leading half-byte is special because it's a half a byte,
+	// so we have to be careful with the bits already there.
+	switch {
+	default:
+		throw("heapBitsSetType: unexpected shift")
+
+	case h.shift == 0:
+		// Ptrmask and heap bitmap are aligned.
+		// Handle first byte of bitmap specially.
+		//
+		// The first byte we write out covers the first four
+		// words of the object. The scan/dead bit on the first
+		// word must be set to scan since there are pointers
+		// somewhere in the object. The scan/dead bit on the
+		// second word is the checkmark, so we don't set it.
+		// In all following words, we set the scan/dead
+		// appropriately to indicate that the object contains
+		// to the next 2-bit entry in the bitmap.
+		//
+		// TODO: It doesn't matter if we set the checkmark, so
+		// maybe this case isn't needed any more.
+		hb = b & bitPointerAll
+		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
+		if w += 4; w >= nw {
+			goto Phase3
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+		nb -= 4
+
+	case sys.PtrSize == 8 && h.shift == 2:
+		// Ptrmask and heap bitmap are misaligned.
+		// The bits for the first two words are in a byte shared
+		// with another object, so we must be careful with the bits
+		// already there.
+		// We took care of 1-word and 2-word objects above,
+		// so this is at least a 6-word object.
+		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
+		// This is not noscan, so set the scan bit in the
+		// first word.
+		hb |= bitScan << (2 * heapBitsShift)
+		b >>= 2
+		nb -= 2
+		// Note: no bitScan for second word because that's
+		// the checkmark.
+		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
+		*hbitp |= uint8(hb)
+		hbitp = subtract1(hbitp)
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			hb = 0
+			w += 4
+			goto Phase3
+		}
+	}
+
+	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
+	// The loop computes the bits for that last write but does not execute the write;
+	// it leaves the bits in hb for processing by phase 3.
+	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
+	// use in the first half of the loop right now, and then we only adjust nb explicitly
+	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
+	nb -= 4
+	for {
+		// Emit bitmap byte.
+		// b has at least nb+4 bits, with one exception:
+		// if w+4 >= nw, then b has only nw-w bits,
+		// but we'll stop at the break and then truncate
+		// appropriately in Phase 3.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+
+		// Load more bits. b has nb right now.
+		if p != endp {
+			// Fast path: keep reading from ptrmask.
+			// nb unmodified: we just loaded 8 bits,
+			// and the next iteration will consume 8 bits,
+			// leaving us with the same nb the next time we're here.
+			if nb < 8 {
+				b |= uintptr(*p) << nb
+				p = add1(p)
+			} else {
+				// Reduce the number of bits in b.
+				// This is important if we skipped
+				// over a scalar tail, since nb could
+				// be larger than the bit width of b.
+				nb -= 8
+			}
+		} else if p == nil {
+			// Almost as fast path: track bit count and refill from pbits.
+			// For short repetitions.
+			if nb < 8 {
+				b |= pbits << nb
+				nb += endnb
+			}
+			nb -= 8 // for next iteration
+		} else {
+			// Slow path: reached end of ptrmask.
+			// Process final partial byte and rewind to start.
+			b |= uintptr(*p) << nb
+			nb += endnb
+			if nb < 8 {
+				b |= uintptr(*ptrmask) << nb
+				p = add1(ptrmask)
+			} else {
+				nb -= 8
+				p = ptrmask
+			}
+		}
+
+		// Emit bitmap byte.
+		hb = b & bitPointerAll
+		hb |= bitScanAll
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		b >>= 4
+	}
+
+Phase3:
+	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
+	if w > nw {
+		// Counting the 4 entries in hb not yet written to memory,
+		// there are more entries than possible pointer slots.
+		// Discard the excess entries (can't be more than 3).
+		mask := uintptr(1)<<(4-(w-nw)) - 1
+		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
+	}
+
+	// Change nw from counting possibly-pointer words to total words in allocation.
+	nw = size / sys.PtrSize
+
+	// Write whole bitmap bytes.
+	// The first is hb, the rest are zero.
+	if w <= nw {
+		*hbitp = uint8(hb)
+		hbitp = subtract1(hbitp)
+		hb = 0 // for possible final half-byte below
+		for w += 4; w <= nw; w += 4 {
+			*hbitp = 0
+			hbitp = subtract1(hbitp)
+		}
+	}
+
+	// Write final partial bitmap byte if any.
+	// We know w > nw, or else we'd still be in the loop above.
+	// It can be bigger only due to the 4 entries in hb that it counts.
+	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
+	// and can discard the 4 sitting in hb.
+	// But if w == nw+2, we need to write first two in hb.
+	// The byte is shared with the next object, so be careful with
+	// existing bits.
+	if w == nw+2 {
+		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
+	}
+
+Phase4:
+	// Phase 4: all done, but perhaps double check.
+	if doubleCheck {
+		end := heapBitsForAddr(x + size)
+		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
+			println("ended at wrong bitmap byte for", *typ.string, "x", dataSize/typ.size)
+			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+			h0 := heapBitsForAddr(x)
+			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
+			throw("bad heapBitsSetType")
+		}
+
+		// Double-check that bits to be written were written correctly.
+		// Does not check that other bits were not written, unfortunately.
+		h := heapBitsForAddr(x)
+		nptr := typ.ptrdata / sys.PtrSize
+		ndata := typ.size / sys.PtrSize
+		count := dataSize / typ.size
+		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
+		for i := uintptr(0); i < size/sys.PtrSize; i++ {
+			j := i % ndata
+			var have, want uint8
+			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
+			if i >= totalptr {
+				want = 0 // deadmarker
+				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
+					want = bitScan
+				}
+			} else {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+					want |= bitPointer
+				}
+				if i != 1 {
+					want |= bitScan
+				} else {
+					have &^= bitScan
+				}
+			}
+			if have != want {
+				println("mismatch writing bits for", *typ.string, "x", dataSize/typ.size)
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
+				h0 := heapBitsForAddr(x)
+				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
+				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
+				if typ.kind&kindGCProg != 0 {
+					println("GC program:")
+					dumpGCProg(addb(typ.gcdata, 4))
+				}
+				throw("bad heapBitsSetType")
+			}
+			h = h.next()
+		}
+		if ptrmask == debugPtrmask.data {
+			unlock(&debugPtrmask.lock)
+		}
+	}
+}
+
+// heapBitsSetTypeNoScan marks x as noscan by setting the first word
+// of x in the heap bitmap to scalar/dead.
+func heapBitsSetTypeNoScan(x uintptr) {
+	h := heapBitsForAddr(uintptr(x))
+	*h.bitp &^= (bitPointer | bitScan) << h.shift
+}
+
+var debugPtrmask struct {
+	lock mutex
+	data *byte
+}
+
+// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
+// progSize is the size of the memory described by the program.
+// elemSize is the size of the element that the GC program describes (a prefix of).
+// dataSize is the total size of the intended data, a multiple of elemSize.
+// allocSize is the total size of the allocated memory.
+//
+// GC programs are only used for large allocations.
+// heapBitsSetType requires that allocSize is a multiple of 4 words,
+// so that the relevant bitmap bytes are not shared with surrounding
+// objects.
+func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
+	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
+		// Alignment will be wrong.
+		throw("heapBitsSetTypeGCProg: small allocation")
+	}
+	var totalBits uintptr
+	if elemSize == dataSize {
+		totalBits = runGCProg(prog, nil, h.bitp, 2)
+		if totalBits*sys.PtrSize != progSize {
+			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
+			throw("heapBitsSetTypeGCProg: unexpected bit count")
+		}
+	} else {
+		count := dataSize / elemSize
+
+		// Piece together program trailer to run after prog that does:
+		//	literal(0)
+		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
+		//	repeat(elemSize, count-1) // repeat that element for count
+		// This zero-pads the data remaining in the first element and then
+		// repeats that first element to fill the array.
+		var trailer [40]byte // 3 varints (max 10 each) + some bytes
+		i := 0
+		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
+			// literal(0)
+			trailer[i] = 0x01
+			i++
+			trailer[i] = 0
+			i++
+			if n > 1 {
+				// repeat(1, n-1)
+				trailer[i] = 0x81
+				i++
+				n--
+				for ; n >= 0x80; n >>= 7 {
+					trailer[i] = byte(n | 0x80)
+					i++
+				}
+				trailer[i] = byte(n)
+				i++
+			}
+		}
+		// repeat(elemSize/ptrSize, count-1)
+		trailer[i] = 0x80
+		i++
+		n := elemSize / sys.PtrSize
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		n = count - 1
+		for ; n >= 0x80; n >>= 7 {
+			trailer[i] = byte(n | 0x80)
+			i++
+		}
+		trailer[i] = byte(n)
+		i++
+		trailer[i] = 0
+		i++
+
+		runGCProg(prog, &trailer[0], h.bitp, 2)
+
+		// Even though we filled in the full array just now,
+		// record that we only filled in up to the ptrdata of the
+		// last element. This will cause the code below to
+		// memclr the dead section of the final array element,
+		// so that scanobject can stop early in the final element.
+		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
+	}
+	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
+	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
+	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
+}
+
+// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
+// size the size of the region described by prog, in bytes.
+// The resulting bitvector will have no more than size/sys.PtrSize bits.
+func progToPointerMask(prog *byte, size uintptr) bitvector {
+	n := (size/sys.PtrSize + 7) / 8
+	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
+	x[len(x)-1] = 0xa1 // overflow check sentinel
+	n = runGCProg(prog, nil, &x[0], 1)
+	if x[len(x)-1] != 0xa1 {
+		throw("progToPointerMask: overflow")
+	}
+	return bitvector{int32(n), &x[0]}
+}
+
+// Packed GC pointer bitmaps, aka GC programs.
+//
+// For large types containing arrays, the type information has a
+// natural repetition that can be encoded to save space in the
+// binary and in the memory representation of the type information.
+//
+// The encoding is a simple Lempel-Ziv style bytecode machine
+// with the following instructions:
+//
+//	00000000: stop
+//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
+//	10000000 n c: repeat the previous n bits c times; n, c are varints
+//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
+
+// runGCProg executes the GC program prog, and then trailer if non-nil,
+// writing to dst with entries of the given size.
+// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
+// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
+// starting at dst (because the heap bitmap does). In this case, the caller guarantees
+// that only whole bytes in dst need to be written.
+//
+// runGCProg returns the number of 1- or 2-bit entries written to memory.
+func runGCProg(prog, trailer, dst *byte, size int) uintptr {
+	dstStart := dst
+
+	// Bits waiting to be written to memory.
+	var bits uintptr
+	var nbits uintptr
+
+	p := prog
+Run:
+	for {
+		// Flush accumulated full bytes.
+		// The rest of the loop assumes that nbits <= 7.
+		for ; nbits >= 8; nbits -= 8 {
+			if size == 1 {
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			} else {
+				v := bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+				v = bits&bitPointerAll | bitScanAll
+				*dst = uint8(v)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+		}
+
+		// Process one instruction.
+		inst := uintptr(*p)
+		p = add1(p)
+		n := inst & 0x7F
+		if inst&0x80 == 0 {
+			// Literal bits; n == 0 means end of program.
+			if n == 0 {
+				// Program is over; continue in trailer if present.
+				if trailer != nil {
+					//println("trailer")
+					p = trailer
+					trailer = nil
+					continue
+				}
+				//println("done")
+				break Run
+			}
+			//println("lit", n, dst)
+			nbyte := n / 8
+			for i := uintptr(0); i < nbyte; i++ {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				if size == 1 {
+					*dst = uint8(bits)
+					dst = add1(dst)
+					bits >>= 8
+				} else {
+					v := bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+					v = bits&0xf | bitScanAll
+					*dst = uint8(v)
+					dst = subtract1(dst)
+					bits >>= 4
+				}
+			}
+			if n %= 8; n > 0 {
+				bits |= uintptr(*p) << nbits
+				p = add1(p)
+				nbits += n
+			}
+			continue Run
+		}
+
+		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
+		if n == 0 {
+			for off := uint(0); ; off += 7 {
+				x := uintptr(*p)
+				p = add1(p)
+				n |= (x & 0x7F) << off
+				if x&0x80 == 0 {
+					break
+				}
+			}
+		}
+
+		// Count is encoded in a varint in the next bytes.
+		c := uintptr(0)
+		for off := uint(0); ; off += 7 {
+			x := uintptr(*p)
+			p = add1(p)
+			c |= (x & 0x7F) << off
+			if x&0x80 == 0 {
+				break
+			}
+		}
+		c *= n // now total number of bits to copy
+
+		// If the number of bits being repeated is small, load them
+		// into a register and use that register for the entire loop
+		// instead of repeatedly reading from memory.
+		// Handling fewer than 8 bits here makes the general loop simpler.
+		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
+		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
+		// it will not overflow.
+		src := dst
+		const maxBits = sys.PtrSize*8 - 7
+		if n <= maxBits {
+			// Start with bits in output buffer.
+			pattern := bits
+			npattern := nbits
+
+			// If we need more bits, fetch them from memory.
+			if size == 1 {
+				src = subtract1(src)
+				for npattern < n {
+					pattern <<= 8
+					pattern |= uintptr(*src)
+					src = subtract1(src)
+					npattern += 8
+				}
+			} else {
+				src = add1(src)
+				for npattern < n {
+					pattern <<= 4
+					pattern |= uintptr(*src) & 0xf
+					src = add1(src)
+					npattern += 4
+				}
+			}
+
+			// We started with the whole bit output buffer,
+			// and then we loaded bits from whole bytes.
+			// Either way, we might now have too many instead of too few.
+			// Discard the extra.
+			if npattern > n {
+				pattern >>= npattern - n
+				npattern = n
+			}
+
+			// Replicate pattern to at most maxBits.
+			if npattern == 1 {
+				// One bit being repeated.
+				// If the bit is 1, make the pattern all 1s.
+				// If the bit is 0, the pattern is already all 0s,
+				// but we can claim that the number of bits
+				// in the word is equal to the number we need (c),
+				// because right shift of bits will zero fill.
+				if pattern == 1 {
+					pattern = 1<<maxBits - 1
+					npattern = maxBits
+				} else {
+					npattern = c
+				}
+			} else {
+				b := pattern
+				nb := npattern
+				if nb+nb <= maxBits {
+					// Double pattern until the whole uintptr is filled.
+					for nb <= sys.PtrSize*8 {
+						b |= b << nb
+						nb += nb
+					}
+					// Trim away incomplete copy of original pattern in high bits.
+					// TODO(rsc): Replace with table lookup or loop on systems without divide?
+					nb = maxBits / npattern * npattern
+					b &= 1<<nb - 1
+					pattern = b
+					npattern = nb
+				}
+			}
+
+			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
+			// Since pattern contains >8 bits, there will be full bytes to flush
+			// on each iteration.
+			for ; c >= npattern; c -= npattern {
+				bits |= pattern << nbits
+				nbits += npattern
+				if size == 1 {
+					for nbits >= 8 {
+						*dst = uint8(bits)
+						dst = add1(dst)
+						bits >>= 8
+						nbits -= 8
+					}
+				} else {
+					for nbits >= 4 {
+						*dst = uint8(bits&0xf | bitScanAll)
+						dst = subtract1(dst)
+						bits >>= 4
+						nbits -= 4
+					}
+				}
+			}
+
+			// Add final fragment to bit buffer.
+			if c > 0 {
+				pattern &= 1<<c - 1
+				bits |= pattern << nbits
+				nbits += c
+			}
+			continue Run
+		}
+
+		// Repeat; n too large to fit in a register.
+		// Since nbits <= 7, we know the first few bytes of repeated data
+		// are already written to memory.
+		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
+		if size == 1 {
+			// Leading src fragment.
+			src = subtractb(src, (off+7)/8)
+			if frag := off & 7; frag != 0 {
+				bits |= uintptr(*src) >> (8 - frag) << nbits
+				src = add1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 8; i > 0; i-- {
+				bits |= uintptr(*src) << nbits
+				src = add1(src)
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
+			}
+			// Final src fragment.
+			if c %= 8; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		} else {
+			// Leading src fragment.
+			src = addb(src, (off+3)/4)
+			if frag := off & 3; frag != 0 {
+				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
+				src = subtract1(src)
+				nbits += frag
+				c -= frag
+			}
+			// Main loop: load one byte, write another.
+			// The bits are rotating through the bit buffer.
+			for i := c / 4; i > 0; i-- {
+				bits |= (uintptr(*src) & 0xf) << nbits
+				src = subtract1(src)
+				*dst = uint8(bits&0xf | bitScanAll)
+				dst = subtract1(dst)
+				bits >>= 4
+			}
+			// Final src fragment.
+			if c %= 4; c > 0 {
+				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+				nbits += c
+			}
+		}
+	}
+
+	// Write any final bits out, using full-byte writes, even for the final byte.
+	var totalBits uintptr
+	if size == 1 {
+		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
+		nbits += -nbits & 7
+		for ; nbits > 0; nbits -= 8 {
+			*dst = uint8(bits)
+			dst = add1(dst)
+			bits >>= 8
+		}
+	} else {
+		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
+		nbits += -nbits & 3
+		for ; nbits > 0; nbits -= 4 {
+			v := bits&0xf | bitScanAll
+			*dst = uint8(v)
+			dst = subtract1(dst)
+			bits >>= 4
+		}
+	}
+	return totalBits
+}
+
+func dumpGCProg(p *byte) {
+	nptr := 0
+	for {
+		x := *p
+		p = add1(p)
+		if x == 0 {
+			print("\t", nptr, " end\n")
+			break
+		}
+		if x&0x80 == 0 {
+			print("\t", nptr, " lit ", x, ":")
+			n := int(x+7) / 8
+			for i := 0; i < n; i++ {
+				print(" ", hex(*p))
+				p = add1(p)
+			}
+			print("\n")
+			nptr += int(x)
+		} else {
+			nbit := int(x &^ 0x80)
+			if nbit == 0 {
+				for nb := uint(0); ; nb += 7 {
+					x := *p
+					p = add1(p)
+					nbit |= int(x&0x7f) << nb
+					if x&0x80 == 0 {
+						break
+					}
+				}
+			}
+			count := 0
+			for nb := uint(0); ; nb += 7 {
+				x := *p
+				p = add1(p)
+				count |= int(x&0x7f) << nb
+				if x&0x80 == 0 {
+					break
+				}
+			}
+			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
+			nptr += nbit * count
+		}
+	}
+}
+
+// Testing.
+
+// gcbits returns the GC type info for x, for testing.
+// The result is the bitmap entries (0 or 1), one entry per byte.
+//go:linkname reflect_gcbits reflect.gcbits
+func reflect_gcbits(x interface{}) []byte {
+	ret := getgcmask(x)
+	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
+	nptr := typ.ptrdata / sys.PtrSize
+	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
+		ret = ret[:len(ret)-1]
+	}
+	return ret
+}
+
+// Returns GC type info for object p for testing.
+func getgcmask(ep interface{}) (mask []byte) {
+	e := *efaceOf(&ep)
+	p := e.data
+	t := e._type
+	// data or bss
+	roots := gcRoots
+	for roots != nil {
+		for i := 0; i < roots.count; i++ {
+			pr := roots.roots[i]
+			addr := uintptr(pr.decl)
+			if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
+				n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+				mask = make([]byte, n/sys.PtrSize)
+				copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
+			}
+			return
+		}
+		roots = roots.next
+	}
+
+	// heap
+	var n uintptr
+	var base uintptr
+	if mlookup(uintptr(p), &base, &n, nil) != 0 {
+		mask = make([]byte, n/sys.PtrSize)
+		for i := uintptr(0); i < n; i += sys.PtrSize {
+			hbits := heapBitsForAddr(base + i)
+			if hbits.isPointer() {
+				mask[i/sys.PtrSize] = 1
+			}
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				mask = mask[:i/sys.PtrSize]
+				break
+			}
+		}
+		return
+	}
+
+	// otherwise, not something the GC knows about.
+	// possibly read-only data, like malloc(0).
+	// must not have pointers
+	// For gccgo, may live on the stack, which is collected conservatively.
+	return
+}
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index b65dd37..92dabef 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -4,16 +4,8 @@
 
 package runtime
 
-// This is a temporary mcache.go for gccgo.
-// At some point it will be replaced by the one in the gc runtime package.
-
 import "unsafe"
 
-type mcachelist struct {
-	list  *mlink
-	nlist uint32
-}
-
 // Per-thread (in Go, per-P) cache for small objects.
 // No locking needed because it is per-thread (per-P).
 //
@@ -24,8 +16,8 @@ type mcachelist struct {
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample      int32   // trigger heap sample after allocating this many bytes
-	local_cachealloc uintptr // bytes allocated (or freed) from cache since last lock of heap
+	next_sample int32   // trigger heap sample after allocating this many bytes
+	local_scan  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -36,12 +28,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny     unsafe.Pointer
-	tinysize uintptr
+	tiny             uintptr
+	tinyoffset       uintptr
+	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
 
 	// The rest is not accessed on every malloc.
-	alloc [_NumSizeClasses]*mspan     // spans to allocate from
-	free  [_NumSizeClasses]mcachelist // lists of explicitly freed objects
+	alloc [_NumSizeClasses]*mspan // spans to allocate from
 
 	// Local allocator stats, flushed during GC.
 	local_nlookup    uintptr                  // number of pointer lookups
@@ -50,46 +42,98 @@ type mcache struct {
 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
 }
 
-type mtypes struct {
-	compression byte
-	data        uintptr
+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
 }
 
-type special struct {
-	next   *special
-	offset uint16
-	kind   byte
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
 }
 
-type mspan struct {
-	next     *mspan // next span in list, or nil if none
-	prev     *mspan // previous span's next field, or list head's first field if none
-	start    uintptr
-	npages   uintptr // number of pages in span
-	freelist *mlink
-
-	// sweep generation:
-	// if sweepgen == h->sweepgen - 2, the span needs sweeping
-	// if sweepgen == h->sweepgen - 1, the span is currently being swept
-	// if sweepgen == h->sweepgen, the span is swept and ready to use
-	// h->sweepgen is incremented by 2 after every GC
-
-	sweepgen    uint32
-	ref         uint16
-	sizeclass   uint8   // size class
-	incache     bool    // being used by an mcache
-	state       uint8   // mspaninuse etc
-	needzero    uint8   // needs to be zeroed before allocation
-	elemsize    uintptr // computed from sizeclass or from npages
-	unusedsince int64   // first time spotted by gc in mspanfree state
-	npreleased  uintptr // number of pages released to the os
-	limit       uintptr // end of data in span
-	types       mtypes
-	speciallock mutex    // guards specials list
-	specials    *special // linked list of special records sorted by offset.
-	freebuf     *mlink
+// dummy MSpan that contains no free objects.
+var emptymspan mspan
+
+func allocmcache() *mcache {
+	lock(&mheap_.lock)
+	c := (*mcache)(mheap_.cachealloc.alloc())
+	unlock(&mheap_.lock)
+	for i := 0; i < _NumSizeClasses; i++ {
+		c.alloc[i] = &emptymspan
+	}
+	c.next_sample = nextSample()
+	return c
+}
+
+func freemcache(c *mcache) {
+	systemstack(func() {
+		c.releaseAll()
+
+		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
+		// with the stealing of gcworkbufs during garbage collection to avoid
+		// a race where the workbuf is double-freed.
+		// gcworkbuffree(c.gcworkbuf)
+
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		mheap_.cachealloc.free(unsafe.Pointer(c))
+		unlock(&mheap_.lock)
+	})
+}
+
+// Gets a span that has a free object in it and assigns it
+// to be the cached span for the given sizeclass. Returns this span.
+func (c *mcache) refill(sizeclass int32) *mspan {
+	_g_ := getg()
+
+	_g_.m.locks++
+	// Return the current cached span to the central lists.
+	s := c.alloc[sizeclass]
+
+	if uintptr(s.allocCount) != s.nelems {
+		throw("refill of span with free space remaining")
+	}
+
+	if s != &emptymspan {
+		s.incache = false
+	}
+
+	// Get a new cached span from the central lists.
+	s = mheap_.central[sizeclass].mcentral.cacheSpan()
+	if s == nil {
+		throw("out of memory")
+	}
+
+	if uintptr(s.allocCount) == s.nelems {
+		throw("span has no free space")
+	}
+
+	c.alloc[sizeclass] = s
+	_g_.m.locks--
+	return s
 }
 
-type mlink struct {
-	next *mlink
+func (c *mcache) releaseAll() {
+	for i := 0; i < _NumSizeClasses; i++ {
+		s := c.alloc[i]
+		if s != &emptymspan {
+			mheap_.central[i].mcentral.uncacheSpan(s)
+			c.alloc[i] = &emptymspan
+		}
+	}
+	// Clear tinyalloc pool.
+	c.tiny = 0
+	c.tinyoffset = 0
 }
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
new file mode 100644
index 0000000..ddcf81e
--- /dev/null
+++ b/libgo/go/runtime/mcentral.go
@@ -0,0 +1,222 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Central free lists.
+//
+// See malloc.go for an overview.
+//
+// The MCentral doesn't actually contain the list of free objects; the MSpan does.
+// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
+// and those that are completely allocated (c->empty).
+
+package runtime
+
+import "runtime/internal/atomic"
+
+// Central list of free objects of a given size.
+//
+//go:notinheap
+type mcentral struct {
+	lock      mutex
+	sizeclass int32
+	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
+	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+}
+
+// Initialize a single central free list.
+func (c *mcentral) init(sizeclass int32) {
+	c.sizeclass = sizeclass
+	c.nonempty.init()
+	c.empty.init()
+}
+
+// Allocate a span to use in an MCache.
+func (c *mcentral) cacheSpan() *mspan {
+	// Deduct credit for this span allocation and sweep if necessary.
+	spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
+
+	lock(&c.lock)
+	sg := mheap_.sweepgen
+retry:
+	var s *mspan
+	for s = c.nonempty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			c.nonempty.remove(s)
+			c.empty.insertBack(s)
+			unlock(&c.lock)
+			s.sweep(true)
+			goto havespan
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// we have a nonempty span that does not require sweeping, allocate from it
+		c.nonempty.remove(s)
+		c.empty.insertBack(s)
+		unlock(&c.lock)
+		goto havespan
+	}
+
+	for s = c.empty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// we have an empty span that requires sweeping,
+			// sweep it and see if we can free some space in it
+			c.empty.remove(s)
+			// swept spans are at the end of the list
+			c.empty.insertBack(s)
+			unlock(&c.lock)
+			s.sweep(true)
+			freeIndex := s.nextFreeIndex()
+			if freeIndex != s.nelems {
+				s.freeindex = freeIndex
+				goto havespan
+			}
+			lock(&c.lock)
+			// the span is still empty after sweep
+			// it is already in the empty list, so just retry
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being swept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	unlock(&c.lock)
+
+	// Replenish central list if empty.
+	s = c.grow()
+	if s == nil {
+		return nil
+	}
+	lock(&c.lock)
+	c.empty.insertBack(s)
+	unlock(&c.lock)
+
+	// At this point s is a non-empty span, queued at the end of the empty list,
+	// c is unlocked.
+havespan:
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
+	}
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	if usedBytes > 0 {
+		reimburseSweepCredit(usedBytes)
+	}
+	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+	s.incache = true
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
+	return s
+}
+
+// Return span from an MCache.
+func (c *mcentral) uncacheSpan(s *mspan) {
+	lock(&c.lock)
+
+	s.incache = false
+
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
+	}
+
+	cap := int32((s.npages << _PageShift) / s.elemsize)
+	n := cap - int32(s.allocCount)
+	if n > 0 {
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+		// mCentral_CacheSpan conservatively counted
+		// unallocated slots in heap_live. Undo this.
+		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+	}
+	unlock(&c.lock)
+}
+
+// freeSpan updates c and s after sweeping s.
+// It sets s's sweepgen to the latest generation,
+// and, based on the number of free objects in s,
+// moves s to the appropriate list of c or returns it
+// to the heap.
+// freeSpan returns true if s was returned to the heap.
+// If preserve=true, it does not move s (the caller
+// must take care of it).
+func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
+	if s.incache {
+		throw("freeSpan given cached span")
+	}
+	s.needzero = 1
+
+	if preserve {
+		// preserve is set only when called from MCentral_CacheSpan above,
+		// the span must be in the empty list.
+		if !s.inList() {
+			throw("can't preserve unlinked span")
+		}
+		atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		return false
+	}
+
+	lock(&c.lock)
+
+	// Move to nonempty if necessary.
+	if wasempty {
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+	}
+
+	// delay updating sweepgen until here. This is the signal that
+	// the span may be used in an MCache, so it must come after the
+	// linked list operations above (actually, just after the
+	// lock of c above.)
+	atomic.Store(&s.sweepgen, mheap_.sweepgen)
+
+	if s.allocCount != 0 {
+		unlock(&c.lock)
+		return false
+	}
+
+	c.nonempty.remove(s)
+	unlock(&c.lock)
+	mheap_.freeSpan(s, 0)
+	return true
+}
+
+// grow allocates a new empty span from the heap and initializes it for c's size class.
+func (c *mcentral) grow() *mspan {
+	npages := uintptr(class_to_allocnpages[c.sizeclass])
+	size := uintptr(class_to_size[c.sizeclass])
+	n := (npages << _PageShift) / size
+
+	s := mheap_.alloc(npages, c.sizeclass, false, true)
+	if s == nil {
+		return nil
+	}
+
+	p := s.base()
+	s.limit = p + size*n
+
+	heapBitsForSpan(s.base()).initSpan(s)
+	return s
+}
diff --git a/libgo/go/runtime/mem_gccgo.go b/libgo/go/runtime/mem_gccgo.go
new file mode 100644
index 0000000..161ff26
--- /dev/null
+++ b/libgo/go/runtime/mem_gccgo.go
@@ -0,0 +1,280 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The gccgo version of mem_*.go.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Functions called by C code.
+//go:linkname sysAlloc runtime.sysAlloc
+
+//extern mmap
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
+
+//extern munmap
+func munmap(addr unsafe.Pointer, length uintptr) int32
+
+//extern mincore
+func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+
+//extern madvise
+func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
+
+var mmapFD = int32(-1)
+
+var devZero = []byte("/dev/zero\x00")
+
+func init() {
+	if _MAP_ANON == 0 {
+		mmapFD = open(&devZero[0], 0 /* O_RDONLY */, 0)
+		if mmapFD < 0 {
+			println("open /dev/zero: errno=", errno())
+			exit(2)
+		}
+	}
+}
+
+// NOTE: vec must be just 1 byte long here.
+// Mincore returns ENOMEM if any of the pages are unmapped,
+// but we want to know that all of the pages are unmapped.
+// To make these the same, we can only ask about one page
+// at a time. See golang.org/issue/7476.
+var addrspace_vec [1]byte
+
+func addrspace_free(v unsafe.Pointer, n uintptr) bool {
+	for off := uintptr(0); off < n; off += physPageSize {
+		// Use a length of 1 byte, which the kernel will round
+		// up to one physical page regardless of the true
+		// physical page size.
+		errval := 0
+		if mincore(unsafe.Pointer(uintptr(v)+off), 1, &addrspace_vec[0]) < 0 {
+			errval = errno()
+		}
+		if errval == _ENOSYS {
+			// mincore is not available on this system.
+			// Assume the address is available.
+			return true
+		}
+		if errval == _EINVAL {
+			// Address is not a multiple of the physical
+			// page size. Shouldn't happen, but just ignore it.
+			continue
+		}
+		// ENOMEM means unmapped, which is what we want.
+		// Anything else we assume means the pages are mapped.
+		if errval != _ENOMEM {
+			return false
+		}
+	}
+	return true
+}
+
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer {
+	p := mmap(v, n, prot, flags, fd, offset)
+	// On some systems, mmap ignores v without
+	// MAP_FIXED, so retry if the address space is free.
+	if p != v && addrspace_free(v, n) {
+		if uintptr(p) != _MAP_FAILED {
+			munmap(p, n)
+		}
+		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+	}
+	return p
+}
+
+// Don't split the stack as this method may be invoked without a valid G, which
+// prevents us from allocating more stack.
+//go:nosplit
+func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED {
+		errval := errno()
+		if errval == _EACCES {
+			print("runtime: mmap: access denied\n")
+			exit(2)
+		}
+		if errval == _EAGAIN {
+			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
+			exit(2)
+		}
+		return nil
+	}
+	mSysStatInc(sysStat, n)
+	return p
+}
+
+func sysUnused(v unsafe.Pointer, n uintptr) {
+	// By default, Linux's "transparent huge page" support will
+	// merge pages into a huge page if there's even a single
+	// present regular page, undoing the effects of the DONTNEED
+	// below. On amd64, that means khugepaged can turn a single
+	// 4KB page to 2MB, bloating the process's RSS by as much as
+	// 512X. (See issue #8832 and Linux kernel bug
+	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
+	//
+	// To work around this, we explicitly disable transparent huge
+	// pages when we release pages of the heap. However, we have
+	// to do this carefully because changing this flag tends to
+	// split the VMA (memory mapping) containing v in to three
+	// VMAs in order to track the different values of the
+	// MADV_NOHUGEPAGE flag in the different regions. There's a
+	// default limit of 65530 VMAs per address space (sysctl
+	// vm.max_map_count), so we must be careful not to create too
+	// many VMAs (see issue #12233).
+	//
+	// Since huge pages are huge, there's little use in adjusting
+	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
+	// exploding the number of VMAs by only adjusting the
+	// MADV_NOHUGEPAGE flag on a large granularity. This still
+	// gets most of the benefit of huge pages while keeping the
+	// number of VMAs under control. With hugePageSize = 2MB, even
+	// a pessimal heap can reach 128GB before running out of VMAs.
+	if sys.HugePageSize != 0 && _MADV_NOHUGEPAGE != 0 {
+		var s uintptr = sys.HugePageSize // division by constant 0 is a compile-time error :(
+
+		// If it's a large allocation, we want to leave huge
+		// pages enabled. Hence, we only adjust the huge page
+		// flag on the huge pages containing v and v+n-1, and
+		// only if those aren't aligned.
+		var head, tail uintptr
+		if uintptr(v)%s != 0 {
+			// Compute huge page containing v.
+			head = uintptr(v) &^ (s - 1)
+		}
+		if (uintptr(v)+n)%s != 0 {
+			// Compute huge page containing v+n-1.
+			tail = (uintptr(v) + n - 1) &^ (s - 1)
+		}
+
+		// Note that madvise will return EINVAL if the flag is
+		// already set, which is quite likely. We ignore
+		// errors.
+		if head != 0 && head+sys.HugePageSize == tail {
+			// head and tail are different but adjacent,
+			// so do this in one call.
+			madvise(unsafe.Pointer(head), 2*sys.HugePageSize, _MADV_NOHUGEPAGE)
+		} else {
+			// Advise the huge pages containing v and v+n-1.
+			if head != 0 {
+				madvise(unsafe.Pointer(head), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+			if tail != 0 && tail != head {
+				madvise(unsafe.Pointer(tail), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+		}
+	}
+
+	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
+		// madvise will round this to any physical page
+		// *covered* by this range, so an unaligned madvise
+		// will release more memory than intended.
+		throw("unaligned sysUnused")
+	}
+
+	if _MADV_DONTNEED != 0 {
+		madvise(v, n, _MADV_DONTNEED)
+	} else if _MADV_FREE != 0 {
+		madvise(v, n, _MADV_FREE)
+	}
+}
+
+func sysUsed(v unsafe.Pointer, n uintptr) {
+	if sys.HugePageSize != 0 && _MADV_HUGEPAGE != 0 {
+		// Partially undo the NOHUGEPAGE marks from sysUnused
+		// for whole huge pages between v and v+n. This may
+		// leave huge pages off at the end points v and v+n
+		// even though allocations may cover these entire huge
+		// pages. We could detect this and undo NOHUGEPAGE on
+		// the end points as well, but it's probably not worth
+		// the cost because when neighboring allocations are
+		// freed sysUnused will just set NOHUGEPAGE again.
+		var s uintptr = sys.HugePageSize
+
+		// Round v up to a huge page boundary.
+		beg := (uintptr(v) + (s - 1)) &^ (s - 1)
+		// Round v+n down to a huge page boundary.
+		end := (uintptr(v) + n) &^ (s - 1)
+
+		if beg < end {
+			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
+		}
+	}
+}
+
+// Don't split the stack as this function may be invoked without a valid G,
+// which prevents us from allocating more stack.
+//go:nosplit
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+	mSysStatDec(sysStat, n)
+	munmap(v, n)
+}
+
+func sysFault(v unsafe.Pointer, n uintptr) {
+	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, mmapFD, 0)
+}
+
+func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space. Instead, assume that the reservation is okay
+	// if we can reserve at least 64K and check the assumption in SysMap.
+	// Only user-mode Linux (UML) rejects these requests.
+	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
+		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+		if p != v {
+			if uintptr(p) != _MAP_FAILED {
+				munmap(p, 64<<10)
+			}
+			return nil
+		}
+		munmap(p, 64<<10)
+		*reserved = false
+		return v
+	}
+
+	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED {
+		return nil
+	}
+	*reserved = true
+	return p
+}
+
+func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
+	mSysStatInc(sysStat, n)
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if !reserved {
+		flags := int32(_MAP_ANON | _MAP_PRIVATE)
+		if GOOS == "dragonfly" {
+			// TODO(jsing): For some reason DragonFly seems to return
+			// memory at a different address than we requested, even when
+			// there should be no reason for it to do so. This can be
+			// avoided by using MAP_FIXED, but I'm not sure we should need
+			// to do this - we do not on other platforms.
+			flags |= _MAP_FIXED
+		}
+		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
+		if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+			throw("runtime: out of memory")
+		}
+		if p != v {
+			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+			throw("runtime: address space conflict")
+		}
+		return
+	}
+
+	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
+	if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+		throw("runtime: out of memory")
+	}
+	if p != v {
+		throw("runtime: cannot map pages in arena address space")
+	}
+}
diff --git a/libgo/go/runtime/memmove_linux_amd64_test.go b/libgo/go/runtime/memmove_linux_amd64_test.go
new file mode 100644
index 0000000..d0e8b42a
--- /dev/null
+++ b/libgo/go/runtime/memmove_linux_amd64_test.go
@@ -0,0 +1,62 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"io/ioutil"
+	"os"
+	"reflect"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// TestMemmoveOverflow maps 3GB of memory and calls memmove on
+// the corresponding slice.
+func TestMemmoveOverflow(t *testing.T) {
+	t.Parallel()
+	// Create a temporary file.
+	tmp, err := ioutil.TempFile("", "go-memmovetest")
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = tmp.Write(make([]byte, 65536))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(tmp.Name())
+	defer tmp.Close()
+
+	// Set up mappings.
+	base, _, errno := syscall.Syscall6(syscall.SYS_MMAP,
+		0xa0<<32, 3<<30, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANONYMOUS, ^uintptr(0), 0)
+	if errno != 0 {
+		t.Skipf("could not create memory mapping: %s", errno)
+	}
+	syscall.Syscall(syscall.SYS_MUNMAP, base, 3<<30, 0)
+
+	for off := uintptr(0); off < 3<<30; off += 65536 {
+		_, _, errno := syscall.Syscall6(syscall.SYS_MMAP,
+			base+off, 65536, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FIXED, tmp.Fd(), 0)
+		if errno != 0 {
+			t.Skipf("could not map a page at requested 0x%x: %s", base+off, errno)
+		}
+		defer syscall.Syscall(syscall.SYS_MUNMAP, base+off, 65536, 0)
+	}
+
+	var s []byte
+	sp := (*reflect.SliceHeader)(unsafe.Pointer(&s))
+	sp.Data = base
+	sp.Len, sp.Cap = 3<<30, 3<<30
+
+	n := copy(s[1:], s)
+	if n != 3<<30-1 {
+		t.Fatalf("copied %d bytes, expected %d", n, 3<<30-1)
+	}
+	n = copy(s, s[1:])
+	if n != 3<<30-1 {
+		t.Fatalf("copied %d bytes, expected %d", n, 3<<30-1)
+	}
+}
diff --git a/libgo/go/runtime/memmove_test.go b/libgo/go/runtime/memmove_test.go
new file mode 100644
index 0000000..74b8753
--- /dev/null
+++ b/libgo/go/runtime/memmove_test.go
@@ -0,0 +1,469 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"internal/race"
+	. "runtime"
+	"testing"
+)
+
+func TestMemmove(t *testing.T) {
+	t.Parallel()
+	size := 256
+	if testing.Short() {
+		size = 128 + 16
+	}
+	src := make([]byte, size)
+	dst := make([]byte, size)
+	for i := 0; i < size; i++ {
+		src[i] = byte(128 + (i & 127))
+	}
+	for i := 0; i < size; i++ {
+		dst[i] = byte(i & 127)
+	}
+	for n := 0; n <= size; n++ {
+		for x := 0; x <= size-n; x++ { // offset in src
+			for y := 0; y <= size-n; y++ { // offset in dst
+				copy(dst[y:y+n], src[x:x+n])
+				for i := 0; i < y; i++ {
+					if dst[i] != byte(i&127) {
+						t.Fatalf("prefix dst[%d] = %d", i, dst[i])
+					}
+				}
+				for i := y; i < y+n; i++ {
+					if dst[i] != byte(128+((i-y+x)&127)) {
+						t.Fatalf("copied dst[%d] = %d", i, dst[i])
+					}
+					dst[i] = byte(i & 127) // reset dst
+				}
+				for i := y + n; i < size; i++ {
+					if dst[i] != byte(i&127) {
+						t.Fatalf("suffix dst[%d] = %d", i, dst[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestMemmoveAlias(t *testing.T) {
+	t.Parallel()
+	size := 256
+	if testing.Short() {
+		size = 128 + 16
+	}
+	buf := make([]byte, size)
+	for i := 0; i < size; i++ {
+		buf[i] = byte(i)
+	}
+	for n := 0; n <= size; n++ {
+		for x := 0; x <= size-n; x++ { // src offset
+			for y := 0; y <= size-n; y++ { // dst offset
+				copy(buf[y:y+n], buf[x:x+n])
+				for i := 0; i < y; i++ {
+					if buf[i] != byte(i) {
+						t.Fatalf("prefix buf[%d] = %d", i, buf[i])
+					}
+				}
+				for i := y; i < y+n; i++ {
+					if buf[i] != byte(i-y+x) {
+						t.Fatalf("copied buf[%d] = %d", i, buf[i])
+					}
+					buf[i] = byte(i) // reset buf
+				}
+				for i := y + n; i < size; i++ {
+					if buf[i] != byte(i) {
+						t.Fatalf("suffix buf[%d] = %d", i, buf[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestMemmoveLarge0x180000(t *testing.T) {
+	t.Parallel()
+	if race.Enabled {
+		t.Skip("skipping large memmove test under race detector")
+	}
+	testSize(t, 0x180000)
+}
+
+func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+	t.Parallel()
+	if race.Enabled {
+		t.Skip("skipping large memmove test under race detector")
+	}
+	testOverlap(t, 0x120000)
+}
+
+func testSize(t *testing.T, size int) {
+	src := make([]byte, size)
+	dst := make([]byte, size)
+	_, _ = rand.Read(src)
+	_, _ = rand.Read(dst)
+
+	ref := make([]byte, size)
+	copyref(ref, dst)
+
+	for n := size - 50; n > 1; n >>= 1 {
+		for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+			for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+				copy(dst[y:y+n], src[x:x+n])
+				copyref(ref[y:y+n], src[x:x+n])
+				p := cmpb(dst, ref)
+				if p >= 0 {
+					t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, dst[p], ref[p])
+				}
+			}
+		}
+	}
+}
+
+func testOverlap(t *testing.T, size int) {
+	src := make([]byte, size)
+	test := make([]byte, size)
+	ref := make([]byte, size)
+	_, _ = rand.Read(src)
+
+	for n := size - 50; n > 1; n >>= 1 {
+		for x := 0; x <= size-n; x = x*7 + 1 { // offset in src
+			for y := 0; y <= size-n; y = y*9 + 1 { // offset in dst
+				// Reset input
+				copyref(test, src)
+				copyref(ref, src)
+				copy(test[y:y+n], test[x:x+n])
+				if y <= x {
+					copyref(ref[y:y+n], ref[x:x+n])
+				} else {
+					copybw(ref[y:y+n], ref[x:x+n])
+				}
+				p := cmpb(test, ref)
+				if p >= 0 {
+					t.Fatalf("Copy failed, copying from src[%d:%d] to dst[%d:%d].\nOffset %d is different, %v != %v", x, x+n, y, y+n, p, test[p], ref[p])
+				}
+			}
+		}
+	}
+
+}
+
+// Forward copy.
+func copyref(dst, src []byte) {
+	for i, v := range src {
+		dst[i] = v
+	}
+}
+
+// Backwards copy
+func copybw(dst, src []byte) {
+	if len(src) == 0 {
+		return
+	}
+	for i := len(src) - 1; i >= 0; i-- {
+		dst[i] = src[i]
+	}
+}
+
+// Returns offset of difference
+func matchLen(a, b []byte, max int) int {
+	a = a[:max]
+	b = b[:max]
+	for i, av := range a {
+		if b[i] != av {
+			return i
+		}
+	}
+	return max
+}
+
+func cmpb(a, b []byte) int {
+	l := matchLen(a, b, len(a))
+	if l == len(a) {
+		return -1
+	}
+	return l
+}
+
+func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
+	for _, n := range sizes {
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n))
+			fn(b, n)
+		})
+	}
+}
+
+var bufSizes = []int{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+	32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
+
+func BenchmarkMemmove(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		y := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			copy(x, y)
+		}
+	})
+}
+
+func BenchmarkMemmoveUnalignedDst(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n+1)
+		y := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			copy(x[1:], y)
+		}
+	})
+}
+
+func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		y := make([]byte, n+1)
+		for i := 0; i < b.N; i++ {
+			copy(x, y[1:])
+		}
+	})
+}
+
+func TestMemclr(t *testing.T) {
+	size := 512
+	if testing.Short() {
+		size = 128 + 16
+	}
+	mem := make([]byte, size)
+	for i := 0; i < size; i++ {
+		mem[i] = 0xee
+	}
+	for n := 0; n < size; n++ {
+		for x := 0; x <= size-n; x++ { // offset in mem
+			MemclrBytes(mem[x : x+n])
+			for i := 0; i < x; i++ {
+				if mem[i] != 0xee {
+					t.Fatalf("overwrite prefix mem[%d] = %d", i, mem[i])
+				}
+			}
+			for i := x; i < x+n; i++ {
+				if mem[i] != 0 {
+					t.Fatalf("failed clear mem[%d] = %d", i, mem[i])
+				}
+				mem[i] = 0xee
+			}
+			for i := x + n; i < size; i++ {
+				if mem[i] != 0xee {
+					t.Fatalf("overwrite suffix mem[%d] = %d", i, mem[i])
+				}
+			}
+		}
+	}
+}
+
+func BenchmarkMemclr(b *testing.B) {
+	for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
+		x := make([]byte, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n))
+			for i := 0; i < b.N; i++ {
+				MemclrBytes(x)
+			}
+		})
+	}
+	for _, m := range []int{1, 4, 8, 16, 64} {
+		x := make([]byte, m<<20)
+		b.Run(fmt.Sprint(m, "M"), func(b *testing.B) {
+			b.SetBytes(int64(m << 20))
+			for i := 0; i < b.N; i++ {
+				MemclrBytes(x)
+			}
+		})
+	}
+}
+
+func BenchmarkGoMemclr(b *testing.B) {
+	benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			for j := range x {
+				x[j] = 0
+			}
+		}
+	})
+}
+
+func BenchmarkClearFat8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [8 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat12(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [12 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [16 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat24(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [24 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [32 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat40(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [40 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat48(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [48 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat56(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [56 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [64 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat128(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [128 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat256(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [256 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat512(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [512 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat1024(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [1024 / 4]uint32
+		_ = x
+	}
+}
+
+func BenchmarkCopyFat8(b *testing.B) {
+	var x [8 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat12(b *testing.B) {
+	var x [12 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat16(b *testing.B) {
+	var x [16 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat24(b *testing.B) {
+	var x [24 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat32(b *testing.B) {
+	var x [32 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat64(b *testing.B) {
+	var x [64 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat128(b *testing.B) {
+	var x [128 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat256(b *testing.B) {
+	var x [256 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat512(b *testing.B) {
+	var x [512 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat1024(b *testing.B) {
+	var x [1024 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+
+func BenchmarkIssue18740(b *testing.B) {
+	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
+	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+	// when we try to read the result with one 4-byte load.
+	var buf [4]byte
+	for j := 0; j < b.N; j++ {
+		s := uint32(0)
+		for i := 0; i < 4096; i += 4 {
+			copy(buf[:], g[i:])
+			s += binary.LittleEndian.Uint32(buf[:])
+		}
+		sink = uint64(s)
+	}
+}
+
+// TODO: 2 byte and 8 byte benchmarks also.
+
+var g [4096]byte
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
new file mode 100644
index 0000000..f0123b3
--- /dev/null
+++ b/libgo/go/runtime/mfinal.go
@@ -0,0 +1,424 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: finalizers and block profiling.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// finblock is allocated from non-GC'd memory, so any heap pointers
+// must be specially handled.
+//
+//go:notinheap
+type finblock struct {
+	alllink *finblock
+	next    *finblock
+	cnt     uint32
+	_       int32
+	fin     [(_FinBlockSize - 2*sys.PtrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
+}
+
+var finlock mutex  // protects the following variables
+var fing *g        // goroutine that runs finalizers
+var finq *finblock // list of finalizers that are to be executed
+var finc *finblock // cache of free blocks
+var finptrmask [_FinBlockSize / sys.PtrSize / 8]byte
+var fingwait bool
+var fingwake bool
+var allfin *finblock // list of all blocks
+
+// NOTE: Layout known to queuefinalizer.
+type finalizer struct {
+	fn  *funcval       // function to call (may be a heap pointer)
+	arg unsafe.Pointer // ptr to object (may be a heap pointer)
+	ft  *functype      // type of fn (unlikely, but may be a heap pointer)
+	ot  *ptrtype       // type of ptr to object (may be a heap pointer)
+}
+
+func queuefinalizer(p unsafe.Pointer, fn *funcval, ft *functype, ot *ptrtype) {
+	lock(&finlock)
+	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
+		if finc == nil {
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc.alllink = allfin
+			allfin = finc
+			if finptrmask[0] == 0 {
+				// Build pointer mask for Finalizer array in block.
+				// We allocate values of type finalizer in
+				// finblock values. Since these values are
+				// allocated by persistentalloc, they require
+				// special scanning during GC. finptrmask is a
+				// pointer mask to use while scanning.
+				// Since all the values in finalizer are
+				// pointers, just turn all bits on.
+				for i := range finptrmask {
+					finptrmask[i] = 0xff
+				}
+			}
+		}
+		block := finc
+		finc = block.next
+		block.next = finq
+		finq = block
+	}
+	f := &finq.fin[finq.cnt]
+	atomic.Xadd(&finq.cnt, +1) // Sync with markroots
+	f.fn = fn
+	f.ft = ft
+	f.ot = ot
+	f.arg = p
+	fingwake = true
+	unlock(&finlock)
+}
+
+//go:nowritebarrier
+func iterate_finq(callback func(*funcval, unsafe.Pointer, *functype, *ptrtype)) {
+	for fb := allfin; fb != nil; fb = fb.alllink {
+		for i := uint32(0); i < fb.cnt; i++ {
+			f := &fb.fin[i]
+			callback(f.fn, f.arg, f.ft, f.ot)
+		}
+	}
+}
+
+func wakefing() *g {
+	var res *g
+	lock(&finlock)
+	if fingwait && fingwake {
+		fingwait = false
+		fingwake = false
+		res = fing
+	}
+	unlock(&finlock)
+	return res
+}
+
+var (
+	fingCreate  uint32
+	fingRunning bool
+)
+
+func createfing() {
+	// start the finalizer goroutine exactly once
+	if fingCreate == 0 && atomic.Cas(&fingCreate, 0, 1) {
+		go runfinq()
+	}
+}
+
+// This is the goroutine that runs all of the finalizers
+func runfinq() {
+	var (
+		ef   eface
+		ifac iface
+	)
+
+	for {
+		lock(&finlock)
+		fb := finq
+		finq = nil
+		if fb == nil {
+			gp := getg()
+			fing = gp
+			fingwait = true
+			goparkunlock(&finlock, "finalizer wait", traceEvGoBlock, 1)
+			continue
+		}
+		unlock(&finlock)
+		for fb != nil {
+			for i := fb.cnt; i > 0; i-- {
+				f := &fb.fin[i-1]
+
+				if f.ft == nil {
+					throw("missing type in runfinq")
+				}
+				fint := f.ft.in[0]
+				var param unsafe.Pointer
+				switch fint.kind & kindMask {
+				case kindPtr:
+					// direct use of pointer
+					param = unsafe.Pointer(&f.arg)
+				case kindInterface:
+					ityp := (*interfacetype)(unsafe.Pointer(fint))
+					if len(ityp.methods) == 0 {
+						// set up with empty interface
+						ef._type = &f.ot.typ
+						ef.data = f.arg
+						param = unsafe.Pointer(&ef)
+					} else {
+						// convert to interface with methods
+						// this conversion is guaranteed to succeed - we checked in SetFinalizer
+						ifac.tab = getitab(fint, &f.ot.typ, true)
+						ifac.data = f.arg
+						param = unsafe.Pointer(&ifac)
+					}
+				default:
+					throw("bad kind in runfinq")
+				}
+				fingRunning = true
+				reflectcall(f.ft, f.fn, false, false, &param, nil)
+				fingRunning = false
+
+				// Drop finalizer queue heap references
+				// before hiding them from markroot.
+				// This also ensures these will be
+				// clear if we reuse the finalizer.
+				f.fn = nil
+				f.arg = nil
+				f.ot = nil
+				atomic.Store(&fb.cnt, i-1)
+			}
+			next := fb.next
+			lock(&finlock)
+			fb.next = finc
+			finc = fb
+			unlock(&finlock)
+			fb = next
+		}
+	}
+}
+
+// SetFinalizer sets the finalizer associated with obj to the provided
+// finalizer function. When the garbage collector finds an unreachable block
+// with an associated finalizer, it clears the association and runs
+// finalizer(obj) in a separate goroutine. This makes obj reachable again,
+// but now without an associated finalizer. Assuming that SetFinalizer
+// is not called again, the next time the garbage collector sees
+// that obj is unreachable, it will free obj.
+//
+// SetFinalizer(obj, nil) clears any finalizer associated with obj.
+//
+// The argument obj must be a pointer to an object allocated by calling
+// new, by taking the address of a composite literal, or by taking the
+// address of a local variable.
+// The argument finalizer must be a function that takes a single argument
+// to which obj's type can be assigned, and can have arbitrary ignored return
+// values. If either of these is not true, SetFinalizer may abort the
+// program.
+//
+// Finalizers are run in dependency order: if A points at B, both have
+// finalizers, and they are otherwise unreachable, only the finalizer
+// for A runs; once A is freed, the finalizer for B can run.
+// If a cyclic structure includes a block with a finalizer, that
+// cycle is not guaranteed to be garbage collected and the finalizer
+// is not guaranteed to run, because there is no ordering that
+// respects the dependencies.
+//
+// The finalizer for obj is scheduled to run at some arbitrary time after
+// obj becomes unreachable.
+// There is no guarantee that finalizers will run before a program exits,
+// so typically they are useful only for releasing non-memory resources
+// associated with an object during a long-running program.
+// For example, an os.File object could use a finalizer to close the
+// associated operating system file descriptor when a program discards
+// an os.File without calling Close, but it would be a mistake
+// to depend on a finalizer to flush an in-memory I/O buffer such as a
+// bufio.Writer, because the buffer would not be flushed at program exit.
+//
+// It is not guaranteed that a finalizer will run if the size of *obj is
+// zero bytes.
+//
+// It is not guaranteed that a finalizer will run for objects allocated
+// in initializers for package-level variables. Such objects may be
+// linker-allocated, not heap-allocated.
+//
+// A finalizer may run as soon as an object becomes unreachable.
+// In order to use finalizers correctly, the program must ensure that
+// the object is reachable until it is no longer required.
+// Objects stored in global variables, or that can be found by tracing
+// pointers from a global variable, are reachable. For other objects,
+// pass the object to a call of the KeepAlive function to mark the
+// last point in the function where the object must be reachable.
+//
+// For example, if p points to a struct that contains a file descriptor d,
+// and p has a finalizer that closes that file descriptor, and if the last
+// use of p in a function is a call to syscall.Write(p.d, buf, size), then
+// p may be unreachable as soon as the program enters syscall.Write. The
+// finalizer may run at that moment, closing p.d, causing syscall.Write
+// to fail because it is writing to a closed file descriptor (or, worse,
+// to an entirely different file descriptor opened by a different goroutine).
+// To avoid this problem, call runtime.KeepAlive(p) after the call to
+// syscall.Write.
+//
+// A single goroutine runs all finalizers for a program, sequentially.
+// If a finalizer must run for a long time, it should do so by starting
+// a new goroutine.
+func SetFinalizer(obj interface{}, finalizer interface{}) {
+	if debug.sbrk != 0 {
+		// debug.sbrk never frees memory, so no finalizers run
+		// (and we don't have the data structures to record them).
+		return
+	}
+	e := efaceOf(&obj)
+	etyp := e._type
+	if etyp == nil {
+		throw("runtime.SetFinalizer: first argument is nil")
+	}
+	if etyp.kind&kindMask != kindPtr {
+		throw("runtime.SetFinalizer: first argument is " + *etyp.string + ", not pointer")
+	}
+	ot := (*ptrtype)(unsafe.Pointer(etyp))
+	if ot.elem == nil {
+		throw("nil elem type!")
+	}
+
+	// find the containing object
+	_, base, _ := findObject(e.data)
+
+	if base == nil {
+		// 0-length objects are okay.
+		if e.data == unsafe.Pointer(&zerobase) {
+			return
+		}
+
+		// Global initializers might be linker-allocated.
+		//	var Foo = &Object{}
+		//	func main() {
+		//		runtime.SetFinalizer(Foo, nil)
+		//	}
+		// The relevant segments are: noptrdata, data, bss, noptrbss.
+		// We cannot assume they are in any order or even contiguous,
+		// due to external linking.
+		//
+		// For gccgo we have no reliable way to detect them,
+		// so we just return.
+		return
+	}
+
+	if e.data != base {
+		// As an implementation detail we allow to set finalizers for an inner byte
+		// of an object if it could come from tiny alloc (see mallocgc for details).
+		if ot.elem == nil || ot.elem.kind&kindNoPointers == 0 || ot.elem.size >= maxTinySize {
+			throw("runtime.SetFinalizer: pointer not at beginning of allocated block")
+		}
+	}
+
+	f := efaceOf(&finalizer)
+	ftyp := f._type
+	if ftyp == nil {
+		// switch to system stack and remove finalizer
+		systemstack(func() {
+			removefinalizer(e.data)
+		})
+		return
+	}
+
+	if ftyp.kind&kindMask != kindFunc {
+		throw("runtime.SetFinalizer: second argument is " + *ftyp.string + ", not a function")
+	}
+	ft := (*functype)(unsafe.Pointer(ftyp))
+	if ft.dotdotdot {
+		throw("runtime.SetFinalizer: cannot pass " + *etyp.string + " to finalizer " + *ftyp.string + " because dotdotdot")
+	}
+	if len(ft.in) != 1 {
+		throw("runtime.SetFinalizer: cannot pass " + *etyp.string + " to finalizer " + *ftyp.string)
+	}
+	fint := ft.in[0]
+	switch {
+	case fint == etyp:
+		// ok - same type
+		goto okarg
+	case fint.kind&kindMask == kindPtr:
+		if (fint.uncommontype == nil || etyp.uncommontype == nil) && (*ptrtype)(unsafe.Pointer(fint)).elem == ot.elem {
+			// ok - not same type, but both pointers,
+			// one or the other is unnamed, and same element type, so assignable.
+			goto okarg
+		}
+	case fint.kind&kindMask == kindInterface:
+		ityp := (*interfacetype)(unsafe.Pointer(fint))
+		if len(ityp.methods) == 0 {
+			// ok - satisfies empty interface
+			goto okarg
+		}
+		if getitab(fint, etyp, true) == nil {
+			goto okarg
+		}
+	}
+	throw("runtime.SetFinalizer: cannot pass " + *etyp.string + " to finalizer " + *ftyp.string)
+okarg:
+	// make sure we have a finalizer goroutine
+	createfing()
+
+	systemstack(func() {
+		data := f.data
+		if !isDirectIface(ftyp) {
+			data = *(*unsafe.Pointer)(data)
+		}
+		if !addfinalizer(e.data, (*funcval)(data), ft, ot) {
+			throw("runtime.SetFinalizer: finalizer already set")
+		}
+	})
+}
+
+// Look up pointer v in heap. Return the span containing the object,
+// the start of the object, and the size of the object. If the object
+// does not exist, return nil, nil, 0.
+func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
+	c := gomcache()
+	c.local_nlookup++
+	if sys.PtrSize == 4 && c.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		unlock(&mheap_.lock)
+	}
+
+	// find span
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+	if uintptr(v) < arena_start || uintptr(v) >= arena_used {
+		return
+	}
+	p := uintptr(v) >> pageShift
+	q := p - arena_start>>pageShift
+	s = mheap_.spans[q]
+	if s == nil {
+		return
+	}
+	x = unsafe.Pointer(s.base())
+
+	if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse {
+		s = nil
+		x = nil
+		return
+	}
+
+	n = s.elemsize
+	if s.sizeclass != 0 {
+		x = add(x, (uintptr(v)-uintptr(x))/n*n)
+	}
+	return
+}
+
+// Mark KeepAlive as noinline so that the current compiler will ensure
+// that the argument is alive at the point of the function call.
+// If it were inlined, it would disappear, and there would be nothing
+// keeping the argument alive. Perhaps a future compiler will recognize
+// runtime.KeepAlive specially and do something more efficient.
+//go:noinline
+
+// KeepAlive marks its argument as currently reachable.
+// This ensures that the object is not freed, and its finalizer is not run,
+// before the point in the program where KeepAlive is called.
+//
+// A very simplified example showing where KeepAlive is required:
+// 	type File struct { d int }
+// 	d, err := syscall.Open("/file/path", syscall.O_RDONLY, 0)
+// 	// ... do something if err != nil ...
+// 	p := &File{d}
+// 	runtime.SetFinalizer(p, func(p *File) { syscall.Close(p.d) })
+// 	var buf [10]byte
+// 	n, err := syscall.Read(p.d, buf[:])
+// 	// Ensure p is not finalized until Read returns.
+// 	runtime.KeepAlive(p)
+// 	// No more uses of p after this point.
+//
+// Without the KeepAlive call, the finalizer could run at the start of
+// syscall.Read, closing the file descriptor before syscall.Read makes
+// the actual system call.
+func KeepAlive(interface{}) {}
diff --git a/libgo/go/runtime/mfixalloc.go b/libgo/go/runtime/mfixalloc.go
new file mode 100644
index 0000000..fe4b0fc
--- /dev/null
+++ b/libgo/go/runtime/mfixalloc.go
@@ -0,0 +1,99 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fixed-size object allocator. Returned memory is not zeroed.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import "unsafe"
+
+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+// MCache and MSpan objects.
+//
+// Memory returned by fixalloc.alloc is zeroed by default, but the
+// caller may take responsibility for zeroing allocations by setting
+// the zero flag to false. This is only safe if the memory never
+// contains heap pointers.
+//
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+//
+// Consider marking fixalloc'd types go:notinheap.
+type fixalloc struct {
+	size   uintptr
+	first  func(arg, p unsafe.Pointer) // called first time p is returned
+	arg    unsafe.Pointer
+	list   *mlink
+	chunk  unsafe.Pointer
+	nchunk uint32
+	inuse  uintptr // in-use bytes now
+	stat   *uint64
+	zero   bool // zero allocations
+}
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being performed
+// this cannot be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
+//
+//go:notinheap
+type mlink struct {
+	next *mlink
+}
+
+// Initialize f to allocate objects of the given size,
+// using the allocator to obtain chunks of memory.
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+	f.size = size
+	f.first = first
+	f.arg = arg
+	f.list = nil
+	f.chunk = nil
+	f.nchunk = 0
+	f.inuse = 0
+	f.stat = stat
+	f.zero = true
+}
+
+func (f *fixalloc) alloc() unsafe.Pointer {
+	if f.size == 0 {
+		print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
+		throw("runtime: internal error")
+	}
+
+	if f.list != nil {
+		v := unsafe.Pointer(f.list)
+		f.list = f.list.next
+		f.inuse += f.size
+		if f.zero {
+			memclrNoHeapPointers(v, f.size)
+		}
+		return v
+	}
+	if uintptr(f.nchunk) < f.size {
+		f.chunk = persistentalloc(_FixAllocChunk, 0, f.stat)
+		f.nchunk = _FixAllocChunk
+	}
+
+	v := f.chunk
+	if f.first != nil {
+		f.first(f.arg, v)
+	}
+	f.chunk = add(f.chunk, f.size)
+	f.nchunk -= uint32(f.size)
+	f.inuse += f.size
+	return v
+}
+
+func (f *fixalloc) free(p unsafe.Pointer) {
+	f.inuse -= f.size
+	v := (*mlink)(p)
+	v.next = f.list
+	f.list = v
+}
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
new file mode 100644
index 0000000..abec9d3
--- /dev/null
+++ b/libgo/go/runtime/mgc.go
@@ -0,0 +1,1963 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC).
+//
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
+// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation
+// areas to minimize fragmentation while eliminating locks in the common case.
+//
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+//
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
+// 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
+//
+// 1. GC performs sweep termination.
+//
+//    a. Stop the world. This causes all Ps to reach a GC safe-point.
+//
+//    b. Sweep any unswept spans. There will only be unswept spans if
+//    this GC cycle was forced before the expected time.
+//
+// 2. GC performs the "mark 1" sub-phase. In this sub-phase, Ps are
+// allowed to locally cache parts of the work queue.
+//
+//    a. Prepare for the mark phase by setting gcphase to _GCmark
+//    (from _GCoff), enabling the write barrier, enabling mutator
+//    assists, and enqueueing root mark jobs. No objects may be
+//    scanned until all Ps have enabled the write barrier, which is
+//    accomplished using STW.
+//
+//    b. Start the world. From this point, GC work is done by mark
+//    workers started by the scheduler and by assists performed as
+//    part of allocation. The write barrier shades both the
+//    overwritten pointer and the new pointer value for any pointer
+//    writes (see mbarrier.go for details). Newly allocated objects
+//    are immediately marked black.
+//
+//    c. GC performs root marking jobs. This includes scanning all
+//    stacks, shading all globals, and shading any heap pointers in
+//    off-heap runtime data structures. Scanning a stack stops a
+//    goroutine, shades any pointers found on its stack, and then
+//    resumes the goroutine.
+//
+//    d. GC drains the work queue of grey objects, scanning each grey
+//    object to black and shading all pointers found in the object
+//    (which in turn may add those pointers to the work queue).
+//
+// 3. Once the global work queue is empty (but local work queue caches
+// may still contain work), GC performs the "mark 2" sub-phase.
+//
+//    a. GC stops all workers, disables local work queue caches,
+//    flushes each P's local work queue cache to the global work queue
+//    cache, and reenables workers.
+//
+//    b. GC again drains the work queue, as in 2d above.
+//
+// 4. Once the work queue is empty, GC performs mark termination.
+//
+//    a. Stop the world.
+//
+//    b. Set gcphase to _GCmarktermination, and disable workers and
+//    assists.
+//
+//    c. Drain any remaining work from the work queue (typically there
+//    will be none).
+//
+//    d. Perform other housekeeping like flushing mcaches.
+//
+// 5. GC performs the sweep phase.
+//
+//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
+//    setting up sweep state and disabling the write barrier.
+//
+//    b. Start the world. From this point on, newly allocated objects
+//    are white, and allocating sweeps spans before use if necessary.
+//
+//    c. GC does concurrent sweeping in the background and in response
+//    to allocation. See description below.
+//
+// 6. When sufficient allocation has taken place, replay the sequence
+// starting with 1 above. See discussion of GC rate below.
+
+// Concurrent sweep.
+//
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// At the end of STW mark termination all spans are marked as "needs sweeping".
+//
+// The background sweeper goroutine simply sweeps spans one-by-one.
+//
+// To avoid requesting more OS memory while there are unswept spans, when a
+// goroutine needs another span, it first attempts to reclaim that much memory
+// by sweeping. When a goroutine needs to allocate a new small-object span, it
+// sweeps small-object spans for the same object size until it frees at least
+// one object. When a goroutine needs to allocate large-object span from heap,
+// it sweeps spans until it frees at least that many pages into heap. There is
+// one case where this may not suffice: if a goroutine sweeps and frees two
+// nonadjacent one-page spans to the heap, it will allocate a new two-page
+// span, but there can still be other one-page unswept spans which could be
+// combined into a two-page span.
+//
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
+
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+
+// Oblets
+//
+// In order to prevent long pauses while scanning large objects and to
+// improve parallelism, the garbage collector breaks up scan jobs for
+// objects larger than maxObletBytes into "oblets" of at most
+// maxObletBytes. When scanning encounters the beginning of a large
+// object, it scans only the first oblet and enqueues the remaining
+// oblets as new scan jobs.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	_DebugGC         = 0
+	_ConcurrentSweep = true
+	_FinBlockSize    = 4 * 1024
+
+	// sweepMinHeapDistance is a lower bound on the heap distance
+	// (in bytes) reserved for concurrent sweeping between GC
+	// cycles. This will be scaled by gcpercent/100.
+	sweepMinHeapDistance = 1024 * 1024
+)
+
+// heapminimum is the minimum heap size at which to trigger GC.
+// For small heaps, this overrides the usual GOGC*live set rule.
+//
+// When there is a very small live set but a lot of allocation, simply
+// collecting when the heap reaches GOGC*live results in many GC
+// cycles and high total per-GC overhead. This minimum amortizes this
+// per-GC overhead while keeping the heap reasonably small.
+//
+// During initialization this is set to 4MB*GOGC/100. In the case of
+// GOGC==0, this will set heapminimum to 0, resulting in constant
+// collection even when the heap size is small, which is useful for
+// debugging.
+var heapminimum uint64 = defaultHeapMinimum
+
+// defaultHeapMinimum is the value of heapminimum for GOGC==100.
+const defaultHeapMinimum = 4 << 20
+
+// Initialized from $GOGC.  GOGC=off means no GC.
+var gcpercent int32
+
+func gcinit() {
+	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+		throw("size of Workbuf is suboptimal")
+	}
+
+	_ = setGCPercent(readgogc())
+	memstats.gc_trigger = heapminimum
+	// Compute the goal heap size based on the trigger:
+	//   trigger = marked * (1 + triggerRatio)
+	//   marked = trigger / (1 + triggerRatio)
+	//   goal = marked * (1 + GOGC/100)
+	//        = trigger / (1 + triggerRatio) * (1 + GOGC/100)
+	memstats.next_gc = uint64(float64(memstats.gc_trigger) / (1 + gcController.triggerRatio) * (1 + float64(gcpercent)/100))
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+	work.startSema = 1
+	work.markDoneSema = 1
+}
+
+func readgogc() int32 {
+	p := gogetenv("GOGC")
+	if p == "off" {
+		return -1
+	}
+	if n, ok := atoi32(p); ok {
+		return n
+	}
+	return 100
+}
+
+// gcenable is called after the bulk of the runtime initialization,
+// just before we're about to start letting user code run.
+// It kicks off the background sweeper goroutine and enables GC.
+func gcenable() {
+	c := make(chan int, 1)
+	go bgsweep(c)
+	<-c
+	memstats.enablegc = true // now that runtime is initialized, GC is okay
+}
+
+//go:linkname setGCPercent runtime_debug.setGCPercent
+func setGCPercent(in int32) (out int32) {
+	lock(&mheap_.lock)
+	out = gcpercent
+	if in < 0 {
+		in = -1
+	}
+	gcpercent = in
+	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
+	if gcController.triggerRatio > float64(gcpercent)/100 {
+		gcController.triggerRatio = float64(gcpercent) / 100
+	}
+	// This is either in gcinit or followed by a STW GC, both of
+	// which will reset other stats like memstats.gc_trigger and
+	// memstats.next_gc to appropriate values.
+	unlock(&mheap_.lock)
+	return out
+}
+
+// Garbage collector phase.
+// Indicates to write barrier and synchronization task to perform.
+var gcphase uint32
+
+// The compiler knows about this variable.
+// If you change it, you must change the compiler too.
+var writeBarrier struct {
+	enabled bool    // compiler emits a check of this before calling write barrier
+	pad     [3]byte // compiler uses 32-bit load for "enabled" field
+	needed  bool    // whether we need a write barrier for current GC phase
+	cgo     bool    // whether we need a write barrier for a cgo check
+	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
+}
+
+// gcBlackenEnabled is 1 if mutator assists and background mark
+// workers are allowed to blacken objects. This must only be set when
+// gcphase == _GCmark.
+var gcBlackenEnabled uint32
+
+// gcBlackenPromptly indicates that optimizations that may
+// hide work from the global work queue should be disabled.
+//
+// If gcBlackenPromptly is true, per-P gcWork caches should
+// be flushed immediately and new objects should be allocated black.
+//
+// There is a tension between allocating objects white and
+// allocating them black. If white and the objects die before being
+// marked they can be collected during this GC cycle. On the other
+// hand allocating them black will reduce _GCmarktermination latency
+// since more work is done in the mark phase. This tension is resolved
+// by allocating white until the mark phase is approaching its end and
+// then allocating black for the remainder of the mark phase.
+var gcBlackenPromptly bool
+
+const (
+	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
+	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
+	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
+)
+
+//go:nosplit
+func setGCPhase(x uint32) {
+	atomic.Store(&gcphase, x)
+	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
+	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
+}
+
+// gcMarkWorkerMode represents the mode that a concurrent mark worker
+// should operate in.
+//
+// Concurrent marking happens through four different mechanisms. One
+// is mutator assists, which happen in response to allocations and are
+// not scheduled. The other three are variations in the per-P mark
+// workers and are distinguished by gcMarkWorkerMode.
+type gcMarkWorkerMode int
+
+const (
+	// gcMarkWorkerDedicatedMode indicates that the P of a mark
+	// worker is dedicated to running that mark worker. The mark
+	// worker should run without preemption.
+	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
+
+	// gcMarkWorkerFractionalMode indicates that a P is currently
+	// running the "fractional" mark worker. The fractional worker
+	// is necessary when GOMAXPROCS*gcGoalUtilization is not an
+	// integer. The fractional worker should run until it is
+	// preempted and will be scheduled to pick up the fractional
+	// part of GOMAXPROCS*gcGoalUtilization.
+	gcMarkWorkerFractionalMode
+
+	// gcMarkWorkerIdleMode indicates that a P is running the mark
+	// worker because it has nothing else to do. The idle worker
+	// should run until it is preempted and account its time
+	// against gcController.idleMarkTime.
+	gcMarkWorkerIdleMode
+)
+
+// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
+// to use in execution traces.
+var gcMarkWorkerModeStrings = [...]string{
+	"GC (dedicated)",
+	"GC (fractional)",
+	"GC (idle)",
+}
+
+// gcController implements the GC pacing controller that determines
+// when to trigger concurrent garbage collection and how much marking
+// work to do in mutator assists and background marking.
+//
+// It uses a feedback control algorithm to adjust the memstats.gc_trigger
+// trigger based on the heap growth and GC CPU utilization each cycle.
+// This algorithm optimizes for heap growth to match GOGC and for CPU
+// utilization between assist and background marking to be 25% of
+// GOMAXPROCS. The high-level design of this algorithm is documented
+// at https://golang.org/s/go15gcpacing.
+var gcController = gcControllerState{
+	// Initial trigger ratio guess.
+	triggerRatio: 7 / 8.0,
+}
+
+type gcControllerState struct {
+	// scanWork is the total scan work performed this cycle. This
+	// is updated atomically during the cycle. Updates occur in
+	// bounded batches, since it is both written and read
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
+	//
+	// Currently this is the bytes of heap scanned. For most uses,
+	// this is an opaque unit of work, but for estimation the
+	// definition is important.
+	scanWork int64
+
+	// bgScanCredit is the scan work credit accumulated by the
+	// concurrent background scan. This credit is accumulated by
+	// the background scan and stolen by mutator assists. This is
+	// updated atomically. Updates occur in bounded batches, since
+	// it is both written and read throughout the cycle.
+	bgScanCredit int64
+
+	// assistTime is the nanoseconds spent in mutator assists
+	// during this cycle. This is updated atomically. Updates
+	// occur in bounded batches, since it is both written and read
+	// throughout the cycle.
+	assistTime int64
+
+	// dedicatedMarkTime is the nanoseconds spent in dedicated
+	// mark workers during this cycle. This is updated atomically
+	// at the end of the concurrent mark phase.
+	dedicatedMarkTime int64
+
+	// fractionalMarkTime is the nanoseconds spent in the
+	// fractional mark worker during this cycle. This is updated
+	// atomically throughout the cycle and will be up-to-date if
+	// the fractional mark worker is not currently running.
+	fractionalMarkTime int64
+
+	// idleMarkTime is the nanoseconds spent in idle marking
+	// during this cycle. This is updated atomically throughout
+	// the cycle.
+	idleMarkTime int64
+
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
+
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark
+	// workers that need to be started. This is computed at the
+	// beginning of each cycle and decremented atomically as
+	// dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded int64
+
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
+	// computed at the beginning of each cycle and updated every
+	// time heap_scan is updated.
+	assistWorkPerByte float64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	assistBytesPerWork float64
+
+	// fractionalUtilizationGoal is the fraction of wall clock
+	// time that should be spent in the fractional mark worker.
+	// For example, if the overall mark utilization goal is 25%
+	// and GOMAXPROCS is 6, one P will be a dedicated mark worker
+	// and this will be set to 0.5 so that 50% of the time some P
+	// is in a fractional mark worker. This is computed at the
+	// beginning of each cycle.
+	fractionalUtilizationGoal float64
+
+	// triggerRatio is the heap growth ratio at which the garbage
+	// collection cycle should start. E.g., if this is 0.6, then
+	// GC should start when the live heap has reached 1.6 times
+	// the heap size marked by the previous cycle. This should be
+	// ≤ GOGC/100 so the trigger heap size is less than the goal
+	// heap size. This is updated at the end of of each cycle.
+	triggerRatio float64
+
+	_ [sys.CacheLineSize]byte
+
+	// fractionalMarkWorkersNeeded is the number of fractional
+	// mark workers that need to be started. This is either 0 or
+	// 1. This is potentially updated atomically at every
+	// scheduling point (hence it gets its own cache line).
+	fractionalMarkWorkersNeeded int64
+
+	_ [sys.CacheLineSize]byte
+}
+
+// startCycle resets the GC controller's state and computes estimates
+// for a new GC cycle. The caller must hold worldsema.
+func (c *gcControllerState) startCycle() {
+	c.scanWork = 0
+	c.bgScanCredit = 0
+	c.assistTime = 0
+	c.dedicatedMarkTime = 0
+	c.fractionalMarkTime = 0
+	c.idleMarkTime = 0
+
+	// If this is the first GC cycle or we're operating on a very
+	// small heap, fake heap_marked so it looks like gc_trigger is
+	// the appropriate growth from heap_marked, even though the
+	// real heap_marked may not have a meaningful value (on the
+	// first cycle) or may be much smaller (resulting in a large
+	// error response).
+	if memstats.gc_trigger <= heapminimum {
+		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + c.triggerRatio))
+	}
+
+	// Re-compute the heap goal for this cycle in case something
+	// changed. This is the same calculation we use elsewhere.
+	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed heap_live
+	// over gc_trigger is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if memstats.next_gc < memstats.heap_live+1024*1024 {
+		memstats.next_gc = memstats.heap_live + 1024*1024
+	}
+
+	// Compute the total mark utilization goal and divide it among
+	// dedicated and fractional workers.
+	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
+	c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
+	if c.fractionalUtilizationGoal > 0 {
+		c.fractionalMarkWorkersNeeded = 1
+	} else {
+		c.fractionalMarkWorkersNeeded = 0
+	}
+
+	// Clear per-P state
+	for _, p := range &allp {
+		if p == nil {
+			break
+		}
+		p.gcAssistTime = 0
+	}
+
+	// Compute initial values for controls that are updated
+	// throughout the cycle.
+	c.revise()
+
+	if debug.gcpacertrace > 0 {
+		print("pacer: assist ratio=", c.assistWorkPerByte,
+			" (scan ", memstats.heap_scan>>20, " MB in ",
+			work.initialHeapLive>>20, "->",
+			memstats.next_gc>>20, " MB)",
+			" workers=", c.dedicatedMarkWorkersNeeded,
+			"+", c.fractionalMarkWorkersNeeded, "\n")
+	}
+}
+
+// revise updates the assist ratio during the GC cycle to account for
+// improved estimates. This should be called either under STW or
+// whenever memstats.heap_scan or memstats.heap_live is updated (with
+// mheap_.lock held).
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+//
+// TODO: Consider removing the periodic controller update altogether.
+// Since we switched to allocating black, in theory we shouldn't have
+// to change the assist ratio. However, this is still a useful hook
+// that we've found many uses for when experimenting.
+func (c *gcControllerState) revise() {
+	// Compute the expected scan work remaining.
+	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heap_scan) and scan work completed
+	// (scanWork), so this difference won't be changed by
+	// allocations during GC.
+	//
+	// This particular estimate is a strict upper bound on the
+	// possible remaining scan work for the current heap.
+	// You might consider dividing this by 2 (or by
+	// (100+GOGC)/100) to counter this over-estimation, but
+	// benchmarks show that this has almost no effect on mean
+	// mutator utilization, heap size, or assist time and it
+	// introduces the danger of under-estimating and letting the
+	// mutator outpace the garbage collector.
+	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
+	if scanWorkExpected < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the expected scan work
+		// negative.
+		scanWorkExpected = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapDistance := int64(memstats.next_gc) - int64(memstats.heap_live)
+	if heapDistance <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapDistance = 1
+	}
+
+	// Compute the mutator assist ratio so by the time the mutator
+	// allocates the remaining heap bytes up to next_gc, it will
+	// have done (or stolen) the remaining amount of scan work.
+	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
+	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
+}
+
+// endCycle updates the GC controller state at the end of the
+// concurrent part of the GC cycle.
+func (c *gcControllerState) endCycle() {
+	h_t := c.triggerRatio // For debugging
+
+	// Proportional response gain for the trigger controller. Must
+	// be in [0, 1]. Lower values smooth out transient effects but
+	// take longer to respond to phase changes. Higher values
+	// react to phase changes quickly, but are more affected by
+	// transient changes. Values near 1 may be unstable.
+	const triggerGain = 0.5
+
+	// Compute next cycle trigger ratio. First, this computes the
+	// "error" for this cycle; that is, how far off the trigger
+	// was from what it should have been, accounting for both heap
+	// growth and GC CPU utilization. We compute the actual heap
+	// growth during this cycle and scale that by how far off from
+	// the goal CPU utilization we were (to estimate the heap
+	// growth if we had the desired CPU utilization). The
+	// difference between this estimate and the GOGC-based goal
+	// heap growth is the error.
+	goalGrowthRatio := float64(gcpercent) / 100
+	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
+	assistDuration := nanotime() - c.markStartTime
+
+	// Assume background mark hit its utilization goal.
+	utilization := gcGoalUtilization
+	// Add assist utilization; avoid divide by zero.
+	if assistDuration > 0 {
+		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
+	}
+
+	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+
+	// Finally, we adjust the trigger for next time by this error,
+	// damped by the proportional gain.
+	c.triggerRatio += triggerGain * triggerError
+	if c.triggerRatio < 0 {
+		// This can happen if the mutator is allocating very
+		// quickly or the GC is scanning very slowly.
+		c.triggerRatio = 0
+	} else if c.triggerRatio > goalGrowthRatio*0.95 {
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		c.triggerRatio = goalGrowthRatio * 0.95
+	}
+
+	if debug.gcpacertrace > 0 {
+		// Print controller state in terms of the design
+		// document.
+		H_m_prev := memstats.heap_marked
+		H_T := memstats.gc_trigger
+		h_a := actualGrowthRatio
+		H_a := memstats.heap_live
+		h_g := goalGrowthRatio
+		H_g := int64(float64(H_m_prev) * (1 + h_g))
+		u_a := utilization
+		u_g := gcGoalUtilization
+		W_a := c.scanWork
+		print("pacer: H_m_prev=", H_m_prev,
+			" h_t=", h_t, " H_T=", H_T,
+			" h_a=", h_a, " H_a=", H_a,
+			" h_g=", h_g, " H_g=", H_g,
+			" u_a=", u_a, " u_g=", u_g,
+			" W_a=", W_a,
+			" goalΔ=", goalGrowthRatio-h_t,
+			" actualΔ=", h_a-h_t,
+			" u_a/u_g=", u_a/u_g,
+			"\n")
+	}
+}
+
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	// If there are idle Ps, wake one so it will run an idle worker.
+	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	//
+	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//		wakep()
+	//		return
+	//	}
+
+	// There are no idle Ps. If we need more dedicated workers,
+	// try to preempt a running P so it will switch to a worker.
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrand() % uint32(gomaxprocs-1))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
+// findRunnableGCWorker returns the background mark worker for _p_ if it
+// should be run. This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
+	if gcBlackenEnabled == 0 {
+		throw("gcControllerState.findRunnable: blackening not enabled")
+	}
+	if _p_.gcBgMarkWorker == 0 {
+		// The mark worker associated with this P is blocked
+		// performing a mark transition. We can't run it
+		// because it may be on some other run or wait queue.
+		return nil
+	}
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
+		return nil
+	}
+
+	decIfPositive := func(ptr *int64) bool {
+		if *ptr > 0 {
+			if atomic.Xaddint64(ptr, -1) >= 0 {
+				return true
+			}
+			// We lost a race
+			atomic.Xaddint64(ptr, +1)
+		}
+		return false
+	}
+
+	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
+		// This P is now dedicated to marking until the end of
+		// the concurrent mark phase.
+		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+		// TODO(austin): This P isn't going to run anything
+		// else for a while, so kick everything out of its run
+		// queue.
+	} else {
+		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
+			// No more workers are need right now.
+			return nil
+		}
+
+		// This P has picked the token for the fractional worker.
+		// Is the GC currently under or at the utilization goal?
+		// If so, do more work.
+		//
+		// We used to check whether doing one time slice of work
+		// would remain under the utilization goal, but that has the
+		// effect of delaying work until the mutator has run for
+		// enough time slices to pay for the work. During those time
+		// slices, write barriers are enabled, so the mutator is running slower.
+		// Now instead we do the work whenever we're under or at the
+		// utilization work and pay for it by letting the mutator run later.
+		// This doesn't change the overall utilization averages, but it
+		// front loads the GC work so that the GC finishes earlier and
+		// write barriers can be turned off sooner, effectively giving
+		// the mutator a faster machine.
+		//
+		// The old, slower behavior can be restored by setting
+		//	gcForcePreemptNS = forcePreemptNS.
+		const gcForcePreemptNS = 0
+
+		// TODO(austin): We could fast path this and basically
+		// eliminate contention on c.fractionalMarkWorkersNeeded by
+		// precomputing the minimum time at which it's worth
+		// next scheduling the fractional worker. Then Ps
+		// don't have to fight in the window where we've
+		// passed that deadline and no one has started the
+		// worker yet.
+		//
+		// TODO(austin): Shorter preemption interval for mark
+		// worker to improve fairness and give this
+		// finer-grained control over schedule?
+		now := nanotime() - gcController.markStartTime
+		then := now + gcForcePreemptNS
+		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
+		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
+			// Nope, we'd overshoot the utilization goal
+			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+			return nil
+		}
+		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+	}
+
+	// Run the background mark worker
+	gp := _p_.gcBgMarkWorker.ptr()
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.enabled {
+		traceGoUnpark(gp, 0)
+	}
+	return gp
+}
+
+// gcGoalUtilization is the goal CPU utilization for background
+// marking as a fraction of GOMAXPROCS.
+const gcGoalUtilization = 0.25
+
+// gcCreditSlack is the amount of scan work credit that can can
+// accumulate locally before updating gcController.scanWork and,
+// optionally, gcController.bgScanCredit. Lower values give a more
+// accurate assist ratio and make it more likely that assists will
+// successfully steal background credit. Higher values reduce memory
+// contention.
+const gcCreditSlack = 2000
+
+// gcAssistTimeSlack is the nanoseconds of mutator assist time that
+// can accumulate on a P before updating gcController.assistTime.
+const gcAssistTimeSlack = 5000
+
+// gcOverAssistWork determines how many extra units of scan work a GC
+// assist does when an assist happens. This amortizes the cost of an
+// assist by pre-paying for this many bytes of future allocations.
+const gcOverAssistWork = 64 << 10
+
+var work struct {
+	full  uint64                   // lock-free list of full blocks workbuf
+	empty uint64                   // lock-free list of empty blocks workbuf
+	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+
+	// bytesMarked is the number of bytes marked this cycle. This
+	// includes bytes blackened in scanned objects, noscan objects
+	// that go straight to black, and permagrey objects scanned by
+	// markroot during the concurrent scan phase. This is updated
+	// atomically during the cycle. Updates may be batched
+	// arbitrarily, since the value is only read at the end of the
+	// cycle.
+	//
+	// Because of benign races during marking, this number may not
+	// be the exact number of marked bytes, but it should be very
+	// close.
+	//
+	// Put this field here because it needs 64-bit atomic access
+	// (and thus 8-byte alignment even on 32-bit architectures).
+	bytesMarked uint64
+
+	markrootNext uint32 // next markroot job
+	markrootJobs uint32 // number of markroot jobs
+
+	nproc   uint32
+	tstart  int64
+	nwait   uint32
+	ndone   uint32
+	alldone note
+
+	// helperDrainBlock indicates that GC mark termination helpers
+	// should pass gcDrainBlock to gcDrain to block in the
+	// getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
+	//
+	// TODO: This is a temporary fallback to support
+	// debug.gcrescanstacks > 0 and to work around some known
+	// races. Remove this when we remove the debug option and fix
+	// the races.
+	helperDrainBlock bool
+
+	// Number of roots of various root types. Set by gcMarkRootPrepare.
+	nFlushCacheRoots                                  int
+	nDataRoots, nSpanRoots, nStackRoots, nRescanRoots int
+
+	// markrootDone indicates that roots have been marked at least
+	// once during the current GC cycle. This is checked by root
+	// marking operations that have to happen only during the
+	// first root marking pass, whether that's during the
+	// concurrent mark phase in current GC or mark termination in
+	// STW GC.
+	markrootDone bool
+
+	// Each type of GC state transition is protected by a lock.
+	// Since multiple threads can simultaneously detect the state
+	// transition condition, any thread that detects a transition
+	// condition must acquire the appropriate transition lock,
+	// re-check the transition condition and return if it no
+	// longer holds or perform the transition if it does.
+	// Likewise, any transition must invalidate the transition
+	// condition before releasing the lock. This ensures that each
+	// transition is performed by exactly one thread and threads
+	// that need the transition to happen block until it has
+	// happened.
+	//
+	// startSema protects the transition from "off" to mark or
+	// mark termination.
+	startSema uint32
+	// markDoneSema protects transitions from mark 1 to mark 2 and
+	// from mark 2 to mark termination.
+	markDoneSema uint32
+
+	bgMarkReady note   // signal background mark worker has started
+	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
+	// Background mark completion signaling
+
+	// mode is the concurrency mode of the current GC cycle.
+	mode gcMode
+
+	// totaltime is the CPU nanoseconds spent in GC since the
+	// program started if debug.gctrace > 0.
+	totaltime int64
+
+	// initialHeapLive is the value of memstats.heap_live at the
+	// beginning of this GC cycle.
+	initialHeapLive uint64
+
+	// assistQueue is a queue of assists that are blocked because
+	// there was neither enough credit to steal or enough work to
+	// do.
+	assistQueue struct {
+		lock       mutex
+		head, tail guintptr
+	}
+
+	// rescan is a list of G's that need to be rescanned during
+	// mark termination. A G adds itself to this list when it
+	// first invalidates its stack scan.
+	rescan struct {
+		lock mutex
+		list []guintptr
+	}
+
+	// Timing/utilization stats for this cycle.
+	stwprocs, maxprocs                 int32
+	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
+
+	pauseNS    int64 // total STW time this cycle
+	pauseStart int64 // nanotime() of last STW
+
+	// debug.gctrace heap sizes for this cycle.
+	heap0, heap1, heap2, heapGoal uint64
+}
+
+// GC runs a garbage collection and blocks the caller until the
+// garbage collection is complete. It may also block the entire
+// program.
+func GC() {
+	gcStart(gcForceBlockMode, false)
+}
+
+// gcMode indicates how concurrent a GC cycle should be.
+type gcMode int
+
+const (
+	gcBackgroundMode gcMode = iota // concurrent GC and sweep
+	gcForceMode                    // stop-the-world GC now, concurrent sweep
+	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
+)
+
+// gcShouldStart returns true if the exit condition for the _GCoff
+// phase has been met. The exit condition should be tested when
+// allocating.
+//
+// If forceTrigger is true, it ignores the current heap size, but
+// checks all other conditions. In general this should be false.
+func gcShouldStart(forceTrigger bool) bool {
+	return gcphase == _GCoff && (forceTrigger || memstats.heap_live >= memstats.gc_trigger) && memstats.enablegc && panicking == 0 && gcpercent >= 0
+}
+
+// gcStart transitions the GC from _GCoff to _GCmark (if mode ==
+// gcBackgroundMode) or _GCmarktermination (if mode !=
+// gcBackgroundMode) by performing sweep termination and GC
+// initialization.
+//
+// This may return without performing this transition in some cases,
+// such as when called on a system stack or with locks held.
+func gcStart(mode gcMode, forceTrigger bool) {
+	// Since this is called from malloc and malloc is called in
+	// the guts of a number of libraries that might be holding
+	// locks, don't attempt to start GC in non-preemptible or
+	// potentially unstable situations.
+	mp := acquirem()
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+	mp = nil
+
+	// Pick up the remaining unswept/not being swept spans concurrently
+	//
+	// This shouldn't happen if we're being invoked in background
+	// mode since proportional sweep should have just finished
+	// sweeping everything, but rounding errors, etc, may leave a
+	// few spans unswept. In forced mode, this is necessary since
+	// GC can be forced at any point in the sweeping cycle.
+	//
+	// We check the transition condition continuously here in case
+	// this G gets delayed in to the next GC cycle.
+	for (mode != gcBackgroundMode || gcShouldStart(forceTrigger)) && gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+	}
+
+	// Perform GC initialization and the sweep termination
+	// transition.
+	//
+	// If this is a forced GC, don't acquire the transition lock
+	// or re-check the transition condition because we
+	// specifically *don't* want to share the transition with
+	// another thread.
+	useStartSema := mode == gcBackgroundMode
+	if useStartSema {
+		semacquire(&work.startSema, 0)
+		// Re-check transition condition under transition lock.
+		if !gcShouldStart(forceTrigger) {
+			semrelease(&work.startSema)
+			return
+		}
+	}
+
+	// For stats, check if this GC was forced by the user.
+	forced := mode != gcBackgroundMode
+
+	// In gcstoptheworld debug mode, upgrade the mode accordingly.
+	// We do this after re-checking the transition condition so
+	// that multiple goroutines that detect the heap trigger don't
+	// start multiple STW GCs.
+	if mode == gcBackgroundMode {
+		if debug.gcstoptheworld == 1 {
+			mode = gcForceMode
+		} else if debug.gcstoptheworld == 2 {
+			mode = gcForceBlockMode
+		}
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	semacquire(&worldsema, 0)
+
+	if trace.enabled {
+		traceGCStart()
+	}
+
+	if mode == gcBackgroundMode {
+		gcBgMarkStartWorkers()
+	}
+
+	gcResetMarkState()
+
+	now := nanotime()
+	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.tSweepTerm = now
+	work.heap0 = memstats.heap_live
+	work.pauseNS = 0
+	work.mode = mode
+
+	work.pauseStart = now
+	systemstack(stopTheWorldWithSema)
+	// Finish sweep before we start concurrent scan.
+	systemstack(func() {
+		finishsweep_m()
+	})
+	// clearpools before we start the GC. If we wait they memory will not be
+	// reclaimed until the next GC cycle.
+	clearpools()
+
+	if mode == gcBackgroundMode { // Do as much work concurrently as possible
+		gcController.startCycle()
+		work.heapGoal = memstats.next_gc
+
+		// Enter concurrent mark phase and enable
+		// write barriers.
+		//
+		// Because the world is stopped, all Ps will
+		// observe that write barriers are enabled by
+		// the time we start the world and begin
+		// scanning.
+		//
+		// It's necessary to enable write barriers
+		// during the scan phase for several reasons:
+		//
+		// They must be enabled for writes to higher
+		// stack frames before we scan stacks and
+		// install stack barriers because this is how
+		// we track writes to inactive stack frames.
+		// (Alternatively, we could not install stack
+		// barriers over frame boundaries with
+		// up-pointers).
+		//
+		// They must be enabled before assists are
+		// enabled because they must be enabled before
+		// any non-leaf heap objects are marked. Since
+		// allocations are blocked until assists can
+		// happen, we want enable assists as early as
+		// possible.
+		setGCPhase(_GCmark)
+
+		gcBgMarkPrepare() // Must happen before assist enable.
+		gcMarkRootPrepare()
+
+		// Mark all active tinyalloc blocks. Since we're
+		// allocating from these, they need to be black like
+		// other allocations. The alternative is to blacken
+		// the tiny block on every allocation from it, which
+		// would slow down the tiny allocator.
+		gcMarkTinyAllocs()
+
+		// At this point all Ps have enabled the write
+		// barrier, thus maintaining the no white to
+		// black invariant. Enable mutator assists to
+		// put back-pressure on fast allocating
+		// mutators.
+		atomic.Store(&gcBlackenEnabled, 1)
+
+		// Assists and workers can start the moment we start
+		// the world.
+		gcController.markStartTime = now
+
+		// Concurrent mark.
+		systemstack(startTheWorldWithSema)
+		now = nanotime()
+		work.pauseNS += now - work.pauseStart
+		work.tMark = now
+	} else {
+		t := nanotime()
+		work.tMark, work.tMarkTerm = t, t
+		work.heapGoal = work.heap0
+
+		if forced {
+			memstats.numforcedgc++
+		}
+
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
+	}
+
+	if useStartSema {
+		semrelease(&work.startSema)
+	}
+}
+
+// gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
+// to mark termination.
+//
+// This should be called when all mark work has been drained. In mark
+// 1, this includes all root marking jobs, global work buffers, and
+// active work buffers in assists and background workers; however,
+// work may still be cached in per-P work buffers. In mark 2, per-P
+// caches are disabled.
+//
+// The calling context must be preemptible.
+//
+// Note that it is explicitly okay to have write barriers in this
+// function because completion of concurrent mark is best-effort
+// anyway. Any work created by write barriers here will be cleaned up
+// by mark termination.
+func gcMarkDone() {
+top:
+	semacquire(&work.markDoneSema, 0)
+
+	// Re-check transition condition under transition lock.
+	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
+		semrelease(&work.markDoneSema)
+		return
+	}
+
+	// Disallow starting new workers so that any remaining workers
+	// in the current mark phase will drain out.
+	//
+	// TODO(austin): Should dedicated workers keep an eye on this
+	// and exit gcDrain promptly?
+	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
+	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+
+	if !gcBlackenPromptly {
+		// Transition from mark 1 to mark 2.
+		//
+		// The global work list is empty, but there can still be work
+		// sitting in the per-P work caches.
+		// Flush and disable work caches.
+
+		// Disallow caching workbufs and indicate that we're in mark 2.
+		gcBlackenPromptly = true
+
+		// Prevent completion of mark 2 until we've flushed
+		// cached workbufs.
+		atomic.Xadd(&work.nwait, -1)
+
+		// GC is set up for mark 2. Let Gs blocked on the
+		// transition lock go while we flush caches.
+		semrelease(&work.markDoneSema)
+
+		systemstack(func() {
+			// Flush all currently cached workbufs and
+			// ensure all Ps see gcBlackenPromptly. This
+			// also blocks until any remaining mark 1
+			// workers have exited their loop so we can
+			// start new mark 2 workers.
+			forEachP(func(_p_ *p) {
+				_p_.gcw.dispose()
+			})
+		})
+
+		// Check that roots are marked. We should be able to
+		// do this before the forEachP, but based on issue
+		// #16083 there may be a (harmless) race where we can
+		// enter mark 2 while some workers are still scanning
+		// stacks. The forEachP ensures these scans are done.
+		//
+		// TODO(austin): Figure out the race and fix this
+		// properly.
+		gcMarkRootCheck()
+
+		// Now we can start up mark 2 workers.
+		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
+		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// This loop will make progress because
+			// gcBlackenPromptly is now true, so it won't
+			// take this same "if" branch.
+			goto top
+		}
+	} else {
+		// Transition to mark termination.
+		now := nanotime()
+		work.tMarkTerm = now
+		work.pauseStart = now
+		getg().m.preemptoff = "gcing"
+		systemstack(stopTheWorldWithSema)
+		// The gcphase is _GCmark, it will transition to _GCmarktermination
+		// below. The important thing is that the wb remains active until
+		// all marking is complete. This includes writes made by the GC.
+
+		// Record that one root marking pass has completed.
+		work.markrootDone = true
+
+		// Disable assists and background workers. We must do
+		// this before waking blocked assists.
+		atomic.Store(&gcBlackenEnabled, 0)
+
+		// Wake all blocked assists. These will run when we
+		// start the world again.
+		gcWakeAllAssists()
+
+		// Likewise, release the transition lock. Blocked
+		// workers and assists will run when we start the
+		// world again.
+		semrelease(&work.markDoneSema)
+
+		// endCycle depends on all gcWork cache stats being
+		// flushed. This is ensured by mark 2.
+		gcController.endCycle()
+
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
+	}
+}
+
+func gcMarkTermination() {
+	// World is stopped.
+	// Start marktermination which includes enabling the write barrier.
+	atomic.Store(&gcBlackenEnabled, 0)
+	gcBlackenPromptly = false
+	setGCPhase(_GCmarktermination)
+
+	work.heap1 = memstats.heap_live
+	startTime := nanotime()
+
+	mp := acquirem()
+	mp.preemptoff = "gcing"
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gp := _g_.m.curg
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "garbage collection"
+
+	// Run gc on the g0 stack. We do this so that the g stack
+	// we're currently running on will no longer change. Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	systemstack(func() {
+		gcMark(startTime)
+		// Must return immediately.
+		// The outer function's stack may have moved
+		// during gcMark (it shrinks stacks, including the
+		// outer function's stack), so we must not refer
+		// to any of its variables. Return back to the
+		// non-system stack to pick up the new addresses
+		// before continuing.
+	})
+
+	systemstack(func() {
+		work.heap2 = work.bytesMarked
+		if debug.gccheckmark > 0 {
+			// Run a full stop-the-world mark using checkmark bits,
+			// to check that we didn't forget to mark anything during
+			// the concurrent mark process.
+			gcResetMarkState()
+			initCheckmarks()
+			gcMark(startTime)
+			clearCheckmarks()
+		}
+
+		// marking is complete so we can turn the write barrier off
+		setGCPhase(_GCoff)
+		gcSweep(work.mode)
+
+		if debug.gctrace > 1 {
+			startTime = nanotime()
+			// The g stacks have been scanned so
+			// they have gcscanvalid==true and gcworkdone==true.
+			// Reset these so that all stacks will be rescanned.
+			gcResetMarkState()
+			finishsweep_m()
+
+			// Still in STW but gcphase is _GCoff, reset to _GCmarktermination
+			// At this point all objects will be found during the gcMark which
+			// does a complete STW mark and object scan.
+			setGCPhase(_GCmarktermination)
+			gcMark(startTime)
+			setGCPhase(_GCoff) // marking is done, turn off wb.
+			gcSweep(work.mode)
+		}
+	})
+
+	_g_.m.traceback = 0
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	if trace.enabled {
+		traceGCDone()
+	}
+
+	// all done
+	mp.preemptoff = ""
+
+	if gcphase != _GCoff {
+		throw("gc done but gcphase != _GCoff")
+	}
+
+	// Update timing memstats
+	now, unixNow := nanotime(), unixnanotime()
+	work.pauseNS += now - work.pauseStart
+	work.tEnd = now
+	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
+	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
+	memstats.pause_total_ns += uint64(work.pauseNS)
+
+	// Update work.totaltime.
+	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
+	// We report idle marking time below, but omit it from the
+	// overall utilization here since it's "free".
+	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
+	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
+	cycleCpu := sweepTermCpu + markCpu + markTermCpu
+	work.totaltime += cycleCpu
+
+	// Compute overall GC CPU utilization.
+	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
+	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
+
+	memstats.numgc++
+
+	// Reset sweep state.
+	sweep.nbgsweep = 0
+	sweep.npausesweep = 0
+
+	systemstack(startTheWorldWithSema)
+
+	// Update heap profile stats if gcSweep didn't do it. This is
+	// relatively expensive, so we don't want to do it while the
+	// world is stopped, but it needs to happen ASAP after
+	// starting the world to prevent too many allocations from the
+	// next cycle leaking in. It must happen before releasing
+	// worldsema since there are applications that do a
+	// runtime.GC() to update the heap profile and then
+	// immediately collect the profile.
+	if _ConcurrentSweep && work.mode != gcForceBlockMode {
+		mProf_GC()
+	}
+
+	// Print gctrace before dropping worldsema. As soon as we drop
+	// worldsema another cycle could start and smash the stats
+	// we're trying to print.
+	if debug.gctrace > 0 {
+		util := int(memstats.gc_cpu_fraction * 100)
+
+		var sbuf [24]byte
+		printlock()
+		print("gc ", memstats.numgc,
+			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
+			util, "%: ")
+		prev := work.tSweepTerm
+		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
+			if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
+			prev = ns
+		}
+		print(" ms clock, ")
+		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
+			if i == 2 || i == 3 {
+				// Separate mark time components with /.
+				print("/")
+			} else if i != 0 {
+				print("+")
+			}
+			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
+		}
+		print(" ms cpu, ",
+			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
+			work.heapGoal>>20, " MB goal, ",
+			work.maxprocs, " P")
+		if work.mode != gcBackgroundMode {
+			print(" (forced)")
+		}
+		print("\n")
+		printunlock()
+	}
+
+	semrelease(&worldsema)
+	// Careful: another GC cycle may start now.
+
+	releasem(mp)
+	mp = nil
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		Gosched()
+	}
+}
+
+// gcBgMarkStartWorkers prepares background mark worker goroutines.
+// These goroutines will not run until the mark phase, but they must
+// be started while the work is not stopped and from a regular G
+// stack. The caller must hold worldsema.
+func gcBgMarkStartWorkers() {
+	// Background marking is performed by per-P G's. Ensure that
+	// each P has a background GC G.
+	for _, p := range &allp {
+		if p == nil || p.status == _Pdead {
+			break
+		}
+		if p.gcBgMarkWorker == 0 {
+			go gcBgMarkWorker(p)
+			notetsleepg(&work.bgMarkReady, -1)
+			noteclear(&work.bgMarkReady)
+		}
+	}
+}
+
+// gcBgMarkPrepare sets up state for background marking.
+// Mutator assists must not yet be enabled.
+func gcBgMarkPrepare() {
+	// Background marking will stop when the work queues are empty
+	// and there are no more workers (note that, since this is
+	// concurrent, this may be a transient state, but mark
+	// termination will clean it up). Between background workers
+	// and assists, we don't really know how many workers there
+	// will be, so we pretend to have an arbitrarily large number
+	// of workers, almost all of which are "waiting". While a
+	// worker is working it decrements nwait. If nproc == nwait,
+	// there are no workers.
+	work.nproc = ^uint32(0)
+	work.nwait = ^uint32(0)
+}
+
+func gcBgMarkWorker(_p_ *p) {
+	gp := getg()
+
+	type parkInfo struct {
+		m      muintptr // Release this m on park.
+		attach puintptr // If non-nil, attach to this p on park.
+	}
+	// We pass park to a gopark unlock function, so it can't be on
+	// the stack (see gopark). Prevent deadlock from recursively
+	// starting GC by disabling preemption.
+	gp.m.preemptoff = "GC worker init"
+	park := new(parkInfo)
+	gp.m.preemptoff = ""
+
+	park.m.set(acquirem())
+	park.attach.set(_p_)
+	// Inform gcBgMarkStartWorkers that this worker is ready.
+	// After this point, the background mark worker is scheduled
+	// cooperatively by gcController.findRunnable. Hence, it must
+	// never be preempted, as this would put it into _Grunnable
+	// and put it on a run queue. Instead, when the preempt flag
+	// is set, this puts itself into _Gwaiting to be woken up by
+	// gcController.findRunnable at the appropriate time.
+	notewakeup(&work.bgMarkReady)
+
+	for {
+		// Go to sleep until woken by gcController.findRunnable.
+		// We can't releasem yet since even the call to gopark
+		// may be preempted.
+		gopark(func(g *g, parkp unsafe.Pointer) bool {
+			park := (*parkInfo)(parkp)
+
+			// The worker G is no longer running, so it's
+			// now safe to allow preemption.
+			releasem(park.m.ptr())
+
+			// If the worker isn't attached to its P,
+			// attach now. During initialization and after
+			// a phase change, the worker may have been
+			// running on a different P. As soon as we
+			// attach, the owner P may schedule the
+			// worker, so this must be done after the G is
+			// stopped.
+			if park.attach != 0 {
+				p := park.attach.ptr()
+				park.attach.set(nil)
+				// cas the worker because we may be
+				// racing with a new worker starting
+				// on this P.
+				if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
+					// The P got a new worker.
+					// Exit this worker.
+					return false
+				}
+			}
+			return true
+		}, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
+
+		// Loop until the P dies and disassociates this
+		// worker (the P may later be reused, in which case
+		// it will get a new worker) or we failed to associate.
+		if _p_.gcBgMarkWorker.ptr() != gp {
+			break
+		}
+
+		// Disable preemption so we can use the gcw. If the
+		// scheduler wants to preempt us, we'll stop draining,
+		// dispose the gcw, and then preempt.
+		park.m.set(acquirem())
+
+		if gcBlackenEnabled == 0 {
+			throw("gcBgMarkWorker: blackening not enabled")
+		}
+
+		startTime := nanotime()
+
+		decnwait := atomic.Xadd(&work.nwait, -1)
+		if decnwait == work.nproc {
+			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+			throw("work.nwait was > work.nproc")
+		}
+
+		systemstack(func() {
+			// Mark our goroutine preemptible so its stack
+			// can be scanned. This lets two mark workers
+			// scan each other (otherwise, they would
+			// deadlock). We must not modify anything on
+			// the G stack. However, stack shrinking is
+			// disabled for mark workers, so it is safe to
+			// read from the G stack.
+			casgstatus(gp, _Grunning, _Gwaiting)
+			switch _p_.gcMarkWorkerMode {
+			default:
+				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
+			case gcMarkWorkerDedicatedMode:
+				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
+			case gcMarkWorkerFractionalMode:
+				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			case gcMarkWorkerIdleMode:
+				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+			}
+			casgstatus(gp, _Gwaiting, _Grunning)
+		})
+
+		// If we are nearing the end of mark, dispose
+		// of the cache promptly. We must do this
+		// before signaling that we're no longer
+		// working so that other workers can't observe
+		// no workers and no work while we have this
+		// cached, and before we compute done.
+		if gcBlackenPromptly {
+			_p_.gcw.dispose()
+		}
+
+		// Account for time.
+		duration := nanotime() - startTime
+		switch _p_.gcMarkWorkerMode {
+		case gcMarkWorkerDedicatedMode:
+			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
+			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
+		case gcMarkWorkerFractionalMode:
+			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
+			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+		case gcMarkWorkerIdleMode:
+			atomic.Xaddint64(&gcController.idleMarkTime, duration)
+		}
+
+		// Was this the last worker and did we run out
+		// of work?
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait > work.nproc {
+			println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
+				"work.nwait=", incnwait, "work.nproc=", work.nproc)
+			throw("work.nwait > work.nproc")
+		}
+
+		// If this worker reached a background mark completion
+		// point, signal the main GC goroutine.
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// Make this G preemptible and disassociate it
+			// as the worker for this P so
+			// findRunnableGCWorker doesn't try to
+			// schedule it.
+			_p_.gcBgMarkWorker.set(nil)
+			releasem(park.m.ptr())
+
+			gcMarkDone()
+
+			// Disable preemption and prepare to reattach
+			// to the P.
+			//
+			// We may be running on a different P at this
+			// point, so we can't reattach until this G is
+			// parked.
+			park.m.set(acquirem())
+			park.attach.set(_p_)
+		}
+	}
+}
+
+// gcMarkWorkAvailable returns true if executing a mark worker
+// on p is potentially useful. p may be nil, in which case it only
+// checks the global sources of work.
+func gcMarkWorkAvailable(p *p) bool {
+	if p != nil && !p.gcw.empty() {
+		return true
+	}
+	if atomic.Load64(&work.full) != 0 {
+		return true // global work available
+	}
+	if work.markrootNext < work.markrootJobs {
+		return true // root scan work available
+	}
+	return false
+}
+
+// gcMark runs the mark (or, for concurrent GC, mark termination)
+// All gcWork caches must be empty.
+// STW is in effect at this point.
+//TODO go:nowritebarrier
+func gcMark(start_time int64) {
+	if debug.allocfreetrace > 0 {
+		tracegc()
+	}
+
+	if gcphase != _GCmarktermination {
+		throw("in gcMark expecting to see gcphase as _GCmarktermination")
+	}
+	work.tstart = start_time
+
+	// Queue root marking jobs.
+	gcMarkRootPrepare()
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = uint32(gcprocs())
+
+	if debug.gcrescanstacks == 0 && work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots+work.nRescanRoots == 0 {
+		// There's no work on the work queue and no root jobs
+		// that can produce work, so don't bother entering the
+		// getfull() barrier.
+		//
+		// With the hybrid barrier enabled, this will be the
+		// situation the vast majority of the time after
+		// concurrent mark. However, we still need a fallback
+		// for STW GC and because there are some known races
+		// that occasionally leave work around for mark
+		// termination.
+		//
+		// We're still hedging our bets here: if we do
+		// accidentally produce some work, we'll still process
+		// it, just not necessarily in parallel.
+		//
+		// TODO(austin): When we eliminate
+		// debug.gcrescanstacks: fix the races, and remove
+		// work draining from mark termination so we don't
+		// need the fallback path.
+		work.helperDrainBlock = false
+	} else {
+		work.helperDrainBlock = true
+	}
+
+	if trace.enabled {
+		traceGCScanStart()
+	}
+
+	if work.nproc > 1 {
+		noteclear(&work.alldone)
+		helpgc(int32(work.nproc))
+	}
+
+	gchelperstart()
+
+	gcw := &getg().m.p.ptr().gcw
+	if work.helperDrainBlock {
+		gcDrain(gcw, gcDrainBlock)
+	} else {
+		gcDrain(gcw, gcDrainNoBlock)
+	}
+	gcw.dispose()
+
+	if debug.gccheckmark > 0 {
+		// This is expensive when there's a large number of
+		// Gs, so only do it if checkmark is also enabled.
+		gcMarkRootCheck()
+	}
+	if work.full != 0 {
+		throw("work.full != 0")
+	}
+
+	if work.nproc > 1 {
+		notesleep(&work.alldone)
+	}
+
+	// Record that at least one root marking pass has completed.
+	work.markrootDone = true
+
+	// Double-check that all gcWork caches are empty. This should
+	// be ensured by mark 2 before we enter mark termination.
+	for i := 0; i < int(gomaxprocs); i++ {
+		gcw := &allp[i].gcw
+		if !gcw.empty() {
+			throw("P has cached GC work at end of mark termination")
+		}
+		if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
+			throw("P has unflushed stats at end of mark termination")
+		}
+	}
+
+	if trace.enabled {
+		traceGCScanDone()
+	}
+
+	cachestats()
+
+	// Update the marked heap stat.
+	memstats.heap_marked = work.bytesMarked
+
+	// Trigger the next GC cycle when the allocated heap has grown
+	// by triggerRatio over the marked heap size. Assume that
+	// we're in steady state, so the marked heap size is the
+	// same now as it was at the beginning of the GC cycle.
+	memstats.gc_trigger = uint64(float64(memstats.heap_marked) * (1 + gcController.triggerRatio))
+	if memstats.gc_trigger < heapminimum {
+		memstats.gc_trigger = heapminimum
+	}
+	if int64(memstats.gc_trigger) < 0 {
+		print("next_gc=", memstats.next_gc, " bytesMarked=", work.bytesMarked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "\n")
+		throw("gc_trigger underflow")
+	}
+
+	// Update other GC heap size stats. This must happen after
+	// cachestats (which flushes local statistics to these) and
+	// flushallmcaches (which modifies heap_live).
+	memstats.heap_live = work.bytesMarked
+	memstats.heap_scan = uint64(gcController.scanWork)
+
+	minTrigger := memstats.heap_live + sweepMinHeapDistance*uint64(gcpercent)/100
+	if memstats.gc_trigger < minTrigger {
+		// The allocated heap is already past the trigger.
+		// This can happen if the triggerRatio is very low and
+		// the marked heap is less than the live heap size.
+		//
+		// Concurrent sweep happens in the heap growth from
+		// heap_live to gc_trigger, so bump gc_trigger up to ensure
+		// that concurrent sweep has some heap growth in which
+		// to perform sweeping before we start the next GC
+		// cycle.
+		memstats.gc_trigger = minTrigger
+	}
+
+	// The next GC cycle should finish before the allocated heap
+	// has grown by GOGC/100.
+	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+	if gcpercent < 0 {
+		memstats.next_gc = ^uint64(0)
+	}
+	if memstats.next_gc < memstats.gc_trigger {
+		memstats.next_gc = memstats.gc_trigger
+	}
+
+	if trace.enabled {
+		traceHeapAlloc()
+		traceNextGC()
+	}
+}
+
+func gcSweep(mode gcMode) {
+	if gcphase != _GCoff {
+		throw("gcSweep being done but phase is not GCoff")
+	}
+
+	lock(&mheap_.lock)
+	mheap_.sweepgen += 2
+	mheap_.sweepdone = 0
+	if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
+		// We should have drained this list during the last
+		// sweep phase. We certainly need to start this phase
+		// with an empty swept list.
+		throw("non-empty swept list")
+	}
+	unlock(&mheap_.lock)
+
+	if !_ConcurrentSweep || mode == gcForceBlockMode {
+		// Special case synchronous sweep.
+		// Record that no proportional sweeping has to happen.
+		lock(&mheap_.lock)
+		mheap_.sweepPagesPerByte = 0
+		mheap_.pagesSwept = 0
+		unlock(&mheap_.lock)
+		// Sweep all spans eagerly.
+		for sweepone() != ^uintptr(0) {
+			sweep.npausesweep++
+		}
+		// Do an additional mProf_GC, because all 'free' events are now real as well.
+		mProf_GC()
+		mProf_GC()
+		return
+	}
+
+	// Concurrent sweep needs to sweep all of the in-use pages by
+	// the time the allocated heap reaches the GC trigger. Compute
+	// the ratio of in-use pages to sweep per byte allocated.
+	heapDistance := int64(memstats.gc_trigger) - int64(memstats.heap_live)
+	// Add a little margin so rounding errors and concurrent
+	// sweep are less likely to leave pages unswept when GC starts.
+	heapDistance -= 1024 * 1024
+	if heapDistance < _PageSize {
+		// Avoid setting the sweep ratio extremely high
+		heapDistance = _PageSize
+	}
+	lock(&mheap_.lock)
+	mheap_.sweepPagesPerByte = float64(mheap_.pagesInUse) / float64(heapDistance)
+	mheap_.pagesSwept = 0
+	mheap_.spanBytesAlloc = 0
+	unlock(&mheap_.lock)
+
+	// Background sweep.
+	lock(&sweep.lock)
+	if sweep.parked {
+		sweep.parked = false
+		ready(sweep.g, 0, true)
+	}
+	unlock(&sweep.lock)
+}
+
+// gcResetMarkState resets global state prior to marking (concurrent
+// or STW) and resets the stack scan state of all Gs.
+//
+// This is safe to do without the world stopped because any Gs created
+// during or after this will start out in the reset state.
+func gcResetMarkState() {
+	// This may be called during a concurrent phase, so make sure
+	// allgs doesn't change.
+	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
+		// Accessing gcRescan is unsafe.
+		throw("bad GC phase")
+	}
+	lock(&allglock)
+	for _, gp := range allgs {
+		gp.gcscandone = false  // set to true in gcphasework
+		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcRescan = -1
+		gp.gcAssistBytes = 0
+	}
+	unlock(&allglock)
+
+	// Clear rescan list.
+	work.rescan.list = work.rescan.list[:0]
+
+	work.bytesMarked = 0
+	work.initialHeapLive = memstats.heap_live
+	work.markrootDone = false
+}
+
+// Hooks for other packages
+
+var poolcleanup func()
+
+//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
+func sync_runtime_registerPoolCleanup(f func()) {
+	poolcleanup = f
+}
+
+func clearpools() {
+	// clear sync.Pools
+	if poolcleanup != nil {
+		poolcleanup()
+	}
+
+	// Clear central sudog cache.
+	// Leave per-P caches alone, they have strictly bounded size.
+	// Disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	lock(&sched.sudoglock)
+	var sg, sgnext *sudog
+	for sg = sched.sudogcache; sg != nil; sg = sgnext {
+		sgnext = sg.next
+		sg.next = nil
+	}
+	sched.sudogcache = nil
+	unlock(&sched.sudoglock)
+
+	// Clear central defer pools.
+	// Leave per-P pools alone, they have strictly bounded size.
+	lock(&sched.deferlock)
+	// disconnect cached list before dropping it on the floor,
+	// so that a dangling ref to one entry does not pin all of them.
+	var d, dlink *_defer
+	for d = sched.deferpool; d != nil; d = dlink {
+		dlink = d.link
+		d.link = nil
+	}
+	sched.deferpool = nil
+	unlock(&sched.deferlock)
+}
+
+// Timing
+
+//go:nowritebarrier
+func gchelper() {
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gchelperstart()
+
+	if trace.enabled {
+		traceGCScanStart()
+	}
+
+	// Parallel mark over GC roots and heap
+	if gcphase == _GCmarktermination {
+		gcw := &_g_.m.p.ptr().gcw
+		if work.helperDrainBlock {
+			gcDrain(gcw, gcDrainBlock) // blocks in getfull
+		} else {
+			gcDrain(gcw, gcDrainNoBlock)
+		}
+		gcw.dispose()
+	}
+
+	if trace.enabled {
+		traceGCScanDone()
+	}
+
+	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
+		notewakeup(&work.alldone)
+	}
+	_g_.m.traceback = 0
+}
+
+func gchelperstart() {
+	_g_ := getg()
+
+	if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
+		throw("gchelperstart: bad m->helpgc")
+	}
+	// For gccgo we run gchelper on the normal g stack.
+	// if _g_ != _g_.m.g0 {
+	// 	throw("gchelper not running on g0 stack")
+	// }
+}
+
+// itoaDiv formats val/(10**dec) into buf.
+func itoaDiv(buf []byte, val uint64, dec int) []byte {
+	i := len(buf) - 1
+	idec := i - dec
+	for val >= 10 || i >= idec {
+		buf[i] = byte(val%10 + '0')
+		i--
+		if i == idec {
+			buf[i] = '.'
+			i--
+		}
+		val /= 10
+	}
+	buf[i] = byte(val + '0')
+	return buf[i:]
+}
+
+// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
+func fmtNSAsMS(buf []byte, ns uint64) []byte {
+	if ns >= 10e6 {
+		// Format as whole milliseconds.
+		return itoaDiv(buf, ns/1e6, 0)
+	}
+	// Format two digits of precision, with at most three decimal places.
+	x := ns / 1e3
+	if x == 0 {
+		buf[0] = '0'
+		return buf[:1]
+	}
+	dec := 3
+	for x >= 100 {
+		x /= 10
+		dec--
+	}
+	return itoaDiv(buf, x, dec)
+}
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
new file mode 100644
index 0000000..c1fa154
--- /dev/null
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -0,0 +1,87 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// gccgo-specific support for GC.
+
+package runtime
+
+import "unsafe"
+
+// gcRoot is a single GC root: a variable plus a ptrmask.
+type gcRoot struct {
+	decl    unsafe.Pointer // Pointer to variable.
+	size    uintptr        // Size of variable.
+	ptrdata uintptr        // Length of gcdata.
+	gcdata  *uint8         // Pointer mask.
+}
+
+// gcRootList is the set of GC roots for a package.
+// The next field is used to put this all into a linked list.
+// count gives the real length of the array.
+type gcRootList struct {
+	next  *gcRootList
+	count int
+	roots [1 << 26]gcRoot
+}
+
+// roots is the list of GC roots for the program.
+// The compiler keeps this variable itself off the list.
+var gcRoots *gcRootList
+
+// registerGCRoots is called by compiler-generated code.
+//go:linkname registerGCRoots runtime.registerGCRoots
+
+// registerGCRoots is called by init functions to register the GC
+// roots for a package.  The init functions are run sequentially at
+// the start of the program, so no locking is needed.
+func registerGCRoots(r *gcRootList) {
+	r.next = gcRoots
+	gcRoots = r
+}
+
+// checkPreempt is called when the preempt field in the running G is true.
+// It preempts the goroutine if it is safe to do so.
+// If preemptscan is true, this scans the stack for the garbage collector
+// and carries on.
+func checkPreempt() {
+	gp := getg()
+	if !gp.preempt || gp != gp.m.curg || gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" {
+		return
+	}
+
+	// Synchronize with scang.
+	gp.scanningself = true
+	casgstatus(gp, _Grunning, _Gwaiting)
+	if gp.preemptscan {
+		for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
+			// Likely to be racing with the GC as
+			// it sees a _Gwaiting and does the
+			// stack scan. If so, gcworkdone will
+			// be set and gcphasework will simply
+			// return.
+		}
+		if !gp.gcscandone {
+			mp := acquirem()
+			gcw := &gp.m.p.ptr().gcw
+			scanstack(gp, gcw)
+			if gcBlackenPromptly {
+				gcw.dispose()
+			}
+			releasem(mp)
+			gp.gcscandone = true
+		}
+		gp.preemptscan = false
+		gp.preempt = false
+		casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
+		// This clears gcscanvalid.
+		casgstatus(gp, _Gwaiting, _Grunning)
+		gp.scanningself = false
+		return
+	}
+
+	// Act like goroutine called runtime.Gosched.
+	casgstatus(gp, _Gwaiting, _Grunning)
+	gp.scanningself = false
+	mcall(gopreempt_m)
+}
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
new file mode 100644
index 0000000..93252ba
--- /dev/null
+++ b/libgo/go/runtime/mgcmark.go
@@ -0,0 +1,1374 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: marking and scanning
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	fixedRootFinalizers = iota
+	fixedRootFreeGStacks
+	fixedRootCount
+
+	// rootBlockBytes is the number of bytes to scan per data or
+	// BSS root.
+	rootBlockBytes = 256 << 10
+
+	// rootBlockSpans is the number of spans to scan per span
+	// root.
+	rootBlockSpans = 8 * 1024 // 64MB worth of spans
+
+	// maxObletBytes is the maximum bytes of an object to scan at
+	// once. Larger objects will be split up into "oblets" of at
+	// most this size. Since we can scan 1–2 MB/ms, 128 KB bounds
+	// scan preemption at ~100 µs.
+	//
+	// This must be > _MaxSmallSize so that the object base is the
+	// span base.
+	maxObletBytes = 128 << 10
+
+	// idleCheckThreshold specifies how many units of work to do
+	// between run queue checks in an idle worker. Assuming a scan
+	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
+	// overhead in the scan loop (the scheduler check may perform
+	// a syscall, so its overhead is nontrivial). Higher values
+	// make the system less responsive to incoming work.
+	idleCheckThreshold = 100000
+)
+
+// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
+// some miscellany) and initializes scanning-related state.
+//
+// The caller must have call gcCopySpans().
+//
+// The world must be stopped.
+//
+//go:nowritebarrier
+func gcMarkRootPrepare() {
+	if gcphase == _GCmarktermination {
+		work.nFlushCacheRoots = int(gomaxprocs)
+	} else {
+		work.nFlushCacheRoots = 0
+	}
+
+	work.nDataRoots = 0
+
+	// Only scan globals once per cycle; preferably concurrently.
+	if !work.markrootDone {
+		roots := gcRoots
+		for roots != nil {
+			work.nDataRoots++
+			roots = roots.next
+		}
+	}
+
+	if !work.markrootDone {
+		// On the first markroot, we need to scan span roots.
+		// In concurrent GC, this happens during concurrent
+		// mark and we depend on addfinalizer to ensure the
+		// above invariants for objects that get finalizers
+		// after concurrent mark. In STW GC, this will happen
+		// during mark termination.
+		//
+		// We're only interested in scanning the in-use spans,
+		// which will all be swept at this point. More spans
+		// may be added to this list during concurrent GC, but
+		// we only care about spans that were allocated before
+		// this mark phase.
+		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+
+		// On the first markroot, we need to scan all Gs. Gs
+		// may be created after this point, but it's okay that
+		// we ignore them because they begin life without any
+		// roots, so there's nothing to scan, and any roots
+		// they create during the concurrent phase will be
+		// scanned during mark termination. During mark
+		// termination, allglen isn't changing, so we'll scan
+		// all Gs.
+		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		work.nRescanRoots = 0
+	} else {
+		// We've already scanned span roots and kept the scan
+		// up-to-date during concurrent mark.
+		work.nSpanRoots = 0
+
+		// On the second pass of markroot, we're just scanning
+		// dirty stacks. It's safe to access rescan since the
+		// world is stopped.
+		work.nStackRoots = 0
+		work.nRescanRoots = len(work.rescan.list)
+	}
+
+	work.markrootNext = 0
+	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+}
+
+// gcMarkRootCheck checks that all roots have been scanned. It is
+// purely for debugging.
+func gcMarkRootCheck() {
+	if work.markrootNext < work.markrootJobs {
+		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+		throw("left over markroot jobs")
+	}
+
+	lock(&allglock)
+	// Check that stacks have been scanned.
+	var gp *g
+	if gcphase == _GCmarktermination && debug.gcrescanstacks > 0 {
+		for i := 0; i < len(allgs); i++ {
+			gp = allgs[i]
+			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+				goto fail
+			}
+		}
+	} else {
+		for i := 0; i < work.nStackRoots; i++ {
+			gp = allgs[i]
+			if !gp.gcscandone {
+				goto fail
+			}
+		}
+	}
+	unlock(&allglock)
+	return
+
+fail:
+	println("gp", gp, "goid", gp.goid,
+		"status", readgstatus(gp),
+		"gcscandone", gp.gcscandone,
+		"gcscanvalid", gp.gcscanvalid)
+	unlock(&allglock) // Avoid self-deadlock with traceback.
+	throw("scan missed a g")
+}
+
+// ptrmask for an allocation containing a single pointer.
+var oneptrmask = [...]uint8{1}
+
+// markroot scans the i'th root.
+//
+// Preemption must be disabled (because this uses a gcWork).
+//
+// nowritebarrier is only advisory here.
+//
+//go:nowritebarrier
+func markroot(gcw *gcWork, i uint32) {
+	// TODO(austin): This is a bit ridiculous. Compute and store
+	// the bases in gcMarkRootPrepare instead of the counts.
+	baseFlushCache := uint32(fixedRootCount)
+	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
+	baseSpans := baseData + uint32(work.nDataRoots)
+	baseStacks := baseSpans + uint32(work.nSpanRoots)
+	baseRescan := baseStacks + uint32(work.nStackRoots)
+	end := baseRescan + uint32(work.nRescanRoots)
+
+	// Note: if you add a case here, please also update heapdump.go:dumproots.
+	switch {
+	case baseFlushCache <= i && i < baseData:
+		flushmcache(int(i - baseFlushCache))
+
+	case baseData <= i && i < baseSpans:
+		roots := gcRoots
+		c := baseData
+		for roots != nil {
+			if i == c {
+				markrootBlock(roots, gcw)
+				break
+			}
+			roots = roots.next
+			c++
+		}
+
+	case i == fixedRootFinalizers:
+		for fb := allfin; fb != nil; fb = fb.alllink {
+			cnt := uintptr(atomic.Load(&fb.cnt))
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
+		}
+
+	case i == fixedRootFreeGStacks:
+		// FIXME: We don't do this for gccgo.
+
+	case baseSpans <= i && i < baseStacks:
+		// mark MSpan.specials
+		markrootSpans(gcw, int(i-baseSpans))
+
+	default:
+		// the rest is scanning goroutine stacks
+		var gp *g
+		if baseStacks <= i && i < baseRescan {
+			gp = allgs[i-baseStacks]
+		} else if baseRescan <= i && i < end {
+			gp = work.rescan.list[i-baseRescan].ptr()
+			if gp.gcRescan != int32(i-baseRescan) {
+				// Looking for issue #17099.
+				println("runtime: gp", gp, "found at rescan index", i-baseRescan, "but should be at", gp.gcRescan)
+				throw("bad g rescan index")
+			}
+		} else {
+			throw("markroot: bad index")
+		}
+
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status := readgstatus(gp) // We are not in a scan state
+		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+			gp.waitsince = work.tstart
+		}
+
+		// scang must be done on the system stack in case
+		// we're trying to scan our own stack.
+		systemstack(func() {
+			// If this is a self-scan, put the user G in
+			// _Gwaiting to prevent self-deadlock. It may
+			// already be in _Gwaiting if this is a mark
+			// worker or we're in mark termination.
+			userG := getg().m.curg
+			selfScan := gp == userG && readgstatus(userG) == _Grunning
+			if selfScan {
+				casgstatus(userG, _Grunning, _Gwaiting)
+				userG.waitreason = "garbage collection scan"
+			}
+
+			// TODO: scang blocks until gp's stack has
+			// been scanned, which may take a while for
+			// running goroutines. Consider doing this in
+			// two phases where the first is non-blocking:
+			// we scan the stacks we can and ask running
+			// goroutines to scan themselves; and the
+			// second blocks.
+			scang(gp, gcw)
+
+			if selfScan {
+				casgstatus(userG, _Gwaiting, _Grunning)
+			}
+		})
+	}
+}
+
+// markrootBlock scans one element of the list of GC roots.
+//
+//go:nowritebarrier
+func markrootBlock(roots *gcRootList, gcw *gcWork) {
+	for i := 0; i < roots.count; i++ {
+		r := &roots.roots[i]
+		scanblock(uintptr(r.decl), r.ptrdata, r.gcdata, gcw)
+	}
+}
+
+// markrootSpans marks roots for one shard of work.spans.
+//
+//go:nowritebarrier
+func markrootSpans(gcw *gcWork, shard int) {
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	//
+	// TODO(austin): There are several ideas for making this more
+	// efficient in issue #11485.
+
+	if work.markrootDone {
+		throw("markrootSpans during second markroot")
+	}
+
+	sg := mheap_.sweepgen
+	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
+	// Note that work.spans may not include spans that were
+	// allocated between entering the scan phase and now. This is
+	// okay because any objects with finalizers in those spans
+	// must have been allocated and given finalizers after we
+	// entered the scan phase, so addfinalizer will have ensured
+	// the above invariants for them.
+	for _, s := range spans {
+		if s.state != mSpanInUse {
+			continue
+		}
+		if !useCheckmark && s.sweepgen != sg {
+			// sweepgen was updated (+2) during non-checkmark GC pass
+			print("sweep ", s.sweepgen, " ", sg, "\n")
+			throw("gc: unswept span")
+		}
+
+		// Speculatively check if there are any specials
+		// without acquiring the span lock. This may race with
+		// adding the first special to a span, but in that
+		// case addfinalizer will observe that the GC is
+		// active (which is globally synchronized) and ensure
+		// the above invariants. We may also ensure the
+		// invariants, but it's okay to scan an object twice.
+		if s.specials == nil {
+			continue
+		}
+
+		// Lock the specials to prevent a special from being
+		// removed from the list while we're traversing it.
+		lock(&s.speciallock)
+
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialFinalizer {
+				continue
+			}
+			// don't mark finalized object, but scan it so we
+			// retain everything it points to.
+			spf := (*specialfinalizer)(unsafe.Pointer(sp))
+			// A finalizer can be set for an inner byte of an object, find object beginning.
+			p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+			// Mark everything that can be reached from
+			// the object (but *not* the object itself or
+			// we'll never collect it).
+			scanobject(p, gcw)
+
+			// The special itself is a root.
+			scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+		}
+
+		unlock(&s.speciallock)
+	}
+}
+
+// gcAssistAlloc performs GC work to make gp's assist debt positive.
+// gp must be the calling user gorountine.
+//
+// This must be called with preemption enabled.
+func gcAssistAlloc(gp *g) {
+	// Don't assist in non-preemptible contexts. These are
+	// generally fragile and won't allow the assist to block.
+	if getg() == gp.m.g0 {
+		return
+	}
+	if mp := getg().m; mp.locks > 0 || mp.preemptoff != "" {
+		return
+	}
+
+retry:
+	// Compute the amount of scan work we need to do to make the
+	// balance positive. When the required amount of work is low,
+	// we over-assist to build up credit for future allocations
+	// and amortize the cost of assisting.
+	debtBytes := -gp.gcAssistBytes
+	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	if scanWork < gcOverAssistWork {
+		scanWork = gcOverAssistWork
+		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+	}
+
+	// Steal as much credit as we can from the background GC's
+	// scan credit. This is racy and may drop the background
+	// credit below 0 if two mutators steal at the same time. This
+	// will just cause steals to fail until credit is accumulated
+	// again, so in the long run it doesn't really matter, but we
+	// do have to handle the negative credit case.
+	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
+	stolen := int64(0)
+	if bgScanCredit > 0 {
+		if bgScanCredit < scanWork {
+			stolen = bgScanCredit
+			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+		} else {
+			stolen = scanWork
+			gp.gcAssistBytes += debtBytes
+		}
+		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
+
+		scanWork -= stolen
+
+		if scanWork == 0 {
+			// We were able to steal all of the credit we
+			// needed.
+			return
+		}
+	}
+
+	// Perform assist work
+	systemstack(func() {
+		gcAssistAlloc1(gp, scanWork)
+		// The user stack may have moved, so this can't touch
+		// anything on it until it returns from systemstack.
+	})
+
+	completed := gp.param != nil
+	gp.param = nil
+	if completed {
+		gcMarkDone()
+	}
+
+	if gp.gcAssistBytes < 0 {
+		// We were unable steal enough credit or perform
+		// enough work to pay off the assist debt. We need to
+		// do one of these before letting the mutator allocate
+		// more to prevent over-allocation.
+		//
+		// If this is because we were preempted, reschedule
+		// and try some more.
+		if gp.preempt {
+			Gosched()
+			goto retry
+		}
+
+		// Add this G to an assist queue and park. When the GC
+		// has more background credit, it will satisfy queued
+		// assists before flushing to the global credit pool.
+		//
+		// Note that this does *not* get woken up when more
+		// work is added to the work list. The theory is that
+		// there wasn't enough work to do anyway, so we might
+		// as well let background marking take care of the
+		// work that is available.
+		if !gcParkAssist() {
+			goto retry
+		}
+
+		// At this point either background GC has satisfied
+		// this G's assist debt, or the GC cycle is over.
+	}
+}
+
+// gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system
+// stack. This is a separate function to make it easier to see that
+// we're not capturing anything from the user stack, since the user
+// stack may move while we're in this function.
+//
+// gcAssistAlloc1 indicates whether this assist completed the mark
+// phase by setting gp.param to non-nil. This can't be communicated on
+// the stack since it may move.
+//
+//go:systemstack
+func gcAssistAlloc1(gp *g, scanWork int64) {
+	// Clear the flag indicating that this assist completed the
+	// mark phase.
+	gp.param = nil
+
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		// The gcBlackenEnabled check in malloc races with the
+		// store that clears it but an atomic check in every malloc
+		// would be a performance hit.
+		// Instead we recheck it here on the non-preemptable system
+		// stack to determine if we should preform an assist.
+
+		// GC is done, so ignore any remaining debt.
+		gp.gcAssistBytes = 0
+		return
+	}
+	// Track time spent in this assist. Since we're on the
+	// system stack, this is non-preemptible, so we can
+	// just measure start and end time.
+	startTime := nanotime()
+
+	decnwait := atomic.Xadd(&work.nwait, -1)
+	if decnwait == work.nproc {
+		println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
+		throw("nwait > work.nprocs")
+	}
+
+	// gcDrainN requires the caller to be preemptible.
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "GC assist marking"
+
+	// drain own cached work first in the hopes that it
+	// will be more cache friendly.
+	gcw := &getg().m.p.ptr().gcw
+	workDone := gcDrainN(gcw, scanWork)
+	// If we are near the end of the mark phase
+	// dispose of the gcw.
+	if gcBlackenPromptly {
+		gcw.dispose()
+	}
+
+	casgstatus(gp, _Gwaiting, _Grunning)
+
+	// Record that we did this much scan work.
+	//
+	// Back out the number of bytes of assist credit that
+	// this scan work counts for. The "1+" is a poor man's
+	// round-up, to ensure this adds credit even if
+	// assistBytesPerWork is very low.
+	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+
+	// If this is the last worker and we ran out of work,
+	// signal a completion point.
+	incnwait := atomic.Xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait,
+			"work.nproc=", work.nproc,
+			"gcBlackenPromptly=", gcBlackenPromptly)
+		throw("work.nwait > work.nproc")
+	}
+
+	if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+		// This has reached a background completion point. Set
+		// gp.param to a non-nil value to indicate this. It
+		// doesn't matter what we set it to (it just has to be
+		// a valid pointer).
+		gp.param = unsafe.Pointer(gp)
+	}
+	duration := nanotime() - startTime
+	_p_ := gp.m.p.ptr()
+	_p_.gcAssistTime += duration
+	if _p_.gcAssistTime > gcAssistTimeSlack {
+		atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
+		_p_.gcAssistTime = 0
+	}
+}
+
+// gcWakeAllAssists wakes all currently blocked assists. This is used
+// at the end of a GC cycle. gcBlackenEnabled must be false to prevent
+// new assists from going to sleep after this point.
+func gcWakeAllAssists() {
+	lock(&work.assistQueue.lock)
+	injectglist(work.assistQueue.head.ptr())
+	work.assistQueue.head.set(nil)
+	work.assistQueue.tail.set(nil)
+	unlock(&work.assistQueue.lock)
+}
+
+// gcParkAssist puts the current goroutine on the assist queue and parks.
+//
+// gcParkAssist returns whether the assist is now satisfied. If it
+// returns false, the caller must retry the assist.
+//
+//go:nowritebarrier
+func gcParkAssist() bool {
+	lock(&work.assistQueue.lock)
+	// If the GC cycle finished while we were getting the lock,
+	// exit the assist. The cycle can't finish while we hold the
+	// lock.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		unlock(&work.assistQueue.lock)
+		return true
+	}
+
+	gp := getg()
+	oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
+	if oldHead == 0 {
+		work.assistQueue.head.set(gp)
+	} else {
+		oldTail.ptr().schedlink.set(gp)
+	}
+	work.assistQueue.tail.set(gp)
+	gp.schedlink.set(nil)
+
+	// Recheck for background credit now that this G is in
+	// the queue, but can still back out. This avoids a
+	// race in case background marking has flushed more
+	// credit since we checked above.
+	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+		work.assistQueue.head = oldHead
+		work.assistQueue.tail = oldTail
+		if oldTail != 0 {
+			oldTail.ptr().schedlink.set(nil)
+		}
+		unlock(&work.assistQueue.lock)
+		return false
+	}
+	// Park.
+	goparkunlock(&work.assistQueue.lock, "GC assist wait", traceEvGoBlockGC, 2)
+	return true
+}
+
+// gcFlushBgCredit flushes scanWork units of background scan work
+// credit. This first satisfies blocked assists on the
+// work.assistQueue and then flushes any remaining credit to
+// gcController.bgScanCredit.
+//
+// Write barriers are disallowed because this is used by gcDrain after
+// it has ensured that all work is drained and this must preserve that
+// condition.
+//
+//go:nowritebarrierrec
+func gcFlushBgCredit(scanWork int64) {
+	if work.assistQueue.head == 0 {
+		// Fast path; there are no blocked assists. There's a
+		// small window here where an assist may add itself to
+		// the blocked queue and park. If that happens, we'll
+		// just get it on the next flush.
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		return
+	}
+
+	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+
+	lock(&work.assistQueue.lock)
+	gp := work.assistQueue.head.ptr()
+	for gp != nil && scanBytes > 0 {
+		// Note that gp.gcAssistBytes is negative because gp
+		// is in debt. Think carefully about the signs below.
+		if scanBytes+gp.gcAssistBytes >= 0 {
+			// Satisfy this entire assist debt.
+			scanBytes += gp.gcAssistBytes
+			gp.gcAssistBytes = 0
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			// It's important that we *not* put xgp in
+			// runnext. Otherwise, it's possible for user
+			// code to exploit the GC worker's high
+			// scheduler priority to get itself always run
+			// before other goroutines and always in the
+			// fresh quantum started by GC.
+			ready(xgp, 0, false)
+		} else {
+			// Partially satisfy this assist.
+			gp.gcAssistBytes += scanBytes
+			scanBytes = 0
+			// As a heuristic, we move this assist to the
+			// back of the queue so that large assists
+			// can't clog up the assist queue and
+			// substantially delay small assists.
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			if gp == nil {
+				// gp is the only assist in the queue.
+				gp = xgp
+			} else {
+				xgp.schedlink = 0
+				work.assistQueue.tail.ptr().schedlink.set(xgp)
+				work.assistQueue.tail.set(xgp)
+			}
+			break
+		}
+	}
+	work.assistQueue.head.set(gp)
+	if gp == nil {
+		work.assistQueue.tail.set(nil)
+	}
+
+	if scanBytes > 0 {
+		// Convert from scan bytes back to work.
+		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+	}
+	unlock(&work.assistQueue.lock)
+}
+
+// We use a C function to find the stack.
+func doscanstack(*g, *gcWork)
+
+// scanstack scans gp's stack, greying all pointers found on the stack.
+//
+// During mark phase, it also installs stack barriers while traversing
+// gp's stack. During mark termination, it stops scanning when it
+// reaches an unhit stack barrier.
+//
+// scanstack is marked go:systemstack because it must not be preempted
+// while using a workbuf.
+//
+//go:nowritebarrier
+//go:systemstack
+func scanstack(gp *g, gcw *gcWork) {
+	if gp.gcscanvalid {
+		return
+	}
+
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		throw("scanstack - bad status")
+	}
+
+	switch readgstatus(gp) &^ _Gscan {
+	default:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("mark - bad status")
+	case _Gdead:
+		return
+	case _Grunning:
+		// ok for gccgo, though not for gc.
+	case _Grunnable, _Gsyscall, _Gwaiting:
+		// ok
+	}
+
+	mp := gp.m
+	if mp != nil && mp.helpgc != 0 {
+		throw("can't scan gchelper stack")
+	}
+
+	// Scan the stack.
+	doscanstack(gp, gcw)
+
+	// Conservatively scan the saved register values.
+	scanstackblock(uintptr(unsafe.Pointer(&gp.gcregs)), unsafe.Sizeof(gp.gcregs), gcw)
+	scanstackblock(uintptr(unsafe.Pointer(&gp.context)), unsafe.Sizeof(gp.context), gcw)
+
+	if gcphase == _GCmark {
+		// gp may have added itself to the rescan list between
+		// when GC started and now. It's clean now, so remove
+		// it. This isn't safe during mark termination because
+		// mark termination is consuming this list, but it's
+		// also not necessary.
+		dequeueRescan(gp)
+	}
+	gp.gcscanvalid = true
+}
+
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+	if debug.gcrescanstacks == 0 {
+		// Clear gcscanvalid to keep assertions happy.
+		//
+		// TODO: Remove gcscanvalid entirely when we remove
+		// stack rescanning.
+		gp.gcscanvalid = false
+		return
+	}
+
+	if gcphase == _GCoff {
+		gp.gcscanvalid = false
+		return
+	}
+	if gp.gcRescan != -1 {
+		throw("g already on rescan list")
+	}
+
+	lock(&work.rescan.lock)
+	gp.gcscanvalid = false
+
+	// Recheck gcphase under the lock in case there was a phase change.
+	if gcphase == _GCoff {
+		unlock(&work.rescan.lock)
+		return
+	}
+	if len(work.rescan.list) == cap(work.rescan.list) {
+		throw("rescan list overflow")
+	}
+	n := len(work.rescan.list)
+	gp.gcRescan = int32(n)
+	work.rescan.list = work.rescan.list[:n+1]
+	work.rescan.list[n].set(gp)
+	unlock(&work.rescan.lock)
+}
+
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+	if debug.gcrescanstacks == 0 {
+		return
+	}
+
+	if gp.gcRescan == -1 {
+		return
+	}
+	if gcphase == _GCoff {
+		gp.gcRescan = -1
+		return
+	}
+
+	lock(&work.rescan.lock)
+	if work.rescan.list[gp.gcRescan].ptr() != gp {
+		throw("bad dequeueRescan")
+	}
+	// Careful: gp may itself be the last G on the list.
+	last := work.rescan.list[len(work.rescan.list)-1]
+	work.rescan.list[gp.gcRescan] = last
+	last.ptr().gcRescan = gp.gcRescan
+	gp.gcRescan = -1
+	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+	unlock(&work.rescan.lock)
+}
+
+type gcDrainFlags int
+
+const (
+	gcDrainUntilPreempt gcDrainFlags = 1 << iota
+	gcDrainNoBlock
+	gcDrainFlushBgCredit
+	gcDrainIdle
+
+	// gcDrainBlock means neither gcDrainUntilPreempt or
+	// gcDrainNoBlock. It is the default, but callers should use
+	// the constant for documentation purposes.
+	gcDrainBlock gcDrainFlags = 0
+)
+
+// gcDrain scans roots and objects in work buffers, blackening grey
+// objects until all roots and work buffers have been drained.
+//
+// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
+// is set. This implies gcDrainNoBlock.
+//
+// If flags&gcDrainIdle != 0, gcDrain returns when there is other work
+// to do. This implies gcDrainNoBlock.
+//
+// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
+// unable to get more work. Otherwise, it will block until all
+// blocking calls are blocked in gcDrain.
+//
+// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
+// credit to gcController.bgScanCredit every gcCreditSlack units of
+// scan work.
+//
+//go:nowritebarrier
+func gcDrain(gcw *gcWork, flags gcDrainFlags) {
+	if !writeBarrier.needed {
+		throw("gcDrain phase incorrect")
+	}
+
+	gp := getg().m.curg
+	preemptible := flags&gcDrainUntilPreempt != 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
+	flushBgCredit := flags&gcDrainFlushBgCredit != 0
+	idle := flags&gcDrainIdle != 0
+
+	initScanWork := gcw.scanWork
+	// idleCheck is the scan work at which to perform the next
+	// idle check with the scheduler.
+	idleCheck := initScanWork + idleCheckThreshold
+
+	// Drain root marking jobs.
+	if work.markrootNext < work.markrootJobs {
+		for !(preemptible && gp.preempt) {
+			job := atomic.Xadd(&work.markrootNext, +1) - 1
+			if job >= work.markrootJobs {
+				break
+			}
+			markroot(gcw, job)
+			if idle && pollWork() {
+				goto done
+			}
+		}
+	}
+
+	// Drain heap marking jobs.
+	for !(preemptible && gp.preempt) {
+		// Try to keep work available on the global queue. We used to
+		// check if there were waiting workers, but it's better to
+		// just keep work available than to make workers wait. In the
+		// worst case, we'll do O(log(_WorkbufSize)) unnecessary
+		// balances.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		var b uintptr
+		if blocking {
+			b = gcw.get()
+		} else {
+			b = gcw.tryGetFast()
+			if b == 0 {
+				b = gcw.tryGet()
+			}
+		}
+		if b == 0 {
+			// work barrier reached or tryGet failed.
+			break
+		}
+		scanobject(b, gcw)
+
+		// Flush background scan work credit to the global
+		// account if we've accumulated enough locally so
+		// mutator assists can draw on it.
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			if flushBgCredit {
+				gcFlushBgCredit(gcw.scanWork - initScanWork)
+				initScanWork = 0
+			}
+			idleCheck -= gcw.scanWork
+			gcw.scanWork = 0
+
+			if idle && idleCheck <= 0 {
+				idleCheck += idleCheckThreshold
+				if pollWork() {
+					break
+				}
+			}
+		}
+	}
+
+	// In blocking mode, write barriers are not allowed after this
+	// point because we must preserve the condition that the work
+	// buffers are empty.
+
+done:
+	// Flush remaining scan work credit.
+	if gcw.scanWork > 0 {
+		atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+		if flushBgCredit {
+			gcFlushBgCredit(gcw.scanWork - initScanWork)
+		}
+		gcw.scanWork = 0
+	}
+}
+
+// gcDrainN blackens grey objects until it has performed roughly
+// scanWork units of scan work or the G is preempted. This is
+// best-effort, so it may perform less work if it fails to get a work
+// buffer. Otherwise, it will perform at least n units of work, but
+// may perform more because scanning is always done in whole object
+// increments. It returns the amount of scan work performed.
+//
+// The caller goroutine must be in a preemptible state (e.g.,
+// _Gwaiting) to prevent deadlocks during stack scanning. As a
+// consequence, this must be called on the system stack.
+//
+//go:nowritebarrier
+//go:systemstack
+func gcDrainN(gcw *gcWork, scanWork int64) int64 {
+	if !writeBarrier.needed {
+		throw("gcDrainN phase incorrect")
+	}
+
+	// There may already be scan work on the gcw, which we don't
+	// want to claim was done by this call.
+	workFlushed := -gcw.scanWork
+
+	gp := getg().m.curg
+	for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+		// See gcDrain comment.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
+		// This might be a good place to add prefetch code...
+		// if(wbuf.nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf.nobj - 3];
+		//  }
+		//
+		b := gcw.tryGetFast()
+		if b == 0 {
+			b = gcw.tryGet()
+		}
+
+		if b == 0 {
+			// Try to do a root job.
+			//
+			// TODO: Assists should get credit for this
+			// work.
+			if work.markrootNext < work.markrootJobs {
+				job := atomic.Xadd(&work.markrootNext, +1) - 1
+				if job < work.markrootJobs {
+					markroot(gcw, job)
+					continue
+				}
+			}
+			// No heap or root jobs.
+			break
+		}
+		scanobject(b, gcw)
+
+		// Flush background scan work credit.
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			workFlushed += gcw.scanWork
+			gcw.scanWork = 0
+		}
+	}
+
+	// Unlike gcDrain, there's no need to flush remaining work
+	// here because this never flushes to bgScanCredit and
+	// gcw.dispose will flush any remaining work to scanWork.
+
+	return workFlushed + gcw.scanWork
+}
+
+// scanblock scans b as scanobject would, but using an explicit
+// pointer bitmap instead of the heap bitmap.
+//
+// This is used to scan non-heap roots, so it does not update
+// gcw.bytesMarked or gcw.scanWork.
+//
+//go:nowritebarrier
+func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
+	// Use local copies of original parameters, so that a stack trace
+	// due to one of the throws below shows the original block
+	// base and extent.
+	b := b0
+	n := n0
+
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	for i := uintptr(0); i < n; {
+		// Find bits for the next word.
+		bits := uint32(*addb(ptrmask, i/(sys.PtrSize*8)))
+		if bits == 0 {
+			i += sys.PtrSize * 8
+			continue
+		}
+		for j := 0; j < 8 && i < n; j++ {
+			if bits&1 != 0 {
+				// Same work as in scanobject; see comments there.
+				obj := *(*uintptr)(unsafe.Pointer(b + i))
+				if obj != 0 && arena_start <= obj && obj < arena_used {
+					if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, false); obj != 0 {
+						greyobject(obj, b, i, hbits, span, gcw, objIndex, false)
+					}
+				}
+			}
+			bits >>= 1
+			i += sys.PtrSize
+		}
+	}
+}
+
+// scanobject scans the object starting at b, adding pointers to gcw.
+// b must point to the beginning of a heap object or an oblet.
+// scanobject consults the GC bitmap for the pointer mask and the
+// spans for the size of the object.
+//
+//go:nowritebarrier
+func scanobject(b uintptr, gcw *gcWork) {
+	// Note that arena_used may change concurrently during
+	// scanobject and hence scanobject may encounter a pointer to
+	// a newly allocated heap object that is *not* in
+	// [start,used). It will not mark this object; however, we
+	// know that it was just installed by a mutator, which means
+	// that mutator will execute a write barrier and take care of
+	// marking it. This is even more pronounced on relaxed memory
+	// architectures since we access arena_used without barriers
+	// or synchronization, but the same logic applies.
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	// Find the bits for b and the size of the object at b.
+	//
+	// b is either the beginning of an object, in which case this
+	// is the size of the object to scan, or it points to an
+	// oblet, in which case we compute the size to scan below.
+	hbits := heapBitsForAddr(b)
+	s := spanOfUnchecked(b)
+	n := s.elemsize
+	if n == 0 {
+		throw("scanobject n == 0")
+	}
+
+	if n > maxObletBytes {
+		// Large object. Break into oblets for better
+		// parallelism and lower latency.
+		if b == s.base() {
+			// It's possible this is a noscan object (not
+			// from greyobject, but from other code
+			// paths), in which case we must *not* enqueue
+			// oblets since their bitmaps will be
+			// uninitialized.
+			if !hbits.hasPointers(n) {
+				// Bypass the whole scan.
+				gcw.bytesMarked += uint64(n)
+				return
+			}
+
+			// Enqueue the other oblets to scan later.
+			// Some oblets may be in b's scalar tail, but
+			// these will be marked as "no more pointers",
+			// so we'll drop out immediately when we go to
+			// scan those.
+			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
+				if !gcw.putFast(oblet) {
+					gcw.put(oblet)
+				}
+			}
+		}
+
+		// Compute the size of the oblet. Since this object
+		// must be a large object, s.base() is the beginning
+		// of the object.
+		n = s.base() + s.elemsize - b
+		if n > maxObletBytes {
+			n = maxObletBytes
+		}
+	}
+
+	var i uintptr
+	for i = 0; i < n; i += sys.PtrSize {
+		// Find bits for this word.
+		if i != 0 {
+			// Avoid needless hbits.next() on last iteration.
+			hbits = hbits.next()
+		}
+		// Load bits once. See CL 22712 and issue 16973 for discussion.
+		bits := hbits.bits()
+		// During checkmarking, 1-word objects store the checkmark
+		// in the type bit for the one word. The only one-word objects
+		// are pointers, or else they'd be merged with other non-pointer
+		// data into larger allocations.
+		if i != 1*sys.PtrSize && bits&bitScan == 0 {
+			break // no more pointers in this object
+		}
+		if bits&bitPointer == 0 {
+			continue // not a pointer
+		}
+
+		// Work here is duplicated in scanblock and above.
+		// If you make changes here, make changes there too.
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap and not back at the current object.
+		if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
+			// Mark the object.
+			if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, false); obj != 0 {
+				greyobject(obj, b, i, hbits, span, gcw, objIndex, false)
+			}
+		}
+	}
+	gcw.bytesMarked += uint64(n)
+	gcw.scanWork += int64(i)
+}
+
+//go:linkname scanstackblock runtime.scanstackblock
+
+// scanstackblock is called by the stack scanning code in C to
+// actually find and mark pointers in the stack block. This is like
+// scanblock, but we scan the stack conservatively, so there is no
+// bitmask of pointers.
+func scanstackblock(b, n uintptr, gcw *gcWork) {
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	for i := uintptr(0); i < n; i += sys.PtrSize {
+		// Same work as in scanobject; see comments there.
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+		if obj != 0 && arena_start <= obj && obj < arena_used {
+			if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i, true); obj != 0 {
+				greyobject(obj, b, i, hbits, span, gcw, objIndex, true)
+			}
+		}
+	}
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+// Preemption must be disabled.
+//go:nowritebarrier
+func shade(b uintptr) {
+	// shade can be called to shade a pointer found on the stack,
+	// so pass forStack as true to heapBitsForObject and greyobject.
+	if obj, hbits, span, objIndex := heapBitsForObject(b, 0, 0, true); obj != 0 {
+		gcw := &getg().m.p.ptr().gcw
+		greyobject(obj, 0, 0, hbits, span, gcw, objIndex, true)
+		if gcphase == _GCmarktermination || gcBlackenPromptly {
+			// Ps aren't allowed to cache work during mark
+			// termination.
+			gcw.dispose()
+		}
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into gcw.
+// base and off are for debugging only and could be removed.
+//go:nowritebarrierrec
+func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr, forStack bool) {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(sys.PtrSize-1) != 0 {
+		throw("greyobject: obj not pointer-aligned")
+	}
+	mbits := span.markBitsForIndex(objIndex)
+
+	if useCheckmark {
+		if !mbits.isMarked() {
+			// Stack scanning is conservative, so we can
+			// see a reference to an object not previously
+			// found. Assume the object was correctly not
+			// marked and ignore the pointer.
+			if forStack {
+				return
+			}
+			printlock()
+			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
+			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+			// Dump the source (base) object
+			gcDumpObject("base", base, off)
+
+			// Dump the object
+			gcDumpObject("obj", obj, ^uintptr(0))
+
+			throw("checkmark found unmarked object")
+		}
+		if hbits.isCheckmarked(span.elemsize) {
+			return
+		}
+		hbits.setCheckmarked(span.elemsize)
+		if !hbits.isCheckmarked(span.elemsize) {
+			throw("setCheckmarked and isCheckmarked disagree")
+		}
+	} else {
+		// Stack scanning is conservative, so we can see a
+		// pointer to a free object. Assume the object was
+		// correctly freed and we must ignore the pointer.
+		if forStack && span.isFree(objIndex) {
+			return
+		}
+
+		if debug.gccheckmark > 0 && span.isFree(objIndex) {
+			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
+			gcDumpObject("base", base, off)
+			gcDumpObject("obj", obj, ^uintptr(0))
+			throw("marking free object")
+		}
+
+		// If marked we have nothing to do.
+		if mbits.isMarked() {
+			return
+		}
+		// mbits.setMarked() // Avoid extra call overhead with manual inlining.
+		atomic.Or8(mbits.bytep, mbits.mask)
+		// If this is a noscan object, fast-track it to black
+		// instead of greying it.
+		if !hbits.hasPointers(span.elemsize) {
+			gcw.bytesMarked += uint64(span.elemsize)
+			return
+		}
+	}
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+	if !gcw.putFast(obj) {
+		gcw.put(obj)
+	}
+}
+
+// gcDumpObject dumps the contents of obj for debugging and marks the
+// field at byte offset off in obj.
+func gcDumpObject(label string, obj, off uintptr) {
+	if obj < mheap_.arena_start || obj >= mheap_.arena_used {
+		print(label, "=", hex(obj), " is not in the Go heap\n")
+		return
+	}
+	k := obj >> _PageShift
+	x := k
+	x -= mheap_.arena_start >> _PageShift
+	s := mheap_.spans[x]
+	print(label, "=", hex(obj), " k=", hex(k))
+	if s == nil {
+		print(" s=nil\n")
+		return
+	}
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, " s.state=")
+	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
+		print(mSpanStateNames[s.state], "\n")
+	} else {
+		print("unknown(", s.state, ")\n")
+	}
+
+	skipped := false
+	size := s.elemsize
+	if s.state == _MSpanStack && size == 0 {
+		// We're printing something from a stack frame. We
+		// don't know how big it is, so just show up to an
+		// including off.
+		size = off + sys.PtrSize
+	}
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		// For big objects, just print the beginning (because
+		// that usually hints at the object's type) and the
+		// fields around off.
+		if !(i < 128*sys.PtrSize || off-16*sys.PtrSize < i && i < off+16*sys.PtrSize) {
+			skipped = true
+			continue
+		}
+		if skipped {
+			print(" ...\n")
+			skipped = false
+		}
+		print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + i))))
+		if i == off {
+			print(" <==")
+		}
+		print("\n")
+	}
+	if skipped {
+		print(" ...\n")
+	}
+}
+
+// gcmarknewobject marks a newly allocated object black. obj must
+// not contain any non-nil pointers.
+//
+// This is nosplit so it can manipulate a gcWork without preemption.
+//
+//go:nowritebarrier
+//go:nosplit
+func gcmarknewobject(obj, size, scanSize uintptr) {
+	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
+		throw("gcmarknewobject called while doing checkmark")
+	}
+	markBitsForAddr(obj).setMarked()
+	gcw := &getg().m.p.ptr().gcw
+	gcw.bytesMarked += uint64(size)
+	gcw.scanWork += int64(scanSize)
+	if gcBlackenPromptly {
+		// There shouldn't be anything in the work queue, but
+		// we still need to flush stats.
+		gcw.dispose()
+	}
+}
+
+// gcMarkTinyAllocs greys all active tiny alloc blocks.
+//
+// The world must be stopped.
+func gcMarkTinyAllocs() {
+	for _, p := range &allp {
+		if p == nil || p.status == _Pdead {
+			break
+		}
+		c := p.mcache
+		if c == nil || c.tiny == 0 {
+			continue
+		}
+		_, hbits, span, objIndex := heapBitsForObject(c.tiny, 0, 0, false)
+		gcw := &p.gcw
+		greyobject(c.tiny, 0, 0, hbits, span, gcw, objIndex, false)
+		if gcBlackenPromptly {
+			gcw.dispose()
+		}
+	}
+}
+
+// Checkmarking
+
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's boundary nibble.
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the typeDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// If useCheckmark is true, marking of an object uses the
+// checkmark bits (encoding above) instead of the standard
+// mark bits.
+var useCheckmark = false
+
+//go:nowritebarrier
+func initCheckmarks() {
+	useCheckmark = true
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).initCheckmarkSpan(s.layout())
+		}
+	}
+}
+
+func clearCheckmarks() {
+	useCheckmark = false
+	for _, s := range mheap_.allspans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).clearCheckmarkSpan(s.layout())
+		}
+	}
+}
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
new file mode 100644
index 0000000..9f24fb1
--- /dev/null
+++ b/libgo/go/runtime/mgcsweep.go
@@ -0,0 +1,428 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: sweeping
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var sweep sweepdata
+
+// State of background sweep.
+type sweepdata struct {
+	lock    mutex
+	g       *g
+	parked  bool
+	started bool
+
+	nbgsweep    uint32
+	npausesweep uint32
+
+	// pacertracegen is the sweepgen at which the last pacer trace
+	// "sweep finished" message was printed.
+	pacertracegen uint32
+}
+
+// finishsweep_m ensures that all spans are swept.
+//
+// The world must be stopped. This ensures there are no sweeps in
+// progress.
+//
+//go:nowritebarrier
+func finishsweep_m() {
+	// Sweeping must be complete before marking commences, so
+	// sweep any unswept spans. If this is a concurrent GC, there
+	// shouldn't be any spans left to sweep, so this should finish
+	// instantly. If GC was forced before the concurrent sweep
+	// finished, there may be spans to sweep.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	nextMarkBitArenaEpoch()
+}
+
+func bgsweep(c chan int) {
+	sweep.g = getg()
+
+	lock(&sweep.lock)
+	sweep.parked = true
+	c <- 1
+	goparkunlock(&sweep.lock, "GC sweep wait", traceEvGoBlock, 1)
+
+	for {
+		for gosweepone() != ^uintptr(0) {
+			sweep.nbgsweep++
+			Gosched()
+		}
+		lock(&sweep.lock)
+		if !gosweepdone() {
+			// This can happen if a GC runs between
+			// gosweepone returning ^0 above
+			// and the lock being acquired.
+			unlock(&sweep.lock)
+			continue
+		}
+		sweep.parked = true
+		goparkunlock(&sweep.lock, "GC sweep wait", traceEvGoBlock, 1)
+	}
+}
+
+// sweeps one span
+// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+//go:nowritebarrier
+func sweepone() uintptr {
+	_g_ := getg()
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	_g_.m.locks++
+	sg := mheap_.sweepgen
+	for {
+		s := mheap_.sweepSpans[1-sg/2%2].pop()
+		if s == nil {
+			mheap_.sweepdone = 1
+			_g_.m.locks--
+			if debug.gcpacertrace > 0 && atomic.Cas(&sweep.pacertracegen, sg-2, sg) {
+				print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+			}
+			return ^uintptr(0)
+		}
+		if s.state != mSpanInUse {
+			// This can happen if direct sweeping already
+			// swept this span, but in that case the sweep
+			// generation should always be up-to-date.
+			if s.sweepgen != sg {
+				print("runtime: bad span s.state=", s.state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+				throw("non in-use span in unswept list")
+			}
+			continue
+		}
+		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			continue
+		}
+		npages := s.npages
+		if !s.sweep(false) {
+			// Span is still in-use, so this returned no
+			// pages to the heap and the span needs to
+			// move to the swept in-use list.
+			npages = 0
+		}
+		_g_.m.locks--
+		return npages
+	}
+}
+
+//go:nowritebarrier
+func gosweepone() uintptr {
+	var ret uintptr
+	systemstack(func() {
+		ret = sweepone()
+	})
+	return ret
+}
+
+//go:nowritebarrier
+func gosweepdone() bool {
+	return mheap_.sweepdone != 0
+}
+
+// Returns only when span s has been swept.
+//go:nowritebarrier
+func (s *mspan) ensureSwept() {
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_EnsureSwept: m is not locked")
+	}
+
+	sg := mheap_.sweepgen
+	if atomic.Load(&s.sweepgen) == sg {
+		return
+	}
+	// The caller must be sure that the span is a MSpanInUse span.
+	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		s.sweep(false)
+		return
+	}
+	// unfortunate condition, and we don't have efficient means to wait
+	for atomic.Load(&s.sweepgen) != sg {
+		osyield()
+	}
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// caller takes care of it.
+//TODO go:nowritebarrier
+func (s *mspan) sweep(preserve bool) bool {
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_Sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("MSpan_Sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
+
+	cl := s.sizeclass
+	size := s.elemsize
+	res := false
+	nfree := 0
+
+	c := _g_.m.mcache
+	freeToHeap := false
+
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
+
+	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		objIndex := uintptr(special.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+
+	if debug.allocfreetrace != 0 || raceenabled || msanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
+		}
+	}
+
+	// Count the number of free objects in this span.
+	nfree = s.countFree()
+	if cl == 0 && nfree != 0 {
+		s.needzero = 1
+		freeToHeap = true
+	}
+	nalloc := uint16(s.nelems) - uint16(nfree)
+	nfreed := s.allocCount - nalloc
+
+	// This test is not reliable with gccgo, because of
+	// conservative stack scanning. The test boils down to
+	// checking that no new bits have been set in gcmarkBits since
+	// the span was added to the sweep count. New bits are set by
+	// greyobject. Seeing a new bit means that a live pointer has
+	// appeared that was not found during the mark phase. That can
+	// not happen when pointers are followed strictly. However,
+	// with conservative checking, it is possible for a pointer
+	// that will never be used to appear live and to cause a mark
+	// to be added. That is unfortunate in that it causes this
+	// check to be inaccurate, and it will keep an object live
+	// unnecessarily, but provided the pointer is not really live
+	// it is not otherwise a problem. So we disable the test for gccgo.
+	if false && nalloc > s.allocCount {
+		print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
+
+	s.allocCount = nalloc
+	wasempty := s.nextFreeIndex() == s.nelems
+	s.freeindex = 0 // reset allocation index to start of span.
+
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	if freeToHeap || nfreed == 0 {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+			throw("MSpan_Sweep: bad span state after sweep")
+		}
+		// Serialization point.
+		// At this point the mark bits are cleared and allocation ready
+		// to go so release the span.
+		atomic.Store(&s.sweepgen, sweepgen)
+	}
+
+	if nfreed > 0 && cl != 0 {
+		c.local_nsmallfree[cl] += uintptr(nfreed)
+		res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty)
+		// MCentral_FreeSpan updates sweepgen
+	} else if freeToHeap {
+		// Free large span to heap
+
+		// NOTE(rsc,dvyukov): The original implementation of efence
+		// in CL 22060046 used SysFree instead of SysFault, so that
+		// the operating system would eventually give the memory
+		// back to us again, so that an efence program could run
+		// longer without running out of memory. Unfortunately,
+		// calling SysFree here without any kind of adjustment of the
+		// heap data structures means that when the memory does
+		// come back to us, we have the wrong metadata for it, either in
+		// the MSpan structures or in the garbage collection bitmap.
+		// Using SysFault here means that the program will run out of
+		// memory fairly quickly in efence mode, but at least it won't
+		// have mysterious crashes due to confused memory reuse.
+		// It should be possible to switch back to SysFree if we also
+		// implement and then call some kind of MHeap_DeleteSpan.
+		if debug.efence > 0 {
+			s.limit = 0 // prevent mlookup from finding this span
+			sysFault(unsafe.Pointer(s.base()), size)
+		} else {
+			mheap_.freeSpan(s, 1)
+		}
+		c.local_nlargefree++
+		c.local_largefree += size
+		res = true
+	}
+	if !res {
+		// The span has been swept and is still in-use, so put
+		// it on the swept in-use list.
+		mheap_.sweepSpans[sweepgen/2%2].push(s)
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+	}
+	return res
+}
+
+// deductSweepCredit deducts sweep credit for allocating a span of
+// size spanBytes. This must be performed *before* the span is
+// allocated to ensure the system has enough credit. If necessary, it
+// performs sweeping to prevent going in to debt. If the caller will
+// also sweep pages (e.g., for a large allocation), it can pass a
+// non-zero callerSweepPages to leave that many pages unswept.
+//
+// deductSweepCredit makes a worst-case assumption that all spanBytes
+// bytes of the ultimately allocated span will be available for object
+// allocation. The caller should call reimburseSweepCredit if that
+// turns out not to be the case once the span is allocated.
+//
+// deductSweepCredit is the core of the "proportional sweep" system.
+// It uses statistics gathered by the garbage collector to perform
+// enough sweeping so that all pages are swept during the concurrent
+// sweep phase between GC cycles.
+//
+// mheap_ must NOT be locked.
+func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Proportional sweep is done or disabled.
+		return
+	}
+
+	// Account for this span allocation.
+	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+
+	// Fix debt if necessary.
+	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
+	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+		if gosweepone() == ^uintptr(0) {
+			mheap_.sweepPagesPerByte = 0
+			break
+		}
+	}
+}
+
+// reimburseSweepCredit records that unusableBytes bytes of a
+// just-allocated span are not available for object allocation. This
+// offsets the worst-case charge performed by deductSweepCredit.
+func reimburseSweepCredit(unusableBytes uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Nobody cares about the credit. Avoid the atomic.
+		return
+	}
+	nval := atomic.Xadd64(&mheap_.spanBytesAlloc, -int64(unusableBytes))
+	if int64(nval) < 0 {
+		// Debugging for #18043.
+		print("runtime: bad spanBytesAlloc=", nval, " (was ", nval+uint64(unusableBytes), ") unusableBytes=", unusableBytes, " sweepPagesPerByte=", mheap_.sweepPagesPerByte, "\n")
+		throw("spanBytesAlloc underflow")
+	}
+}
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
new file mode 100644
index 0000000..6c1118e
--- /dev/null
+++ b/libgo/go/runtime/mgcsweepbuf.go
@@ -0,0 +1,178 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A gcSweepBuf is a set of *mspans.
+//
+// gcSweepBuf is safe for concurrent push operations *or* concurrent
+// pop operations, but not both simultaneously.
+type gcSweepBuf struct {
+	// A gcSweepBuf is a two-level data structure consisting of a
+	// growable spine that points to fixed-sized blocks. The spine
+	// can be accessed without locks, but adding a block or
+	// growing it requires taking the spine lock.
+	//
+	// Because each mspan covers at least 8K of heap and takes at
+	// most 8 bytes in the gcSweepBuf, the growth of the spine is
+	// quite limited.
+	//
+	// The spine and all blocks are allocated off-heap, which
+	// allows this to be used in the memory manager and avoids the
+	// need for write barriers on all of these. We never release
+	// this memory because there could be concurrent lock-free
+	// access and we're likely to reuse it anyway. (In principle,
+	// we could do this during STW.)
+
+	spineLock mutex
+	spine     unsafe.Pointer // *[N]*gcSweepBlock, accessed atomically
+	spineLen  uintptr        // Spine array length, accessed atomically
+	spineCap  uintptr        // Spine array cap, accessed under lock
+
+	// index is the first unused slot in the logical concatenation
+	// of all blocks. It is accessed atomically.
+	index uint32
+}
+
+const (
+	gcSweepBlockEntries    = 512 // 4KB on 64-bit
+	gcSweepBufInitSpineCap = 256 // Enough for 1GB heap on 64-bit
+)
+
+type gcSweepBlock struct {
+	spans [gcSweepBlockEntries]*mspan
+}
+
+// push adds span s to buffer b. push is safe to call concurrently
+// with other push operations, but NOT to call concurrently with pop.
+func (b *gcSweepBuf) push(s *mspan) {
+	// Obtain our slot.
+	cursor := uintptr(atomic.Xadd(&b.index, +1) - 1)
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+
+	// Do we need to add a block?
+	spineLen := atomic.Loaduintptr(&b.spineLen)
+	var block *gcSweepBlock
+retry:
+	if top < spineLen {
+		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+		blockp := add(spine, sys.PtrSize*top)
+		block = (*gcSweepBlock)(atomic.Loadp(blockp))
+	} else {
+		// Add a new block to the spine, potentially growing
+		// the spine.
+		lock(&b.spineLock)
+		// spineLen cannot change until we release the lock,
+		// but may have changed while we were waiting.
+		spineLen = atomic.Loaduintptr(&b.spineLen)
+		if top < spineLen {
+			unlock(&b.spineLock)
+			goto retry
+		}
+
+		if spineLen == b.spineCap {
+			// Grow the spine.
+			newCap := b.spineCap * 2
+			if newCap == 0 {
+				newCap = gcSweepBufInitSpineCap
+			}
+			newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)
+			if b.spineCap != 0 {
+				// Blocks are allocated off-heap, so
+				// no write barriers.
+				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
+			}
+			// Spine is allocated off-heap, so no write barrier.
+			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spineCap = newCap
+			// We can't immediately free the old spine
+			// since a concurrent push with a lower index
+			// could still be reading from it. We let it
+			// leak because even a 1TB heap would waste
+			// less than 2MB of memory on old spines. If
+			// this is a problem, we could free old spines
+			// during STW.
+		}
+
+		// Allocate a new block and add it to the spine.
+		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))
+		blockp := add(b.spine, sys.PtrSize*top)
+		// Blocks are allocated off-heap, so no write barrier.
+		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
+		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		unlock(&b.spineLock)
+	}
+
+	// We have a block. Insert the span.
+	block.spans[bottom] = s
+}
+
+// pop removes and returns a span from buffer b, or nil if b is empty.
+// pop is safe to call concurrently with other pop operations, but NOT
+// to call concurrently with push.
+func (b *gcSweepBuf) pop() *mspan {
+	cursor := atomic.Xadd(&b.index, -1)
+	if int32(cursor) < 0 {
+		atomic.Xadd(&b.index, +1)
+		return nil
+	}
+
+	// There are no concurrent spine or block modifications during
+	// pop, so we can omit the atomics.
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+	blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
+	block := *blockp
+	s := block.spans[bottom]
+	// Clear the pointer for block(i).
+	block.spans[bottom] = nil
+	return s
+}
+
+// numBlocks returns the number of blocks in buffer b. numBlocks is
+// safe to call concurrently with any other operation. Spans that have
+// been pushed prior to the call to numBlocks are guaranteed to appear
+// in some block in the range [0, numBlocks()), assuming there are no
+// intervening pops. Spans that are pushed after the call may also
+// appear in these blocks.
+func (b *gcSweepBuf) numBlocks() int {
+	return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)
+}
+
+// block returns the spans in the i'th block of buffer b. block is
+// safe to call concurrently with push.
+func (b *gcSweepBuf) block(i int) []*mspan {
+	// Perform bounds check before loading spine address since
+	// push ensures the allocated length is at least spineLen.
+	if i < 0 || uintptr(i) >= atomic.Loaduintptr(&b.spineLen) {
+		throw("block index out of range")
+	}
+
+	// Get block i.
+	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+	blockp := add(spine, sys.PtrSize*uintptr(i))
+	block := (*gcSweepBlock)(atomic.Loadp(blockp))
+
+	// Slice the block if necessary.
+	cursor := uintptr(atomic.Load(&b.index))
+	top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries
+	var spans []*mspan
+	if uintptr(i) < top {
+		spans = block.spans[:]
+	} else {
+		spans = block.spans[:bottom]
+	}
+
+	// push may have reserved a slot but not filled it yet, so
+	// trim away unused entries.
+	for len(spans) > 0 && spans[len(spans)-1] == nil {
+		spans = spans[:len(spans)-1]
+	}
+	return spans
+}
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
new file mode 100644
index 0000000..5eb05a7
--- /dev/null
+++ b/libgo/go/runtime/mgcwork.go
@@ -0,0 +1,444 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const (
+	_WorkbufSize = 2048 // in bytes; larger values result in less contention
+)
+
+// Garbage collector work pool abstraction.
+//
+// This implements a producer/consumer model for pointers to grey
+// objects. A grey object is one that is marked and on a work
+// queue. A black object is marked and not on a work queue.
+//
+// Write barriers, root discovery, stack scanning, and object scanning
+// produce pointers to grey objects. Scanning consumes pointers to
+// grey objects, thus blackening them, and then scans them,
+// potentially producing new pointers to grey objects.
+
+// A wbufptr holds a workbuf*, but protects it from write barriers.
+// workbufs never live on the heap, so write barriers are unnecessary.
+// Write barriers on workbuf pointers may also be dangerous in the GC.
+//
+// TODO: Since workbuf is now go:notinheap, this isn't necessary.
+type wbufptr uintptr
+
+func wbufptrOf(w *workbuf) wbufptr {
+	return wbufptr(unsafe.Pointer(w))
+}
+
+func (wp wbufptr) ptr() *workbuf {
+	return (*workbuf)(unsafe.Pointer(wp))
+}
+
+// A gcWork provides the interface to produce and consume work for the
+// garbage collector.
+//
+// A gcWork can be used on the stack as follows:
+//
+//     (preemption must be disabled)
+//     gcw := &getg().m.p.ptr().gcw
+//     .. call gcw.put() to produce and gcw.get() to consume ..
+//     if gcBlackenPromptly {
+//         gcw.dispose()
+//     }
+//
+// It's important that any use of gcWork during the mark phase prevent
+// the garbage collector from transitioning to mark termination since
+// gcWork may locally hold GC work buffers. This can be done by
+// disabling preemption (systemstack or acquirem).
+type gcWork struct {
+	// wbuf1 and wbuf2 are the primary and secondary work buffers.
+	//
+	// This can be thought of as a stack of both work buffers'
+	// pointers concatenated. When we pop the last pointer, we
+	// shift the stack up by one work buffer by bringing in a new
+	// full buffer and discarding an empty one. When we fill both
+	// buffers, we shift the stack down by one work buffer by
+	// bringing in a new empty buffer and discarding a full one.
+	// This way we have one buffer's worth of hysteresis, which
+	// amortizes the cost of getting or putting a work buffer over
+	// at least one buffer of work and reduces contention on the
+	// global work lists.
+	//
+	// wbuf1 is always the buffer we're currently pushing to and
+	// popping from and wbuf2 is the buffer that will be discarded
+	// next.
+	//
+	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
+	wbuf1, wbuf2 wbufptr
+
+	// Bytes marked (blackened) on this gcWork. This is aggregated
+	// into work.bytesMarked by dispose.
+	bytesMarked uint64
+
+	// Scan work performed on this gcWork. This is aggregated into
+	// gcController by dispose and may also be flushed by callers.
+	scanWork int64
+}
+
+func (w *gcWork) init() {
+	w.wbuf1 = wbufptrOf(getempty())
+	wbuf2 := trygetfull()
+	if wbuf2 == nil {
+		wbuf2 = getempty()
+	}
+	w.wbuf2 = wbufptrOf(wbuf2)
+}
+
+// put enqueues a pointer for the garbage collector to trace.
+// obj must point to the beginning of a heap object or an oblet.
+//go:nowritebarrier
+func (w *gcWork) put(obj uintptr) {
+	flushed := false
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	} else if wbuf.nobj == len(wbuf.obj) {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			wbuf = getempty()
+			w.wbuf1 = wbufptrOf(wbuf)
+			flushed = true
+		}
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+
+	// If we put a buffer on full, let the GC controller know so
+	// it can encourage more workers to run. We delay this until
+	// the end of put so that w is in a consistent state, since
+	// enlistWorker may itself manipulate w.
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// putFast does a put and returns true if it can be done quickly
+// otherwise it returns false and the caller needs to call put.
+//go:nowritebarrier
+func (w *gcWork) putFast(obj uintptr) bool {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return false
+	} else if wbuf.nobj == len(wbuf.obj) {
+		return false
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return true
+}
+
+// tryGet dequeues a pointer for the garbage collector to trace.
+//
+// If there are no pointers remaining in this gcWork or in the global
+// queue, tryGet returns 0.  Note that there may still be pointers in
+// other gcWork instances or other caches.
+//go:nowritebarrier
+func (w *gcWork) tryGet() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = trygetfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
+		}
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// tryGetFast dequeues a pointer for the garbage collector to trace
+// if one is readily available. Otherwise it returns 0 and
+// the caller is expected to call tryGet().
+//go:nowritebarrier
+func (w *gcWork) tryGetFast() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return 0
+	}
+	if wbuf.nobj == 0 {
+		return 0
+	}
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// get dequeues a pointer for the garbage collector to trace, blocking
+// if necessary to ensure all pointers from all queues and caches have
+// been retrieved.  get returns 0 if there are no pointers remaining.
+//go:nowritebarrier
+func (w *gcWork) get() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = getfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
+		}
+	}
+
+	// TODO: This might be a good place to add prefetch code
+
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
+}
+
+// dispose returns any cached pointers to the global queue.
+// The buffers are being put on the full queue so that the
+// write barriers will not simply reacquire them before the
+// GC can inspect them. This helps reduce the mutator's
+// ability to hide pointers during the concurrent mark phase.
+//
+//go:nowritebarrier
+func (w *gcWork) dispose() {
+	if wbuf := w.wbuf1.ptr(); wbuf != nil {
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+		}
+		w.wbuf1 = 0
+
+		wbuf = w.wbuf2.ptr()
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+		}
+		w.wbuf2 = 0
+	}
+	if w.bytesMarked != 0 {
+		// dispose happens relatively infrequently. If this
+		// atomic becomes a problem, we should first try to
+		// dispose less and if necessary aggregate in a per-P
+		// counter.
+		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
+		w.bytesMarked = 0
+	}
+	if w.scanWork != 0 {
+		atomic.Xaddint64(&gcController.scanWork, w.scanWork)
+		w.scanWork = 0
+	}
+}
+
+// balance moves some work that's cached in this gcWork back on the
+// global queue.
+//go:nowritebarrier
+func (w *gcWork) balance() {
+	if w.wbuf1 == 0 {
+		return
+	}
+	if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
+		putfull(wbuf)
+		w.wbuf2 = wbufptrOf(getempty())
+	} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
+		w.wbuf1 = wbufptrOf(handoff(wbuf))
+	} else {
+		return
+	}
+	// We flushed a buffer to the full list, so wake a worker.
+	if gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
+// empty returns true if w has no mark work available.
+//go:nowritebarrier
+func (w *gcWork) empty() bool {
+	return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
+}
+
+// Internally, the GC work pool is kept in arrays in work buffers.
+// The gcWork interface caches a work buffer until full (or empty) to
+// avoid contending on the global work buffer lists.
+
+type workbufhdr struct {
+	node lfnode // must be first
+	nobj int
+}
+
+//go:notinheap
+type workbuf struct {
+	workbufhdr
+	// account for the above fields
+	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
+}
+
+// workbuf factory routines. These funcs are used to manage the
+// workbufs.
+// If the GC asks for some work these are the only routines that
+// make wbufs available to the GC.
+
+func (b *workbuf) checknonempty() {
+	if b.nobj == 0 {
+		throw("workbuf is empty")
+	}
+}
+
+func (b *workbuf) checkempty() {
+	if b.nobj != 0 {
+		throw("workbuf is not empty")
+	}
+}
+
+// getempty pops an empty work buffer off the work.empty list,
+// allocating new buffers if none are available.
+//go:nowritebarrier
+func getempty() *workbuf {
+	var b *workbuf
+	if work.empty != 0 {
+		b = (*workbuf)(lfstackpop(&work.empty))
+		if b != nil {
+			b.checkempty()
+		}
+	}
+	if b == nil {
+		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), sys.CacheLineSize, &memstats.gc_sys))
+	}
+	return b
+}
+
+// putempty puts a workbuf onto the work.empty list.
+// Upon entry this go routine owns b. The lfstackpush relinquishes ownership.
+//go:nowritebarrier
+func putempty(b *workbuf) {
+	b.checkempty()
+	lfstackpush(&work.empty, &b.node)
+}
+
+// putfull puts the workbuf on the work.full list for the GC.
+// putfull accepts partially full buffers so the GC can avoid competing
+// with the mutators for ownership of partially full buffers.
+//go:nowritebarrier
+func putfull(b *workbuf) {
+	b.checknonempty()
+	lfstackpush(&work.full, &b.node)
+}
+
+// trygetfull tries to get a full or partially empty workbuffer.
+// If one is not immediately available return nil
+//go:nowritebarrier
+func trygetfull() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.full))
+	if b != nil {
+		b.checknonempty()
+		return b
+	}
+	return b
+}
+
+// Get a full work buffer off the work.full list.
+// If nothing is available wait until all the other gc helpers have
+// finished and then return nil.
+// getfull acts as a barrier for work.nproc helpers. As long as one
+// gchelper is actively marking objects it
+// may create a workbuffer that the other helpers can work on.
+// The for loop either exits when a work buffer is found
+// or when _all_ of the work.nproc GC helpers are in the loop
+// looking for work and thus not capable of creating new work.
+// This is in fact the termination condition for the STW mark
+// phase.
+//go:nowritebarrier
+func getfull() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.full))
+	if b != nil {
+		b.checknonempty()
+		return b
+	}
+
+	incnwait := atomic.Xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+		throw("work.nwait > work.nproc")
+	}
+	for i := 0; ; i++ {
+		if work.full != 0 {
+			decnwait := atomic.Xadd(&work.nwait, -1)
+			if decnwait == work.nproc {
+				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
+			b = (*workbuf)(lfstackpop(&work.full))
+			if b != nil {
+				b.checknonempty()
+				return b
+			}
+			incnwait := atomic.Xadd(&work.nwait, +1)
+			if incnwait > work.nproc {
+				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
+		}
+		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
+			return nil
+		}
+		_g_ := getg()
+		if i < 10 {
+			_g_.m.gcstats.nprocyield++
+			procyield(20)
+		} else if i < 20 {
+			_g_.m.gcstats.nosyield++
+			osyield()
+		} else {
+			_g_.m.gcstats.nsleep++
+			usleep(100)
+		}
+	}
+}
+
+//go:nowritebarrier
+func handoff(b *workbuf) *workbuf {
+	// Make new buffer with half of b's pointers.
+	b1 := getempty()
+	n := b.nobj / 2
+	b.nobj -= n
+	b1.nobj = n
+	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
+	_g_ := getg()
+	_g_.m.gcstats.nhandoff++
+	_g_.m.gcstats.nhandoffcnt += uint64(n)
+
+	// Put b on full list - let first half of b get stolen.
+	putfull(b)
+	return b1
+}
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
new file mode 100644
index 0000000..7262748
--- /dev/null
+++ b/libgo/go/runtime/mheap.go
@@ -0,0 +1,1427 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.go for overview.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// minPhysPageSize is a lower-bound on the physical page size. The
+// true physical page size may be larger than this. In contrast,
+// sys.PhysPageSize is an upper-bound on the physical page size.
+const minPhysPageSize = 4096
+
+// Main malloc heap.
+// The heap itself is the "free[]" and "large" arrays,
+// but all the other global data is here too.
+//
+// mheap must not be heap-allocated because it contains mSpanLists,
+// which must not be heap-allocated.
+//
+//go:notinheap
+type mheap struct {
+	lock      mutex
+	free      [_MaxMHeapList]mSpanList // free lists of given length
+	freelarge mSpanList                // free lists length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mSpanList // busy lists of large objects of given length
+	busylarge mSpanList                // busy lists of large objects length >= _MaxMHeapList
+	sweepgen  uint32                   // sweep generation, see comment in mspan
+	sweepdone uint32                   // all spans are swept
+
+	// allspans is a slice of all mspans ever created. Each mspan
+	// appears exactly once.
+	//
+	// The memory for allspans is manually managed and can be
+	// reallocated and move as the heap grows.
+	//
+	// In general, allspans is protected by mheap_.lock, which
+	// prevents concurrent access as well as freeing the backing
+	// store. Accesses during STW might not hold the lock, but
+	// must ensure that allocation cannot happen around the
+	// access (since that may free the backing store).
+	allspans []*mspan // all spans out there
+
+	// spans is a lookup table to map virtual address page IDs to *mspan.
+	// For allocated spans, their pages map to the span itself.
+	// For free spans, only the lowest and highest pages map to the span itself.
+	// Internal pages map to an arbitrary span.
+	// For pages that have never been allocated, spans entries are nil.
+	//
+	// This is backed by a reserved region of the address space so
+	// it can grow without moving. The memory up to len(spans) is
+	// mapped. cap(spans) indicates the total reserved memory.
+	spans []*mspan
+
+	// sweepSpans contains two mspan stacks: one of swept in-use
+	// spans, and one of unswept in-use spans. These two trade
+	// roles on each GC cycle. Since the sweepgen increases by 2
+	// on each cycle, this means the swept spans are in
+	// sweepSpans[sweepgen/2%2] and the unswept spans are in
+	// sweepSpans[1-sweepgen/2%2]. Sweeping pops spans from the
+	// unswept stack and pushes spans that are still in-use on the
+	// swept stack. Likewise, allocating an in-use span pushes it
+	// on the swept stack.
+	sweepSpans [2]gcSweepBuf
+
+	_ uint32 // align uint64 fields on 32-bit for atomics
+
+	// Proportional sweep
+	pagesInUse        uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
+	spanBytesAlloc    uint64  // bytes of spans allocated this cycle; updated atomically
+	pagesSwept        uint64  // pages swept this cycle; updated atomically
+	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+	// TODO(austin): pagesInUse should be a uintptr, but the 386
+	// compiler can't 8-byte align fields.
+
+	// Malloc stats.
+	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+
+	// range of addresses we might see in the heap
+	bitmap         uintptr // Points to one byte past the end of the bitmap
+	bitmap_mapped  uintptr
+	arena_start    uintptr
+	arena_used     uintptr // always mHeap_Map{Bits,Spans} before updating
+	arena_end      uintptr
+	arena_reserved bool
+
+	// central free lists for small size classes.
+	// the padding makes sure that the MCentrals are
+	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// gets its own cache line.
+	central [_NumSizeClasses]struct {
+		mcentral mcentral
+		pad      [sys.CacheLineSize]byte
+	}
+
+	spanalloc             fixalloc // allocator for span*
+	cachealloc            fixalloc // allocator for mcache*
+	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+	specialprofilealloc   fixalloc // allocator for specialprofile*
+	speciallock           mutex    // lock for special record allocators.
+}
+
+var mheap_ mheap
+
+// An MSpan is a run of pages.
+//
+// When a MSpan is in the heap free list, state == MSpanFree
+// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+//
+// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
+
+// Every MSpan is in one doubly-linked list,
+// either one of the MHeap's free lists or one of the
+// MCentral's span lists.
+
+// An MSpan representing actual memory has state _MSpanInUse,
+// _MSpanStack, or _MSpanFree. Transitions between these states are
+// constrained as follows:
+//
+// * A span may transition from free to in-use or stack during any GC
+//   phase.
+//
+// * During sweeping (gcphase == _GCoff), a span may transition from
+//   in-use to free (as a result of sweeping) or stack to free (as a
+//   result of stacks being freed).
+//
+// * During GC (gcphase != _GCoff), a span *must not* transition from
+//   stack or in-use to free. Because concurrent GC may read a pointer
+//   and then look up its span, the span state must be monotonic.
+type mSpanState uint8
+
+const (
+	_MSpanDead  mSpanState = iota
+	_MSpanInUse            // allocated for garbage collected heap
+	_MSpanStack            // allocated for use by stack allocator
+	_MSpanFree
+)
+
+// mSpanStateNames are the names of the span states, indexed by
+// mSpanState.
+var mSpanStateNames = []string{
+	"_MSpanDead",
+	"_MSpanInUse",
+	"_MSpanStack",
+	"_MSpanFree",
+}
+
+// mSpanList heads a linked list of spans.
+//
+//go:notinheap
+type mSpanList struct {
+	first *mspan // first span in list, or nil if none
+	last  *mspan // last span in list, or nil if none
+}
+
+//go:notinheap
+type mspan struct {
+	next *mspan     // next span in list, or nil if none
+	prev *mspan     // previous span in list, or nil if none
+	list *mSpanList // For debugging. TODO: Remove.
+
+	startAddr     uintptr   // address of first byte of span aka s.base()
+	npages        uintptr   // number of pages in span
+	stackfreelist gclinkptr // list of free stacks, avoids overloading freelist
+
+	// freeindex is the slot index between 0 and nelems at which to begin scanning
+	// for the next free object in this span.
+	// Each allocation scans allocBits starting at freeindex until it encounters a 0
+	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
+	// just past the the newly discovered free object.
+	//
+	// If freeindex == nelem, this span has no free objects.
+	//
+	// allocBits is a bitmap of objects in this span.
+	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
+	// then object n is free;
+	// otherwise, object n is allocated. Bits starting at nelem are
+	// undefined and should never be referenced.
+	//
+	// Object n starts at address n*elemsize + (start << pageShift).
+	freeindex uintptr
+	// TODO: Look up nelems from sizeclass and remove this field if it
+	// helps performance.
+	nelems uintptr // number of object in the span.
+
+	// Cache of the allocBits at freeindex. allocCache is shifted
+	// such that the lowest bit corresponds to the bit freeindex.
+	// allocCache holds the complement of allocBits, thus allowing
+	// ctz (count trailing zero) to use it directly.
+	// allocCache may contain bits beyond s.nelems; the caller must ignore
+	// these.
+	allocCache uint64
+
+	// allocBits and gcmarkBits hold pointers to a span's mark and
+	// allocation bits. The pointers are 8 byte aligned.
+	// There are three arenas where this data is held.
+	// free: Dirty arenas that are no longer accessed
+	//       and can be reused.
+	// next: Holds information to be used in the next GC cycle.
+	// current: Information being used during this GC cycle.
+	// previous: Information being used during the last GC cycle.
+	// A new GC cycle starts with the call to finishsweep_m.
+	// finishsweep_m moves the previous arena to the free arena,
+	// the current arena to the previous arena, and
+	// the next arena to the current arena.
+	// The next arena is populated as the spans request
+	// memory to hold gcmarkBits for the next GC cycle as well
+	// as allocBits for newly allocated spans.
+	//
+	// The pointer arithmetic is done "by hand" instead of using
+	// arrays to avoid bounds checks along critical performance
+	// paths.
+	// The sweep will free the old allocBits and set allocBits to the
+	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
+	// out memory.
+	allocBits  *uint8
+	gcmarkBits *uint8
+
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+
+	sweepgen    uint32
+	divMul      uint16     // for divide by elemsize - divMagic.mul
+	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
+	allocCount  uint16     // capacity - number of objects in freelist
+	sizeclass   uint8      // size class
+	incache     bool       // being used by an mcache
+	state       mSpanState // mspaninuse etc
+	needzero    uint8      // needs to be zeroed before allocation
+	divShift    uint8      // for divide by elemsize - divMagic.shift
+	divShift2   uint8      // for divide by elemsize - divMagic.shift2
+	elemsize    uintptr    // computed from sizeclass or from npages
+	unusedsince int64      // first time spotted by gc in mspanfree state
+	npreleased  uintptr    // number of pages released to the os
+	limit       uintptr    // end of data in span
+	speciallock mutex      // guards specials list
+	specials    *special   // linked list of special records sorted by offset.
+}
+
+func (s *mspan) base() uintptr {
+	return s.startAddr
+}
+
+func (s *mspan) layout() (size, n, total uintptr) {
+	total = s.npages << _PageShift
+	size = s.elemsize
+	if size > 0 {
+		n = total / size
+	}
+	return
+}
+
+func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
+	h := (*mheap)(vh)
+	s := (*mspan)(p)
+	if len(h.allspans) >= cap(h.allspans) {
+		n := 64 * 1024 / sys.PtrSize
+		if n < cap(h.allspans)*3/2 {
+			n = cap(h.allspans) * 3 / 2
+		}
+		var new []*mspan
+		sp := (*slice)(unsafe.Pointer(&new))
+		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
+		if sp.array == nil {
+			throw("runtime: cannot allocate memory")
+		}
+		sp.len = len(h.allspans)
+		sp.cap = n
+		if len(h.allspans) > 0 {
+			copy(new, h.allspans)
+		}
+		oldAllspans := h.allspans
+		h.allspans = new
+		if len(oldAllspans) != 0 {
+			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
+		}
+	}
+	h.allspans = append(h.allspans, s)
+}
+
+// inheap reports whether b is a pointer into a (potentially dead) heap object.
+// It returns false for pointers into stack spans.
+// Non-preemptible because it is used by write barriers.
+//go:nowritebarrier
+//go:nosplit
+func inheap(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	s := mheap_.spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() || b >= s.limit || s.state != mSpanInUse {
+		return false
+	}
+	return true
+}
+
+// inHeapOrStack is a variant of inheap that returns true for pointers into stack spans.
+//go:nowritebarrier
+//go:nosplit
+func inHeapOrStack(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	s := mheap_.spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() {
+		return false
+	}
+	switch s.state {
+	case mSpanInUse:
+		return b < s.limit
+	case _MSpanStack:
+		return b < s.base()+s.npages<<_PageShift
+	default:
+		return false
+	}
+}
+
+// TODO: spanOf and spanOfUnchecked are open-coded in a lot of places.
+// Use the functions instead.
+
+// spanOf returns the span of p. If p does not point into the heap or
+// no span contains p, spanOf returns nil.
+func spanOf(p uintptr) *mspan {
+	if p == 0 || p < mheap_.arena_start || p >= mheap_.arena_used {
+		return nil
+	}
+	return spanOfUnchecked(p)
+}
+
+// spanOfUnchecked is equivalent to spanOf, but the caller must ensure
+// that p points into the heap (that is, mheap_.arena_start <= p <
+// mheap_.arena_used).
+func spanOfUnchecked(p uintptr) *mspan {
+	return mheap_.spans[(p-mheap_.arena_start)>>_PageShift]
+}
+
+func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
+	_g_ := getg()
+
+	_g_.m.mcache.local_nlookup++
+	if sys.PtrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(_g_.m.mcache)
+		unlock(&mheap_.lock)
+	}
+
+	s := mheap_.lookupMaybe(unsafe.Pointer(v))
+	if sp != nil {
+		*sp = s
+	}
+	if s == nil {
+		if base != nil {
+			*base = 0
+		}
+		if size != nil {
+			*size = 0
+		}
+		return 0
+	}
+
+	p := s.base()
+	if s.sizeclass == 0 {
+		// Large object.
+		if base != nil {
+			*base = p
+		}
+		if size != nil {
+			*size = s.npages << _PageShift
+		}
+		return 1
+	}
+
+	n := s.elemsize
+	if base != nil {
+		i := (v - p) / n
+		*base = p + i*n
+	}
+	if size != nil {
+		*size = n
+	}
+
+	return 1
+}
+
+// Initialize the heap.
+func (h *mheap) init(spansStart, spansBytes uintptr) {
+	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
+	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
+	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+
+	// Don't zero mspan allocations. Background sweeping can
+	// inspect a span concurrently with allocating it, so it's
+	// important that the span's sweepgen survive across freeing
+	// and re-allocating a span to prevent background sweeping
+	// from improperly cas'ing it from 0.
+	//
+	// This is safe because mspan contains no heap pointers.
+	h.spanalloc.zero = false
+
+	// h->mapcache needs no init
+	for i := range h.free {
+		h.free[i].init()
+		h.busy[i].init()
+	}
+
+	h.freelarge.init()
+	h.busylarge.init()
+	for i := range h.central {
+		h.central[i].mcentral.init(int32(i))
+	}
+
+	sp := (*slice)(unsafe.Pointer(&h.spans))
+	sp.array = unsafe.Pointer(spansStart)
+	sp.len = 0
+	sp.cap = int(spansBytes / sys.PtrSize)
+}
+
+// mHeap_MapSpans makes sure that the spans are mapped
+// up to the new value of arena_used.
+//
+// It must be called with the expected new value of arena_used,
+// *before* h.arena_used has been updated.
+// Waiting to update arena_used until after the memory has been mapped
+// avoids faults when other threads try access the bitmap immediately
+// after observing the change to arena_used.
+func (h *mheap) mapSpans(arena_used uintptr) {
+	// Map spans array, PageSize at a time.
+	n := arena_used
+	n -= h.arena_start
+	n = n / _PageSize * sys.PtrSize
+	n = round(n, physPageSize)
+	need := n / unsafe.Sizeof(h.spans[0])
+	have := uintptr(len(h.spans))
+	if have >= need {
+		return
+	}
+	h.spans = h.spans[:need]
+	sysMap(unsafe.Pointer(&h.spans[have]), (need-have)*unsafe.Sizeof(h.spans[0]), h.arena_reserved, &memstats.other_sys)
+}
+
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+func (h *mheap) reclaimList(list *mSpanList, npages uintptr) uintptr {
+	n := uintptr(0)
+	sg := mheap_.sweepgen
+retry:
+	for s := list.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			list.remove(s)
+			// swept spans are at the end of the list
+			list.insertBack(s)
+			unlock(&h.lock)
+			snpages := s.npages
+			if s.sweep(false) {
+				n += snpages
+			}
+			lock(&h.lock)
+			if n >= npages {
+				return n
+			}
+			// the span could have been moved elsewhere
+			goto retry
+		}
+		if s.sweepgen == sg-1 {
+			// the span is being sweept by background sweeper, skip
+			continue
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break
+	}
+	return n
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+func (h *mheap) reclaim(npage uintptr) {
+	// First try to sweep busy spans with large objects of size >= npage,
+	// this has good chances of reclaiming the necessary space.
+	for i := int(npage); i < len(h.busy); i++ {
+		if h.reclaimList(&h.busy[i], npage) != 0 {
+			return // Bingo!
+		}
+	}
+
+	// Then -- even larger objects.
+	if h.reclaimList(&h.busylarge, npage) != 0 {
+		return // Bingo!
+	}
+
+	// Now try smaller objects.
+	// One such object is not enough, so we need to reclaim several of them.
+	reclaimed := uintptr(0)
+	for i := 0; i < int(npage) && i < len(h.busy); i++ {
+		reclaimed += h.reclaimList(&h.busy[i], npage-reclaimed)
+		if reclaimed >= npage {
+			return
+		}
+	}
+
+	// Now sweep everything that is not yet swept.
+	unlock(&h.lock)
+	for {
+		n := sweepone()
+		if n == ^uintptr(0) { // all spans are swept
+			break
+		}
+		reclaimed += n
+		if reclaimed >= npage {
+			break
+		}
+	}
+	lock(&h.lock)
+}
+
+// Allocate a new span of npage pages from the heap for GC'd memory
+// and record its size class in the HeapMap and HeapMapCache.
+func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
+	_g_ := getg()
+	lock(&h.lock)
+
+	// To prevent excessive heap growth, before allocating n pages
+	// we need to sweep and reclaim at least n pages.
+	if h.sweepdone == 0 {
+		// TODO(austin): This tends to sweep a large number of
+		// spans in order to find a few completely free spans
+		// (for example, in the garbage benchmark, this sweeps
+		// ~30x the number of pages its trying to allocate).
+		// If GC kept a bit for whether there were any marks
+		// in a span, we could release these free spans
+		// at the end of GC and eliminate this entirely.
+		h.reclaim(npage)
+	}
+
+	// transfer stats from cache to global
+	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
+	_g_.m.mcache.local_scan = 0
+	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
+	_g_.m.mcache.local_tinyallocs = 0
+
+	s := h.allocSpanLocked(npage)
+	if s != nil {
+		// Record span info, because gc needs to be
+		// able to map interior pointer to containing span.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
+		s.state = _MSpanInUse
+		s.allocCount = 0
+		s.sizeclass = uint8(sizeclass)
+		if sizeclass == 0 {
+			s.elemsize = s.npages << _PageShift
+			s.divShift = 0
+			s.divMul = 0
+			s.divShift2 = 0
+			s.baseMask = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			m := &class_to_divmagic[sizeclass]
+			s.divShift = m.shift
+			s.divMul = m.mul
+			s.divShift2 = m.shift2
+			s.baseMask = m.baseMask
+		}
+
+		// update stats, sweep lists
+		h.pagesInUse += uint64(npage)
+		if large {
+			memstats.heap_objects++
+			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
+			// Swept spans are at the end of lists.
+			if s.npages < uintptr(len(h.free)) {
+				h.busy[s.npages].insertBack(s)
+			} else {
+				h.busylarge.insertBack(s)
+			}
+		}
+	}
+	// heap_scan and heap_live were updated.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	if trace.enabled {
+		traceHeapAlloc()
+	}
+
+	// h.spans is accessed concurrently without synchronization
+	// from other threads. Hence, there must be a store/store
+	// barrier here to ensure the writes to h.spans above happen
+	// before the caller can publish a pointer p to an object
+	// allocated from s. As soon as this happens, the garbage
+	// collector running on another processor could read p and
+	// look up s in h.spans. The unlock acts as the barrier to
+	// order these writes. On the read side, the data dependency
+	// between p and the index in h.spans orders the reads.
+	unlock(&h.lock)
+	return s
+}
+
+func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+	// Don't do any operations that lock the heap on the G stack.
+	// It might trigger stack growth, and the stack growth code needs
+	// to be able to allocate heap.
+	var s *mspan
+	systemstack(func() {
+		s = h.alloc_m(npage, sizeclass, large)
+	})
+
+	if s != nil {
+		if needzero && s.needzero != 0 {
+			memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		}
+		s.needzero = 0
+	}
+	return s
+}
+
+func (h *mheap) allocStack(npage uintptr) *mspan {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		throw("mheap_allocstack not on g0 stack")
+	}
+	lock(&h.lock)
+	s := h.allocSpanLocked(npage)
+	if s != nil {
+		s.state = _MSpanStack
+		s.stackfreelist = 0
+		s.allocCount = 0
+		memstats.stacks_inuse += uint64(s.npages << _PageShift)
+	}
+
+	// This unlock acts as a release barrier. See mHeap_Alloc_m.
+	unlock(&h.lock)
+	return s
+}
+
+// Allocates a span of the given size.  h must be locked.
+// The returned span has been removed from the
+// free list, but its state is still MSpanFree.
+func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
+	var list *mSpanList
+	var s *mspan
+
+	// Try in fixed-size lists up to max.
+	for i := int(npage); i < len(h.free); i++ {
+		list = &h.free[i]
+		if !list.isEmpty() {
+			s = list.first
+			goto HaveSpan
+		}
+	}
+
+	// Best fit in list of large spans.
+	list = &h.freelarge
+	s = h.allocLarge(npage)
+	if s == nil {
+		if !h.grow(npage) {
+			return nil
+		}
+		s = h.allocLarge(npage)
+		if s == nil {
+			return nil
+		}
+	}
+
+HaveSpan:
+	// Mark span in use.
+	if s.state != _MSpanFree {
+		throw("MHeap_AllocLocked - MSpan not free")
+	}
+	if s.npages < npage {
+		throw("MHeap_AllocLocked - bad npages")
+	}
+	list.remove(s)
+	if s.inList() {
+		throw("still in list")
+	}
+	if s.npreleased > 0 {
+		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		memstats.heap_released -= uint64(s.npreleased << _PageShift)
+		s.npreleased = 0
+	}
+
+	if s.npages > npage {
+		// Trim extra and put it back in the heap.
+		t := (*mspan)(h.spanalloc.alloc())
+		t.init(s.base()+npage<<_PageShift, s.npages-npage)
+		s.npages = npage
+		p := (t.base() - h.arena_start) >> _PageShift
+		if p > 0 {
+			h.spans[p-1] = s
+		}
+		h.spans[p] = t
+		h.spans[p+t.npages-1] = t
+		t.needzero = s.needzero
+		s.state = _MSpanStack // prevent coalescing with s
+		t.state = _MSpanStack
+		h.freeSpanLocked(t, false, false, s.unusedsince)
+		s.state = _MSpanFree
+	}
+	s.unusedsince = 0
+
+	p := (s.base() - h.arena_start) >> _PageShift
+	for n := uintptr(0); n < npage; n++ {
+		h.spans[p+n] = s
+	}
+
+	memstats.heap_inuse += uint64(npage << _PageShift)
+	memstats.heap_idle -= uint64(npage << _PageShift)
+
+	//println("spanalloc", hex(s.start<<_PageShift))
+	if s.inList() {
+		throw("still in list")
+	}
+	return s
+}
+
+// Allocate a span of exactly npage pages from the list of large spans.
+func (h *mheap) allocLarge(npage uintptr) *mspan {
+	return bestFit(&h.freelarge, npage, nil)
+}
+
+// Search list for smallest span with >= npage pages.
+// If there are multiple smallest spans, take the one
+// with the earliest starting address.
+func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan {
+	for s := list.first; s != nil; s = s.next {
+		if s.npages < npage {
+			continue
+		}
+		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) {
+			best = s
+		}
+	}
+	return best
+}
+
+// Try to add at least npage pages of memory to the heap,
+// returning whether it worked.
+//
+// h must be locked.
+func (h *mheap) grow(npage uintptr) bool {
+	// Ask for a big chunk, to reduce the number of mappings
+	// the operating system needs to track; also amortizes
+	// the overhead of an operating system mapping.
+	// Allocate a multiple of 64kB.
+	npage = round(npage, (64<<10)/_PageSize)
+	ask := npage << _PageShift
+	if ask < _HeapAllocChunk {
+		ask = _HeapAllocChunk
+	}
+
+	v := h.sysAlloc(ask)
+	if v == nil {
+		if ask > npage<<_PageShift {
+			ask = npage << _PageShift
+			v = h.sysAlloc(ask)
+		}
+		if v == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return false
+		}
+	}
+
+	// Create a fake "in use" span and free it, so that the
+	// right coalescing happens.
+	s := (*mspan)(h.spanalloc.alloc())
+	s.init(uintptr(v), ask>>_PageShift)
+	p := (s.base() - h.arena_start) >> _PageShift
+	for i := p; i < p+s.npages; i++ {
+		h.spans[i] = s
+	}
+	atomic.Store(&s.sweepgen, h.sweepgen)
+	s.state = _MSpanInUse
+	h.pagesInUse += uint64(s.npages)
+	h.freeSpanLocked(s, false, true, 0)
+	return true
+}
+
+// Look up the span at the given address.
+// Address is guaranteed to be in map
+// and is guaranteed to be start or end of span.
+func (h *mheap) lookup(v unsafe.Pointer) *mspan {
+	p := uintptr(v)
+	p -= h.arena_start
+	return h.spans[p>>_PageShift]
+}
+
+// Look up the span at the given address.
+// Address is *not* guaranteed to be in map
+// and may be anywhere in the span.
+// Map entries for the middle of a span are only
+// valid for allocated spans. Free spans may have
+// other garbage in their middles, so we have to
+// check for that.
+func (h *mheap) lookupMaybe(v unsafe.Pointer) *mspan {
+	if uintptr(v) < h.arena_start || uintptr(v) >= h.arena_used {
+		return nil
+	}
+	s := h.spans[(uintptr(v)-h.arena_start)>>_PageShift]
+	if s == nil || uintptr(v) < s.base() || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse {
+		return nil
+	}
+	return s
+}
+
+// Free the span back into the heap.
+func (h *mheap) freeSpan(s *mspan, acct int32) {
+	systemstack(func() {
+		mp := getg().m
+		lock(&h.lock)
+		memstats.heap_scan += uint64(mp.mcache.local_scan)
+		mp.mcache.local_scan = 0
+		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
+		mp.mcache.local_tinyallocs = 0
+		if msanenabled {
+			// Tell msan that this entire span is no longer in use.
+			base := unsafe.Pointer(s.base())
+			bytes := s.npages << _PageShift
+			msanfree(base, bytes)
+		}
+		if acct != 0 {
+			memstats.heap_objects--
+		}
+		if gcBlackenEnabled != 0 {
+			// heap_scan changed.
+			gcController.revise()
+		}
+		h.freeSpanLocked(s, true, true, 0)
+		unlock(&h.lock)
+	})
+}
+
+func (h *mheap) freeStack(s *mspan) {
+	_g_ := getg()
+	if _g_ != _g_.m.g0 {
+		throw("mheap_freestack not on g0 stack")
+	}
+	s.needzero = 1
+	lock(&h.lock)
+	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
+	h.freeSpanLocked(s, true, true, 0)
+	unlock(&h.lock)
+}
+
+// s must be on a busy list (h.busy or h.busylarge) or unlinked.
+func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
+	switch s.state {
+	case _MSpanStack:
+		if s.allocCount != 0 {
+			throw("MHeap_FreeSpanLocked - invalid stack free")
+		}
+	case _MSpanInUse:
+		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
+			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+			throw("MHeap_FreeSpanLocked - invalid free")
+		}
+		h.pagesInUse -= uint64(s.npages)
+	default:
+		throw("MHeap_FreeSpanLocked - invalid span state")
+	}
+
+	if acctinuse {
+		memstats.heap_inuse -= uint64(s.npages << _PageShift)
+	}
+	if acctidle {
+		memstats.heap_idle += uint64(s.npages << _PageShift)
+	}
+	s.state = _MSpanFree
+	if s.inList() {
+		h.busyList(s.npages).remove(s)
+	}
+
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	s.unusedsince = unusedsince
+	if unusedsince == 0 {
+		s.unusedsince = nanotime()
+	}
+	s.npreleased = 0
+
+	// Coalesce with earlier, later spans.
+	p := (s.base() - h.arena_start) >> _PageShift
+	if p > 0 {
+		t := h.spans[p-1]
+		if t != nil && t.state == _MSpanFree {
+			s.startAddr = t.startAddr
+			s.npages += t.npages
+			s.npreleased = t.npreleased // absorb released pages
+			s.needzero |= t.needzero
+			p -= t.npages
+			h.spans[p] = s
+			h.freeList(t.npages).remove(t)
+			t.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(t))
+		}
+	}
+	if (p + s.npages) < uintptr(len(h.spans)) {
+		t := h.spans[p+s.npages]
+		if t != nil && t.state == _MSpanFree {
+			s.npages += t.npages
+			s.npreleased += t.npreleased
+			s.needzero |= t.needzero
+			h.spans[p+s.npages-1] = s
+			h.freeList(t.npages).remove(t)
+			t.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(t))
+		}
+	}
+
+	// Insert s into appropriate list.
+	h.freeList(s.npages).insert(s)
+}
+
+func (h *mheap) freeList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.free[npages]
+	}
+	return &h.freelarge
+}
+
+func (h *mheap) busyList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.busy[npages]
+	}
+	return &h.busylarge
+}
+
+func scavengelist(list *mSpanList, now, limit uint64) uintptr {
+	if list.isEmpty() {
+		return 0
+	}
+
+	var sumreleased uintptr
+	for s := list.first; s != nil; s = s.next {
+		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
+			start := s.base()
+			end := start + s.npages<<_PageShift
+			if physPageSize > _PageSize {
+				// We can only release pages in
+				// physPageSize blocks, so round start
+				// and end in. (Otherwise, madvise
+				// will round them *out* and release
+				// more memory than we want.)
+				start = (start + physPageSize - 1) &^ (physPageSize - 1)
+				end &^= physPageSize - 1
+				if end <= start {
+					// start and end don't span a
+					// whole physical page.
+					continue
+				}
+			}
+			len := end - start
+
+			released := len - (s.npreleased << _PageShift)
+			if physPageSize > _PageSize && released == 0 {
+				continue
+			}
+			memstats.heap_released += uint64(released)
+			sumreleased += released
+			s.npreleased = len >> _PageShift
+			sysUnused(unsafe.Pointer(start), len)
+		}
+	}
+	return sumreleased
+}
+
+func (h *mheap) scavenge(k int32, now, limit uint64) {
+	lock(&h.lock)
+	var sumreleased uintptr
+	for i := 0; i < len(h.free); i++ {
+		sumreleased += scavengelist(&h.free[i], now, limit)
+	}
+	sumreleased += scavengelist(&h.freelarge, now, limit)
+	unlock(&h.lock)
+
+	if debug.gctrace > 0 {
+		if sumreleased > 0 {
+			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
+		}
+		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
+		// But we can't call ReadMemStats on g0 holding locks.
+		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+	}
+}
+
+//go:linkname runtime_debug_freeOSMemory runtime_debug.freeOSMemory
+func runtime_debug_freeOSMemory() {
+	gcStart(gcForceBlockMode, false)
+	systemstack(func() { mheap_.scavenge(-1, ^uint64(0), 0) })
+}
+
+// Initialize a new span with the given start and npages.
+func (span *mspan) init(base uintptr, npages uintptr) {
+	// span is *not* zeroed.
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+	span.startAddr = base
+	span.npages = npages
+	span.allocCount = 0
+	span.sizeclass = 0
+	span.incache = false
+	span.elemsize = 0
+	span.state = _MSpanDead
+	span.unusedsince = 0
+	span.npreleased = 0
+	span.speciallock.key = 0
+	span.specials = nil
+	span.needzero = 0
+	span.freeindex = 0
+	span.allocBits = nil
+	span.gcmarkBits = nil
+}
+
+func (span *mspan) inList() bool {
+	return span.list != nil
+}
+
+// Initialize an empty doubly-linked list.
+func (list *mSpanList) init() {
+	list.first = nil
+	list.last = nil
+}
+
+func (list *mSpanList) remove(span *mspan) {
+	if span.list != list {
+		println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
+		throw("MSpanList_Remove")
+	}
+	if list.first == span {
+		list.first = span.next
+	} else {
+		span.prev.next = span.next
+	}
+	if list.last == span {
+		list.last = span.prev
+	} else {
+		span.next.prev = span.prev
+	}
+	span.next = nil
+	span.prev = nil
+	span.list = nil
+}
+
+func (list *mSpanList) isEmpty() bool {
+	return list.first == nil
+}
+
+func (list *mSpanList) insert(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
+		throw("MSpanList_Insert")
+	}
+	span.next = list.first
+	if list.first != nil {
+		// The list contains at least one span; link it in.
+		// The last span in the list doesn't change.
+		list.first.prev = span
+	} else {
+		// The list contains no spans, so this is also the last span.
+		list.last = span
+	}
+	list.first = span
+	span.list = list
+}
+
+func (list *mSpanList) insertBack(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
+		throw("MSpanList_InsertBack")
+	}
+	span.prev = list.last
+	if list.last != nil {
+		// The list contains at least one span.
+		list.last.next = span
+	} else {
+		// The list contains no spans, so this is also the first span.
+		list.first = span
+	}
+	list.last = span
+	span.list = list
+}
+
+const (
+	_KindSpecialFinalizer = 1
+	_KindSpecialProfile   = 2
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+)
+
+//go:notinheap
+type special struct {
+	next   *special // linked list in span
+	offset uint16   // span offset of object
+	kind   byte     // kind of special
+}
+
+// Adds the special record s to the list of special records for
+// the object p. All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+func addspecial(p unsafe.Pointer, s *special) bool {
+	span := mheap_.lookupMaybe(p)
+	if span == nil {
+		throw("addspecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+	kind := s.kind
+
+	lock(&span.speciallock)
+
+	// Find splice point, check for existing record.
+	t := &span.specials
+	for {
+		x := *t
+		if x == nil {
+			break
+		}
+		if offset == uintptr(x.offset) && kind == x.kind {
+			unlock(&span.speciallock)
+			releasem(mp)
+			return false // already exists
+		}
+		if offset < uintptr(x.offset) || (offset == uintptr(x.offset) && kind < x.kind) {
+			break
+		}
+		t = &x.next
+	}
+
+	// Splice in record, fill in offset.
+	s.offset = uint16(offset)
+	s.next = *t
+	*t = s
+	unlock(&span.speciallock)
+	releasem(mp)
+
+	return true
+}
+
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+func removespecial(p unsafe.Pointer, kind uint8) *special {
+	span := mheap_.lookupMaybe(p)
+	if span == nil {
+		throw("removespecial on invalid pointer")
+	}
+
+	// Ensure that the span is swept.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
+	mp := acquirem()
+	span.ensureSwept()
+
+	offset := uintptr(p) - span.base()
+
+	lock(&span.speciallock)
+	t := &span.specials
+	for {
+		s := *t
+		if s == nil {
+			break
+		}
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if offset == uintptr(s.offset) && kind == s.kind {
+			*t = s.next
+			unlock(&span.speciallock)
+			releasem(mp)
+			return s
+		}
+		t = &s.next
+	}
+	unlock(&span.speciallock)
+	releasem(mp)
+	return nil
+}
+
+// The described object has a finalizer set for it.
+//
+// specialfinalizer is allocated from non-GC'd memory, so any heap
+// pointers must be specially handled.
+//
+//go:notinheap
+type specialfinalizer struct {
+	special special
+	fn      *funcval  // May be a heap pointer.
+	ft      *functype // May be a heap pointer, but always live.
+	ot      *ptrtype  // May be a heap pointer, but always live.
+}
+
+// Adds a finalizer to the object p. Returns true if it succeeded.
+func addfinalizer(p unsafe.Pointer, f *funcval, ft *functype, ot *ptrtype) bool {
+	lock(&mheap_.speciallock)
+	s := (*specialfinalizer)(mheap_.specialfinalizeralloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialFinalizer
+	s.fn = f
+	s.ft = ft
+	s.ot = ot
+	if addspecial(p, &s.special) {
+		// This is responsible for maintaining the same
+		// GC-related invariants as markrootSpans in any
+		// situation where it's possible that markrootSpans
+		// has already run but mark termination hasn't yet.
+		if gcphase != _GCoff {
+			_, base, _ := findObject(p)
+			mp := acquirem()
+			gcw := &mp.p.ptr().gcw
+			// Mark everything reachable from the object
+			// so it's retained for the finalizer.
+			scanobject(uintptr(base), gcw)
+			// Mark the finalizer itself, since the
+			// special isn't part of the GC'd heap.
+			scanblock(uintptr(unsafe.Pointer(&s.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+			if gcBlackenPromptly {
+				gcw.dispose()
+			}
+			releasem(mp)
+		}
+		return true
+	}
+
+	// There was an old finalizer
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+	return false
+}
+
+// Removes the finalizer (if any) from the object p.
+func removefinalizer(p unsafe.Pointer) {
+	s := (*specialfinalizer)(unsafe.Pointer(removespecial(p, _KindSpecialFinalizer)))
+	if s == nil {
+		return // there wasn't a finalizer to remove
+	}
+	lock(&mheap_.speciallock)
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
+	unlock(&mheap_.speciallock)
+}
+
+// The described object is being heap profiled.
+//
+//go:notinheap
+type specialprofile struct {
+	special special
+	b       *bucket
+}
+
+// Set the heap profile bucket associated with addr to b.
+func setprofilebucket(p unsafe.Pointer, b *bucket) {
+	lock(&mheap_.speciallock)
+	s := (*specialprofile)(mheap_.specialprofilealloc.alloc())
+	unlock(&mheap_.speciallock)
+	s.special.kind = _KindSpecialProfile
+	s.b = b
+	if !addspecial(p, &s.special) {
+		throw("setprofilebucket: profile already set")
+	}
+}
+
+// Do whatever cleanup needs to be done to deallocate s. It has
+// already been unlinked from the MSpan specials list.
+func freespecial(s *special, p unsafe.Pointer, size uintptr) {
+	switch s.kind {
+	case _KindSpecialFinalizer:
+		sf := (*specialfinalizer)(unsafe.Pointer(s))
+		queuefinalizer(p, sf.fn, sf.ft, sf.ot)
+		lock(&mheap_.speciallock)
+		mheap_.specialfinalizeralloc.free(unsafe.Pointer(sf))
+		unlock(&mheap_.speciallock)
+	case _KindSpecialProfile:
+		sp := (*specialprofile)(unsafe.Pointer(s))
+		mProf_Free(sp.b, size)
+		lock(&mheap_.speciallock)
+		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
+		unlock(&mheap_.speciallock)
+	default:
+		throw("bad special kind")
+		panic("not reached")
+	}
+}
+
+const gcBitsChunkBytes = uintptr(64 << 10)
+const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
+
+type gcBitsHeader struct {
+	free uintptr // free is the index into bits of the next free byte.
+	next uintptr // *gcBits triggers recursive type bug. (issue 14620)
+}
+
+//go:notinheap
+type gcBits struct {
+	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
+	free uintptr // free is the index into bits of the next free byte.
+	next *gcBits
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8
+}
+
+var gcBitsArenas struct {
+	lock     mutex
+	free     *gcBits
+	next     *gcBits
+	current  *gcBits
+	previous *gcBits
+}
+
+// newMarkBits returns a pointer to 8 byte aligned bytes
+// to be used for a span's mark bits.
+func newMarkBits(nelems uintptr) *uint8 {
+	lock(&gcBitsArenas.lock)
+	blocksNeeded := uintptr((nelems + 63) / 64)
+	bytesNeeded := blocksNeeded * 8
+	if gcBitsArenas.next == nil ||
+		gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) {
+		// Allocate a new arena.
+		fresh := newArena()
+		fresh.next = gcBitsArenas.next
+		gcBitsArenas.next = fresh
+	}
+	if gcBitsArenas.next.free >= gcBitsChunkBytes {
+		println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes)
+		throw("markBits overflow")
+	}
+	result := &gcBitsArenas.next.bits[gcBitsArenas.next.free]
+	gcBitsArenas.next.free += bytesNeeded
+	unlock(&gcBitsArenas.lock)
+	return result
+}
+
+// newAllocBits returns a pointer to 8 byte aligned bytes
+// to be used for this span's alloc bits.
+// newAllocBits is used to provide newly initialized spans
+// allocation bits. For spans not being initialized the
+// the mark bits are repurposed as allocation bits when
+// the span is swept.
+func newAllocBits(nelems uintptr) *uint8 {
+	return newMarkBits(nelems)
+}
+
+// nextMarkBitArenaEpoch establishes a new epoch for the arenas
+// holding the mark bits. The arenas are named relative to the
+// current GC cycle which is demarcated by the call to finishweep_m.
+//
+// All current spans have been swept.
+// During that sweep each span allocated room for its gcmarkBits in
+// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current
+// where the GC will mark objects and after each span is swept these bits
+// will be used to allocate objects.
+// gcBitsArenas.current becomes gcBitsArenas.previous where the span's
+// gcAllocBits live until all the spans have been swept during this GC cycle.
+// The span's sweep extinguishes all the references to gcBitsArenas.previous
+// by pointing gcAllocBits into the gcBitsArenas.current.
+// The gcBitsArenas.previous is released to the gcBitsArenas.free list.
+func nextMarkBitArenaEpoch() {
+	lock(&gcBitsArenas.lock)
+	if gcBitsArenas.previous != nil {
+		if gcBitsArenas.free == nil {
+			gcBitsArenas.free = gcBitsArenas.previous
+		} else {
+			// Find end of previous arenas.
+			last := gcBitsArenas.previous
+			for last = gcBitsArenas.previous; last.next != nil; last = last.next {
+			}
+			last.next = gcBitsArenas.free
+			gcBitsArenas.free = gcBitsArenas.previous
+		}
+	}
+	gcBitsArenas.previous = gcBitsArenas.current
+	gcBitsArenas.current = gcBitsArenas.next
+	gcBitsArenas.next = nil // newMarkBits calls newArena when needed
+	unlock(&gcBitsArenas.lock)
+}
+
+// newArena allocates and zeroes a gcBits arena.
+func newArena() *gcBits {
+	var result *gcBits
+	if gcBitsArenas.free == nil {
+		result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		if result == nil {
+			throw("runtime: cannot allocate memory")
+		}
+	} else {
+		result = gcBitsArenas.free
+		gcBitsArenas.free = gcBitsArenas.free.next
+		memclrNoHeapPointers(unsafe.Pointer(result), gcBitsChunkBytes)
+	}
+	result.next = nil
+	// If result.bits is not 8 byte aligned adjust index so
+	// that &result.bits[result.free] is 8 byte aligned.
+	if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 {
+		result.free = 0
+	} else {
+		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
+	}
+	return result
+}
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 1bfdc39..87f84a7 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -12,15 +12,6 @@ import (
 	"unsafe"
 )
 
-// Export temporarily for gccgo's C code to call:
-//go:linkname mProf_Malloc runtime.mProf_Malloc
-//go:linkname mProf_Free runtime.mProf_Free
-//go:linkname mProf_GC runtime.mProf_GC
-//go:linkname tracealloc runtime.tracealloc
-//go:linkname tracefree runtime.tracefree
-//go:linkname tracegc runtime.tracegc
-//go:linkname iterate_memprof runtime.iterate_memprof
-
 // NOTE(rsc): Everything here could use cas if contention became an issue.
 var proflock mutex
 
diff --git a/libgo/go/runtime/msize.go b/libgo/go/runtime/msize.go
new file mode 100644
index 0000000..438c987
--- /dev/null
+++ b/libgo/go/runtime/msize.go
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc small size classes.
+//
+// See malloc.go for overview.
+// See also mksizeclasses.go for how we decide what size classes to use.
+
+package runtime
+
+// sizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+//	1 <= sizeclass < NumSizeClasses, for n.
+//	Size class 0 is reserved to mean "not small".
+//
+// The sizeToClass lookup is implemented using two arrays,
+// one mapping sizes <= 1024 to their class and one mapping
+// sizes >= 1024 and <= MaxSmallSize to their class.
+// All objects are 8-aligned, so the first array is indexed by
+// the size divided by 8 (rounded up).  Objects >= 1024 bytes
+// are 128-aligned, so the second array is indexed by the
+// size divided by 128 (rounded up).  The arrays are constants
+// in sizeclass.go generated by mksizeclass.go.
+func sizeToClass(size uint32) uint32 {
+	if size > _MaxSmallSize {
+		throw("invalid size")
+	}
+	if size > smallSizeMax-8 {
+		return uint32(size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv])
+	}
+	return uint32(size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv])
+}
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+func roundupsize(size uintptr) uintptr {
+	if size < _MaxSmallSize {
+		if size <= smallSizeMax-8 {
+			return uintptr(class_to_size[size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]])
+		} else {
+			return uintptr(class_to_size[size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]])
+		}
+	}
+	if size+_PageSize < size {
+		return size
+	}
+	return round(size, _PageSize)
+}
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 178c32c..aa3cfef 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -467,10 +467,7 @@ func readmemstats_m(stats *MemStats) {
 // For gccgo this is in runtime/mgc0.c.
 func updatememstats(stats *gcstats)
 
-/*
-For gccgo these are still in runtime/mgc0.c.
-
-//go:linkname readGCStats runtime/debug.readGCStats
+//go:linkname readGCStats runtime_debug.readGCStats
 func readGCStats(pauses *[]uint64) {
 	systemstack(func() {
 		readGCStats_m(pauses)
@@ -618,7 +615,6 @@ func flushmcache(i int) {
 		return
 	}
 	c.releaseAll()
-	stackcache_clear(c)
 }
 
 // flushallmcaches flushes the mcaches of all Ps.
@@ -652,8 +648,6 @@ func purgecachedstats(c *mcache) {
 	}
 }
 
-*/
-
 // Atomically increases a given *system* memory stat. We are counting on this
 // stat never overflowing a uintptr, so this function must only be used for
 // system memory stats.
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index 876eaea..8932455 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
 
 package runtime
 
diff --git a/libgo/go/runtime/netpoll_aix.go b/libgo/go/runtime/netpoll_aix.go
new file mode 100644
index 0000000..e40dfb6
--- /dev/null
+++ b/libgo/go/runtime/netpoll_aix.go
@@ -0,0 +1,173 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// This is based on the former libgo/runtime/netpoll_select.c implementation
+// except that it uses poll instead of select and is written in Go.
+
+// These definitions should come from sysinfo.go as they may be OS-dependent.
+// These are the definitions for the AIX operating system.
+type pollfd struct {
+	fd      int32
+	events  int16
+	revents int16
+}
+
+const _POLLIN = 0x0001
+const _POLLOUT = 0x0002
+const _POLLHUP = 0x2000
+const _POLLERR = 0x4000
+
+//extern poll
+func libc_poll(pfds *pollfd, npfds uintptr, timeout uintptr) int32
+
+//extern pipe
+func libc_pipe(fd *int32) int32
+
+//extern __go_fcntl_uintptr
+func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
+
+func closeonexec(fd int32) {
+	fcntlUintptr(uintptr(fd), _F_SETFD, _FD_CLOEXEC)
+}
+
+var (
+	allocated int
+	pfds      []pollfd
+	mpfds     map[uintptr]*pollDesc
+	pmtx      mutex
+	rdwake    int32
+	wrwake    int32
+)
+
+func netpollinit() {
+	var p [2]int32
+
+	// Create the pipe we use to wakeup poll.
+	if err := libc_pipe(&p[0]); err < 0 {
+		throw("netpollinit: failed to create pipe")
+	}
+	rdwake = p[0]
+	wrwake = p[1]
+
+	closeonexec(rdwake)
+	closeonexec(wrwake)
+
+	// Pre-allocate array of pollfd structures for poll.
+	allocated = 128
+	pfds = make([]pollfd, allocated)
+
+	mpfds = make(map[uintptr]*pollDesc)
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	lock(&pmtx)
+	mpfds[fd] = pd
+	unlock(&pmtx)
+
+	// Wakeup poll.
+	b := [1]byte{0}
+	write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
+
+	return 0
+}
+
+func netpollclose(fd uintptr) int32 {
+	lock(&pmtx)
+	delete(mpfds, fd)
+	unlock(&pmtx)
+
+	// Wakeup poll.
+	b := [1]byte{0}
+	write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
+
+	return 0
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+	throw("unused")
+}
+
+func netpoll(block bool) *g {
+	if allocated == 0 {
+		return nil
+	}
+	timeout := ^uintptr(0)
+	if !block {
+		timeout = 0
+	}
+retry:
+	lock(&pmtx)
+	npfds := len(mpfds) + 1
+	unlock(&pmtx)
+
+	if npfds > allocated {
+		for npfds > allocated {
+			allocated *= 2
+		}
+		pfds = make([]pollfd, allocated)
+	}
+
+	// Poll the read side of the pipe.
+	pfds[0].fd = rdwake
+	pfds[0].events = _POLLIN
+	lock(&pmtx)
+	// Notice that npfds may have changed since we released the lock.
+	// Just copy what we can, new descriptors will be added at next
+	// iteration.
+	i := 1
+	for fd := range mpfds {
+		if i >= allocated {
+			break
+		}
+		pfds[i].fd = int32(fd)
+		pfds[i].events = _POLLIN | _POLLOUT
+		i++
+	}
+	npfds = i
+	unlock(&pmtx)
+
+	n := libc_poll(&pfds[0], uintptr(npfds), timeout)
+	if n < 0 {
+		e := errno()
+		if e != _EINTR {
+			throw("poll failed")
+		}
+		goto retry
+	}
+	var gp guintptr
+	for i = 0; i < npfds && n > 0; i++ {
+		pfd := pfds[i]
+
+		var mode int32
+		if pfd.revents&(_POLLIN|_POLLHUP|_POLLERR) != 0 {
+			if i == 0 {
+				var b [1]byte
+				read(pfd.fd, unsafe.Pointer(&b[0]), 1)
+				n--
+				continue
+			}
+			mode += 'r'
+		}
+		if pfd.revents&(_POLLOUT|_POLLHUP|_POLLERR) != 0 {
+			mode += 'w'
+		}
+		if mode != 0 {
+			lock(&pmtx)
+			pd := mpfds[uintptr(pfd.fd)]
+			unlock(&pmtx)
+			if pd != nil {
+				netpollready(&gp, pd, mode)
+			}
+			n--
+		}
+	}
+	if block && gp == 0 {
+		goto retry
+	}
+	return gp.ptr()
+}
diff --git a/libgo/go/runtime/os_aix.go b/libgo/go/runtime/os_aix.go
new file mode 100644
index 0000000..246b9c3
--- /dev/null
+++ b/libgo/go/runtime/os_aix.go
@@ -0,0 +1,98 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+type mOS struct {
+	waitsema uintptr // semaphore for parking on locks
+}
+
+//extern malloc
+func libc_malloc(uintptr) unsafe.Pointer
+
+//go:noescape
+//extern sem_init
+func sem_init(sem *semt, pshared int32, value uint32) int32
+
+//go:noescape
+//extern sem_wait
+func sem_wait(sem *semt) int32
+
+//go:noescape
+//extern sem_post
+func sem_post(sem *semt) int32
+
+//go:noescape
+//extern sem_timedwait
+func sem_timedwait(sem *semt, timeout *timespec) int32
+
+//go:noescape
+//extern clock_gettime
+func clock_gettime(clock_id int64, timeout *timespec) int32
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.mos.waitsema != 0 {
+		return
+	}
+
+	var sem *semt
+
+	// Call libc's malloc rather than malloc. This will
+	// allocate space on the C heap. We can't call malloc
+	// here because it could cause a deadlock.
+	sem = (*semt)(libc_malloc(unsafe.Sizeof(*sem)))
+	if sem_init(sem, 0, 0) != 0 {
+		throw("sem_init")
+	}
+	mp.mos.waitsema = uintptr(unsafe.Pointer(sem))
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	_m_ := getg().m
+	if ns >= 0 {
+		const CLOCK_REALTIME int64 = 9
+		var ts timespec
+
+		if clock_gettime(CLOCK_REALTIME, &ts) != 0 {
+			throw("clock_gettime")
+		}
+		ts.tv_sec += timespec_sec_t(ns / 1000000000)
+		ts.tv_nsec += timespec_nsec_t(ns % 1000000000)
+		if ts.tv_nsec >= 1000000000 {
+			ts.tv_sec += timespec_sec_t(1)
+			ts.tv_nsec -= timespec_nsec_t(1000000000)
+		}
+
+		if sem_timedwait((*semt)(unsafe.Pointer(_m_.mos.waitsema)), &ts) != 0 {
+			err := errno()
+			if err == _ETIMEDOUT || err == _EAGAIN || err == _EINTR {
+				return -1
+			}
+			throw("sem_timedwait")
+		}
+		return 0
+	}
+	for {
+		r1 := sem_wait((*semt)(unsafe.Pointer(_m_.mos.waitsema)))
+		if r1 == 0 {
+			break
+		}
+		if errno() == _EINTR {
+			continue
+		}
+		throw("sem_wait")
+	}
+	return 0
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	if sem_post((*semt)(unsafe.Pointer(mp.mos.waitsema))) != 0 {
+		throw("sem_post")
+	}
+}
diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go
index a8f05a4..358a38b 100644
--- a/libgo/go/runtime/os_gccgo.go
+++ b/libgo/go/runtime/os_gccgo.go
@@ -11,6 +11,10 @@ import (
 // Temporary for C code to call:
 //go:linkname minit runtime.minit
 
+func goenvs() {
+	goenvs_unix()
+}
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
 func mpreinit(mp *m) {
diff --git a/libgo/go/runtime/os_linux.go b/libgo/go/runtime/os_linux.go
index ad33486..e1a6a30 100644
--- a/libgo/go/runtime/os_linux.go
+++ b/libgo/go/runtime/os_linux.go
@@ -166,6 +166,3 @@ func sysauxv(auxv []uintptr) int {
 	}
 	return i / 2
 }
-
-// Temporary for gccgo until we port mem_GOOS.go.
-var addrspace_vec [1]byte
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index b76bb21..aa196ae 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -97,7 +97,6 @@ func deferproc(frame *bool, pfn uintptr, arg unsafe.Pointer) {
 	n.arg = arg
 	n.retaddr = 0
 	n.makefunccanrecover = false
-	n.special = false
 }
 
 // Allocate a Defer, usually using per-P pool.
@@ -141,10 +140,6 @@ func newdefer() *_defer {
 //
 //go:nosplit
 func freedefer(d *_defer) {
-	if d.special {
-		return
-	}
-
 	// When C code calls a Go function on a non-Go thread, the
 	// deferred call to cgocallBackDone will set g to nil.
 	// Don't crash trying to put d on the free list; just let it
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index 079af15..5ebd46b 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -103,9 +103,11 @@ func TestMemoryProfiler(t *testing.T) {
 #	0x[0-9,a-f]+	runtime_pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:74
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 
-		fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
+		// This should start with "0: 0" but gccgo's imprecise
+		// GC means that sometimes the value is not collected.
+		fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @( 0x[0-9,a-f]+)+
 #	0x[0-9,a-f]+	pprof_test\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:49
-`, memoryProfilerRun, (2<<20)*memoryProfilerRun),
+`, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}
 
 	for _, test := range tests {
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index ea7f84e..b28e26b 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -6,61 +6,128 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
-// Functions temporarily called by C code.
+// Functions called by C code.
+//go:linkname main runtime.main
+//go:linkname goparkunlock runtime.goparkunlock
 //go:linkname newextram runtime.newextram
 //go:linkname acquirep runtime.acquirep
 //go:linkname releasep runtime.releasep
 //go:linkname incidlelocked runtime.incidlelocked
-//go:linkname checkdead runtime.checkdead
-//go:linkname sysmon runtime.sysmon
-//go:linkname schedtrace runtime.schedtrace
-//go:linkname allgadd runtime.allgadd
-//go:linkname mcommoninit runtime.mcommoninit
+//go:linkname schedinit runtime.schedinit
 //go:linkname ready runtime.ready
 //go:linkname gcprocs runtime.gcprocs
-//go:linkname needaddgcproc runtime.needaddgcproc
 //go:linkname stopm runtime.stopm
 //go:linkname handoffp runtime.handoffp
 //go:linkname wakep runtime.wakep
 //go:linkname stoplockedm runtime.stoplockedm
 //go:linkname schedule runtime.schedule
 //go:linkname execute runtime.execute
-//go:linkname gfput runtime.gfput
+//go:linkname goexit1 runtime.goexit1
+//go:linkname reentersyscall runtime.reentersyscall
+//go:linkname reentersyscallblock runtime.reentersyscallblock
+//go:linkname exitsyscall runtime.exitsyscall
 //go:linkname gfget runtime.gfget
-//go:linkname lockOSThread runtime.lockOSThread
-//go:linkname unlockOSThread runtime.unlockOSThread
-//go:linkname procresize runtime.procresize
 //go:linkname helpgc runtime.helpgc
-//go:linkname stopTheWorldWithSema runtime.stopTheWorldWithSema
-//go:linkname startTheWorldWithSema runtime.startTheWorldWithSema
-//go:linkname mput runtime.mput
-//go:linkname mget runtime.mget
+//go:linkname kickoff runtime.kickoff
+//go:linkname mstart1 runtime.mstart1
 //go:linkname globrunqput runtime.globrunqput
 //go:linkname pidleget runtime.pidleget
-//go:linkname runqempty runtime.runqempty
-//go:linkname runqput runtime.runqput
 
 // Function called by misc/cgo/test.
 //go:linkname lockedOSThread runtime.lockedOSThread
 
-// Functions temporarily in C that have not yet been ported.
-func allocm(*p, bool, *unsafe.Pointer, *uintptr) *m
+// C functions for thread and context management.
+func newosproc(*m)
 func malg(bool, bool, *unsafe.Pointer, *uintptr) *g
-func startm(*p, bool)
-func newm(unsafe.Pointer, *p)
-func gchelper()
-func getfingwait() bool
-func getfingwake() bool
-func wakefing() *g
-
-// C functions for ucontext management.
+func resetNewG(*g, *unsafe.Pointer, *uintptr)
 func gogo(*g)
 func setGContext()
 func makeGContext(*g, unsafe.Pointer, uintptr)
 func getTraceback(me, gp *g)
+func gtraceback(*g)
+func _cgo_notify_runtime_init_done()
+func alreadyInCallers() bool
+
+// Functions created by the compiler.
+//extern __go_init_main
+func main_init()
+
+//extern main.main
+func main_main()
+
+var buildVersion = sys.TheVersion
+
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+//
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
+//
+// Design doc at https://golang.org/s/go11sched.
+
+// Worker thread parking/unparking.
+// We need to balance between keeping enough running worker threads to utilize
+// available hardware parallelism and parking excessive running worker threads
+// to conserve CPU resources and power. This is not simple for two reasons:
+// (1) scheduler state is intentionally distributed (in particular, per-P work
+// queues), so it is not possible to compute global predicates on fast paths;
+// (2) for optimal thread management we would need to know the future (don't park
+// a worker thread when a new goroutine will be readied in near future).
+//
+// Three rejected approaches that would work badly:
+// 1. Centralize all scheduler state (would inhibit scalability).
+// 2. Direct goroutine handoff. That is, when we ready a new goroutine and there
+//    is a spare P, unpark a thread and handoff it the thread and the goroutine.
+//    This would lead to thread state thrashing, as the thread that readied the
+//    goroutine can be out of work the very next moment, we will need to park it.
+//    Also, it would destroy locality of computation as we want to preserve
+//    dependent goroutines on the same thread; and introduce additional latency.
+// 3. Unpark an additional thread whenever we ready a goroutine and there is an
+//    idle P, but don't do handoff. This would lead to excessive thread parking/
+//    unparking as the additional threads will instantly park without discovering
+//    any work to do.
+//
+// The current approach:
+// We unpark an additional thread when we ready a goroutine if (1) there is an
+// idle P and there are no "spinning" worker threads. A worker thread is considered
+// spinning if it is out of local work and did not find work in global run queue/
+// netpoller; the spinning state is denoted in m.spinning and in sched.nmspinning.
+// Threads unparked this way are also considered spinning; we don't do goroutine
+// handoff so such threads are out of work initially. Spinning threads do some
+// spinning looking for work in per-P run queues before parking. If a spinning
+// thread finds work it takes itself out of the spinning state and proceeds to
+// execution. If it does not find work it takes itself out of the spinning state
+// and then parks.
+// If there is at least one spinning thread (sched.nmspinning>1), we don't unpark
+// new threads when readying goroutines. To compensate for that, if the last spinning
+// thread finds work and stops spinning, it must unpark a new spinning thread.
+// This approach smooths out unjustified spikes of thread unparking,
+// but at the same time guarantees eventual maximal CPU parallelism utilization.
+//
+// The main implementation complication is that we need to be very careful during
+// spinning->non-spinning thread transition. This transition can race with submission
+// of a new goroutine, and either one part or another needs to unpark another worker
+// thread. If they both fail to do that, we can end up with semi-persistent CPU
+// underutilization. The general pattern for goroutine readying is: submit a goroutine
+// to local work queue, #StoreLoad-style memory barrier, check sched.nmspinning.
+// The general pattern for spinning->non-spinning transition is: decrement nmspinning,
+// #StoreLoad-style memory barrier, check all per-P work queues for new work.
+// Note that all this complexity does not apply to global run queue as we are not
+// sloppy about thread unparking when submitting to global queue. Also see comments
+// for nmspinning manipulation.
+
+var (
+	m0 m
+	g0 g
+)
 
 // main_init_done is a signal used by cgocallbackg that initialization
 // has been completed. It is made before _cgo_notify_runtime_init_done,
@@ -68,6 +135,159 @@ func getTraceback(me, gp *g)
 // it is closed, meaning cgocallbackg can reliably receive from it.
 var main_init_done chan bool
 
+// runtimeInitTime is the nanotime() at which the runtime started.
+var runtimeInitTime int64
+
+// Value to use for signal mask for newly created M's.
+var initSigmask sigset
+
+// The main goroutine.
+func main() {
+	g := getg()
+
+	// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
+	// Using decimal instead of binary GB and MB because
+	// they look nicer in the stack overflow failure message.
+	if sys.PtrSize == 8 {
+		maxstacksize = 1000000000
+	} else {
+		maxstacksize = 250000000
+	}
+
+	// Record when the world started.
+	runtimeInitTime = nanotime()
+
+	systemstack(func() {
+		newm(sysmon, nil)
+	})
+
+	// Lock the main goroutine onto this, the main OS thread,
+	// during initialization. Most programs won't care, but a few
+	// do require certain calls to be made by the main thread.
+	// Those can arrange for main.main to run in the main thread
+	// by calling runtime.LockOSThread during initialization
+	// to preserve the lock.
+	lockOSThread()
+
+	if g.m != &m0 {
+		throw("runtime.main not on m0")
+	}
+
+	// Defer unlock so that runtime.Goexit during init does the unlock too.
+	needUnlock := true
+	defer func() {
+		if needUnlock {
+			unlockOSThread()
+		}
+	}()
+
+	main_init_done = make(chan bool)
+	if iscgo {
+		_cgo_notify_runtime_init_done()
+	}
+
+	fn := main_init // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
+	fn()
+	close(main_init_done)
+
+	needUnlock = false
+	unlockOSThread()
+
+	// For gccgo we have to wait until after main is initialized
+	// to enable GC, because initializing main registers the GC roots.
+	gcenable()
+
+	if isarchive || islibrary {
+		// A program compiled with -buildmode=c-archive or c-shared
+		// has a main, but it is not executed.
+		return
+	}
+	fn = main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
+	fn()
+	if raceenabled {
+		racefini()
+	}
+
+	// Make racy client program work: if panicking on
+	// another goroutine at the same time as main returns,
+	// let the other goroutine finish printing the panic trace.
+	// Once it does, it will exit. See issue 3934.
+	if panicking != 0 {
+		gopark(nil, nil, "panicwait", traceEvGoStop, 1)
+	}
+
+	exit(0)
+	for {
+		var x *int32
+		*x = 0
+	}
+}
+
+// os_beforeExit is called from os.Exit(0).
+//go:linkname os_beforeExit os.runtime_beforeExit
+func os_beforeExit() {
+	if raceenabled {
+		racefini()
+	}
+}
+
+// start forcegc helper goroutine
+func init() {
+	go forcegchelper()
+}
+
+func forcegchelper() {
+	forcegc.g = getg()
+	for {
+		lock(&forcegc.lock)
+		if forcegc.idle != 0 {
+			throw("forcegc: phase error")
+		}
+		atomic.Store(&forcegc.idle, 1)
+		goparkunlock(&forcegc.lock, "force gc (idle)", traceEvGoBlock, 1)
+		// this goroutine is explicitly resumed by sysmon
+		if debug.gctrace > 0 {
+			println("GC forced")
+		}
+		gcStart(gcBackgroundMode, true)
+	}
+}
+
+//go:nosplit
+
+// Gosched yields the processor, allowing other goroutines to run. It does not
+// suspend the current goroutine, so execution resumes automatically.
+func Gosched() {
+	mcall(gosched_m)
+}
+
+// Puts the current goroutine into a waiting state and calls unlockf.
+// If unlockf returns false, the goroutine is resumed.
+// unlockf must not access this G's stack, as it may be moved between
+// the call to gopark and the call to unlockf.
+func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason string, traceEv byte, traceskip int) {
+	mp := acquirem()
+	gp := mp.curg
+	status := readgstatus(gp)
+	if status != _Grunning && status != _Gscanrunning {
+		throw("gopark: bad g status")
+	}
+	mp.waitlock = lock
+	mp.waitunlockf = *(*unsafe.Pointer)(unsafe.Pointer(&unlockf))
+	gp.waitreason = reason
+	mp.waittraceev = traceEv
+	mp.waittraceskip = traceskip
+	releasem(mp)
+	// can't do anything that might move the G between Ms here.
+	mcall(park_m)
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling goready(gp).
+func goparkunlock(lock *mutex, reason string, traceEv byte, traceskip int) {
+	gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceEv, traceskip)
+}
+
 func goready(gp *g, traceskip int) {
 	systemstack(func() {
 		ready(gp, traceskip, true)
@@ -164,12 +384,11 @@ func releaseSudog(s *sudog) {
 
 // funcPC returns the entry PC of the function f.
 // It assumes that f is a func value. Otherwise the behavior is undefined.
-// For gccgo here unless and until we port proc.go.
-// Note that this differs from the gc implementation; the gc implementation
-// adds sys.PtrSize to the address of the interface value, but GCC's
-// alias analysis decides that that can not be a reference to the second
-// field of the interface, and in some cases it drops the initialization
-// of the second field as a dead store.
+// For gccgo note that this differs from the gc implementation; the gc
+// implementation adds sys.PtrSize to the address of the interface
+// value, but GCC's alias analysis decides that that can not be a
+// reference to the second field of the interface, and in some cases
+// it drops the initialization of the second field as a dead store.
 //go:nosplit
 func funcPC(f interface{}) uintptr {
 	i := (*iface)(unsafe.Pointer(&f))
@@ -207,6 +426,62 @@ func allgadd(gp *g) {
 	unlock(&allglock)
 }
 
+const (
+	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	_GoidCacheBatch = 16
+)
+
+// The bootstrap sequence is:
+//
+//	call osinit
+//	call schedinit
+//	make & queue new G
+//	call runtime·mstart
+//
+// The new G calls runtime·main.
+func schedinit() {
+	_m_ := &m0
+	_g_ := &g0
+	_m_.g0 = _g_
+	_m_.curg = _g_
+	_g_.m = _m_
+	setg(_g_)
+
+	sched.maxmcount = 10000
+
+	tracebackinit()
+	mallocinit()
+	mcommoninit(_g_.m)
+	alginit() // maps must not be used before this call
+
+	msigsave(_g_.m)
+	initSigmask = _g_.m.sigmask
+
+	goargs()
+	goenvs()
+	parsedebugvars()
+	gcinit()
+
+	sched.lastpoll = uint64(nanotime())
+	procs := ncpu
+	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
+		procs = n
+	}
+	if procs > _MaxGomaxprocs {
+		procs = _MaxGomaxprocs
+	}
+	if procresize(procs) != nil {
+		throw("unknown runnable goroutine during bootstrap")
+	}
+
+	if buildVersion == "" {
+		// Condition should never trigger. This code just serves
+		// to ensure runtime·buildVersion is kept in the resulting binary.
+		buildVersion = "unknown"
+	}
+}
+
 func dumpgstatus(gp *g) {
 	_g_ := getg()
 	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
@@ -491,6 +766,122 @@ func casgstatus(gp *g, oldval, newval uint32) {
 	}
 }
 
+// scang blocks until gp's stack has been scanned.
+// It might be scanned by scang or it might be scanned by the goroutine itself.
+// Either way, the stack scan has completed when scang returns.
+func scang(gp *g, gcw *gcWork) {
+	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
+	// Nothing is racing with us now, but gcscandone might be set to true left over
+	// from an earlier round of stack scanning (we scan twice per GC).
+	// We use gcscandone to record whether the scan has been done during this round.
+	// It is important that the scan happens exactly once: if called twice,
+	// the installation of stack barriers will detect the double scan and die.
+
+	gp.gcscandone = false
+
+	// See http://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Endeavor to get gcscandone set to true,
+	// either by doing the stack scan ourselves or by coercing gp to scan itself.
+	// gp.gcscandone can transition from false to true when we're not looking
+	// (if we asked for preemption), so any time we lock the status using
+	// castogscanstatus we have to double-check that the scan is still not done.
+loop:
+	for i := 0; !gp.gcscandone; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			dumpgstatus(gp)
+			throw("stopg: invalid status")
+
+		case _Gdead:
+			// No stack.
+			gp.gcscandone = true
+			break loop
+
+		case _Gcopystack:
+		// Stack being switched. Go around again.
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// Racing with execution or readying of gp.
+			// The scan bit keeps them from running
+			// the goroutine until we're done.
+			if castogscanstatus(gp, s, s|_Gscan) {
+				if gp.scanningself {
+					// Don't try to scan the stack
+					// if the goroutine is going to do
+					// it itself.
+					restartg(gp)
+					break
+				}
+				if !gp.gcscandone {
+					scanstack(gp, gcw)
+					gp.gcscandone = true
+				}
+				restartg(gp)
+				break loop
+			}
+
+		case _Gscanwaiting:
+			// newstack is doing a scan for us right now. Wait.
+
+		case _Gscanrunning:
+			// checkPreempt is scanning. Wait.
+
+		case _Grunning:
+			// Goroutine running. Try to preempt execution so it can scan itself.
+			// The preemption handler (in newstack) does the actual scan.
+
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if gp.preemptscan && gp.preempt {
+				break
+			}
+
+			// Ask for preemption and self scan.
+			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				if !gp.gcscandone {
+					gp.preemptscan = true
+					gp.preempt = true
+				}
+				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+			}
+		}
+
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+
+	gp.preemptscan = false // cancel scan request if no longer needed
+}
+
+// The GC requests that this routine be moved from a scanmumble state to a mumble state.
+func restartg(gp *g) {
+	s := readgstatus(gp)
+	switch s {
+	default:
+		dumpgstatus(gp)
+		throw("restartg: unexpected status")
+
+	case _Gdead:
+	// ok
+
+	case _Gscanrunnable,
+		_Gscanwaiting,
+		_Gscansyscall:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+}
+
 // stopTheWorld stops all P's from executing goroutines, interrupting
 // all goroutines at GC safe points and records reason as the reason
 // for the stop. On return, only the current goroutine's P is running.
@@ -684,11 +1075,64 @@ func startTheWorldWithSema() {
 		// coordinate. This lazy approach works out in practice:
 		// we don't mind if the first couple gc rounds don't have quite
 		// the maximum number of procs.
-		newm(unsafe.Pointer(funcPC(mhelpgc)), nil)
+		newm(mhelpgc, nil)
 	}
 	_g_.m.locks--
 }
 
+// First function run by a new goroutine.
+// This is passed to makecontext.
+func kickoff() {
+	gp := getg()
+
+	if gp.traceback != nil {
+		gtraceback(gp)
+	}
+
+	fv := gp.entry
+	param := gp.param
+	gp.entry = nil
+	gp.param = nil
+	fv(param)
+	goexit1()
+}
+
+// This is called from mstart.
+func mstart1() {
+	_g_ := getg()
+
+	if _g_ != _g_.m.g0 {
+		throw("bad runtime·mstart")
+	}
+
+	asminit()
+	minit()
+
+	// Install signal handlers; after minit so that minit can
+	// prepare the thread to be able to handle the signals.
+	if _g_.m == &m0 {
+		// Create an extra M for callbacks on threads not created by Go.
+		if iscgo && !cgoHasExtraM {
+			cgoHasExtraM = true
+			newextram()
+		}
+		initsig(false)
+	}
+
+	if fn := _g_.m.mstartfn; fn != nil {
+		fn()
+	}
+
+	if _g_.m.helpgc != 0 {
+		_g_.m.helpgc = 0
+		stopm()
+	} else if _g_.m != &m0 {
+		acquirep(_g_.m.nextp.ptr())
+		_g_.m.nextp = 0
+	}
+	schedule()
+}
+
 // forEachP calls fn(p) for every P p when p reaches a GC safe point.
 // If a P is currently executing code, this will bring the P to a GC
 // safe point and execute fn on that P. If the P is not executing code
@@ -811,6 +1255,35 @@ func runSafePointFn() {
 	unlock(&sched.lock)
 }
 
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
+// fn is recorded as the new m's m.mstartfn.
+//
+// This function is allowed to have write barriers even if the caller
+// isn't because it borrows _p_.
+//
+//go:yeswritebarrierrec
+func allocm(_p_ *p, fn func(), allocatestack bool) (mp *m, g0Stack unsafe.Pointer, g0StackSize uintptr) {
+	_g_ := getg()
+	_g_.m.locks++ // disable GC because it can be called from sysmon
+	if _g_.m.p == 0 {
+		acquirep(_p_) // temporarily borrow p for mallocs in this function
+	}
+	mp = new(m)
+	mp.mstartfn = fn
+	mcommoninit(mp)
+
+	mp.g0 = malg(allocatestack, false, &g0Stack, &g0StackSize)
+	mp.g0.m = mp
+
+	if _p_ == _g_.m.p.ptr() {
+		releasep()
+	}
+	_g_.m.locks--
+
+	return mp, g0Stack, g0StackSize
+}
+
 // needm is called when a cgo callback happens on a
 // thread without an m (a thread not created by Go).
 // In this case, needm is expected to find an m to use
@@ -884,6 +1357,7 @@ func needm(x byte) {
 	setGContext()
 
 	// Initialize this thread to use the m.
+	asminit()
 	minit()
 }
 
@@ -915,9 +1389,7 @@ func oneNewExtraM() {
 	// The sched.pc will never be returned to, but setting it to
 	// goexit makes clear to the traceback routines where
 	// the goroutine stack ends.
-	var g0SP unsafe.Pointer
-	var g0SPSize uintptr
-	mp := allocm(nil, true, &g0SP, &g0SPSize)
+	mp, g0SP, g0SPSize := allocm(nil, nil, true)
 	gp := malg(true, false, nil, nil)
 	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
 	gp.gcscandone = true
@@ -1051,6 +1523,17 @@ func unlockextra(mp *m) {
 	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
 }
 
+// Create a new m. It will start off with a call to fn, or else the scheduler.
+// fn needs to be static and not a heap allocated closure.
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrierrec
+func newm(fn func(), _p_ *p) {
+	mp, _, _ := allocm(_p_, fn, false)
+	mp.nextp.set(_p_)
+	mp.sigmask = initSigmask
+	newosproc(mp)
+}
+
 // Stops execution of the current m until new work is available.
 // Returns with acquired P.
 func stopm() {
@@ -1083,6 +1566,59 @@ retry:
 	_g_.m.nextp = 0
 }
 
+func mspinning() {
+	// startm's caller incremented nmspinning. Set the new M's spinning.
+	getg().m.spinning = true
+}
+
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
+// May run with m.p==nil, so write barriers are not allowed.
+// If spinning is set, the caller has incremented nmspinning and startm will
+// either decrement nmspinning or set m.spinning in the newly started M.
+//go:nowritebarrierrec
+func startm(_p_ *p, spinning bool) {
+	lock(&sched.lock)
+	if _p_ == nil {
+		_p_ = pidleget()
+		if _p_ == nil {
+			unlock(&sched.lock)
+			if spinning {
+				// The caller incremented nmspinning, but there are no idle Ps,
+				// so it's okay to just undo the increment and give up.
+				if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+					throw("startm: negative nmspinning")
+				}
+			}
+			return
+		}
+	}
+	mp := mget()
+	unlock(&sched.lock)
+	if mp == nil {
+		var fn func()
+		if spinning {
+			// The caller incremented nmspinning, so set m.spinning in the new M.
+			fn = mspinning
+		}
+		newm(fn, _p_)
+		return
+	}
+	if mp.spinning {
+		throw("startm: m is spinning")
+	}
+	if mp.nextp != 0 {
+		throw("startm: m has p")
+	}
+	if spinning && !runqempty(_p_) {
+		throw("startm: p has runnable gs")
+	}
+	// The caller incremented nmspinning, so set m.spinning in the new M.
+	mp.spinning = spinning
+	mp.nextp.set(_p_)
+	notewakeup(&mp.park)
+}
+
 // Hands off P from syscall or locked M.
 // Always runs without a P, so write barriers are not allowed.
 //go:nowritebarrierrec
@@ -1281,7 +1817,7 @@ top:
 	if _p_.runSafePointFn != 0 {
 		runSafePointFn()
 	}
-	if getfingwait() && getfingwake() {
+	if fingwait && fingwake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
 		}
@@ -1593,6 +2129,7 @@ top:
 		// goroutines on the global queue.
 		// Since we preempt by storing the goroutine on the global
 		// queue, this is the only place we need to check preempt.
+		// This does not call checkPreempt because gp is not running.
 		if gp != nil && gp.preempt {
 			gp.preempt = false
 			lock(&sched.lock)
@@ -1636,6 +2173,442 @@ func dropg() {
 	setGNoWB(&_g_.m.curg, nil)
 }
 
+func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
+	unlock((*mutex)(lock))
+	return true
+}
+
+// park continuation on g0.
+func park_m(gp *g) {
+	_g_ := getg()
+
+	if trace.enabled {
+		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
+	}
+
+	casgstatus(gp, _Grunning, _Gwaiting)
+	dropg()
+
+	if _g_.m.waitunlockf != nil {
+		fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
+		ok := fn(gp, _g_.m.waitlock)
+		_g_.m.waitunlockf = nil
+		_g_.m.waitlock = nil
+		if !ok {
+			if trace.enabled {
+				traceGoUnpark(gp, 2)
+			}
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			execute(gp, true) // Schedule it back, never returns.
+		}
+	}
+	schedule()
+}
+
+func goschedImpl(gp *g) {
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	globrunqput(gp)
+	unlock(&sched.lock)
+
+	schedule()
+}
+
+// Gosched continuation on g0.
+func gosched_m(gp *g) {
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
+func gopreempt_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	goschedImpl(gp)
+}
+
+// Finishes execution of the current goroutine.
+func goexit1() {
+	if trace.enabled {
+		traceGoEnd()
+	}
+	mcall(goexit0)
+}
+
+// goexit continuation on g0.
+func goexit0(gp *g) {
+	_g_ := getg()
+
+	casgstatus(gp, _Grunning, _Gdead)
+	if isSystemGoroutine(gp) {
+		atomic.Xadd(&sched.ngsys, -1)
+	}
+	gp.m = nil
+	gp.lockedm = nil
+	_g_.m.lockedg = nil
+	gp.entry = nil
+	gp.paniconfault = false
+	gp._defer = nil // should be true already but just in case.
+	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
+	gp.writebuf = nil
+	gp.waitreason = ""
+	gp.param = nil
+
+	// Note that gp's stack scan is now "valid" because it has no
+	// stack. We could dequeueRescan, but that takes a lock and
+	// isn't really necessary.
+	gp.gcscanvalid = true
+	dropg()
+
+	if _g_.m.locked&^_LockExternal != 0 {
+		print("invalid m->locked = ", _g_.m.locked, "\n")
+		throw("internal lockOSThread error")
+	}
+	_g_.m.locked = 0
+	gfput(_g_.m.p.ptr(), gp)
+	schedule()
+}
+
+// The goroutine g is about to enter a system call.
+// Record that it's not using the cpu anymore.
+// This is called only from the go syscall library and cgocall,
+// not from the low-level system calls used by the runtime.
+//
+// The entersyscall function is written in C, so that it can save the
+// current register context so that the GC will see them.
+// It calls reentersyscall.
+//
+// Syscall tracing:
+// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
+// If the syscall does not block, that is it, we do not emit any other events.
+// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
+// when syscall returns we emit traceGoSysExit and when the goroutine starts running
+// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
+// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
+// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
+// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
+// and we wait for the increment before emitting traceGoSysExit.
+// Note that the increment is done even if tracing is not enabled,
+// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
+//
+//go:nosplit
+//go:noinline
+func reentersyscall(pc, sp uintptr) {
+	_g_ := getg()
+
+	// Disable preemption because during this function g is in Gsyscall status,
+	// but can have inconsistent g->sched, do not let GC observe it.
+	_g_.m.locks++
+
+	_g_.syscallsp = sp
+	_g_.syscallpc = pc
+	casgstatus(_g_, _Grunning, _Gsyscall)
+
+	if trace.enabled {
+		systemstack(traceGoSysCall)
+	}
+
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		systemstack(entersyscall_sysmon)
+	}
+
+	if _g_.m.p.ptr().runSafePointFn != 0 {
+		// runSafePointFn may stack split if run on this stack
+		systemstack(runSafePointFn)
+	}
+
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	_g_.m.mcache = nil
+	_g_.m.p.ptr().m = 0
+	atomic.Store(&_g_.m.p.ptr().status, _Psyscall)
+	if sched.gcwaiting != 0 {
+		systemstack(entersyscall_gcwait)
+	}
+
+	_g_.m.locks--
+}
+
+func entersyscall_sysmon() {
+	lock(&sched.lock)
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+}
+
+func entersyscall_gcwait() {
+	_g_ := getg()
+	_p_ := _g_.m.p.ptr()
+
+	lock(&sched.lock)
+	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
+		if trace.enabled {
+			traceGoSysBlock(_p_)
+			traceProcStop(_p_)
+		}
+		_p_.syscalltick++
+		if sched.stopwait--; sched.stopwait == 0 {
+			notewakeup(&sched.stopnote)
+		}
+	}
+	unlock(&sched.lock)
+}
+
+// The same as reentersyscall(), but with a hint that the syscall is blocking.
+//go:nosplit
+func reentersyscallblock(pc, sp uintptr) {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+	_g_.throwsplit = true
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	_g_.m.p.ptr().syscalltick++
+
+	// Leave SP around for GC and traceback.
+	_g_.syscallsp = sp
+	_g_.syscallpc = pc
+	casgstatus(_g_, _Grunning, _Gsyscall)
+	systemstack(entersyscallblock_handoff)
+
+	_g_.m.locks--
+}
+
+func entersyscallblock_handoff() {
+	if trace.enabled {
+		traceGoSysCall()
+		traceGoSysBlock(getg().m.p.ptr())
+	}
+	handoffp(releasep())
+}
+
+// The goroutine g exited its system call.
+// Arrange for it to run on a cpu again.
+// This is called only from the go syscall library, not
+// from the low-level system calls used by the runtime.
+//
+// Write barriers are not allowed because our P may have been stolen.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func exitsyscall(dummy int32) {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+
+	_g_.waitsince = 0
+	oldp := _g_.m.p.ptr()
+	if exitsyscallfast() {
+		if _g_.m.mcache == nil {
+			throw("lost mcache")
+		}
+		if trace.enabled {
+			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+				systemstack(traceGoStart)
+			}
+		}
+		// There's a cpu for us, so we can run.
+		_g_.m.p.ptr().syscalltick++
+		// We need to cas the status and scan before resuming...
+		casgstatus(_g_, _Gsyscall, _Grunning)
+
+		exitsyscallclear(_g_)
+		_g_.m.locks--
+		_g_.throwsplit = false
+		return
+	}
+
+	_g_.sysexitticks = 0
+	if trace.enabled {
+		// Wait till traceGoSysBlock event is emitted.
+		// This ensures consistency of the trace (the goroutine is started after it is blocked).
+		for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
+			osyield()
+		}
+		// We can't trace syscall exit right now because we don't have a P.
+		// Tracing code can invoke write barriers that cannot run without a P.
+		// So instead we remember the syscall exit time and emit the event
+		// in execute when we have a P.
+		_g_.sysexitticks = cputicks()
+	}
+
+	_g_.m.locks--
+
+	// Call the scheduler.
+	mcall(exitsyscall0)
+
+	if _g_.m.mcache == nil {
+		throw("lost mcache")
+	}
+
+	// Scheduler returned, so we're allowed to run now.
+	// Delete the syscallsp information that we left for
+	// the garbage collector during the system call.
+	// Must wait until now because until gosched returns
+	// we don't know for sure that the garbage collector
+	// is not running.
+	exitsyscallclear(_g_)
+
+	_g_.m.p.ptr().syscalltick++
+	_g_.throwsplit = false
+}
+
+//go:nosplit
+func exitsyscallfast() bool {
+	_g_ := getg()
+
+	// Freezetheworld sets stopwait but does not retake P's.
+	if sched.stopwait == freezeStopWait {
+		_g_.m.mcache = nil
+		_g_.m.p = 0
+		return false
+	}
+
+	// Try to re-acquire the last P.
+	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && atomic.Cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
+		// There's a cpu for us, so we can run.
+		exitsyscallfast_reacquired()
+		return true
+	}
+
+	// Try to get any other idle P.
+	oldp := _g_.m.p.ptr()
+	_g_.m.mcache = nil
+	_g_.m.p = 0
+	if sched.pidle != 0 {
+		var ok bool
+		systemstack(func() {
+			ok = exitsyscallfast_pidle()
+			if ok && trace.enabled {
+				if oldp != nil {
+					// Wait till traceGoSysBlock event is emitted.
+					// This ensures consistency of the trace (the goroutine is started after it is blocked).
+					for oldp.syscalltick == _g_.m.syscalltick {
+						osyield()
+					}
+				}
+				traceGoSysExit(0)
+			}
+		})
+		if ok {
+			return true
+		}
+	}
+	return false
+}
+
+// exitsyscallfast_reacquired is the exitsyscall path on which this G
+// has successfully reacquired the P it was running on before the
+// syscall.
+//
+// This function is allowed to have write barriers because exitsyscall
+// has acquired a P at this point.
+//
+//go:yeswritebarrierrec
+//go:nosplit
+func exitsyscallfast_reacquired() {
+	_g_ := getg()
+	_g_.m.mcache = _g_.m.p.ptr().mcache
+	_g_.m.p.ptr().m.set(_g_.m)
+	if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+		if trace.enabled {
+			// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
+			// traceGoSysBlock for this syscall was already emitted,
+			// but here we effectively retake the p from the new syscall running on the same p.
+			systemstack(func() {
+				// Denote blocking of the new syscall.
+				traceGoSysBlock(_g_.m.p.ptr())
+				// Denote completion of the current syscall.
+				traceGoSysExit(0)
+			})
+		}
+		_g_.m.p.ptr().syscalltick++
+	}
+}
+
+func exitsyscallfast_pidle() bool {
+	lock(&sched.lock)
+	_p_ := pidleget()
+	if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		return true
+	}
+	return false
+}
+
+// exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+//
+//go:nowritebarrierrec
+func exitsyscall0(gp *g) {
+	_g_ := getg()
+
+	casgstatus(gp, _Gsyscall, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	_p_ := pidleget()
+	if _p_ == nil {
+		globrunqput(gp)
+	} else if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		execute(gp, false) // Never returns.
+	}
+	if _g_.m.lockedg != nil {
+		// Wait until another thread schedules gp and so m again.
+		stoplockedm()
+		execute(gp, false) // Never returns.
+	}
+	stopm()
+	schedule() // Never returns.
+}
+
+// exitsyscallclear clears GC-related information that we only track
+// during a syscall.
+func exitsyscallclear(gp *g) {
+	// Garbage collector isn't running (since we are), so okay to
+	// clear syscallsp.
+	gp.syscallsp = 0
+
+	gp.gcstack = nil
+	gp.gcnextsp = nil
+	memclrNoHeapPointers(unsafe.Pointer(&gp.gcregs), unsafe.Sizeof(gp.gcregs))
+}
+
+// Code generated by cgo, and some library code, calls syscall.Entersyscall
+// and syscall.Exitsyscall.
+
+//go:linkname syscall_entersyscall syscall.Entersyscall
+//go:nosplit
+func syscall_entersyscall() {
+	entersyscall(0)
+}
+
+//go:linkname syscall_exitsyscall syscall.Exitsyscall
+//go:nosplit
+func syscall_exitsyscall() {
+	exitsyscall(0)
+}
+
 func beforefork() {
 	gp := getg().m.curg
 
@@ -1671,6 +2644,91 @@ func syscall_runtime_AfterFork() {
 	systemstack(afterfork)
 }
 
+// Create a new g running fn passing arg as the single argument.
+// Put it on the queue of g's waiting to run.
+// The compiler turns a go statement into a call to this.
+//go:linkname newproc __go_go
+func newproc(fn uintptr, arg unsafe.Pointer) *g {
+	_g_ := getg()
+
+	if fn == 0 {
+		_g_.m.throwing = -1 // do not dump full stacks
+		throw("go of nil func value")
+	}
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+
+	_p_ := _g_.m.p.ptr()
+	newg := gfget(_p_)
+	var (
+		sp     unsafe.Pointer
+		spsize uintptr
+	)
+	if newg == nil {
+		newg = malg(true, false, &sp, &spsize)
+		casgstatus(newg, _Gidle, _Gdead)
+		newg.gcRescan = -1
+		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+	} else {
+		resetNewG(newg, &sp, &spsize)
+	}
+	newg.traceback = nil
+
+	if readgstatus(newg) != _Gdead {
+		throw("newproc1: new g is not Gdead")
+	}
+
+	// Store the C function pointer into entryfn, take the address
+	// of entryfn, convert it to a Go function value, and store
+	// that in entry.
+	newg.entryfn = fn
+	var entry func(unsafe.Pointer)
+	*(*unsafe.Pointer)(unsafe.Pointer(&entry)) = unsafe.Pointer(&newg.entryfn)
+	newg.entry = entry
+
+	newg.param = arg
+	newg.gopc = getcallerpc(unsafe.Pointer(&fn))
+	newg.startpc = fn
+	if isSystemGoroutine(newg) {
+		atomic.Xadd(&sched.ngsys, +1)
+	}
+	// The stack is dirty from the argument frame, so queue it for
+	// scanning. Do this before setting it to runnable so we still
+	// own the G. If we're recycling a G, it may already be on the
+	// rescan list.
+	if newg.gcRescan == -1 {
+		queueRescan(newg)
+	} else {
+		// The recycled G is already on the rescan list. Just
+		// mark the stack dirty.
+		newg.gcscanvalid = false
+	}
+	casgstatus(newg, _Gdead, _Grunnable)
+
+	if _p_.goidcache == _p_.goidcacheend {
+		// Sched.goidgen is the last allocated id,
+		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+		// At startup sched.goidgen=0, so main goroutine receives goid=1.
+		_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
+		_p_.goidcache -= _GoidCacheBatch - 1
+		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+	}
+	newg.goid = int64(_p_.goidcache)
+	_p_.goidcache++
+	if trace.enabled {
+		traceGoCreate(newg, newg.startpc)
+	}
+
+	makeGContext(newg, sp, spsize)
+
+	runqput(_p_, newg, true)
+
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && runtimeInitTime != 0 {
+		wakep()
+	}
+	_g_.m.locks--
+	return newg
+}
+
 // Put on gfree list.
 // If local list is too long, transfer a batch to the global list.
 func gfput(_p_ *p, gp *g) {
@@ -1738,6 +2796,11 @@ func gfpurge(_p_ *p) {
 	unlock(&sched.gflock)
 }
 
+// Breakpoint executes a breakpoint trap.
+func Breakpoint() {
+	breakpoint()
+}
+
 // dolockOSThread is called by LockOSThread and lockOSThread below
 // after they modify m.locked. Do not allow preemption during this call,
 // or else the m might be different in this function than in the caller.
@@ -1822,6 +2885,152 @@ func mcount() int32 {
 	return sched.mcount
 }
 
+var prof struct {
+	lock uint32
+	hz   int32
+}
+
+func _System()       { _System() }
+func _ExternalCode() { _ExternalCode() }
+func _GC()           { _GC() }
+
+var _SystemPC = funcPC(_System)
+var _ExternalCodePC = funcPC(_ExternalCode)
+var _GCPC = funcPC(_GC)
+
+// Called if we receive a SIGPROF signal.
+// Called by the signal handler, may run during STW.
+//go:nowritebarrierrec
+func sigprof(pc uintptr, gp *g, mp *m) {
+	if prof.hz == 0 {
+		return
+	}
+
+	// Profiling runs concurrently with GC, so it must not allocate.
+	// Set a trap in case the code does allocate.
+	// Note that on windows, one thread takes profiles of all the
+	// other threads, so mp is usually not getg().m.
+	// In fact mp may not even be stopped.
+	// See golang.org/issue/17165.
+	getg().m.mallocing++
+
+	traceback := true
+
+	// If SIGPROF arrived while already fetching runtime callers
+	// we can have trouble on older systems because the unwind
+	// library calls dl_iterate_phdr which was not reentrant in
+	// the past. alreadyInCallers checks for that.
+	if gp == nil || alreadyInCallers() {
+		traceback = false
+	}
+
+	var stk [maxCPUProfStack]uintptr
+	n := 0
+	if traceback {
+		var stklocs [maxCPUProfStack]location
+		n = callers(0, stklocs[:])
+
+		for i := 0; i < n; i++ {
+			stk[i] = stklocs[i].pc
+		}
+	}
+
+	if n <= 0 {
+		// Normal traceback is impossible or has failed.
+		// Account it against abstract "System" or "GC".
+		n = 2
+		stk[0] = pc
+		if mp.preemptoff != "" || mp.helpgc != 0 {
+			stk[1] = _GCPC + sys.PCQuantum
+		} else {
+			stk[1] = _SystemPC + sys.PCQuantum
+		}
+	}
+
+	if prof.hz != 0 {
+		// Simple cas-lock to coordinate with setcpuprofilerate.
+		for !atomic.Cas(&prof.lock, 0, 1) {
+			osyield()
+		}
+		if prof.hz != 0 {
+			cpuprof.add(stk[:n])
+		}
+		atomic.Store(&prof.lock, 0)
+	}
+	getg().m.mallocing--
+}
+
+// Use global arrays rather than using up lots of stack space in the
+// signal handler. This is safe since while we are executing a SIGPROF
+// signal other SIGPROF signals are blocked.
+var nonprofGoStklocs [maxCPUProfStack]location
+var nonprofGoStk [maxCPUProfStack]uintptr
+
+// sigprofNonGo is called if we receive a SIGPROF signal on a non-Go thread,
+// and the signal handler collected a stack trace in sigprofCallers.
+// When this is called, sigprofCallersUse will be non-zero.
+// g is nil, and what we can do is very limited.
+//go:nosplit
+//go:nowritebarrierrec
+func sigprofNonGo(pc uintptr) {
+	if prof.hz != 0 {
+		n := callers(0, nonprofGoStklocs[:])
+
+		for i := 0; i < n; i++ {
+			nonprofGoStk[i] = nonprofGoStklocs[i].pc
+		}
+
+		if n <= 0 {
+			n = 2
+			nonprofGoStk[0] = pc
+			nonprofGoStk[1] = _ExternalCodePC + sys.PCQuantum
+		}
+
+		// Simple cas-lock to coordinate with setcpuprofilerate.
+		for !atomic.Cas(&prof.lock, 0, 1) {
+			osyield()
+		}
+		if prof.hz != 0 {
+			cpuprof.addNonGo(nonprofGoStk[:n])
+		}
+		atomic.Store(&prof.lock, 0)
+	}
+}
+
+// Arrange to call fn with a traceback hz times a second.
+func setcpuprofilerate_m(hz int32) {
+	// Force sane arguments.
+	if hz < 0 {
+		hz = 0
+	}
+
+	// Disable preemption, otherwise we can be rescheduled to another thread
+	// that has profiling enabled.
+	_g_ := getg()
+	_g_.m.locks++
+
+	// Stop profiler on this thread so that it is safe to lock prof.
+	// if a profiling signal came in while we had prof locked,
+	// it would deadlock.
+	resetcpuprofiler(0)
+
+	for !atomic.Cas(&prof.lock, 0, 1) {
+		osyield()
+	}
+	prof.hz = hz
+	atomic.Store(&prof.lock, 0)
+
+	lock(&sched.lock)
+	sched.profilehz = hz
+	unlock(&sched.lock)
+
+	if hz != 0 {
+		resetcpuprofiler(hz)
+	}
+
+	_g_.m.locks--
+}
+
 // Change number of processors. The world is stopped, sched is locked.
 // gcworkbufs are not being modified by either the GC or
 // the write barrier code.
diff --git a/libgo/go/runtime/runtime.go b/libgo/go/runtime/runtime.go
index e63130b..58710de 100644
--- a/libgo/go/runtime/runtime.go
+++ b/libgo/go/runtime/runtime.go
@@ -19,20 +19,17 @@ import (
 //
 //go:linkname tickspersecond runtime.tickspersecond
 
-var ticks struct {
-	lock mutex
-	pad  uint32 // ensure 8-byte alignment of val on 386
-	val  uint64
-}
+var ticksLock mutex
+var ticksVal uint64
 
 // Note: Called by runtime/pprof in addition to runtime code.
 func tickspersecond() int64 {
-	r := int64(atomic.Load64(&ticks.val))
+	r := int64(atomic.Load64(&ticksVal))
 	if r != 0 {
 		return r
 	}
-	lock(&ticks.lock)
-	r = int64(ticks.val)
+	lock(&ticksLock)
+	r = int64(ticksVal)
 	if r == 0 {
 		t0 := nanotime()
 		c0 := cputicks()
@@ -46,9 +43,9 @@ func tickspersecond() int64 {
 		if r == 0 {
 			r++
 		}
-		atomic.Store64(&ticks.val, uint64(r))
+		atomic.Store64(&ticksVal, uint64(r))
 	}
-	unlock(&ticks.lock)
+	unlock(&ticksLock)
 	return r
 }
 
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 99c0f11..dd3f7b2 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -112,10 +112,10 @@ var test_z64, test_x64 uint64
 func testAtomic64() {
 	test_z64 = 42
 	test_x64 = 0
-	// prefetcht0(uintptr(unsafe.Pointer(&test_z64)))
-	// prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
-	// prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
-	// prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
+	prefetcht0(uintptr(unsafe.Pointer(&test_z64)))
+	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
+	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
+	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
 	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
@@ -151,14 +151,6 @@ func testAtomic64() {
 }
 
 func check() {
-
-	// This doesn't currently work for gccgo.  Because escape
-	// analysis is not turned on by default, the code below that
-	// takes the address of local variables causes memory
-	// allocation, but this function is called before the memory
-	// allocator has been initialized.
-	return
-
 	var (
 		a     int8
 		b     uint8
@@ -390,7 +382,18 @@ var dbgvars = []dbgVar{
 func parsedebugvars() {
 	// defaults
 	debug.cgocheck = 1
-	debug.invalidptr = 1
+
+	// Unfortunately, because gccgo uses conservative stack scanning,
+	// we can not enable invalid pointer checking. It is possible for
+	// memory block M1 to point to M2, and for both to be dead.
+	// We release M2, causing the entire span to be released.
+	// Before we release M1, a stack pointer appears that point into it.
+	// This stack pointer is presumably dead, but causes M1 to be marked.
+	// We scan M1 and see the pointer to M2 on a released span.
+	// At that point, if debug.invalidptr is set, we crash.
+	// This is not a problem, assuming that M1 really is dead and
+	// the pointer we discovered to it will not be used.
+	// debug.invalidptr = 1
 
 	for p := gogetenv("GODEBUG"); p != ""; {
 		field := ""
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 195d65b..22847ea 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -409,16 +409,16 @@ type g struct {
 	gcinitialsp   unsafe.Pointer
 	gcregs        g_ucontext_t
 
-	entry    unsafe.Pointer // goroutine entry point
-	fromgogo bool           // whether entered from gogo function
+	entry    func(unsafe.Pointer) // goroutine function to run
+	entryfn  uintptr              // function address passed to __go_go
+	fromgogo bool                 // whether entered from gogo function
 
-	issystem     bool // do not output in stack dump
-	isbackground bool // ignore in deadlock detector
+	scanningself bool // whether goroutine is scanning its own stack
 
 	traceback *tracebackg // stack traceback buffer
 
-	context      g_ucontext_t       // saved context for setcontext
-	stackcontext [10]unsafe.Pointer // split-stack context
+	context      g_ucontext_t // saved context for setcontext
+	stackcontext [10]uintptr  // split-stack context
 }
 
 type m struct {
@@ -431,7 +431,7 @@ type m struct {
 	gsignal *g     // signal-handling g
 	sigmask sigset // storage for saved signal mask
 	// Not for gccgo: tls           [6]uintptr // thread-local storage (for x86 extern register)
-	mstartfn    uintptr
+	mstartfn    func()
 	curg        *g       // current running goroutine
 	caughtsig   guintptr // goroutine running during fatal signal
 	p           puintptr // attached p for executing go code (nil if not executing go code)
@@ -541,7 +541,7 @@ type p struct {
 
 	tracebuf traceBufPtr
 
-	// Not for gccgo for now: palloc persistentAlloc // per-P to avoid mutex
+	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
 	gcAssistTime     int64 // Nanoseconds in assistAlloc
@@ -551,7 +551,7 @@ type p struct {
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
-	// Not for gccgo for now: gcw gcWork
+	gcw gcWork
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
@@ -714,10 +714,6 @@ type _defer struct {
 	// function function will be somewhere in libffi, so __retaddr
 	// is not useful.
 	makefunccanrecover bool
-
-	// Set to true if this defer stack entry is not part of the
-	// defer pool.
-	special bool
 }
 
 // panics
@@ -790,7 +786,7 @@ var (
 // aligned to a 16-byte boundary.  We implement this by increasing the
 // required size and picking an appropriate offset when we use the
 // array.
-type g_ucontext_t [(_sizeof_ucontext_t + 15) / unsafe.Sizeof(unsafe.Pointer(nil))]unsafe.Pointer
+type g_ucontext_t [(_sizeof_ucontext_t + 15) / unsafe.Sizeof(uintptr(0))]uintptr
 
 // sigset is the Go version of the C type sigset_t.
 // _sigset_t is defined by the Makefile from <signal.h>.
diff --git a/libgo/go/runtime/runtime_unix_test.go b/libgo/go/runtime/runtime_unix_test.go
index e912163..b0cbbbe 100644
--- a/libgo/go/runtime/runtime_unix_test.go
+++ b/libgo/go/runtime/runtime_unix_test.go
@@ -6,7 +6,7 @@
 // We need a fast system call to provoke the race,
 // and Close(-1) is nearly universally fast.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd plan9
+// +build aix darwin dragonfly freebsd linux netbsd openbsd plan9
 
 package runtime_test
 
diff --git a/libgo/go/runtime/signal_gccgo.go b/libgo/go/runtime/signal_gccgo.go
index b4257c9..056be36 100644
--- a/libgo/go/runtime/signal_gccgo.go
+++ b/libgo/go/runtime/signal_gccgo.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
 
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
index 279001b..b71b21e 100644
--- a/libgo/go/runtime/signal_sighandler.go
+++ b/libgo/go/runtime/signal_sighandler.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package runtime
 
@@ -29,13 +29,13 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := sigctxt{info, ctxt}
 
+	sigfault, sigpc := getSiginfo(info, ctxt)
+
 	if sig == _SIGPROF {
-		sigprof()
+		sigprof(sigpc, gp, _g_.m)
 		return
 	}
 
-	sigfault, sigpc := getSiginfo(info, ctxt)
-
 	flags := int32(_SigThrow)
 	if sig < uint32(len(sigtable)) {
 		flags = sigtable[sig].flags
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 13b7930..c8713b6 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
 
@@ -216,7 +216,7 @@ func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
 		c := sigctxt{info, ctx}
 		if sig == _SIGPROF {
 			_, pc := getSiginfo(info, ctx)
-			sigprofNonGoPC(pc)
+			sigprofNonGo(pc)
 			return
 		}
 		badsignal(uintptr(sig), &c)
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index 55f4454..f61f85e 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -60,10 +60,7 @@ func makeslice(et *_type, len, cap int) slice {
 		panic(errorString("makeslice: cap out of range"))
 	}
 
-	// gccgo's current garbage collector requires using newarray,
-	// not mallocgc here.  This can change back to mallocgc when
-	// we port the garbage collector.
-	p := newarray(et, cap)
+	p := mallocgc(et.size*uintptr(cap), et, true)
 	return slice{p, len, cap}
 }
 
@@ -144,21 +141,14 @@ func growslice(et *_type, old slice, cap int) slice {
 
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
-		// gccgo's current GC requires newarray, not mallocgc.
-		p = newarray(et, newcap)
+		p = mallocgc(capmem, nil, false)
 		memmove(p, old.array, lenmem)
-		// The call to memclr is not needed for gccgo since
-		// the newarray function will zero the memory.
-		// Calling memclr is also wrong since we allocated
-		// newcap*et.size bytes, which is not the same as capmem.
 		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
 		// Only clear the part that will not be overwritten.
-		// memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
-		_ = newlenmem
+		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
-		// gccgo's current GC requires newarray, not mallocgc.
-		p = newarray(et, newcap)
+		p = mallocgc(capmem, et, true)
 		if !writeBarrier.enabled {
 			memmove(p, old.array, lenmem)
 		} else {
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index bf9f62e..a3d0918 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -51,11 +51,22 @@ func mcall(fn func(*g))
 //
 // For the gc toolchain this permits running a function that requires
 // additional stack space in a context where the stack can not be
-// split.  For gccgo, however, stack splitting is not managed by the
-// Go runtime. In effect, all stacks are system stacks. So this gccgo
-// version just runs the function.
+// split. We don't really need additional stack space in gccgo, since
+// stack splitting is handled separately. But to keep things looking
+// the same, we do switch to the g0 stack here if necessary.
 func systemstack(fn func()) {
-	fn()
+	gp := getg()
+	mp := gp.m
+	if gp == mp.g0 || gp == mp.gsignal {
+		fn()
+	} else if gp == mp.curg {
+		mcall(func(origg *g) {
+			fn()
+			gogo(origg)
+		})
+	} else {
+		badsystemstack()
+	}
 }
 
 func badsystemstack() {
@@ -119,26 +130,18 @@ func noescape(p unsafe.Pointer) unsafe.Pointer {
 	return unsafe.Pointer(x ^ 0)
 }
 
-//extern mincore
-func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
-
 //go:noescape
 func jmpdefer(fv *funcval, argp uintptr)
 func exit1(code int32)
-func asminit()
 func setg(gg *g)
+
+//extern __builtin_trap
 func breakpoint()
 
-// reflectcall calls fn with a copy of the n argument bytes pointed at by arg.
-// After fn returns, reflectcall copies n-retoffset result bytes
-// back into arg+retoffset before returning. If copying result bytes back,
-// the caller should pass the argument frame type as argtype, so that
-// call can execute appropriate write barriers during the copy.
-// Package reflect passes a frame type. In package runtime, there is only
-// one call that copies results back, in cgocallbackg1, and it does NOT pass a
-// frame type, meaning there are no write barriers invoked. See that call
-// site for justification.
-func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
+func asminit() {}
+
+//go:linkname reflectcall reflect.call
+func reflectcall(fntype *functype, fn *funcval, isInterface, isMethod bool, params, results *unsafe.Pointer)
 
 func procyield(cycles uint32)
 
@@ -216,6 +219,25 @@ const _NoArgs = ^uintptr(0)
 //go:linkname time_now time.now
 func time_now() (sec int64, nsec int32)
 
+//extern __builtin_prefetch
+func prefetch(addr unsafe.Pointer, rw int32, locality int32)
+
+func prefetcht0(addr uintptr) {
+	prefetch(unsafe.Pointer(addr), 0, 3)
+}
+
+func prefetcht1(addr uintptr) {
+	prefetch(unsafe.Pointer(addr), 0, 2)
+}
+
+func prefetcht2(addr uintptr) {
+	prefetch(unsafe.Pointer(addr), 0, 1)
+}
+
+func prefetchnta(addr uintptr) {
+	prefetch(unsafe.Pointer(addr), 0, 0)
+}
+
 // For gccgo, expose this for C callers.
 //go:linkname unixnanotime runtime.unixnanotime
 func unixnanotime() int64 {
@@ -252,32 +274,12 @@ func osyield()
 //extern syscall
 func syscall(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) uintptr
 
-// newobject allocates a new object.
-// For gccgo unless and until we port malloc.go.
-func newobject(*_type) unsafe.Pointer
-
-// newarray allocates a new array of objects.
-// For gccgo unless and until we port malloc.go.
-func newarray(*_type, int) unsafe.Pointer
-
 // For gccgo, to communicate from the C code to the Go code.
 //go:linkname setIsCgo runtime.setIsCgo
 func setIsCgo() {
 	iscgo = true
 }
 
-// Temporary for gccgo until we port proc.go.
-//go:linkname makeMainInitDone runtime.makeMainInitDone
-func makeMainInitDone() {
-	main_init_done = make(chan bool)
-}
-
-// Temporary for gccgo until we port proc.go.
-//go:linkname closeMainInitDone runtime.closeMainInitDone
-func closeMainInitDone() {
-	close(main_init_done)
-}
-
 // For gccgo, to communicate from the C code to the Go code.
 //go:linkname setCpuidECX runtime.setCpuidECX
 func setCpuidECX(v uint32) {
@@ -290,82 +292,6 @@ func setSupportAES(v bool) {
 	support_aes = v
 }
 
-// typedmemmove copies a typed value.
-// For gccgo for now.
-//go:linkname typedmemmove runtime.typedmemmove
-//go:nosplit
-func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
-	memmove(dst, src, typ.size)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname reflect_typedmemmove reflect.typedmemmove
-func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
-	typedmemmove(typ, dst, src)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:nosplit
-func typedmemclr(typ *_type, ptr unsafe.Pointer) {
-	memclrNoHeapPointers(ptr, typ.size)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:nosplit
-func memclrHasPointers(ptr unsafe.Pointer, n uintptr) {
-	memclrNoHeapPointers(ptr, n)
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname typedslicecopy runtime.typedslicecopy
-func typedslicecopy(typ *_type, dst, src slice) int {
-	n := dst.len
-	if n > src.len {
-		n = src.len
-	}
-	if n == 0 {
-		return 0
-	}
-	memmove(dst.array, src.array, uintptr(n)*typ.size)
-	return n
-}
-
-// Temporary for gccgo until we port mbarrier.go.
-//go:linkname reflect_typedslicecopy reflect.typedslicecopy
-func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
-	return typedslicecopy(elemType, dst, src)
-}
-
-// Here for gccgo until we port malloc.go.
-const (
-	_64bit              = 1 << (^uintptr(0) >> 63) / 2
-	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*32
-	_MaxMem             = uintptr(1<<_MHeapMap_TotalBits - 1)
-	_MaxGcproc          = 32
-)
-
-// Here for gccgo until we port malloc.go.
-//extern runtime_mallocgc
-func c_mallocgc(size uintptr, typ uintptr, flag uint32) unsafe.Pointer
-func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
-	flag := uint32(0)
-	if !needzero {
-		flag = 1 << 3
-	}
-	return c_mallocgc(size, uintptr(unsafe.Pointer(typ)), flag)
-}
-
-// Here for gccgo until we port mgc.go.
-var writeBarrier struct {
-	enabled bool   // compiler emits a check of this before calling write barrier
-	needed  bool   // whether we need a write barrier for current GC phase
-	cgo     bool   // whether we need a write barrier for a cgo check
-	alignme uint64 // guarantee alignment so that compiler can use a 32 or 64-bit load
-}
-
-func queueRescan(*g) {
-}
-
 // Here for gccgo until we port atomic_pointer.go and mgc.go.
 //go:nosplit
 func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
@@ -379,21 +305,12 @@ func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
 func lock(l *mutex)
 func unlock(l *mutex)
 
-// Here for gccgo for netpoll and Solaris.
+// Here for gccgo.
 func errno() int
 
 // Temporary for gccgo until we port proc.go.
 func entersyscall(int32)
 func entersyscallblock(int32)
-func exitsyscall(int32)
-func gopark(func(*g, unsafe.Pointer) bool, unsafe.Pointer, string, byte, int)
-func goparkunlock(*mutex, string, byte, int)
-
-// Temporary hack for gccgo until we port the garbage collector.
-func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {}
-
-// Here for gccgo until we port msize.go.
-func roundupsize(uintptr) uintptr
 
 // Here for gccgo until we port mgc.go.
 func GC()
@@ -417,64 +334,22 @@ func getMstats() *mstats {
 	return &memstats
 }
 
-// Temporary for gccgo until we port proc.go.
-func setcpuprofilerate_m(hz int32)
-
 // Temporary for gccgo until we port mem_GOOS.go.
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer
 func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64)
 
-// Temporary for gccgo until we port proc.go, so that the C signal
-// handler can call into cpuprof.
-//go:linkname cpuprofAdd runtime.cpuprofAdd
-func cpuprofAdd(stk []uintptr) {
-	cpuprof.add(stk)
-}
-
-// For gccgo until we port proc.go.
-func Breakpoint()
-func LockOSThread()
-func UnlockOSThread()
-func lockOSThread()
-func unlockOSThread()
-
 // Temporary for gccgo until we port malloc.go
 func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer
 
 // Temporary for gccgo until we port mheap.go
 func setprofilebucket(p unsafe.Pointer, b *bucket)
 
-// Temporary for gccgo until we port mgc.go.
-func setgcpercent(int32) int32
-
-//go:linkname setGCPercent runtime_debug.setGCPercent
-func setGCPercent(in int32) (out int32) {
-	return setgcpercent(in)
-}
-
 // Temporary for gccgo until we port atomic_pointer.go.
 //go:nosplit
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 	atomic.StorepNoWB(noescape(ptr), new)
 }
 
-// Temporary for gccgo until we port mbarrier.go
-func writebarrierptr(dst *uintptr, src uintptr) {
-	*dst = src
-}
-
-// Temporary for gccgo until we port malloc.go
-var zerobase uintptr
-
-//go:linkname getZerobase runtime.getZerobase
-func getZerobase() *uintptr {
-	return &zerobase
-}
-
-// Temporary for gccgo until we port proc.go.
-func sigprof()
-func goexit1()
-
 // Get signal trampoline, written in C.
 func getSigtramp() uintptr
 
@@ -547,79 +422,12 @@ func getPanicking() uint32 {
 	return panicking
 }
 
-// Temporary for gccgo until we port mcache.go.
-func allocmcache() *mcache
-func freemcache(*mcache)
-
-// Temporary for gccgo until we port mgc.go.
-// This is just so that allgadd will compile.
-var work struct {
-	rescan struct {
-		lock mutex
-		list []guintptr
-	}
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcBlackenEnabled uint32
-
-// Temporary for gccgo until we port mgc.go.
-func gcMarkWorkAvailable(p *p) bool {
-	return false
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcController gcControllerState
-
-// Temporary for gccgo until we port mgc.go.
-type gcControllerState struct {
-}
-
-// Temporary for gccgo until we port mgc.go.
-func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
-	return nil
-}
-
-// Temporary for gccgo until we port mgc.go.
-var gcphase uint32
-
-// Temporary for gccgo until we port mgc.go.
-const (
-	_GCoff = iota
-	_GCmark
-	_GCmarktermination
-)
-
-// Temporary for gccgo until we port mgc.go.
-type gcMarkWorkerMode int
-
-// Temporary for gccgo until we port mgc.go.
-const (
-	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
-	gcMarkWorkerFractionalMode
-	gcMarkWorkerIdleMode
-)
-
-// Temporary for gccgo until we port mheap.go.
-type mheap struct {
-}
-
-// Temporary for gccgo until we port mheap.go.
-var mheap_ mheap
-
-// Temporary for gccgo until we port mheap.go.
-func (h *mheap) scavenge(k int32, now, limit uint64) {
-}
-
 // Temporary for gccgo until we initialize ncpu in Go.
 //go:linkname setncpu runtime.setncpu
 func setncpu(n int32) {
 	ncpu = n
 }
 
-// Temporary for gccgo until we port malloc.go.
-var physPageSize uintptr
-
 // Temporary for gccgo until we reliably initialize physPageSize in Go.
 //go:linkname setpagesize runtime.setpagesize
 func setpagesize(s uintptr) {
@@ -633,10 +441,20 @@ func sigprofNonGoPC(pc uintptr) {
 }
 
 // Temporary for gccgo until we port mgc.go.
-// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
-// to use in execution traces.
-var gcMarkWorkerModeStrings = [...]string{
-	"GC (dedicated)",
-	"GC (fractional)",
-	"GC (idle)",
+//go:linkname runtime_m0 runtime.runtime_m0
+func runtime_m0() *m {
+	return &m0
+}
+
+// Temporary for gccgo until we port mgc.go.
+//go:linkname runtime_g0 runtime.runtime_g0
+func runtime_g0() *g {
+	return &g0
+}
+
+const uintptrMask = 1<<(8*sys.PtrSize) - 1
+
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
 }
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index e891fe5..490405d 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -18,16 +18,8 @@ func exit(code int32)
 func nanotime() int64
 func usleep(usec uint32)
 
-//extern mmap
-func mmap(addr unsafe.Pointer, length uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer
-
-//extern munmap
-func munmap(addr unsafe.Pointer, n uintptr) int32
-
 //go:noescape
 func write(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
-
-func madvise(addr unsafe.Pointer, n uintptr, flags int32)
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index 52e2d03..bad0347 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -115,11 +115,17 @@ func FuncForPC(pc uintptr) *Func {
 
 // Name returns the name of the function.
 func (f *Func) Name() string {
+	if f == nil {
+		return ""
+	}
 	return f.name
 }
 
 // Entry returns the entry address of the function.
 func (f *Func) Entry() uintptr {
+	if f == nil {
+		return 0
+	}
 	return f.entry
 }
 
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 611aba9..d060e09 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -9,7 +9,7 @@ package runtime
 
 import (
 	"runtime/internal/sys"
-	_ "unsafe" // for go:linkname
+	"unsafe"
 )
 
 // For gccgo, use go:linkname to rename compiler-called functions to
@@ -20,6 +20,34 @@ import (
 //go:linkname goroutineheader runtime.goroutineheader
 //go:linkname printcreatedby runtime.printcreatedby
 
+var (
+	// initialized in tracebackinit
+	runfinqPC        uintptr
+	bgsweepPC        uintptr
+	forcegchelperPC  uintptr
+	timerprocPC      uintptr
+	gcBgMarkWorkerPC uintptr
+)
+
+func tracebackinit() {
+	// Go variable initialization happens late during runtime startup.
+	// Instead of initializing the variables above in the declarations,
+	// schedinit calls this function so that the variables are
+	// initialized and available earlier in the startup sequence.
+	// This doesn't use funcPC to avoid memory allocation.
+	// FIXME: We should be able to use funcPC when escape analysis is on.
+	f1 := runfinq
+	runfinqPC = **(**uintptr)(unsafe.Pointer(&f1))
+	f2 := bgsweep
+	bgsweepPC = **(**uintptr)(unsafe.Pointer(&f2))
+	f3 := forcegchelper
+	forcegchelperPC = **(**uintptr)(unsafe.Pointer(&f3))
+	f4 := timerproc
+	timerprocPC = **(**uintptr)(unsafe.Pointer(&f4))
+	f5 := gcBgMarkWorker
+	gcBgMarkWorkerPC = **(**uintptr)(unsafe.Pointer(&f5))
+}
+
 func printcreatedby(gp *g) {
 	// Show what created goroutine, except main goroutine (goid 1).
 	pc := gp.gopc
@@ -168,14 +196,26 @@ func goroutineheader(gp *g) {
 // isSystemGoroutine reports whether the goroutine g must be omitted in
 // stack dumps and deadlock detector.
 func isSystemGoroutine(gp *g) bool {
-	// FIXME.
-	return false
+	// FIXME: This doesn't work reliably for gccgo because in many
+	// cases the startpc field will be set to a thunk rather than
+	// to one of these addresses.
+	pc := gp.startpc
+	return pc == runfinqPC && !fingRunning ||
+		pc == bgsweepPC ||
+		pc == forcegchelperPC ||
+		pc == timerprocPC ||
+		pc == gcBgMarkWorkerPC
 }
 
 func tracebackothers(me *g) {
 	var tb tracebackg
 	tb.gp = me
 
+	// The getTraceback function will modify me's stack context.
+	// Preserve it in case we have been called via systemstack.
+	context := me.context
+	stackcontext := me.stackcontext
+
 	level, _, _ := gotraceback()
 
 	// Show the current goroutine first, if we haven't already.
@@ -225,4 +265,7 @@ func tracebackothers(me *g) {
 		}
 	}
 	unlock(&allglock)
+
+	me.context = context
+	me.stackcontext = stackcontext
 }
diff --git a/libgo/go/runtime/type.go b/libgo/go/runtime/type.go
index cfee35a..6788f24 100644
--- a/libgo/go/runtime/type.go
+++ b/libgo/go/runtime/type.go
@@ -9,17 +9,18 @@ package runtime
 import "unsafe"
 
 type _type struct {
+	size       uintptr
+	ptrdata    uintptr
+	hash       uint32
 	kind       uint8
 	align      int8
 	fieldAlign uint8
 	_          uint8
-	size       uintptr
-	hash       uint32
 
 	hashfn  func(unsafe.Pointer, uintptr) uintptr
 	equalfn func(unsafe.Pointer, unsafe.Pointer) bool
 
-	gc     unsafe.Pointer
+	gcdata *byte
 	string *string
 	*uncommontype
 	ptrToThis *_type
diff --git a/libgo/go/syscall/env_unix.go b/libgo/go/syscall/env_unix.go
index 5bf3336..eb93e2e 100644
--- a/libgo/go/syscall/env_unix.go
+++ b/libgo/go/syscall/env_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 // Unix environment variables.
 
diff --git a/libgo/go/syscall/exec_bsd.go b/libgo/go/syscall/exec_bsd.go
index af025e4..80991ec 100644
--- a/libgo/go/syscall/exec_bsd.go
+++ b/libgo/go/syscall/exec_bsd.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd netbsd openbsd solaris
 
 package syscall
 
@@ -235,6 +235,10 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr
 
 	// Set the controlling TTY to Ctty
 	if sys.Setctty {
+		if TIOCSCTTY == 0 {
+			err1 = ENOSYS
+			goto childerror
+		}
 		_, err1 = raw_ioctl(sys.Ctty, TIOCSCTTY, 0)
 		if err1 != 0 {
 			goto childerror
diff --git a/libgo/go/syscall/exec_unix.go b/libgo/go/syscall/exec_unix.go
index c04005c..f2bc741 100644
--- a/libgo/go/syscall/exec_unix.go
+++ b/libgo/go/syscall/exec_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Fork, exec, wait, etc.
 
diff --git a/libgo/go/syscall/exec_unix_test.go b/libgo/go/syscall/exec_unix_test.go
index 69c4a1f..58708da 100644
--- a/libgo/go/syscall/exec_unix_test.go
+++ b/libgo/go/syscall/exec_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package syscall_test
 
diff --git a/libgo/go/syscall/export_unix_test.go b/libgo/go/syscall/export_unix_test.go
index 47ec544..120500c 100644
--- a/libgo/go/syscall/export_unix_test.go
+++ b/libgo/go/syscall/export_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package syscall
 
diff --git a/libgo/go/syscall/libcall_aix.go b/libgo/go/syscall/libcall_aix.go
new file mode 100644
index 0000000..992eeb4
--- /dev/null
+++ b/libgo/go/syscall/libcall_aix.go
@@ -0,0 +1,11 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix
+
+package syscall
+
+func raw_ptrace(request int, pid int, addr *byte, data *byte) Errno {
+	return ENOSYS
+}
diff --git a/libgo/go/syscall/libcall_posix_largefile.go b/libgo/go/syscall/libcall_posix_largefile.go
index 1f437b4..9b13735 100644
--- a/libgo/go/syscall/libcall_posix_largefile.go
+++ b/libgo/go/syscall/libcall_posix_largefile.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux solaris,386 solaris,sparc
+// +build aix linux solaris,386 solaris,sparc
 
 // POSIX library calls on systems which use the largefile interface.
 
diff --git a/libgo/go/syscall/libcall_posix_regfile.go b/libgo/go/syscall/libcall_posix_regfile.go
index d106a7b..5b8f75a 100644
--- a/libgo/go/syscall/libcall_posix_regfile.go
+++ b/libgo/go/syscall/libcall_posix_regfile.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !aix
 // +build !linux
 // +build !solaris !386
 // +build !solaris !sparc
diff --git a/libgo/go/syscall/libcall_posix_utimesnano.go b/libgo/go/syscall/libcall_posix_utimesnano.go
index 5d9d02e..372b0d7 100644
--- a/libgo/go/syscall/libcall_posix_utimesnano.go
+++ b/libgo/go/syscall/libcall_posix_utimesnano.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd openbsd netbsd solaris
+// +build aix darwin dragonfly freebsd openbsd netbsd solaris
 
 // General POSIX version of UtimesNano.
 
diff --git a/libgo/go/syscall/libcall_wait4.go b/libgo/go/syscall/libcall_wait4.go
index 559d780..00b6874 100644
--- a/libgo/go/syscall/libcall_wait4.go
+++ b/libgo/go/syscall/libcall_wait4.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !aix
+
 // For systems with the wait4 library call.
 
 package syscall
diff --git a/libgo/go/syscall/libcall_wait4_aix.go b/libgo/go/syscall/libcall_wait4_aix.go
new file mode 100644
index 0000000..9c25d04
--- /dev/null
+++ b/libgo/go/syscall/libcall_wait4_aix.go
@@ -0,0 +1,26 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Handle AIX's wait4 specific behavior
+
+package syscall
+
+//sys  wait4(pid Pid_t, status *_C_int, options int, rusage *Rusage) (wpid Pid_t, err error)
+//wait4(pid Pid_t, status *_C_int, options _C_int, rusage *Rusage) Pid_t
+
+func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int, err error) {
+	var status _C_int
+	var r Pid_t
+	err = ERESTART
+	// AIX wait4 may return with ERESTART errno, while the processus is still
+	// active.
+	for err == ERESTART {
+		r, err = wait4(Pid_t(pid), &status, options, rusage)
+	}
+	wpid = int(r)
+	if wstatus != nil {
+		*wstatus = WaitStatus(status)
+	}
+	return
+}
diff --git a/libgo/go/syscall/mmap_unix_test.go b/libgo/go/syscall/mmap_unix_test.go
index 01f7783..d0b3644 100644
--- a/libgo/go/syscall/mmap_unix_test.go
+++ b/libgo/go/syscall/mmap_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd
+// +build aix darwin dragonfly freebsd linux netbsd openbsd
 
 package syscall_test
 
diff --git a/libgo/go/syscall/signame.c b/libgo/go/syscall/signame.c
index 0453c06..dca92a9 100644
--- a/libgo/go/syscall/signame.c
+++ b/libgo/go/syscall/signame.c
@@ -31,7 +31,7 @@ Signame (intgo sig)
       s = buf;
     }
   len = __builtin_strlen (s);
-  data = runtime_mallocgc (len, 0, FlagNoScan);
+  data = runtime_mallocgc (len, nil, false);
   __builtin_memcpy (data, s, len);
   ret.str = data;
   ret.len = len;
diff --git a/libgo/go/syscall/sockcmsg_unix.go b/libgo/go/syscall/sockcmsg_unix.go
index 0161699..c01602f 100644
--- a/libgo/go/syscall/sockcmsg_unix.go
+++ b/libgo/go/syscall/sockcmsg_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Socket control messages
 
diff --git a/libgo/go/syscall/socket_bsd.go b/libgo/go/syscall/socket_bsd.go
index 0f09627..ecdab06 100644
--- a/libgo/go/syscall/socket_bsd.go
+++ b/libgo/go/syscall/socket_bsd.go
@@ -4,10 +4,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd openbsd netbsd
+// +build aix darwin dragonfly freebsd openbsd netbsd
 
 package syscall
 
+import "unsafe"
+
 const SizeofSockaddrInet4 = 16
 const SizeofSockaddrInet6 = 28
 const SizeofSockaddrUnix = 110
diff --git a/libgo/go/syscall/syscall_unix.go b/libgo/go/syscall/syscall_unix.go
index ddf7303..61aa1c4 100644
--- a/libgo/go/syscall/syscall_unix.go
+++ b/libgo/go/syscall/syscall_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package syscall
 
diff --git a/libgo/go/syscall/timestruct.go b/libgo/go/syscall/timestruct.go
index 49c3383..6ece338 100644
--- a/libgo/go/syscall/timestruct.go
+++ b/libgo/go/syscall/timestruct.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package syscall
 
diff --git a/libgo/go/syscall/wait.c b/libgo/go/syscall/wait.c
index 8c3b53f..a50f7d6 100644
--- a/libgo/go/syscall/wait.c
+++ b/libgo/go/syscall/wait.c
@@ -12,6 +12,10 @@
 
 #include "runtime.h"
 
+#ifndef WCOREDUMP
+#define WCOREDUMP(status) (((status) & 0200) != 0)
+#endif
+
 extern _Bool Exited (uint32_t *w)
   __asm__ (GOSYM_PREFIX "syscall.Exited.N18_syscall.WaitStatus");
 
diff --git a/libgo/go/time/sys_unix.go b/libgo/go/time/sys_unix.go
index 91d54c9..4c68bbd 100644
--- a/libgo/go/time/sys_unix.go
+++ b/libgo/go/time/sys_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package time
 
diff --git a/libgo/go/time/zoneinfo_unix.go b/libgo/go/time/zoneinfo_unix.go
index 7727488..a876e27 100644
--- a/libgo/go/time/zoneinfo_unix.go
+++ b/libgo/go/time/zoneinfo_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin,386 darwin,amd64 dragonfly freebsd linux,!android nacl netbsd openbsd solaris
+// +build aix darwin,386 darwin,amd64 dragonfly freebsd linux,!android nacl netbsd openbsd solaris
 
 // Parse "zoneinfo" time zone file.
 // This is a fairly standard file format used on OS X, Linux, BSD, Sun, and others.