122 files changed, 11048 insertions, 3008 deletions
diff --git a/libgo/go/runtime/cgo_gccgo.go b/libgo/go/runtime/cgo_gccgo.go
index 8236eea..c3bf955 100644
--- a/libgo/go/runtime/cgo_gccgo.go
+++ b/libgo/go/runtime/cgo_gccgo.go
@@ -6,7 +6,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	_ "unsafe"
+	"unsafe"
 )
 
 // For historical reasons these functions are called as though they
@@ -41,6 +41,7 @@ func Cgocall() {
 	mp := getg().m
 	mp.ncgocall++
 	mp.ncgo++
+	mp.incgo = true
 	entersyscall(0)
 }
 
@@ -50,6 +51,7 @@ func CgocallDone() {
 	if gp == nil {
 		throw("no g in CgocallDone")
 	}
+	gp.m.incgo = false
 	gp.m.ncgo--
 
 	// If we are invoked because the C function called _cgo_panic,
@@ -68,15 +70,18 @@ func CgocallDone() {
 //     gofunction()
 //go:nosplit
 func CgocallBack() {
-	if getg() == nil || getg().m == nil {
+	gp := getg()
+	if gp == nil || gp.m == nil {
 		needm(0)
-		mp := getg().m
+		gp = getg()
+		mp := gp.m
 		mp.dropextram = true
 	}
 
 	exitsyscall(0)
+	gp.m.incgo = false
 
-	if getg().m.ncgo == 0 {
+	if gp.m.ncgo == 0 {
 		// The C call to Go came from a thread created by C.
 		// The C call to Go came from a thread not currently running
 		// any Go. In the case of -buildmode=c-archive or c-shared,
@@ -85,7 +90,7 @@ func CgocallBack() {
 		<-main_init_done
 	}
 
-	mp := getg().m
+	mp := gp.m
 	if mp.needextram || atomic.Load(&extraMWaiters) > 0 {
 		mp.needextram = false
 		newextram()
@@ -120,6 +125,7 @@ func CgocallBackDone() {
 		drop = true
 	}
 
+	gp.m.incgo = true
 	entersyscall(0)
 
 	if drop {
@@ -133,3 +139,8 @@ func _cgo_panic(p *byte) {
 	exitsyscall(0)
 	panic(gostringnocopy(p))
 }
+
+// cgo_yield exists in the gc toolchain to let TSAN deliver a signal.
+// gccgo does not need this.
+var cgo_yield = &_cgo_yield
+var _cgo_yield unsafe.Pointer
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index 09d444d..30f054b 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -125,7 +125,7 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
 	aoff := uintptr(src) - mheap_.arena_start
 	idx := aoff >> _PageShift
 	s := mheap_.spans[idx]
-	if s.state == _MSpanStack {
+	if s.state == _MSpanManual {
 		// There are no heap bits for value stored on the stack.
 		// For a channel receive src might be on the stack of some
 		// other goroutine, so we can't unwind the stack even if
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index d2470bd..7bb919c 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -185,7 +185,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	if sg := c.recvq.dequeue(); sg != nil {
 		// Found a waiting receiver. We pass the value we want to send
 		// directly to the receiver, bypassing the channel buffer (if any).
-		send(c, sg, ep, func() { unlock(&c.lock) })
+		send(c, sg, ep, func() { unlock(&c.lock) }, 3)
 		return true
 	}
 
@@ -256,7 +256,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 // Channel c must be empty and locked.  send unlocks c with unlockf.
 // sg must already be dequeued from c.
 // ep must be non-nil and point to the heap or the caller's stack.
-func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if raceenabled {
 		if c.dataqsiz == 0 {
 			racesync(c, sg)
@@ -286,7 +286,7 @@ func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
-	goready(gp, 4)
+	goready(gp, skip+1)
 }
 
 // Sends and receives on unbuffered or empty-buffered channels are the
@@ -466,7 +466,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 		// directly from sender. Otherwise, receive from head of queue
 		// and add sender's value to the tail of the queue (both map to
 		// the same buffer slot because the queue is full).
-		recv(c, sg, ep, func() { unlock(&c.lock) })
+		recv(c, sg, ep, func() { unlock(&c.lock) }, 3)
 		return true, true
 	}
 
@@ -542,7 +542,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 // Channel c must be full and locked. recv unlocks c with unlockf.
 // sg must already be dequeued from c.
 // A non-nil ep must point to the heap or the caller's stack.
-func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.dataqsiz == 0 {
 		if raceenabled {
 			racesync(c, sg)
@@ -582,7 +582,7 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
-	goready(gp, 4)
+	goready(gp, skip+1)
 }
 
 // compiler implements
diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go
index e1206f9..b031b1a 100644
--- a/libgo/go/runtime/cpuprof.go
+++ b/libgo/go/runtime/cpuprof.go
@@ -3,118 +3,45 @@
 // license that can be found in the LICENSE file.
 
 // CPU profiling.
-// Based on algorithms and data structures used in
-// https://github.com/google/pprof.
-//
-// The main difference between this code and the google-perftools
-// code is that this code is written to allow copying the profile data
-// to an arbitrary io.Writer, while the google-perftools code always
-// writes to an operating system file.
 //
 // The signal handler for the profiling clock tick adds a new stack trace
-// to a hash table tracking counts for recent traces. Most clock ticks
-// hit in the cache. In the event of a cache miss, an entry must be
-// evicted from the hash table, copied to a log that will eventually be
-// written as profile data. The google-perftools code flushed the
-// log itself during the signal handler. This code cannot do that, because
-// the io.Writer might block or need system calls or locks that are not
-// safe to use from within the signal handler. Instead, we split the log
-// into two halves and let the signal handler fill one half while a goroutine
-// is writing out the other half. When the signal handler fills its half, it
-// offers to swap with the goroutine. If the writer is not done with its half,
-// we lose the stack trace for this clock tick (and record that loss).
-// The goroutine interacts with the signal handler by calling getprofile() to
-// get the next log piece to write, implicitly handing back the last log
-// piece it obtained.
-//
-// The state of this dance between the signal handler and the goroutine
-// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
-// is not using either log half and is waiting (or will soon be waiting) for
-// a new piece by calling notesleep(&p.wait).  If the signal handler
-// changes handoff from 0 to non-zero, it must call notewakeup(&p.wait)
-// to wake the goroutine. The value indicates the number of entries in the
-// log half being handed off. The goroutine leaves the non-zero value in
-// place until it has finished processing the log half and then flips the number
-// back to zero. Setting the high bit in handoff means that the profiling is over,
-// and the goroutine is now in charge of flushing the data left in the hash table
-// to the log and returning that data.
-//
-// The handoff field is manipulated using atomic operations.
-// For the most part, the manipulation of handoff is orderly: if handoff == 0
-// then the signal handler owns it and can change it to non-zero.
-// If handoff != 0 then the goroutine owns it and can change it to zero.
-// If that were the end of the story then we would not need to manipulate
-// handoff using atomic operations. The operations are needed, however,
-// in order to let the log closer set the high bit to indicate "EOF" safely
-// in the situation when normally the goroutine "owns" handoff.
+// to a log of recent traces. The log is read by a user goroutine that
+// turns it into formatted profile data. If the reader does not keep up
+// with the log, those writes will be recorded as a count of lost records.
+// The actual profile buffer is in profbuf.go.
 
 package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
-const (
-	numBuckets      = 1 << 10
-	logSize         = 1 << 17
-	assoc           = 4
-	maxCPUProfStack = 64
-)
+const maxCPUProfStack = 64
 
-type cpuprofEntry struct {
-	count uintptr
-	depth int
-	stack [maxCPUProfStack]uintptr
-}
-
-//go:notinheap
 type cpuProfile struct {
-	on     bool    // profiling is on
-	wait   note    // goroutine waits here
-	count  uintptr // tick count
-	evicts uintptr // eviction count
-	lost   uintptr // lost ticks that need to be logged
-
-	// Active recent stack traces.
-	hash [numBuckets]struct {
-		entry [assoc]cpuprofEntry
-	}
-
-	// Log of traces evicted from hash.
-	// Signal handler has filled log[toggle][:nlog].
-	// Goroutine is writing log[1-toggle][:handoff].
-	log     [2][logSize / 2]uintptr
-	nlog    int
-	toggle  int32
-	handoff uint32
-
-	// Writer state.
-	// Writer maintains its own toggle to avoid races
-	// looking at signal handler's toggle.
-	wtoggle  uint32
-	wholding bool // holding & need to release a log half
-	flushing bool // flushing hash table - profile is over
-	eodSent  bool // special end-of-data record sent; => flushing
+	lock mutex
+	on   bool     // profiling is on
+	log  *profBuf // profile events written here
+
+	// extra holds extra stacks accumulated in addNonGo
+	// corresponding to profiling signals arriving on
+	// non-Go-created threads. Those stacks are written
+	// to log the next time a normal Go thread gets the
+	// signal handler.
+	// Assuming the stacks are 2 words each (we don't get
+	// a full traceback from those threads), plus one word
+	// size for framing, 100 Hz profiling would generate
+	// 300 words per second.
+	// Hopefully a normal Go thread will get the profiling
+	// signal at least once every few seconds.
+	extra     [1000]uintptr
+	numExtra  int
+	lostExtra uint64 // count of frames lost because extra is full
 }
 
-var (
-	cpuprofLock mutex
-	cpuprof     *cpuProfile
-
-	eod = [3]uintptr{0, 1, 0}
-)
-
-func setcpuprofilerate(hz int32) {
-	systemstack(func() {
-		setcpuprofilerate_m(hz)
-	})
-}
-
-// lostProfileData is a no-op function used in profiles
-// to mark the number of profiling stack traces that were
-// discarded due to slow data writers.
-func lostProfileData() {}
+var cpuprof cpuProfile
 
 // SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
 // If hz <= 0, SetCPUProfileRate turns off profiling.
@@ -132,323 +59,153 @@ func SetCPUProfileRate(hz int) {
 		hz = 1000000
 	}
 
-	lock(&cpuprofLock)
+	lock(&cpuprof.lock)
 	if hz > 0 {
-		if cpuprof == nil {
-			cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys))
-			if cpuprof == nil {
-				print("runtime: cpu profiling cannot allocate memory\n")
-				unlock(&cpuprofLock)
-				return
-			}
-		}
-		if cpuprof.on || cpuprof.handoff != 0 {
+		if cpuprof.on || cpuprof.log != nil {
 			print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
-			unlock(&cpuprofLock)
+			unlock(&cpuprof.lock)
 			return
 		}
 
 		cpuprof.on = true
-		// pprof binary header format.
-		// https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119
-		p := &cpuprof.log[0]
-		p[0] = 0                 // count for header
-		p[1] = 3                 // depth for header
-		p[2] = 0                 // version number
-		p[3] = uintptr(1e6 / hz) // period (microseconds)
-		p[4] = 0
-		cpuprof.nlog = 5
-		cpuprof.toggle = 0
-		cpuprof.wholding = false
-		cpuprof.wtoggle = 0
-		cpuprof.flushing = false
-		cpuprof.eodSent = false
-		noteclear(&cpuprof.wait)
-
+		cpuprof.log = newProfBuf(1, 1<<17, 1<<14)
+		hdr := [1]uint64{uint64(hz)}
+		cpuprof.log.write(nil, nanotime(), hdr[:], nil)
 		setcpuprofilerate(int32(hz))
-	} else if cpuprof != nil && cpuprof.on {
+	} else if cpuprof.on {
 		setcpuprofilerate(0)
 		cpuprof.on = false
-
-		// Now add is not running anymore, and getprofile owns the entire log.
-		// Set the high bit in cpuprof.handoff to tell getprofile.
-		for {
-			n := cpuprof.handoff
-			if n&0x80000000 != 0 {
-				print("runtime: setcpuprofile(off) twice\n")
-			}
-			if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
-				if n == 0 {
-					// we did the transition from 0 -> nonzero so we wake getprofile
-					notewakeup(&cpuprof.wait)
-				}
-				break
-			}
-		}
+		cpuprof.addExtra()
+		cpuprof.log.close()
 	}
-	unlock(&cpuprofLock)
+	unlock(&cpuprof.lock)
 }
 
 // add adds the stack trace to the profile.
 // It is called from signal handlers and other limited environments
 // and cannot allocate memory or acquire locks that might be
 // held at the time of the signal, nor can it use substantial amounts
-// of stack. It is allowed to call evict.
+// of stack.
 //go:nowritebarrierrec
-func (p *cpuProfile) add(pc []uintptr) {
-	p.addWithFlushlog(pc, p.flushlog)
-}
-
-// addWithFlushlog implements add and addNonGo.
-// It is called from signal handlers and other limited environments
-// and cannot allocate memory or acquire locks that might be
-// held at the time of the signal, nor can it use substantial amounts
-// of stack. It may be called by a signal handler with no g or m.
-// It is allowed to call evict, passing the flushlog parameter.
-//go:nosplit
-//go:nowritebarrierrec
-func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) {
-	if len(pc) > maxCPUProfStack {
-		pc = pc[:maxCPUProfStack]
-	}
-
-	// Compute hash.
-	h := uintptr(0)
-	for _, x := range pc {
-		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
-		h += x * 41
+func (p *cpuProfile) add(gp *g, stk []uintptr) {
+	// Simple cas-lock to coordinate with setcpuprofilerate.
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
 	}
-	p.count++
 
-	// Add to entry count if already present in table.
-	b := &p.hash[h%numBuckets]
-Assoc:
-	for i := range b.entry {
-		e := &b.entry[i]
-		if e.depth != len(pc) {
-			continue
-		}
-		for j := range pc {
-			if e.stack[j] != pc[j] {
-				continue Assoc
-			}
+	if prof.hz != 0 { // implies cpuprof.log != nil
+		if p.numExtra > 0 || p.lostExtra > 0 {
+			p.addExtra()
 		}
-		e.count++
-		return
+		hdr := [1]uint64{1}
+		// Note: write "knows" that the argument is &gp.labels,
+		// because otherwise its write barrier behavior may not
+		// be correct. See the long comment there before
+		// changing the argument here.
+		cpuprof.log.write(&gp.labels, nanotime(), hdr[:], stk)
 	}
 
-	// Evict entry with smallest count.
-	var e *cpuprofEntry
-	for i := range b.entry {
-		if e == nil || b.entry[i].count < e.count {
-			e = &b.entry[i]
-		}
-	}
-	if e.count > 0 {
-		if !p.evict(e, flushlog) {
-			// Could not evict entry. Record lost stack.
-			p.lost++
-			return
-		}
-		p.evicts++
-	}
-
-	// Reuse the newly evicted entry.
-	e.depth = len(pc)
-	e.count = 1
-	copy(e.stack[:], pc)
+	atomic.Store(&prof.signalLock, 0)
 }
 
-// evict copies the given entry's data into the log, so that
-// the entry can be reused.  evict is called from add, which
-// is called from the profiling signal handler, so it must not
-// allocate memory or block, and it may be called with no g or m.
-// It is safe to call flushlog. evict returns true if the entry was
-// copied to the log, false if there was no room available.
+// addNonGo adds the non-Go stack trace to the profile.
+// It is called from a non-Go thread, so we cannot use much stack at all,
+// nor do anything that needs a g or an m.
+// In particular, we can't call cpuprof.log.write.
+// Instead, we copy the stack into cpuprof.extra,
+// which will be drained the next time a Go thread
+// gets the signal handling event.
 //go:nosplit
 //go:nowritebarrierrec
-func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool {
-	d := e.depth
-	nslot := d + 2
-	log := &p.log[p.toggle]
-	if p.nlog+nslot > len(log) {
-		if !flushlog() {
-			return false
-		}
-		log = &p.log[p.toggle]
-	}
-
-	q := p.nlog
-	log[q] = e.count
-	q++
-	log[q] = uintptr(d)
-	q++
-	copy(log[q:], e.stack[:d])
-	q += d
-	p.nlog = q
-	e.count = 0
-	return true
-}
-
-// flushlog tries to flush the current log and switch to the other one.
-// flushlog is called from evict, called from add, called from the signal handler,
-// so it cannot allocate memory or block. It can try to swap logs with
-// the writing goroutine, as explained in the comment at the top of this file.
-//go:nowritebarrierrec
-func (p *cpuProfile) flushlog() bool {
-	if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
-		return false
+func (p *cpuProfile) addNonGo(stk []uintptr) {
+	// Simple cas-lock to coordinate with SetCPUProfileRate.
+	// (Other calls to add or addNonGo should be blocked out
+	// by the fact that only one SIGPROF can be handled by the
+	// process at a time. If not, this lock will serialize those too.)
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
 	}
-	notewakeup(&p.wait)
 
-	p.toggle = 1 - p.toggle
-	log := &p.log[p.toggle]
-	q := 0
-	if p.lost > 0 {
-		lostPC := funcPC(lostProfileData)
-		log[0] = p.lost
-		log[1] = 1
-		log[2] = lostPC
-		q = 3
-		p.lost = 0
+	if cpuprof.numExtra+1+len(stk) < len(cpuprof.extra) {
+		i := cpuprof.numExtra
+		cpuprof.extra[i] = uintptr(1 + len(stk))
+		copy(cpuprof.extra[i+1:], stk)
+		cpuprof.numExtra += 1 + len(stk)
+	} else {
+		cpuprof.lostExtra++
 	}
-	p.nlog = q
-	return true
-}
 
-// addNonGo is like add, but runs on a non-Go thread.
-// It can't do anything that might need a g or an m.
-// With this entry point, we don't try to flush the log when evicting an
-// old entry. Instead, we just drop the stack trace if we're out of space.
-//go:nosplit
-//go:nowritebarrierrec
-func (p *cpuProfile) addNonGo(pc []uintptr) {
-	p.addWithFlushlog(pc, func() bool { return false })
+	atomic.Store(&prof.signalLock, 0)
 }
 
-// getprofile blocks until the next block of profiling data is available
-// and returns it as a []byte. It is called from the writing goroutine.
-func (p *cpuProfile) getprofile() []byte {
-	if p == nil {
-		return nil
-	}
-
-	if p.wholding {
-		// Release previous log to signal handling side.
-		// Loop because we are racing against SetCPUProfileRate(0).
-		for {
-			n := p.handoff
-			if n == 0 {
-				print("runtime: phase error during cpu profile handoff\n")
-				return nil
-			}
-			if n&0x80000000 != 0 {
-				p.wtoggle = 1 - p.wtoggle
-				p.wholding = false
-				p.flushing = true
-				goto Flush
-			}
-			if atomic.Cas(&p.handoff, n, 0) {
-				break
-			}
-		}
-		p.wtoggle = 1 - p.wtoggle
-		p.wholding = false
-	}
-
-	if p.flushing {
-		goto Flush
-	}
-
-	if !p.on && p.handoff == 0 {
-		return nil
-	}
-
-	// Wait for new log.
-	notetsleepg(&p.wait, -1)
-	noteclear(&p.wait)
-
-	switch n := p.handoff; {
-	case n == 0:
-		print("runtime: phase error during cpu profile wait\n")
-		return nil
-	case n == 0x80000000:
-		p.flushing = true
-		goto Flush
-	default:
-		n &^= 0x80000000
-
-		// Return new log to caller.
-		p.wholding = true
-
-		return uintptrBytes(p.log[p.wtoggle][:n])
-	}
-
-	// In flush mode.
-	// Add is no longer being called. We own the log.
-	// Also, p.handoff is non-zero, so flushlog will return false.
-	// Evict the hash table into the log and return it.
-Flush:
-	for i := range p.hash {
-		b := &p.hash[i]
-		for j := range b.entry {
-			e := &b.entry[j]
-			if e.count > 0 && !p.evict(e, p.flushlog) {
-				// Filled the log. Stop the loop and return what we've got.
-				break Flush
-			}
+// addExtra adds the "extra" profiling events,
+// queued by addNonGo, to the profile log.
+// addExtra is called either from a signal handler on a Go thread
+// or from an ordinary goroutine; either way it can use stack
+// and has a g. The world may be stopped, though.
+func (p *cpuProfile) addExtra() {
+	// Copy accumulated non-Go profile events.
+	hdr := [1]uint64{1}
+	for i := 0; i < p.numExtra; {
+		p.log.write(nil, 0, hdr[:], p.extra[i+1:i+int(p.extra[i])])
+		i += int(p.extra[i])
+	}
+	p.numExtra = 0
+
+	// Report any lost events.
+	if p.lostExtra > 0 {
+		hdr := [1]uint64{p.lostExtra}
+		lostStk := [2]uintptr{
+			funcPC(_LostExternalCode) + sys.PCQuantum,
+			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
+		cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
 	}
-
-	// Return pending log data.
-	if p.nlog > 0 {
-		// Note that we're using toggle now, not wtoggle,
-		// because we're working on the log directly.
-		n := p.nlog
-		p.nlog = 0
-		return uintptrBytes(p.log[p.toggle][:n])
-	}
-
-	// Made it through the table without finding anything to log.
-	if !p.eodSent {
-		// We may not have space to append this to the partial log buf,
-		// so we always return a new slice for the end-of-data marker.
-		p.eodSent = true
-		return uintptrBytes(eod[:])
-	}
-
-	// Finally done. Clean up and return nil.
-	p.flushing = false
-	if !atomic.Cas(&p.handoff, p.handoff, 0) {
-		print("runtime: profile flush racing with something\n")
-	}
-	return nil
 }
 
-func uintptrBytes(p []uintptr) (ret []byte) {
-	pp := (*slice)(unsafe.Pointer(&p))
-	rp := (*slice)(unsafe.Pointer(&ret))
-
-	rp.array = pp.array
-	rp.len = pp.len * int(unsafe.Sizeof(p[0]))
-	rp.cap = rp.len
-
-	return
+func (p *cpuProfile) addLostAtomic64(count uint64) {
+	hdr := [1]uint64{count}
+	lostStk := [2]uintptr{
+		funcPC(_LostSIGPROFDuringAtomic64) + sys.PCQuantum,
+		funcPC(_System) + sys.PCQuantum,
+	}
+	cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
 }
 
-// CPUProfile returns the next chunk of binary CPU profiling stack trace data,
-// blocking until data is available. If profiling is turned off and all the profile
-// data accumulated while it was on has been returned, CPUProfile returns nil.
-// The caller must save the returned data before calling CPUProfile again.
+// CPUProfile panics.
+// It formerly provided raw access to chunks of
+// a pprof-format profile generated by the runtime.
+// The details of generating that format have changed,
+// so this functionality has been removed.
 //
-// Most clients should use the runtime/pprof package or
-// the testing package's -test.cpuprofile flag instead of calling
-// CPUProfile directly.
+// Deprecated: use the runtime/pprof package,
+// or the handlers in the net/http/pprof package,
+// or the testing package's -test.cpuprofile flag instead.
 func CPUProfile() []byte {
-	return cpuprof.getprofile()
+	panic("CPUProfile no longer available")
 }
 
 //go:linkname runtime_pprof_runtime_cyclesPerSecond runtime_pprof.runtime_cyclesPerSecond
 func runtime_pprof_runtime_cyclesPerSecond() int64 {
 	return tickspersecond()
 }
+
+// readProfile, provided to runtime/pprof, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+//
+//go:linkname runtime_pprof_readProfile runtime_pprof.readProfile
+func runtime_pprof_readProfile() ([]uint64, []unsafe.Pointer, bool) {
+	lock(&cpuprof.lock)
+	log := cpuprof.log
+	unlock(&cpuprof.lock)
+	data, tags, eof := log.read(profBufBlocking)
+	if len(data) == 0 && eof {
+		lock(&cpuprof.lock)
+		cpuprof.log = nil
+		unlock(&cpuprof.lock)
+	}
+	return data, tags, eof
+}
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index b338df9..b798731 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -24,7 +24,10 @@ func TestCgoCrashHandler(t *testing.T) {
 }
 
 func TestCgoSignalDeadlock(t *testing.T) {
-	t.Parallel()
+	// Don't call t.Parallel, since too much work going on at the
+	// same time can cause the testprogcgo code to overrun its
+	// timeouts (issue #18598).
+
 	if testing.Short() && runtime.GOOS == "windows" {
 		t.Skip("Skipping in short mode") // takes up to 64 seconds
 	}
@@ -291,33 +294,43 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 
 	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
 	if err != nil {
+		if testenv.Builder() == "linux-amd64-alpine" {
+			// See Issue 18243 and Issue 19938.
+			t.Skipf("Skipping failing test on Alpine (golang.org/issue/18243). Ignoring error: %v", err)
+		}
 		t.Fatal(err)
 	}
 	fn := strings.TrimSpace(string(got))
 	defer os.Remove(fn)
 
-	cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1", exe, fn))
-
-	found := false
-	for i, e := range cmd.Env {
-		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
-			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
-			found = true
-			break
+	for try := 0; try < 2; try++ {
+		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		// Check that pprof works both with and without explicit executable on command line.
+		if try == 0 {
+			cmd.Args = append(cmd.Args, exe, fn)
+		} else {
+			cmd.Args = append(cmd.Args, fn)
 		}
-	}
-	if !found {
-		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
-	}
 
-	top, err := cmd.CombinedOutput()
-	t.Logf("%s", top)
-	if err != nil {
-		t.Fatal(err)
-	}
+		found := false
+		for i, e := range cmd.Env {
+			if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+				cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+				found = true
+				break
+			}
+		}
+		if !found {
+			cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
+		}
 
-	if !bytes.Contains(top, []byte("cpuHog")) {
-		t.Error("missing cpuHog in pprof output")
+		top, err := cmd.CombinedOutput()
+		t.Logf("%s:\n%s", cmd.Args, top)
+		if err != nil {
+			t.Error(err)
+		} else if !bytes.Contains(top, []byte("cpuHog")) {
+			t.Error("missing cpuHog in pprof output")
+		}
 	}
 }
 
@@ -397,3 +410,16 @@ func TestRaceSignal(t *testing.T) {
 		t.Errorf("expected %q got %s", want, got)
 	}
 }
+
+func TestCgoNumGoroutine(t *testing.T) {
+	switch runtime.GOOS {
+	case "windows", "plan9":
+		t.Skipf("skipping numgoroutine test on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "NumGoroutine")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 4ba9d44..1cde6bf 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -164,6 +164,12 @@ func checkStaleRuntime(t *testing.T) {
 			return
 		}
 		if string(out) != "false\n" {
+			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
+			out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
+			if err != nil {
+				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
+			}
+			t.Logf("go list -f {{.StaleReason}} runtime:\n%s", out)
 			staleRuntimeErr = fmt.Errorf("Stale runtime.a. Run 'go install runtime'.")
 		}
 	})
@@ -305,6 +311,9 @@ func TestNoHelperGoroutines(t *testing.T) {
 
 func TestBreakpoint(t *testing.T) {
 	output := runTestProg(t, "testprog", "Breakpoint")
+	// If runtime.Breakpoint() is inlined, then the stack trace prints
+	// "runtime.Breakpoint(...)" instead of "runtime.Breakpoint()".
+	// For gccgo, no parens.
 	want := "runtime.Breakpoint"
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
@@ -481,28 +490,33 @@ func TestMemPprof(t *testing.T) {
 	fn := strings.TrimSpace(string(got))
 	defer os.Remove(fn)
 
-	cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top", exe, fn))
-
-	found := false
-	for i, e := range cmd.Env {
-		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
-			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
-			found = true
-			break
+	for try := 0; try < 2; try++ {
+		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
+		// Check that pprof works both with and without explicit executable on command line.
+		if try == 0 {
+			cmd.Args = append(cmd.Args, exe, fn)
+		} else {
+			cmd.Args = append(cmd.Args, fn)
+		}
+		found := false
+		for i, e := range cmd.Env {
+			if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+				cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+				found = true
+				break
+			}
+		}
+		if !found {
+			cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
 		}
-	}
-	if !found {
-		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
-	}
-
-	top, err := cmd.CombinedOutput()
-	t.Logf("%s", top)
-	if err != nil {
-		t.Fatal(err)
-	}
 
-	if !bytes.Contains(top, []byte("MemProf")) {
-		t.Error("missing MemProf in pprof output")
+		top, err := cmd.CombinedOutput()
+		t.Logf("%s:\n%s", cmd.Args, top)
+		if err != nil {
+			t.Error(err)
+		} else if !bytes.Contains(top, []byte("MemProf")) {
+			t.Error("missing MemProf in pprof output")
+		}
 	}
 }
 
@@ -541,3 +555,87 @@ func TestConcurrentMapIterateWrite(t *testing.T) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
+
+type point struct {
+	x, y *int
+}
+
+func (p *point) negate() {
+	*p.x = *p.x * -1
+	*p.y = *p.y * -1
+}
+
+// Test for issue #10152.
+func TestPanicInlined(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatalf("recover failed")
+		}
+		buf := make([]byte, 2048)
+		n := runtime.Stack(buf, false)
+		buf = buf[:n]
+		want := []byte("(*point).negate(")
+		if runtime.Compiler == "gccgo" {
+			want = []byte("negate.pN18_runtime_test.point")
+		}
+		if !bytes.Contains(buf, want) {
+			t.Logf("%s", buf)
+			t.Fatalf("expecting stack trace to contain call to %s", want)
+		}
+	}()
+
+	pt := new(point)
+	pt.negate()
+}
+
+// Test for issues #3934 and #20018.
+// We want to delay exiting until a panic print is complete.
+func TestPanicRace(t *testing.T) {
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprog")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// The test is intentionally racy, and in my testing does not
+	// produce the expected output about 0.05% of the time.
+	// So run the program in a loop and only fail the test if we
+	// get the wrong output ten times in a row.
+	const tries = 10
+retry:
+	for i := 0; i < tries; i++ {
+		got, err := testEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
+		if err == nil {
+			t.Logf("try %d: program exited successfully, should have failed", i+1)
+			continue
+		}
+
+		if i > 0 {
+			t.Logf("try %d:\n", i+1)
+		}
+		t.Logf("%s\n", got)
+
+		wants := []string{
+			"panic: crash",
+			"PanicRace",
+			"created by ",
+		}
+		if runtime.Compiler == "gccgo" {
+			// gccgo will dump a function name like main.$nested30.
+			// Match on the file name instead.
+			wants[1] = "panicrace"
+		}
+		for _, want := range wants {
+			if !bytes.Contains(got, []byte(want)) {
+				t.Logf("did not find expected string %q", want)
+				continue retry
+			}
+		}
+
+		// Test generated expected output.
+		return
+	}
+	t.Errorf("test ran %d times without producing expected output", tries)
+}
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 7a29c1e..09c2547 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -24,6 +24,15 @@ import (
 // Send SIGQUIT to get a stack trace.
 var sigquit = syscall.SIGQUIT
 
+func init() {
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		// We can't use SIGQUIT to kill subprocesses because
+		// it's blocked. Use SIGKILL instead. See issue
+		// #19196 for an example of when this happens.
+		sigquit = syscall.SIGKILL
+	}
+}
+
 func TestCrashDumpsAllThreads(t *testing.T) {
 	switch runtime.GOOS {
 	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "solaris":
@@ -31,6 +40,10 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skipf("skipping; not supported on %v", runtime.GOOS)
 	}
 
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
+	}
+
 	// We don't use executeTest because we need to kill the
 	// program while it is running.
 
@@ -165,6 +178,10 @@ func TestPanicSystemstack(t *testing.T) {
 		t.Skip("Skipping in short mode (GOTRACEBACK=crash is slow)")
 	}
 
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
+	}
+
 	t.Parallel()
 	cmd := exec.Command(os.Args[0], "testPanicSystemstackInternal")
 	cmd = testEnv(cmd)
@@ -251,3 +268,16 @@ func TestSignalIgnoreSIGTRAP(t *testing.T) {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+func TestSignalDuringExec(t *testing.T) {
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd":
+	default:
+		t.Skipf("skipping test on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprognet", "SignalDuringExec")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/libgo/go/runtime/debug/garbage.go b/libgo/go/runtime/debug/garbage.go
index c82c024..785e9d4 100644
--- a/libgo/go/runtime/debug/garbage.go
+++ b/libgo/go/runtime/debug/garbage.go
@@ -89,9 +89,7 @@ func ReadGCStats(stats *GCStats) {
 // at startup, or 100 if the variable is not set.
 // A negative percentage disables garbage collection.
 func SetGCPercent(percent int) int {
-	old := setGCPercent(int32(percent))
-	runtime.GC()
-	return int(old)
+	return int(setGCPercent(int32(percent)))
 }
 
 // FreeOSMemory forces a garbage collection followed by an
diff --git a/libgo/go/runtime/debug/garbage_test.go b/libgo/go/runtime/debug/garbage_test.go
index 04e954b..62eeb2c 100644
--- a/libgo/go/runtime/debug/garbage_test.go
+++ b/libgo/go/runtime/debug/garbage_test.go
@@ -5,6 +5,7 @@
 package debug_test
 
 import (
+	"internal/testenv"
 	"runtime"
 	. "runtime/debug"
 	"testing"
@@ -104,15 +105,78 @@ func TestFreeOSMemory(t *testing.T) {
 	}
 }
 
+var (
+	setGCPercentBallast interface{}
+	setGCPercentSink    interface{}
+)
+
 func TestSetGCPercent(t *testing.T) {
+	testenv.SkipFlaky(t, 20076)
+
 	// Test that the variable is being set and returned correctly.
-	// Assume the percentage itself is implemented fine during GC,
-	// which is harder to test.
 	old := SetGCPercent(123)
 	new := SetGCPercent(old)
 	if new != 123 {
 		t.Errorf("SetGCPercent(123); SetGCPercent(x) = %d, want 123", new)
 	}
+
+	// Test that the percentage is implemented correctly.
+	defer func() {
+		SetGCPercent(old)
+		setGCPercentBallast, setGCPercentSink = nil, nil
+	}()
+	SetGCPercent(100)
+	runtime.GC()
+	// Create 100 MB of live heap as a baseline.
+	const baseline = 100 << 20
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	setGCPercentBallast = make([]byte, baseline-ms.Alloc)
+	runtime.GC()
+	runtime.ReadMemStats(&ms)
+	if abs64(baseline-int64(ms.Alloc)) > 10<<20 {
+		t.Fatalf("failed to set up baseline live heap; got %d MB, want %d MB", ms.Alloc>>20, baseline>>20)
+	}
+	// NextGC should be ~200 MB.
+	const thresh = 20 << 20 // TODO: Figure out why this is so noisy on some builders
+	if want := int64(2 * baseline); abs64(want-int64(ms.NextGC)) > thresh {
+		t.Errorf("NextGC = %d MB, want %d±%d MB", ms.NextGC>>20, want>>20, thresh>>20)
+	}
+	// Create some garbage, but not enough to trigger another GC.
+	for i := 0; float64(i) < 1.2*baseline; i += 1 << 10 {
+		setGCPercentSink = make([]byte, 1<<10)
+	}
+	setGCPercentSink = nil
+	// Adjust GOGC to 50. NextGC should be ~150 MB.
+	SetGCPercent(50)
+	runtime.ReadMemStats(&ms)
+	if want := int64(1.5 * baseline); abs64(want-int64(ms.NextGC)) > thresh {
+		t.Errorf("NextGC = %d MB, want %d±%d MB", ms.NextGC>>20, want>>20, thresh>>20)
+	}
+
+	// Trigger a GC and get back to 100 MB live with GOGC=100.
+	SetGCPercent(100)
+	runtime.GC()
+	// Raise live to 120 MB.
+	setGCPercentSink = make([]byte, int(0.2*baseline))
+	// Lower GOGC to 10. This must force a GC.
+	runtime.ReadMemStats(&ms)
+	ngc1 := ms.NumGC
+	SetGCPercent(10)
+	// It may require an allocation to actually force the GC.
+	setGCPercentSink = make([]byte, 1<<20)
+	runtime.ReadMemStats(&ms)
+	ngc2 := ms.NumGC
+	if ngc1 == ngc2 {
+		t.Errorf("expected GC to run but it did not")
+	}
+}
+
+func abs64(a int64) int64 {
+	if a < 0 {
+		return -a
+	}
+	return a
 }
 
 func TestSetMaxThreadsOvf(t *testing.T) {
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index 9bf7ddc..ddf3c02 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -11,7 +11,7 @@ func gogetenv(key string) string {
 	if env == nil {
 		throw("getenv before env init")
 	}
-	for _, s := range environ() {
+	for _, s := range env {
 		if len(s) > len(key) && s[len(key)] == '=' && s[:len(key)] == key {
 			return s[len(key)+1:]
 		}
diff --git a/libgo/go/runtime/error.go b/libgo/go/runtime/error.go
index 9cf2230..44e63d8 100644
--- a/libgo/go/runtime/error.go
+++ b/libgo/go/runtime/error.go
@@ -140,8 +140,6 @@ func typestring(x interface{}) string {
 
 // For calling from C.
 // Prints an argument passed to panic.
-// There's room for arbitrary complexity here, but we keep it
-// simple and handle just a few important cases: int, string, and Stringer.
 func printany(i interface{}) {
 	switch v := i.(type) {
 	case nil:
@@ -150,16 +148,41 @@ func printany(i interface{}) {
 		print(v.String())
 	case error:
 		print(v.Error())
+	case bool:
+		print(v)
 	case int:
 		print(v)
+	case int8:
+		print(v)
+	case int16:
+		print(v)
+	case int32:
+		print(v)
+	case int64:
+		print(v)
+	case uint:
+		print(v)
+	case uint8:
+		print(v)
+	case uint16:
+		print(v)
+	case uint32:
+		print(v)
+	case uint64:
+		print(v)
+	case uintptr:
+		print(v)
+	case float32:
+		print(v)
+	case float64:
+		print(v)
+	case complex64:
+		print(v)
+	case complex128:
+		print(v)
 	case string:
 		print(v)
 	default:
 		print("(", typestring(i), ") ", i)
 	}
 }
-
-// called from generated code
-func panicwrap(pkg, typ, meth string) {
-	panic(plainError("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer"))
-}
diff --git a/libgo/go/runtime/example_test.go b/libgo/go/runtime/example_test.go
new file mode 100644
index 0000000..e4912a5
--- /dev/null
+++ b/libgo/go/runtime/example_test.go
@@ -0,0 +1,54 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+)
+
+func ExampleFrames() {
+	c := func() {
+		// Ask runtime.Callers for up to 10 pcs, including runtime.Callers itself.
+		pc := make([]uintptr, 10)
+		n := runtime.Callers(0, pc)
+		if n == 0 {
+			// No pcs available. Stop now.
+			// This can happen if the first argument to runtime.Callers is large.
+			return
+		}
+
+		pc = pc[:n] // pass only valid pcs to runtime.CallersFrames
+		frames := runtime.CallersFrames(pc)
+
+		// Loop to get frames.
+		// A fixed number of pcs can expand to an indefinite number of Frames.
+		for {
+			frame, more := frames.Next()
+			// To keep this example's output stable
+			// even if there are changes in the testing package,
+			// stop unwinding when we leave package runtime.
+			if !strings.Contains(frame.File, "runtime/") {
+				break
+			}
+			fmt.Printf("- more:%v | %s\n", more, frame.Function)
+			if !more {
+				break
+			}
+		}
+	}
+
+	b := func() { c() }
+	a := func() { b() }
+
+	a()
+	// Output:
+	// - more:true | runtime.Callers
+	// - more:true | runtime_test.ExampleFrames.func1
+	// - more:true | runtime_test.ExampleFrames.func2
+	// - more:true | runtime_test.ExampleFrames.func3
+	// - more:true | runtime_test.ExampleFrames
+}
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index bf435f4..6325dcb 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -41,11 +41,11 @@ type LFNode struct {
 }
 
 func LFStackPush(head *uint64, node *LFNode) {
-	lfstackpush(head, (*lfnode)(unsafe.Pointer(node)))
+	(*lfstack)(head).push((*lfnode)(unsafe.Pointer(node)))
 }
 
 func LFStackPop(head *uint64) *LFNode {
-	return (*LFNode)(unsafe.Pointer(lfstackpop(head)))
+	return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
 }
 
 func GCMask(x interface{}) (ret []byte) {
@@ -241,6 +241,97 @@ func CountPagesInUse() (pagesInUse, counted uintptr) {
 	return
 }
 
+func Fastrand() uint32          { return fastrand() }
+func Fastrandn(n uint32) uint32 { return fastrandn(n) }
+
+type ProfBuf profBuf
+
+func NewProfBuf(hdrsize, bufwords, tags int) *ProfBuf {
+	return (*ProfBuf)(newProfBuf(hdrsize, bufwords, tags))
+}
+
+func (p *ProfBuf) Write(tag *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+	(*profBuf)(p).write(tag, now, hdr, stk)
+}
+
+const (
+	ProfBufBlocking    = profBufBlocking
+	ProfBufNonBlocking = profBufNonBlocking
+)
+
+func (p *ProfBuf) Read(mode profBufReadMode) ([]uint64, []unsafe.Pointer, bool) {
+	return (*profBuf)(p).read(profBufReadMode(mode))
+}
+
+func (p *ProfBuf) Close() {
+	(*profBuf)(p).close()
+}
+
+// ReadMemStatsSlow returns both the runtime-computed MemStats and
+// MemStats accumulated by scanning the heap.
+func ReadMemStatsSlow() (base, slow MemStats) {
+	stopTheWorld("ReadMemStatsSlow")
+
+	// Run on the system stack to avoid stack growth allocation.
+	systemstack(func() {
+		// Make sure stats don't change.
+		getg().m.mallocing++
+
+		readmemstats_m(&base)
+
+		// Initialize slow from base and zero the fields we're
+		// recomputing.
+		slow = base
+		slow.Alloc = 0
+		slow.TotalAlloc = 0
+		slow.Mallocs = 0
+		slow.Frees = 0
+		var bySize [_NumSizeClasses]struct {
+			Mallocs, Frees uint64
+		}
+
+		// Add up current allocations in spans.
+		for _, s := range mheap_.allspans {
+			if s.state != mSpanInUse {
+				continue
+			}
+			if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
+				slow.Mallocs++
+				slow.Alloc += uint64(s.elemsize)
+			} else {
+				slow.Mallocs += uint64(s.allocCount)
+				slow.Alloc += uint64(s.allocCount) * uint64(s.elemsize)
+				bySize[sizeclass].Mallocs += uint64(s.allocCount)
+			}
+		}
+
+		// Add in frees. readmemstats_m flushed the cached stats, so
+		// these are up-to-date.
+		var smallFree uint64
+		slow.Frees = mheap_.nlargefree
+		for i := range mheap_.nsmallfree {
+			slow.Frees += mheap_.nsmallfree[i]
+			bySize[i].Frees = mheap_.nsmallfree[i]
+			bySize[i].Mallocs += mheap_.nsmallfree[i]
+			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		}
+		slow.Frees += memstats.tinyallocs
+		slow.Mallocs += slow.Frees
+
+		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+
+		for i := range slow.BySize {
+			slow.BySize[i].Mallocs = bySize[i].Mallocs
+			slow.BySize[i].Frees = bySize[i].Frees
+		}
+
+		getg().m.mallocing--
+	})
+
+	startTheWorld()
+	return
+}
+
 // BlockOnSystemStack switches to the system stack, prints "x\n" to
 // stderr, and blocks in a stack containing
 // "runtime.blockOnSystemStackInternal".
@@ -253,3 +344,23 @@ func blockOnSystemStackInternal() {
 	lock(&deadlock)
 	lock(&deadlock)
 }
+
+type RWMutex struct {
+	rw rwmutex
+}
+
+func (rw *RWMutex) RLock() {
+	rw.rw.rlock()
+}
+
+func (rw *RWMutex) RUnlock() {
+	rw.rw.runlock()
+}
+
+func (rw *RWMutex) Lock() {
+	rw.rw.lock()
+}
+
+func (rw *RWMutex) Unlock() {
+	rw.rw.unlock()
+}
diff --git a/libgo/go/runtime/export_unix_test.go b/libgo/go/runtime/export_unix_test.go
new file mode 100644
index 0000000..54d5770
--- /dev/null
+++ b/libgo/go/runtime/export_unix_test.go
@@ -0,0 +1,19 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func sigismember(mask *sigset, i int) bool {
+	clear := *mask
+	sigdelset(&clear, i)
+	return clear != *mask
+}
+
+func Sigisblocked(i int) bool {
+	var sigmask sigset
+	sigprocmask(_SIG_SETMASK, nil, &sigmask)
+	return sigismember(&sigmask, i)
+}
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 5c50760..6ca9789 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -50,13 +50,6 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	gcshrinkstackoff: setting gcshrinkstackoff=1 disables moving goroutines
 	onto smaller stacks. In this mode, a goroutine's stack can only grow.
 
-	gcstackbarrieroff: setting gcstackbarrieroff=1 disables the use of stack barriers
-	that allow the garbage collector to avoid repeating a stack scan during the
-	mark termination phase.
-
-	gcstackbarrierall: setting gcstackbarrierall=1 installs stack barriers
-	in every stack frame, rather than in exponentially-spaced frames.
-
 	gcrescanstacks: setting gcrescanstacks=1 enables stack
 	re-scanning during the STW mark termination phase. This is
 	helpful for debugging if objects are being prematurely
@@ -85,7 +78,7 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	for mark/scan are broken down in to assist time (GC performed in
 	line with allocation), background GC time, and idle GC time.
 	If the line ends with "(forced)", this GC was forced by a
-	runtime.GC() call and all phases are STW.
+	runtime.GC() call.
 
 	Setting gctrace to any value > 0 also causes the garbage collector
 	to emit a summary when memory is released back to the system.
@@ -173,7 +166,7 @@ func Gosched()
 // to ascend, with 0 identifying the caller of Caller.  (For historical reasons the
 // meaning of skip differs between Caller and Callers.) The return values report the
 // program counter, file name, and line number within the file of the corresponding
-// call.  The boolean ok is false if it was not possible to recover the information.
+// call. The boolean ok is false if it was not possible to recover the information.
 func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 
 // Callers fills the slice pc with the return program counters of function invocations
@@ -181,6 +174,14 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 // to skip before recording in pc, with 0 identifying the frame for Callers itself and
 // 1 identifying the caller of Callers.
 // It returns the number of entries written to pc.
+//
+// To translate these PCs into symbolic information such as function
+// names and line numbers, use CallersFrames. CallersFrames accounts
+// for inlined functions and adjusts the return program counters into
+// call program counters. Iterating over the returned slice of PCs
+// directly is discouraged, as is using FuncForPC on any of the
+// returned PCs, since these cannot account for inlining or return
+// program counter adjustment.
 func Callers(skip int, pc []uintptr) int
 
 // GOROOT returns the root of the Go tree.
@@ -206,7 +207,7 @@ func Version() string {
 const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
-// 386, amd64, arm, or s390x.
+// one of 386, amd64, arm, s390x, and so on.
 const GOARCH string = sys.GOARCH
 
 // GCCGOTOOLDIR is the Tool Dir for the gccgo build
diff --git a/libgo/go/runtime/fastlog2.go b/libgo/go/runtime/fastlog2.go
index 5f3fb53..1f251bf 100644
--- a/libgo/go/runtime/fastlog2.go
+++ b/libgo/go/runtime/fastlog2.go
@@ -4,8 +4,6 @@
 
 package runtime
 
-import "unsafe"
-
 // fastlog2 implements a fast approximation to the base 2 log of a
 // float64. This is used to compute a geometric distribution for heap
 // sampling, without introducing dependencies into package math. This
@@ -27,7 +25,3 @@ func fastlog2(x float64) float64 {
 	low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
 	return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
 }
-
-// float64bits returns the IEEE 754 binary representation of f.
-// Taken from math.Float64bits to avoid dependencies into package math.
-func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }
diff --git a/libgo/go/runtime/float.go b/libgo/go/runtime/float.go
new file mode 100644
index 0000000..459e58d
--- /dev/null
+++ b/libgo/go/runtime/float.go
@@ -0,0 +1,53 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var inf = float64frombits(0x7FF0000000000000)
+
+// isNaN reports whether f is an IEEE 754 ``not-a-number'' value.
+func isNaN(f float64) (is bool) {
+	// IEEE 754 says that only NaNs satisfy f != f.
+	return f != f
+}
+
+// isFinite reports whether f is neither NaN nor an infinity.
+func isFinite(f float64) bool {
+	return !isNaN(f - f)
+}
+
+// isInf reports whether f is an infinity.
+func isInf(f float64) bool {
+	return !isNaN(f) && !isFinite(f)
+}
+
+// Abs returns the absolute value of x.
+//
+// Special cases are:
+//	Abs(±Inf) = +Inf
+//	Abs(NaN) = NaN
+func abs(x float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x) &^ sign)
+}
+
+// copysign returns a value with the magnitude
+// of x and the sign of y.
+func copysign(x, y float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x)&^sign | float64bits(y)&sign)
+}
+
+// Float64bits returns the IEEE 754 binary representation of f.
+func float64bits(f float64) uint64 {
+	return *(*uint64)(unsafe.Pointer(&f))
+}
+
+// Float64frombits returns the floating point number corresponding
+// the IEEE 754 binary representation b.
+func float64frombits(b uint64) float64 {
+	return *(*float64)(unsafe.Pointer(&b))
+}
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index ec043ed..f14e0d5 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"fmt"
 	"os"
 	"reflect"
 	"runtime"
@@ -450,3 +451,53 @@ func TestPageAccounting(t *testing.T) {
 		t.Fatalf("mheap_.pagesInUse is %d, but direct count is %d", pagesInUse, counted)
 	}
 }
+
+func TestReadMemStats(t *testing.T) {
+	base, slow := runtime.ReadMemStatsSlow()
+	if base != slow {
+		logDiff(t, "MemStats", reflect.ValueOf(base), reflect.ValueOf(slow))
+		t.Fatal("memstats mismatch")
+	}
+}
+
+func logDiff(t *testing.T, prefix string, got, want reflect.Value) {
+	typ := got.Type()
+	switch typ.Kind() {
+	case reflect.Array, reflect.Slice:
+		if got.Len() != want.Len() {
+			t.Logf("len(%s): got %v, want %v", prefix, got, want)
+			return
+		}
+		for i := 0; i < got.Len(); i++ {
+			logDiff(t, fmt.Sprintf("%s[%d]", prefix, i), got.Index(i), want.Index(i))
+		}
+	case reflect.Struct:
+		for i := 0; i < typ.NumField(); i++ {
+			gf, wf := got.Field(i), want.Field(i)
+			logDiff(t, prefix+"."+typ.Field(i).Name, gf, wf)
+		}
+	case reflect.Map:
+		t.Fatal("not implemented: logDiff for map")
+	default:
+		if got.Interface() != want.Interface() {
+			t.Logf("%s: got %v, want %v", prefix, got, want)
+		}
+	}
+}
+
+func BenchmarkReadMemStats(b *testing.B) {
+	var ms runtime.MemStats
+	const heapSize = 100 << 20
+	x := make([]*[1024]byte, heapSize/1024)
+	for i := range x {
+		x[i] = new([1024]byte)
+	}
+	hugeSink = x
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		runtime.ReadMemStats(&ms)
+	}
+
+	hugeSink = nil
+}
diff --git a/libgo/go/runtime/hashmap.go b/libgo/go/runtime/hashmap.go
index 5b191d4..a3e50cd 100644
--- a/libgo/go/runtime/hashmap.go
+++ b/libgo/go/runtime/hashmap.go
@@ -129,6 +129,11 @@ type hmap struct {
 	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
 	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)
 
+	extra *mapextra // optional fields
+}
+
+// mapextra holds fields that are not present on all maps.
+type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
@@ -136,9 +141,11 @@ type hmap struct {
 	// Overflow is used only if key and value do not contain pointers.
 	// overflow[0] contains overflow buckets for hmap.buckets.
 	// overflow[1] contains overflow buckets for hmap.oldbuckets.
-	// The first indirection allows us to reduce static size of hmap.
-	// The second indirection allows to store a pointer to the slice in hiter.
-	overflow *[2]*[]*bmap
+	// The indirection allows to store a pointer to the slice in hiter.
+	overflow [2]*[]*bmap
+
+	// nextOverflow holds a pointer to a free overflow bucket.
+	nextOverflow *bmap
 }
 
 // A bucket for a Go map.
@@ -183,6 +190,10 @@ func (b *bmap) overflow(t *maptype) *bmap {
 	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize))
 }
 
+func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
+	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
+}
+
 // incrnoverflow increments h.noverflow.
 // noverflow counts the number of overflow buckets.
 // This is used to trigger same-size map growth.
@@ -209,21 +220,40 @@ func (h *hmap) incrnoverflow() {
 	}
 }
 
-func (h *hmap) setoverflow(t *maptype, b, ovf *bmap) {
+func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
+	var ovf *bmap
+	if h.extra != nil && h.extra.nextOverflow != nil {
+		// We have preallocated overflow buckets available.
+		// See makeBucketArray for more details.
+		ovf = h.extra.nextOverflow
+		if ovf.overflow(t) == nil {
+			// We're not at the end of the preallocated overflow buckets. Bump the pointer.
+			h.extra.nextOverflow = (*bmap)(add(unsafe.Pointer(ovf), uintptr(t.bucketsize)))
+		} else {
+			// This is the last preallocated overflow bucket.
+			// Reset the overflow pointer on this bucket,
+			// which was set to a non-nil sentinel value.
+			ovf.setoverflow(t, nil)
+			h.extra.nextOverflow = nil
+		}
+	} else {
+		ovf = (*bmap)(newobject(t.bucket))
+	}
 	h.incrnoverflow()
 	if t.bucket.kind&kindNoPointers != 0 {
 		h.createOverflow()
-		*h.overflow[0] = append(*h.overflow[0], ovf)
+		*h.extra.overflow[0] = append(*h.extra.overflow[0], ovf)
 	}
-	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
+	b.setoverflow(t, ovf)
+	return ovf
 }
 
 func (h *hmap) createOverflow() {
-	if h.overflow == nil {
-		h.overflow = new([2]*[]*bmap)
+	if h.extra == nil {
+		h.extra = new(mapextra)
 	}
-	if h.overflow[0] == nil {
-		h.overflow[0] = new([]*bmap)
+	if h.extra.overflow[0] == nil {
+		h.extra.overflow[0] = new([]*bmap)
 	}
 }
 
@@ -238,9 +268,8 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 		throw("bad hmap size")
 	}
 
-	if hint < 0 || int64(int32(hint)) != hint {
-		panic(plainError("makemap: size out of range"))
-		// TODO: make hint an int, then none of this nonsense
+	if hint < 0 || hint > int64(maxSliceCap(t.bucket.size)) {
+		hint = 0
 	}
 
 	if !ismapkey(t.key) {
@@ -290,8 +319,14 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 	// if B == 0, the buckets field is allocated lazily later (in mapassign)
 	// If hint is large zeroing this memory could take a while.
 	buckets := bucket
+	var extra *mapextra
 	if B != 0 {
-		buckets = newarray(t.bucket, 1<<B)
+		var nextOverflow *bmap
+		buckets, nextOverflow = makeBucketArray(t, B)
+		if nextOverflow != nil {
+			extra = new(mapextra)
+			extra.nextOverflow = nextOverflow
+		}
 	}
 
 	// initialize Hmap
@@ -300,6 +335,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 	}
 	h.count = 0
 	h.B = B
+	h.extra = extra
 	h.flags = 0
 	h.hash0 = fastrand()
 	h.buckets = buckets
@@ -514,12 +550,14 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	h.flags |= hashWriting
-
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
 
+	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// in which case we have not actually done a write.
+	h.flags |= hashWriting
+
 	if h.buckets == nil {
 		h.buckets = newarray(t.bucket, 1)
 	}
@@ -580,8 +618,7 @@ again:
 
 	if inserti == nil {
 		// all current buckets are full, allocate a new one.
-		newb := (*bmap)(newobject(t.bucket))
-		h.setoverflow(t, b, newb)
+		newb := h.newoverflow(t, b)
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
 		val = add(insertk, bucketCnt*uintptr(t.keysize))
@@ -628,11 +665,15 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	h.flags |= hashWriting
 
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// in which case we have not actually done a write (delete).
+	h.flags |= hashWriting
+
 	bucket := hash & (uintptr(1)<<h.B - 1)
 	if h.growing() {
 		growWork(t, h, bucket)
@@ -720,7 +761,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 		// the table grows and/or overflow buckets are added to the table
 		// while we are iterating.
 		h.createOverflow()
-		it.overflow = *h.overflow
+		it.overflow = h.extra.overflow
 	}
 
 	// decide where to start
@@ -886,6 +927,36 @@ next:
 	goto next
 }
 
+func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow *bmap) {
+	base := uintptr(1 << b)
+	nbuckets := base
+	// For small b, overflow buckets are unlikely.
+	// Avoid the overhead of the calculation.
+	if b >= 4 {
+		// Add on the estimated number of overflow buckets
+		// required to insert the median number of elements
+		// used with this value of b.
+		nbuckets += 1 << (b - 4)
+		sz := t.bucket.size * nbuckets
+		up := roundupsize(sz)
+		if up != sz {
+			nbuckets = up / t.bucket.size
+		}
+	}
+	buckets = newarray(t.bucket, int(nbuckets))
+	if base != nbuckets {
+		// We preallocated some overflow buckets.
+		// To keep the overhead of tracking these overflow buckets to a minimum,
+		// we use the convention that if a preallocated overflow bucket's overflow
+		// pointer is nil, then there are more available by bumping the pointer.
+		// We need a safe non-nil pointer for the last overflow bucket; just use buckets.
+		nextOverflow = (*bmap)(add(buckets, base*uintptr(t.bucketsize)))
+		last := (*bmap)(add(buckets, (nbuckets-1)*uintptr(t.bucketsize)))
+		last.setoverflow(t, (*bmap)(buckets))
+	}
+	return buckets, nextOverflow
+}
+
 func hashGrow(t *maptype, h *hmap) {
 	// If we've hit the load factor, get bigger.
 	// Otherwise, there are too many overflow buckets,
@@ -896,7 +967,8 @@ func hashGrow(t *maptype, h *hmap) {
 		h.flags |= sameSizeGrow
 	}
 	oldbuckets := h.buckets
-	newbuckets := newarray(t.bucket, 1<<(h.B+bigger))
+	newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger)
+
 	flags := h.flags &^ (iterator | oldIterator)
 	if h.flags&iterator != 0 {
 		flags |= oldIterator
@@ -909,13 +981,19 @@ func hashGrow(t *maptype, h *hmap) {
 	h.nevacuate = 0
 	h.noverflow = 0
 
-	if h.overflow != nil {
+	if h.extra != nil && h.extra.overflow[0] != nil {
 		// Promote current overflow buckets to the old generation.
-		if h.overflow[1] != nil {
+		if h.extra.overflow[1] != nil {
 			throw("overflow is not nil")
 		}
-		h.overflow[1] = h.overflow[0]
-		h.overflow[0] = nil
+		h.extra.overflow[1] = h.extra.overflow[0]
+		h.extra.overflow[0] = nil
+	}
+	if nextOverflow != nil {
+		if h.extra == nil {
+			h.extra = new(mapextra)
+		}
+		h.extra.nextOverflow = nextOverflow
 	}
 
 	// the actual copying of the hash table data is done incrementally
@@ -925,7 +1003,7 @@ func hashGrow(t *maptype, h *hmap) {
 // overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
 func overLoadFactor(count int64, B uint8) bool {
 	// TODO: rewrite to use integer math and comparison?
-	return count >= bucketCnt && float32(count) >= loadFactor*float32((uintptr(1)<<B))
+	return count >= bucketCnt && float32(count) >= loadFactor*float32((uint64(1)<<B))
 }
 
 // tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
@@ -977,6 +1055,11 @@ func growWork(t *maptype, h *hmap, bucket uintptr) {
 	}
 }
 
+func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
+	b := (*bmap)(add(h.oldbuckets, bucket*uintptr(t.bucketsize)))
+	return evacuated(b)
+}
+
 func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
 	newbit := h.noldbuckets()
@@ -1054,8 +1137,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 				if useX {
 					b.tophash[i] = evacuatedX
 					if xi == bucketCnt {
-						newx := (*bmap)(newobject(t.bucket))
-						h.setoverflow(t, x, newx)
+						newx := h.newoverflow(t, x)
 						x = newx
 						xi = 0
 						xk = add(unsafe.Pointer(x), dataOffset)
@@ -1078,8 +1160,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 				} else {
 					b.tophash[i] = evacuatedY
 					if yi == bucketCnt {
-						newy := (*bmap)(newobject(t.bucket))
-						h.setoverflow(t, y, newy)
+						newy := h.newoverflow(t, y)
 						y = newy
 						yi = 0
 						yk = add(unsafe.Pointer(y), dataOffset)
@@ -1118,14 +1199,23 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	// Advance evacuation mark
 	if oldbucket == h.nevacuate {
 		h.nevacuate = oldbucket + 1
-		if oldbucket+1 == newbit { // newbit == # of oldbuckets
+		// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+		// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+		stop := h.nevacuate + 1024
+		if stop > newbit {
+			stop = newbit
+		}
+		for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+			h.nevacuate++
+		}
+		if h.nevacuate == newbit { // newbit == # of oldbuckets
 			// Growing is all done. Free old main bucket array.
 			h.oldbuckets = nil
 			// Can discard old overflow buckets as well.
 			// If they are still referenced by an iterator,
 			// then the iterator holds a pointers to the slice.
-			if h.overflow != nil {
-				h.overflow[1] = nil
+			if h.extra != nil {
+				h.extra.overflow[1] = nil
 			}
 			h.flags &^= sameSizeGrow
 		}
@@ -1139,8 +1229,8 @@ func ismapkey(t *_type) bool {
 // Reflect stubs. Called from ../reflect/asm_*.s
 
 //go:linkname reflect_makemap reflect.makemap
-func reflect_makemap(t *maptype) *hmap {
-	return makemap(t, 0, nil, nil)
+func reflect_makemap(t *maptype, cap int) *hmap {
+	return makemap(t, int64(cap), nil, nil)
 }
 
 //go:linkname reflect_mapaccess reflect.mapaccess
diff --git a/libgo/go/runtime/hashmap_fast.go b/libgo/go/runtime/hashmap_fast.go
index 853da70..bec8fda 100644
--- a/libgo/go/runtime/hashmap_fast.go
+++ b/libgo/go/runtime/hashmap_fast.go
@@ -45,7 +45,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -94,7 +94,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -143,7 +143,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -192,7 +192,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -223,7 +223,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
 			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 				if x == empty {
 					continue
 				}
@@ -240,7 +240,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -252,8 +252,6 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 			// check first 4 bytes
-			// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
-			// four 1-byte comparisons.
 			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
 				continue
 			}
@@ -295,7 +293,7 @@ dohash:
 	}
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x != top {
 				continue
 			}
@@ -332,7 +330,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
 			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 				if x == empty {
 					continue
 				}
@@ -349,7 +347,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -402,7 +400,7 @@ dohash:
 	}
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x != top {
 				continue
 			}
@@ -420,3 +418,441 @@ dohash:
 		}
 	}
 }
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*4)
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*4)
+	}
+
+	// store new key/value at insert position
+	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*8)
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*8)
+	}
+
+	// store new key/value at insert position
+	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&ky)
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			// already have a mapping for key. Update it.
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*2*sys.PtrSize)
+	}
+
+	// store new key/value at insert position
+	*((*stringStruct)(insertk)) = *key
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*uint32)(add(unsafe.Pointer(b), dataOffset+i*4))
+			if key != *k {
+				continue
+			}
+			typedmemclr(t.key, unsafe.Pointer(k))
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*4 + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*uint64)(add(unsafe.Pointer(b), dataOffset+i*8))
+			if key != *k {
+				continue
+			}
+			typedmemclr(t.key, unsafe.Pointer(k))
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*8 + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func mapdelete_faststr(t *maptype, h *hmap, ky string) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	key := stringStructOf(&ky)
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			typedmemclr(t.key, unsafe.Pointer(k))
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*2*sys.PtrSize + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index 0db53f5..166199b 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -413,7 +413,7 @@ func dumpmemstats() {
 	dumpint(memstats.gc_sys)
 	dumpint(memstats.other_sys)
 	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc)
+	dumpint(memstats.last_gc_unix)
 	dumpint(memstats.pause_total_ns)
 	for i := 0; i < 256; i++ {
 		dumpint(memstats.pause_ns[i])
@@ -515,7 +515,7 @@ func writeheapdump_m(fd uintptr) {
 	// Update stats so we can dump them.
 	// As a side effect, flushes all the MCaches so the MSpan.freelist
 	// lists contain all the free objects.
-	updatememstats(nil)
+	updatememstats()
 
 	// Set dump file.
 	dumpfd = fd
diff --git a/libgo/go/runtime/iface_test.go b/libgo/go/runtime/iface_test.go
index 3744a4f..d63ea79 100644
--- a/libgo/go/runtime/iface_test.go
+++ b/libgo/go/runtime/iface_test.go
@@ -29,6 +29,20 @@ func (TM) Method2() {}
 func (TL) Method1() {}
 func (TL) Method2() {}
 
+type T8 uint8
+type T16 uint16
+type T32 uint32
+type T64 uint64
+type Tstr string
+type Tslice []byte
+
+func (T8) Method1()     {}
+func (T16) Method1()    {}
+func (T32) Method1()    {}
+func (T64) Method1()    {}
+func (Tstr) Method1()   {}
+func (Tslice) Method1() {}
+
 var (
 	e  interface{}
 	e_ interface{}
@@ -269,3 +283,133 @@ func TestNonEscapingConvT2I(t *testing.T) {
 		t.Fatalf("want 0 allocs, got %v", n)
 	}
 }
+
+func TestZeroConvT2x(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("does not work on gccgo without better escape analysis")
+	}
+
+	tests := []struct {
+		name string
+		fn   func()
+	}{
+		{name: "E8", fn: func() { e = eight8 }},  // any byte-sized value does not allocate
+		{name: "E16", fn: func() { e = zero16 }}, // zero values do not allocate
+		{name: "E32", fn: func() { e = zero32 }},
+		{name: "E64", fn: func() { e = zero64 }},
+		{name: "Estr", fn: func() { e = zerostr }},
+		{name: "Eslice", fn: func() { e = zeroslice }},
+		{name: "Econstflt", fn: func() { e = 99.0 }}, // constants do not allocate
+		{name: "Econststr", fn: func() { e = "change" }},
+		{name: "I8", fn: func() { i1 = eight8I }},
+		{name: "I16", fn: func() { i1 = zero16I }},
+		{name: "I32", fn: func() { i1 = zero32I }},
+		{name: "I64", fn: func() { i1 = zero64I }},
+		{name: "Istr", fn: func() { i1 = zerostrI }},
+		{name: "Islice", fn: func() { i1 = zerosliceI }},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			n := testing.AllocsPerRun(1000, test.fn)
+			if n != 0 {
+				t.Errorf("want zero allocs, got %v", n)
+			}
+		})
+	}
+}
+
+var (
+	eight8  uint8 = 8
+	eight8I T8    = 8
+
+	zero16  uint16 = 0
+	zero16I T16    = 0
+	one16   uint16 = 1
+
+	zero32  uint32 = 0
+	zero32I T32    = 0
+	one32   uint32 = 1
+
+	zero64  uint64 = 0
+	zero64I T64    = 0
+	one64   uint64 = 1
+
+	zerostr  string = ""
+	zerostrI Tstr   = ""
+	nzstr    string = "abc"
+
+	zeroslice  []byte = nil
+	zerosliceI Tslice = nil
+	nzslice    []byte = []byte("abc")
+
+	zerobig [512]byte
+	nzbig   [512]byte = [512]byte{511: 1}
+)
+
+func BenchmarkConvT2Ezero(b *testing.B) {
+	b.Run("zero", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero16
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero32
+			}
+		})
+		b.Run("64", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero64
+			}
+		})
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zerostr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zeroslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zerobig
+			}
+		})
+	})
+	b.Run("nonzero", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one16
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one32
+			}
+		})
+		b.Run("64", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one64
+			}
+		})
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzstr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzbig
+			}
+		})
+	})
+}
diff --git a/libgo/go/runtime/internal/sys/intrinsics.go b/libgo/go/runtime/internal/sys/intrinsics.go
index 43acf34..2928280 100644
--- a/libgo/go/runtime/internal/sys/intrinsics.go
+++ b/libgo/go/runtime/internal/sys/intrinsics.go
@@ -14,22 +14,22 @@ func builtinCtz64(uint64) int32
 
 // Ctz64 counts trailing (low-order) zeroes,
 // and if all are zero, then 64.
-func Ctz64(x uint64) uint64 {
+func Ctz64(x uint64) int {
 	if x == 0 {
 		return 64
 	}
-	return uint64(builtinCtz64(x))
+	return int(builtinCtz64(x))
 }
 
 //go:nosplit
 
 // Ctz32 counts trailing (low-order) zeroes,
 // and if all are zero, then 32.
-func Ctz32(x uint32) uint32 {
+func Ctz32(x uint32) int {
 	if x == 0 {
 		return 32
 	}
-	return uint32(builtinCtz32(x))
+	return int(builtinCtz32(x))
 }
 
 //extern __builtin_bswap64
diff --git a/libgo/go/runtime/internal/sys/intrinsics_test.go b/libgo/go/runtime/internal/sys/intrinsics_test.go
index 1f2c8da..0444183 100644
--- a/libgo/go/runtime/internal/sys/intrinsics_test.go
+++ b/libgo/go/runtime/internal/sys/intrinsics_test.go
@@ -6,17 +6,17 @@ import (
 )
 
 func TestCtz64(t *testing.T) {
-	for i := uint(0); i <= 64; i++ {
-		x := uint64(5) << i
-		if got := sys.Ctz64(x); got != uint64(i) {
+	for i := 0; i <= 64; i++ {
+		x := uint64(5) << uint(i)
+		if got := sys.Ctz64(x); got != i {
 			t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
 		}
 	}
 }
 func TestCtz32(t *testing.T) {
-	for i := uint(0); i <= 32; i++ {
-		x := uint32(5) << i
-		if got := sys.Ctz32(x); got != uint32(i) {
+	for i := 0; i <= 32; i++ {
+		x := uint32(5) << uint(i)
+		if got := sys.Ctz32(x); got != i {
 			t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
 		}
 	}
diff --git a/libgo/go/runtime/lfstack.go b/libgo/go/runtime/lfstack.go
index 2f2958c..4787c5b 100644
--- a/libgo/go/runtime/lfstack.go
+++ b/libgo/go/runtime/lfstack.go
@@ -3,10 +3,6 @@
 // license that can be found in the LICENSE file.
 
 // Lock-free stack.
-// Initialize head to 0, compare with 0 to test for emptiness.
-// The stack does not keep pointers to nodes,
-// so they can be garbage collected if there are no other pointers to nodes.
-// The following code runs only in non-preemptible contexts.
 
 package runtime
 
@@ -15,36 +11,47 @@ import (
 	"unsafe"
 )
 
-// Temporary for C code to call:
-//go:linkname lfstackpush runtime.lfstackpush
-//go:linkname lfstackpop runtime.lfstackpop
+// lfstack is the head of a lock-free stack.
+//
+// The zero value of lfstack is an empty list.
+//
+// This stack is intrusive. Nodes must embed lfnode as the first field.
+//
+// The stack does not keep GC-visible pointers to nodes, so the caller
+// is responsible for ensuring the nodes are not garbage collected
+// (typically by allocating them from manually-managed memory).
+type lfstack uint64
 
-func lfstackpush(head *uint64, node *lfnode) {
+func (head *lfstack) push(node *lfnode) {
 	node.pushcnt++
 	new := lfstackPack(node, node.pushcnt)
 	if node1 := lfstackUnpack(new); node1 != node {
-		print("runtime: lfstackpush invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
-		throw("lfstackpush")
+		print("runtime: lfstack.push invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
+		throw("lfstack.push")
 	}
 	for {
-		old := atomic.Load64(head)
+		old := atomic.Load64((*uint64)(head))
 		node.next = old
-		if atomic.Cas64(head, old, new) {
+		if atomic.Cas64((*uint64)(head), old, new) {
 			break
 		}
 	}
 }
 
-func lfstackpop(head *uint64) unsafe.Pointer {
+func (head *lfstack) pop() unsafe.Pointer {
 	for {
-		old := atomic.Load64(head)
+		old := atomic.Load64((*uint64)(head))
 		if old == 0 {
 			return nil
 		}
 		node := lfstackUnpack(old)
 		next := atomic.Load64(&node.next)
-		if atomic.Cas64(head, old, next) {
+		if atomic.Cas64((*uint64)(head), old, next) {
 			return unsafe.Pointer(node)
 		}
 	}
 }
+
+func (head *lfstack) empty() bool {
+	return atomic.Load64((*uint64)(head)) == 0
+}
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index 9877bc3..7ddd378 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -50,6 +50,7 @@ const (
 // affect mutex's state.
 
 // We use the uintptr mutex.key and note.key as a uint32.
+//go:nosplit
 func key32(p *uintptr) *uint32 {
 	return (*uint32)(unsafe.Pointer(p))
 }
@@ -152,9 +153,17 @@ func notesleep(n *note) {
 	if gp != gp.m.g0 {
 		throw("notesleep not on g0")
 	}
+	ns := int64(-1)
+	if *cgo_yield != nil {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		ns = 10e6
+	}
 	for atomic.Load(key32(&n.key)) == 0 {
 		gp.m.blocked = true
-		futexsleep(key32(&n.key), 0, -1)
+		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 	}
 }
@@ -168,9 +177,16 @@ func notetsleep_internal(n *note, ns int64) bool {
 	gp := getg()
 
 	if ns < 0 {
+		if *cgo_yield != nil {
+			// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+			ns = 10e6
+		}
 		for atomic.Load(key32(&n.key)) == 0 {
 			gp.m.blocked = true
-			futexsleep(key32(&n.key), 0, -1)
+			futexsleep(key32(&n.key), 0, ns)
+			if *cgo_yield != nil {
+				asmcgocall(*cgo_yield, nil)
+			}
 			gp.m.blocked = false
 		}
 		return true
@@ -182,8 +198,14 @@ func notetsleep_internal(n *note, ns int64) bool {
 
 	deadline := nanotime() + ns
 	for {
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 		if atomic.Load(key32(&n.key)) != 0 {
 			break
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 57fee19..52a2376 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -175,7 +175,16 @@ func notesleep(n *note) {
 	}
 	// Queued. Sleep.
 	gp.m.blocked = true
-	semasleep(-1)
+	if *cgo_yield == nil {
+		semasleep(-1)
+	} else {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		const ns = 10e6
+		for atomic.Loaduintptr(&n.key) == 0 {
+			semasleep(ns)
+			asmcgocall(*cgo_yield, nil)
+		}
+	}
 	gp.m.blocked = false
 }
 
@@ -198,7 +207,15 @@ func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
 	if ns < 0 {
 		// Queued. Sleep.
 		gp.m.blocked = true
-		semasleep(-1)
+		if *cgo_yield == nil {
+			semasleep(-1)
+		} else {
+			// Sleep in arbitrary-but-moderate intervals to poll libc interceptors.
+			const ns = 10e6
+			for semasleep(ns) < 0 {
+				asmcgocall(*cgo_yield, nil)
+			}
+		}
 		gp.m.blocked = false
 		return true
 	}
@@ -207,12 +224,18 @@ func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
 	for {
 		// Registered. Sleep.
 		gp.m.blocked = true
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
 		if semasleep(ns) >= 0 {
 			gp.m.blocked = false
 			// Acquired semaphore, semawakeup unregistered us.
 			// Done.
 			return true
 		}
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 		// Interrupted or timed out. Still registered. Semaphore not acquired.
 		ns = deadline - nanotime()
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index 3912fc2..796cd8a 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -122,7 +122,7 @@ const (
 
 	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
 	_TinySize      = 16
-	_TinySizeClass = 2
+	_TinySizeClass = int8(2)
 
 	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
 	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
@@ -159,7 +159,11 @@ const (
 	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
 	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
 
-	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+	// _MaxMem is the maximum heap arena size minus 1.
+	//
+	// On 32-bit, this is also the maximum heap pointer value,
+	// since the arena starts at address 0.
+	_MaxMem = 1<<_MHeapMap_TotalBits - 1
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
@@ -167,8 +171,6 @@ const (
 	// collector scales well to 32 cpus.
 	_MaxGcproc = 32
 
-	_MaxArena32 = 1<<32 - 1
-
 	// minLegalPointer is the smallest possible legal pointer.
 	// This is the smallest possible architectural page size,
 	// since we assume that the first page is never mapped.
@@ -250,18 +252,21 @@ func mallocinit() {
 		throw("bad system page size")
 	}
 
-	var p, bitmapSize, spansSize, pSize, limit uintptr
+	// The auxiliary regions start at p and are laid out in the
+	// following order: spans, bitmap, arena.
+	var p, pSize uintptr
 	var reserved bool
 
-	// limit = runtime.memlimit();
-	// See https://golang.org/issue/5049
-	// TODO(rsc): Fix after 1.1.
-	limit = 0
+	// The spans array holds one *mspan per _PageSize of arena.
+	var spansSize uintptr = (_MaxMem + 1) / _PageSize * sys.PtrSize
+	spansSize = round(spansSize, _PageSize)
+	// The bitmap holds 2 bits per word of arena.
+	var bitmapSize uintptr = (_MaxMem + 1) / (sys.PtrSize * 8 / 2)
+	bitmapSize = round(bitmapSize, _PageSize)
 
 	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found. The arena begins with a bitmap large
-	// enough to hold 2 bits per allocated word.
-	if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
+	// allocated data will be found.
+	if sys.PtrSize == 8 {
 		// On a 64-bit machine, allocate from a single contiguous reservation.
 		// 512 GB (MaxMem) should be big enough for now.
 		//
@@ -294,9 +299,7 @@ func mallocinit() {
 		// On AIX, mmap adresses range start at 0x07000000_00000000 for 64 bits
 		// processes.
 		arenaSize := round(_MaxMem, _PageSize)
-		bitmapSize = arenaSize / (sys.PtrSize * 8 / 2)
-		spansSize = arenaSize / _PageSize * sys.PtrSize
-		spansSize = round(spansSize, _PageSize)
+		pSize = bitmapSize + spansSize + arenaSize + _PageSize
 		for i := 0; i <= 0x7f; i++ {
 			switch {
 			case GOARCH == "arm64" && GOOS == "darwin":
@@ -309,7 +312,6 @@ func mallocinit() {
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
-			pSize = bitmapSize + spansSize + arenaSize + _PageSize
 			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
 			if p != 0 || GOOS == "aix" { // Useless to loop on AIX, as i is forced to 1
 				break
@@ -327,6 +329,15 @@ func mallocinit() {
 		// When that gets used up, we'll start asking the kernel
 		// for any memory anywhere.
 
+		// We want to start the arena low, but if we're linked
+		// against C code, it's possible global constructors
+		// have called malloc and adjusted the process' brk.
+		// Query the brk so we can avoid trying to map the
+		// arena over it (which will cause the kernel to put
+		// the arena somewhere else, likely at a high
+		// address).
+		procBrk := sbrk0()
+
 		// If we fail to allocate, try again with a smaller arena.
 		// This is necessary on Android L where we share a process
 		// with ART, which reserves virtual memory aggressively.
@@ -340,15 +351,6 @@ func mallocinit() {
 		}
 
 		for _, arenaSize := range &arenaSizes {
-			bitmapSize = (_MaxArena32 + 1) / (sys.PtrSize * 8 / 2)
-			spansSize = (_MaxArena32 + 1) / _PageSize * sys.PtrSize
-			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
-				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
-				arenaSize = bitmapSize * 8
-				spansSize = arenaSize / _PageSize * sys.PtrSize
-			}
-			spansSize = round(spansSize, _PageSize)
-
 			// SysReserve treats the address we ask for, end, as a hint,
 			// not as an absolute requirement. If we ask for the end
 			// of the data segment but the operating system requires
@@ -360,6 +362,12 @@ func mallocinit() {
 			// to a MB boundary.
 			p = round(getEnd()+(1<<18), 1<<20)
 			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			if p <= procBrk && procBrk < p+pSize {
+				// Move the start above the brk,
+				// leaving some room for future brk
+				// expansion.
+				p = round(procBrk+(1<<20), 1<<20)
+			}
 			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
 			if p != 0 {
 				break
@@ -374,18 +382,22 @@ func mallocinit() {
 	// so SysReserve can give us a PageSize-unaligned pointer.
 	// To overcome this we ask for PageSize more and round up the pointer.
 	p1 := round(p, _PageSize)
+	pSize -= p1 - p
 
 	spansStart := p1
-	mheap_.bitmap = p1 + spansSize + bitmapSize
+	p1 += spansSize
+	mheap_.bitmap = p1 + bitmapSize
+	p1 += bitmapSize
 	if sys.PtrSize == 4 {
 		// Set arena_start such that we can accept memory
 		// reservations located anywhere in the 4GB virtual space.
 		mheap_.arena_start = 0
 	} else {
-		mheap_.arena_start = p1 + (spansSize + bitmapSize)
+		mheap_.arena_start = p1
 	}
 	mheap_.arena_end = p + pSize
-	mheap_.arena_used = p1 + (spansSize + bitmapSize)
+	mheap_.arena_used = p1
+	mheap_.arena_alloc = p1
 	mheap_.arena_reserved = reserved
 
 	if mheap_.arena_start&(_PageSize-1) != 0 {
@@ -404,62 +416,78 @@ func mallocinit() {
 // h.arena_start and h.arena_end. sysAlloc returns nil on failure.
 // There is no corresponding free function.
 func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
-	if n > h.arena_end-h.arena_used {
-		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
-		// Reserve some more space.
+	// strandLimit is the maximum number of bytes to strand from
+	// the current arena block. If we would need to strand more
+	// than this, we fall back to sysAlloc'ing just enough for
+	// this allocation.
+	const strandLimit = 16 << 20
+
+	if n > h.arena_end-h.arena_alloc {
+		// If we haven't grown the arena to _MaxMem yet, try
+		// to reserve some more address space.
 		p_size := round(n+_PageSize, 256<<20)
 		new_end := h.arena_end + p_size // Careful: can overflow
-		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxArena32 {
+		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxMem {
 			// TODO: It would be bad if part of the arena
 			// is reserved and part is not.
 			var reserved bool
 			p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
 			if p == 0 {
-				return nil
+				// TODO: Try smaller reservation
+				// growths in case we're in a crowded
+				// 32-bit address space.
+				goto reservationFailed
 			}
 			// p can be just about anywhere in the address
 			// space, including before arena_end.
 			if p == h.arena_end {
+				// The new block is contiguous with
+				// the current block. Extend the
+				// current arena block.
 				h.arena_end = new_end
 				h.arena_reserved = reserved
-			} else if h.arena_end < p && p+p_size-h.arena_start-1 <= _MaxArena32 {
+			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxMem && h.arena_end-h.arena_alloc < strandLimit {
+				// We were able to reserve more memory
+				// within the arena space, but it's
+				// not contiguous with our previous
+				// reservation. It could be before or
+				// after our current arena_used.
+				//
 				// Keep everything page-aligned.
 				// Our pages are bigger than hardware pages.
 				h.arena_end = p + p_size
-				used := p + (-p & (_PageSize - 1))
-				h.mapBits(used)
-				h.mapSpans(used)
-				h.arena_used = used
+				p = round(p, _PageSize)
+				h.arena_alloc = p
 				h.arena_reserved = reserved
 			} else {
-				// We got a mapping, but it's not
-				// linear with our current arena, so
-				// we can't use it.
+				// We got a mapping, but either
+				//
+				// 1) It's not in the arena, so we
+				// can't use it. (This should never
+				// happen on 32-bit.)
+				//
+				// 2) We would need to discard too
+				// much of our current arena block to
+				// use it.
 				//
-				// TODO: Make it possible to allocate
-				// from this. We can't decrease
-				// arena_used, but we could introduce
-				// a new variable for the current
-				// allocation position.
-
 				// We haven't added this allocation to
 				// the stats, so subtract it from a
 				// fake stat (but avoid underflow).
+				//
+				// We'll fall back to a small sysAlloc.
 				stat := uint64(p_size)
 				sysFree(unsafe.Pointer(p), p_size, &stat)
 			}
 		}
 	}
 
-	if n <= h.arena_end-h.arena_used {
+	if n <= h.arena_end-h.arena_alloc {
 		// Keep taking from our reservation.
-		p := h.arena_used
+		p := h.arena_alloc
 		sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
-		h.mapBits(p + n)
-		h.mapSpans(p + n)
-		h.arena_used = p + n
-		if raceenabled {
-			racemapshadow(unsafe.Pointer(p), n)
+		h.arena_alloc += n
+		if h.arena_alloc > h.arena_used {
+			h.setArenaUsed(h.arena_alloc, true)
 		}
 
 		if p&(_PageSize-1) != 0 {
@@ -468,8 +496,9 @@ func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
 		return unsafe.Pointer(p)
 	}
 
+reservationFailed:
 	// If using 64-bit, our reservation is all we have.
-	if h.arena_end-h.arena_start > _MaxArena32 {
+	if sys.PtrSize != 4 {
 		return nil
 	}
 
@@ -481,28 +510,18 @@ func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
 		return nil
 	}
 
-	if p < h.arena_start || p+p_size-h.arena_start > _MaxArena32 {
-		top := ^uintptr(0)
-		if top-h.arena_start-1 > _MaxArena32 {
-			top = h.arena_start + _MaxArena32 + 1
-		}
+	if p < h.arena_start || p+p_size-h.arena_start > _MaxMem {
+		// This shouldn't be possible because _MaxMem is the
+		// whole address space on 32-bit.
+		top := uint64(h.arena_start) + _MaxMem
 		print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
 		sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
 		return nil
 	}
 
-	p_end := p + p_size
 	p += -p & (_PageSize - 1)
 	if p+n > h.arena_used {
-		h.mapBits(p + n)
-		h.mapSpans(p + n)
-		h.arena_used = p + n
-		if p_end > h.arena_end {
-			h.arena_end = p_end
-		}
-		if raceenabled {
-			racemapshadow(unsafe.Pointer(p), n)
-		}
+		h.setArenaUsed(p+n, true)
 	}
 
 	if p&(_PageSize-1) != 0 {
@@ -525,7 +544,7 @@ func nextFreeFast(s *mspan) gclinkptr {
 			if freeidx%64 == 0 && freeidx != s.nelems {
 				return 0
 			}
-			s.allocCache >>= (theBit + 1)
+			s.allocCache >>= uint(theBit + 1)
 			s.freeindex = freeidx
 			v := gclinkptr(result*s.elemsize + s.base())
 			s.allocCount++
@@ -541,8 +560,8 @@ func nextFreeFast(s *mspan) gclinkptr {
 // weight allocation. If it is a heavy weight allocation the caller must
 // determine whether a new GC cycle needs to be started or if the GC is active
 // whether this goroutine needs to assist the GC.
-func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
-	s = c.alloc[sizeclass]
+func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[spc]
 	shouldhelpgc = false
 	freeIndex := s.nextFreeIndex()
 	if freeIndex == s.nelems {
@@ -552,10 +571,10 @@ func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc
 			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
 		}
 		systemstack(func() {
-			c.refill(int32(sizeclass))
+			c.refill(spc)
 		})
 		shouldhelpgc = true
-		s = c.alloc[sizeclass]
+		s = c.alloc[spc]
 
 		freeIndex = s.nextFreeIndex()
 	}
@@ -693,10 +712,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				return x
 			}
 			// Allocate a new maxTinySize block.
-			span := c.alloc[tinySizeClass]
+			span := c.alloc[tinySpanClass]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, _, shouldhelpgc = c.nextFree(tinySizeClass)
+				v, _, shouldhelpgc = c.nextFree(tinySpanClass)
 			}
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
@@ -716,10 +735,11 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
 			}
 			size = uintptr(class_to_size[sizeclass])
-			span := c.alloc[sizeclass]
+			spc := makeSpanClass(sizeclass, noscan)
+			span := c.alloc[spc]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, span, shouldhelpgc = c.nextFree(sizeclass)
+				v, span, shouldhelpgc = c.nextFree(spc)
 			}
 			x = unsafe.Pointer(v)
 			if needzero && span.needzero != 0 {
@@ -730,7 +750,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, needzero)
+			s = largeAlloc(size, needzero, noscan)
 		})
 		s.freeindex = 1
 		s.allocCount = 1
@@ -739,9 +759,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	}
 
 	var scanSize uintptr
-	if noscan {
-		heapBitsSetTypeNoScan(uintptr(x))
-	} else {
+	if !noscan {
 		heapBitsSetType(uintptr(x), size, dataSize, typ)
 		if dataSize > typ.size {
 			// Array allocation. If there are any
@@ -803,8 +821,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		assistG.gcAssistBytes -= int64(size - dataSize)
 	}
 
-	if shouldhelpgc && gcShouldStart(false) {
-		gcStart(gcBackgroundMode, false)
+	if shouldhelpgc {
+		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+			gcStart(gcBackgroundMode, t)
+		}
 	}
 
 	if getg().preempt {
@@ -818,7 +838,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool) *mspan {
+func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	// print("largeAlloc size=", size, "\n")
 
 	if size+_PageSize < size {
@@ -834,7 +854,7 @@ func largeAlloc(size uintptr, needzero bool) *mspan {
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, 0, true, needzero)
+	s := mheap_.alloc(npages, makeSpanClass(0, noscan), true, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -924,7 +944,7 @@ func nextSampleNoFP() int32 {
 		rate = 0x3fffffff
 	}
 	if rate != 0 {
-		return int32(int(fastrand()) % (2 * rate))
+		return int32(fastrand() % uint32(2*rate))
 	}
 	return 0
 }
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index bc5530c..0d43cf6 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -6,6 +6,8 @@ package runtime_test
 
 import (
 	"flag"
+	"fmt"
+	"reflect"
 	. "runtime"
 	"testing"
 	"time"
@@ -22,24 +24,62 @@ func TestMemStats(t *testing.T) {
 	st := new(MemStats)
 	ReadMemStats(st)
 
-	// Everything except HeapReleased, HeapIdle, and NumGC,
-	// because they indeed can be 0.
-	if st.Alloc == 0 || st.TotalAlloc == 0 || st.Sys == 0 || st.Lookups == 0 ||
-		st.Mallocs == 0 || st.Frees == 0 || st.HeapAlloc == 0 || st.HeapSys == 0 ||
-		st.HeapInuse == 0 || st.HeapObjects == 0 || st.StackInuse == 0 ||
-		st.StackSys == 0 || st.MSpanInuse == 0 || st.MSpanSys == 0 || st.MCacheInuse == 0 ||
-		st.MCacheSys == 0 || st.BuckHashSys == 0 || st.GCSys == 0 || st.OtherSys == 0 ||
-		st.NextGC == 0 || st.NumForcedGC == 0 {
-		t.Fatalf("Zero value: %+v", *st)
-	}
-
-	if st.Alloc > 1e10 || st.TotalAlloc > 1e11 || st.Sys > 1e10 || st.Lookups > 1e10 ||
-		st.Mallocs > 1e10 || st.Frees > 1e10 || st.HeapAlloc > 1e10 || st.HeapSys > 1e10 ||
-		st.HeapIdle > 1e10 || st.HeapInuse > 1e10 || st.HeapObjects > 1e10 || st.StackInuse > 1e10 ||
-		st.StackSys > 1e10 || st.MSpanInuse > 1e10 || st.MSpanSys > 1e10 || st.MCacheInuse > 1e10 ||
-		st.MCacheSys > 1e10 || st.BuckHashSys > 1e10 || st.GCSys > 1e10 || st.OtherSys > 1e10 ||
-		st.NextGC > 1e10 || st.NumGC > 1e9 || st.NumForcedGC > 1e9 || st.PauseTotalNs > 1e11 {
-		t.Fatalf("Insanely high value (overflow?): %+v", *st)
+	nz := func(x interface{}) error {
+		if x != reflect.Zero(reflect.TypeOf(x)).Interface() {
+			return nil
+		}
+		return fmt.Errorf("zero value")
+	}
+	le := func(thresh float64) func(interface{}) error {
+		return func(x interface{}) error {
+			if reflect.ValueOf(x).Convert(reflect.TypeOf(thresh)).Float() < thresh {
+				return nil
+			}
+			return fmt.Errorf("insanely high value (overflow?); want <= %v", thresh)
+		}
+	}
+	eq := func(x interface{}) func(interface{}) error {
+		return func(y interface{}) error {
+			if x == y {
+				return nil
+			}
+			return fmt.Errorf("want %v", x)
+		}
+	}
+	// Of the uint fields, HeapReleased, HeapIdle can be 0.
+	// PauseTotalNs can be 0 if timer resolution is poor.
+	//
+	// TODO: Test that GCCPUFraction is <= 0.99. This currently
+	// fails on windows/386. (Issue #19319)
+	fields := map[string][]func(interface{}) error{
+		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
+		"Lookups": {nz, le(1e10)}, "Mallocs": {nz, le(1e10)}, "Frees": {nz, le(1e10)},
+		"HeapAlloc": {nz, le(1e10)}, "HeapSys": {nz, le(1e10)}, "HeapIdle": {le(1e10)},
+		"HeapInuse": {nz, le(1e10)}, "HeapReleased": {le(1e10)}, "HeapObjects": {nz, le(1e10)},
+		"StackInuse": {nz, le(1e10)}, "StackSys": {nz, le(1e10)},
+		"MSpanInuse": {nz, le(1e10)}, "MSpanSys": {nz, le(1e10)},
+		"MCacheInuse": {nz, le(1e10)}, "MCacheSys": {nz, le(1e10)},
+		"BuckHashSys": {nz, le(1e10)}, "GCSys": {nz, le(1e10)}, "OtherSys": {nz, le(1e10)},
+		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
+		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
+		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
+		"GCCPUFraction": nil, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
+		"BySize": nil,
+	}
+
+	rst := reflect.ValueOf(st).Elem()
+	for i := 0; i < rst.Type().NumField(); i++ {
+		name, val := rst.Type().Field(i).Name, rst.Field(i).Interface()
+		checks, ok := fields[name]
+		if !ok {
+			t.Errorf("unknown MemStats field %s", name)
+			continue
+		}
+		for _, check := range checks {
+			if err := check(val); err != nil {
+				t.Errorf("%s = %v: %s", name, val, err)
+			}
+		}
 	}
 	if st.Sys != st.HeapSys+st.StackSys+st.MSpanSys+st.MCacheSys+
 		st.BuckHashSys+st.GCSys+st.OtherSys {
diff --git a/libgo/go/runtime/map_test.go b/libgo/go/runtime/map_test.go
index 9b5b051..37c959f 100644
--- a/libgo/go/runtime/map_test.go
+++ b/libgo/go/runtime/map_test.go
@@ -10,6 +10,7 @@ import (
 	"reflect"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"testing"
@@ -594,6 +595,14 @@ func TestMapLargeValNoPointer(t *testing.T) {
 	}
 }
 
+// Test that making a map with a large or invalid hint
+// doesn't panic. (Issue 19926).
+func TestIgnoreBogusMapHint(t *testing.T) {
+	for _, hint := range []int64{-1, 1 << 62} {
+		_ = make(map[int]int, hint)
+	}
+}
+
 func benchmarkMapPop(b *testing.B, n int) {
 	m := map[int]int{}
 	for i := 0; i < b.N; i++ {
@@ -625,3 +634,86 @@ func TestNonEscapingMap(t *testing.T) {
 		t.Fatalf("want 0 allocs, got %v", n)
 	}
 }
+
+func benchmarkMapAssignInt32(b *testing.B, n int) {
+	a := make(map[int32]int)
+	for i := 0; i < b.N; i++ {
+		a[int32(i&(n-1))] = i
+	}
+}
+
+func benchmarkMapDeleteInt32(b *testing.B, n int) {
+	a := make(map[int32]int)
+	for i := 0; i < n*b.N; i++ {
+		a[int32(i)] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, int32(i))
+	}
+}
+
+func benchmarkMapAssignInt64(b *testing.B, n int) {
+	a := make(map[int64]int)
+	for i := 0; i < b.N; i++ {
+		a[int64(i&(n-1))] = i
+	}
+}
+
+func benchmarkMapDeleteInt64(b *testing.B, n int) {
+	a := make(map[int64]int)
+	for i := 0; i < n*b.N; i++ {
+		a[int64(i)] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, int64(i))
+	}
+}
+
+func benchmarkMapAssignStr(b *testing.B, n int) {
+	k := make([]string, n)
+	for i := 0; i < len(k); i++ {
+		k[i] = strconv.Itoa(i)
+	}
+	b.ResetTimer()
+	a := make(map[string]int)
+	for i := 0; i < b.N; i++ {
+		a[k[i&(n-1)]] = i
+	}
+}
+
+func benchmarkMapDeleteStr(b *testing.B, n int) {
+	k := make([]string, n*b.N)
+	for i := 0; i < n*b.N; i++ {
+		k[i] = strconv.Itoa(i)
+	}
+	a := make(map[string]int)
+	for i := 0; i < n*b.N; i++ {
+		a[k[i]] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, k[i])
+	}
+}
+
+func runWith(f func(*testing.B, int), v ...int) func(*testing.B) {
+	return func(b *testing.B) {
+		for _, n := range v {
+			b.Run(strconv.Itoa(n), func(b *testing.B) { f(b, n) })
+		}
+	}
+}
+
+func BenchmarkMapAssign(b *testing.B) {
+	b.Run("Int32", runWith(benchmarkMapAssignInt32, 1<<8, 1<<16))
+	b.Run("Int64", runWith(benchmarkMapAssignInt64, 1<<8, 1<<16))
+	b.Run("Str", runWith(benchmarkMapAssignStr, 1<<8, 1<<16))
+}
+
+func BenchmarkMapDelete(b *testing.B) {
+	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 1, 2, 4))
+	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 1, 2, 4))
+	b.Run("Str", runWith(benchmarkMapDeleteStr, 1, 2, 4))
+}
diff --git a/libgo/go/runtime/mapspeed_test.go b/libgo/go/runtime/mapspeed_test.go
index ac93119..aec0c51 100644
--- a/libgo/go/runtime/mapspeed_test.go
+++ b/libgo/go/runtime/mapspeed_test.go
@@ -5,6 +5,7 @@ package runtime_test
 
 import (
 	"fmt"
+	"strconv"
 	"strings"
 	"testing"
 )
@@ -308,6 +309,20 @@ func BenchmarkSmallKeyMap(b *testing.B) {
 	}
 }
 
+func BenchmarkMapPopulate(b *testing.B) {
+	for size := 1; size < 1000000; size *= 10 {
+		b.Run(strconv.Itoa(size), func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				m := make(map[int]bool)
+				for j := 0; j < size; j++ {
+					m[j] = true
+				}
+			}
+		})
+	}
+}
+
 type ComplexAlgKey struct {
 	a, b, c int64
 	_       int
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index 3a463c8..d54016f0 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -156,6 +156,11 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 		// combine the read and the write. Checking inheap is
 		// insufficient since we need to track changes to
 		// roots outside the heap.
+		//
+		// Note: profbuf.go omits a barrier during signal handler
+		// profile logging; that's safe only because this deletion barrier exists.
+		// If we remove the deletion barrier, we'll have to work out
+		// a new way to handle the profile logging.
 		if slot1 := uintptr(unsafe.Pointer(slot)); slot1 >= minPhysPageSize {
 			if optr := *slot; optr != 0 {
 				shade(optr)
@@ -238,6 +243,7 @@ func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
 }
 
 // typedmemmove copies a value of type t to dst from src.
+// Must be nosplit, see #16026.
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index a7ccc65..d1a5820 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -45,6 +45,11 @@
 // not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
+// The bitmap for noscan spans is not maintained. Code must ensure
+// that an object is scannable before consulting its bitmap by
+// checking either the noscan bit in the span or by consulting its
+// type's information.
+//
 // Checkmarks
 //
 // In a concurrent garbage collector, one worries about failing to mark
@@ -134,13 +139,9 @@ func subtract1(p *byte) *byte {
 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 }
 
-// mHeap_MapBits is called each time arena_used is extended.
-// It maps any additional bitmap memory needed for the new arena memory.
-// It must be called with the expected new value of arena_used,
-// *before* h.arena_used has been updated.
-// Waiting to update arena_used until after the memory has been mapped
-// avoids faults when other threads try access the bitmap immediately
-// after observing the change to arena_used.
+// mapBits maps any additional bitmap memory needed for the new arena memory.
+//
+// Don't call this directly. Call mheap.setArenaUsed.
 //
 //go:nowritebarrier
 func (h *mheap) mapBits(arena_used uintptr) {
@@ -186,10 +187,8 @@ type markBits struct {
 
 //go:nosplit
 func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
-	whichByte := allocBitIndex / 8
-	whichBit := allocBitIndex % 8
-	bytePtr := addb(s.allocBits, whichByte)
-	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
+	bytep, mask := s.allocBits.bitp(allocBitIndex)
+	return markBits{bytep, mask, allocBitIndex}
 }
 
 // refillaCache takes 8 bytes s.allocBits starting at whichByte
@@ -197,7 +196,7 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
 // can be used. It then places these 8 bytes into the cached 64 bit
 // s.allocCache.
 func (s *mspan) refillAllocCache(whichByte uintptr) {
-	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
+	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
 	aCache := uint64(0)
 	aCache |= uint64(bytes[0])
 	aCache |= uint64(bytes[1]) << (1 * 8)
@@ -248,7 +247,7 @@ func (s *mspan) nextFreeIndex() uintptr {
 		return snelems
 	}
 
-	s.allocCache >>= (bitIndex + 1)
+	s.allocCache >>= uint(bitIndex + 1)
 	sfreeindex = result + 1
 
 	if sfreeindex%64 == 0 && sfreeindex != snelems {
@@ -269,10 +268,8 @@ func (s *mspan) isFree(index uintptr) bool {
 	if index < s.freeindex {
 		return false
 	}
-	whichByte := index / 8
-	whichBit := index % 8
-	byteVal := *addb(s.allocBits, whichByte)
-	return byteVal&uint8(1<<whichBit) == 0
+	bytep, mask := s.allocBits.bitp(index)
+	return *bytep&mask == 0
 }
 
 func (s *mspan) objIndex(p uintptr) uintptr {
@@ -294,14 +291,12 @@ func markBitsForAddr(p uintptr) markBits {
 }
 
 func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
-	whichByte := objIndex / 8
-	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
-	bytePtr := addb(s.gcmarkBits, whichByte)
-	return markBits{bytePtr, bitMask, objIndex}
+	bytep, mask := s.gcmarkBits.bitp(objIndex)
+	return markBits{bytep, mask, objIndex}
 }
 
 func (s *mspan) markBitsForBase() markBits {
-	return markBits{s.gcmarkBits, uint8(1), 0}
+	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
 }
 
 // isMarked reports whether mark bit m is set.
@@ -332,11 +327,6 @@ func (m markBits) clearMarked() {
 	atomic.And8(m.bytep, ^m.mask)
 }
 
-// clearMarkedNonAtomic clears the marked bit non-atomically.
-func (m markBits) clearMarkedNonAtomic() {
-	*m.bytep ^= m.mask
-}
-
 // markBitsForSpan returns the markBits for the span base address base.
 func markBitsForSpan(base uintptr) (mbits markBits) {
 	if base < mheap_.arena_start || base >= mheap_.arena_used {
@@ -404,7 +394,7 @@ func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr,
 	// Consult the span table to find the block beginning.
 	s = mheap_.spans[idx]
 	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
-		if s == nil || s.state == _MSpanStack || forStack {
+		if s == nil || s.state == _MSpanManual || forStack {
 			// If s is nil, the virtual address has never been part of the heap.
 			// This pointer may be to some mmap'd region, so we allow it.
 			// Pointers into stacks are also ok, the runtime manages these explicitly.
@@ -434,6 +424,7 @@ func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr,
 				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 				gcDumpObject("object", refBase, refOff)
 			}
+			getg().m.traceback = 2
 			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
 		}
 		return
@@ -524,16 +515,6 @@ func (h heapBits) isPointer() bool {
 	return h.bits()&bitPointer != 0
 }
 
-// hasPointers reports whether the given object has any pointers.
-// It must be told how large the object at h is for efficiency.
-// h must describe the initial word of the object.
-func (h heapBits) hasPointers(size uintptr) bool {
-	if size == sys.PtrSize { // 1-word objects are always pointers
-		return true
-	}
-	return (*h.bitp>>h.shift)&bitScan != 0
-}
-
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
 // It must be told how large the object at h is, because the encoding of the
 // checkmark bit varies by size.
@@ -839,23 +820,23 @@ var oneBitCount = [256]uint8{
 	4, 5, 5, 6, 5, 6, 6, 7,
 	5, 6, 6, 7, 6, 7, 7, 8}
 
-// countFree runs through the mark bits in a span and counts the number of free objects
-// in the span.
+// countAlloc returns the number of objects allocated in span s by
+// scanning the allocation bitmap.
 // TODO:(rlh) Use popcount intrinsic.
-func (s *mspan) countFree() int {
+func (s *mspan) countAlloc() int {
 	count := 0
 	maxIndex := s.nelems / 8
 	for i := uintptr(0); i < maxIndex; i++ {
-		mrkBits := *addb(s.gcmarkBits, i)
+		mrkBits := *s.gcmarkBits.bytep(i)
 		count += int(oneBitCount[mrkBits])
 	}
 	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
-		mrkBits := *addb(s.gcmarkBits, maxIndex)
+		mrkBits := *s.gcmarkBits.bytep(maxIndex)
 		mask := uint8((1 << bitsInLastByte) - 1)
 		bits := mrkBits & mask
 		count += int(oneBitCount[bits])
 	}
-	return int(s.nelems) - count
+	return count
 }
 
 // heapBitsSetType records that the new allocation [x, x+size)
@@ -1076,7 +1057,9 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 					endnb += endnb
 				}
 				// Truncate to a multiple of original ptrmask.
-				endnb = maxBits / nb * nb
+				// Because nb+nb <= maxBits, nb fits in a byte.
+				// Byte division is cheaper than uintptr division.
+				endnb = uintptr(maxBits/byte(nb)) * nb
 				pbits &= 1<<endnb - 1
 				b = pbits
 				nb = endnb
@@ -1354,13 +1337,6 @@ Phase4:
 	}
 }
 
-// heapBitsSetTypeNoScan marks x as noscan by setting the first word
-// of x in the heap bitmap to scalar/dead.
-func heapBitsSetTypeNoScan(x uintptr) {
-	h := heapBitsForAddr(uintptr(x))
-	*h.bitp &^= (bitPointer | bitScan) << h.shift
-}
-
 var debugPtrmask struct {
 	lock mutex
 	data *byte
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 92dabef..71a2f22 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -33,7 +33,8 @@ type mcache struct {
 	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
 
 	// The rest is not accessed on every malloc.
-	alloc [_NumSizeClasses]*mspan // spans to allocate from
+
+	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
 
 	// Local allocator stats, flushed during GC.
 	local_nlookup    uintptr                  // number of pointer lookups
@@ -70,7 +71,7 @@ func allocmcache() *mcache {
 	lock(&mheap_.lock)
 	c := (*mcache)(mheap_.cachealloc.alloc())
 	unlock(&mheap_.lock)
-	for i := 0; i < _NumSizeClasses; i++ {
+	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
 	c.next_sample = nextSample()
@@ -95,12 +96,12 @@ func freemcache(c *mcache) {
 
 // Gets a span that has a free object in it and assigns it
 // to be the cached span for the given sizeclass. Returns this span.
-func (c *mcache) refill(sizeclass int32) *mspan {
+func (c *mcache) refill(spc spanClass) *mspan {
 	_g_ := getg()
 
 	_g_.m.locks++
 	// Return the current cached span to the central lists.
-	s := c.alloc[sizeclass]
+	s := c.alloc[spc]
 
 	if uintptr(s.allocCount) != s.nelems {
 		throw("refill of span with free space remaining")
@@ -111,7 +112,7 @@ func (c *mcache) refill(sizeclass int32) *mspan {
 	}
 
 	// Get a new cached span from the central lists.
-	s = mheap_.central[sizeclass].mcentral.cacheSpan()
+	s = mheap_.central[spc].mcentral.cacheSpan()
 	if s == nil {
 		throw("out of memory")
 	}
@@ -120,13 +121,13 @@ func (c *mcache) refill(sizeclass int32) *mspan {
 		throw("span has no free space")
 	}
 
-	c.alloc[sizeclass] = s
+	c.alloc[spc] = s
 	_g_.m.locks--
 	return s
 }
 
 func (c *mcache) releaseAll() {
-	for i := 0; i < _NumSizeClasses; i++ {
+	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
 			mheap_.central[i].mcentral.uncacheSpan(s)
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index ddcf81e..eaabcb9 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -19,14 +19,19 @@ import "runtime/internal/atomic"
 //go:notinheap
 type mcentral struct {
 	lock      mutex
-	sizeclass int32
+	spanclass spanClass
 	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
 	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// nmalloc is the cumulative count of objects allocated from
+	// this mcentral, assuming all spans in mcaches are
+	// fully-allocated. Written atomically, read under STW.
+	nmalloc uint64
 }
 
 // Initialize a single central free list.
-func (c *mcentral) init(sizeclass int32) {
-	c.sizeclass = sizeclass
+func (c *mcentral) init(spc spanClass) {
+	c.spanclass = spc
 	c.nonempty.init()
 	c.empty.init()
 }
@@ -34,10 +39,14 @@ func (c *mcentral) init(sizeclass int32) {
 // Allocate a span to use in an MCache.
 func (c *mcentral) cacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
-	spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
+	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
 
 	lock(&c.lock)
+	traceDone := false
+	if trace.enabled {
+		traceGCSweepStart()
+	}
 	sg := mheap_.sweepgen
 retry:
 	var s *mspan
@@ -87,6 +96,10 @@ retry:
 		// all subsequent ones must also be either swept or in process of sweeping
 		break
 	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceDone = true
+	}
 	unlock(&c.lock)
 
 	// Replenish central list if empty.
@@ -101,15 +114,18 @@ retry:
 	// At this point s is a non-empty span, queued at the end of the empty list,
 	// c is unlocked.
 havespan:
+	if trace.enabled && !traceDone {
+		traceGCSweepDone()
+	}
 	cap := int32((s.npages << _PageShift) / s.elemsize)
 	n := cap - int32(s.allocCount)
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	atomic.Xadd64(&c.nmalloc, int64(n))
 	usedBytes := uintptr(s.allocCount) * s.elemsize
-	if usedBytes > 0 {
-		reimburseSweepCredit(usedBytes)
-	}
 	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
 	if trace.enabled {
 		// heap_live changed.
@@ -150,6 +166,10 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// mCentral_CacheSpan conservatively counted
 		// unallocated slots in heap_live. Undo this.
 		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+		// cacheSpan updated alloc assuming all objects on s
+		// were going to be allocated. Adjust for any that
+		// weren't.
+		atomic.Xadd64(&c.nmalloc, -int64(n))
 	}
 	unlock(&c.lock)
 }
@@ -205,11 +225,11 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 
 // grow allocates a new empty span from the heap and initializes it for c's size class.
 func (c *mcentral) grow() *mspan {
-	npages := uintptr(class_to_allocnpages[c.sizeclass])
-	size := uintptr(class_to_size[c.sizeclass])
+	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
+	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 	n := (npages << _PageShift) / size
 
-	s := mheap_.alloc(npages, c.sizeclass, false, true)
+	s := mheap_.alloc(npages, c.spanclass, false, true)
 	if s == nil {
 		return nil
 	}
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 229ccb5..4353ee5 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -12,8 +12,12 @@ import (
 	"unsafe"
 )
 
+// finblock is an array of finalizers to be executed. finblocks are
+// arranged in a linked list for the finalizer queue.
+//
 // finblock is allocated from non-GC'd memory, so any heap pointers
-// must be specially handled.
+// must be specially handled. GC currently assumes that the finalizer
+// queue does not grow during marking (but it can shrink).
 //
 //go:notinheap
 type finblock struct {
@@ -42,6 +46,16 @@ type finalizer struct {
 }
 
 func queuefinalizer(p unsafe.Pointer, fn *funcval, ft *functype, ot *ptrtype) {
+	if gcphase != _GCoff {
+		// Currently we assume that the finalizer queue won't
+		// grow during marking so we don't have to rescan it
+		// during mark termination. If we ever need to lift
+		// this assumption, we can do it by adding the
+		// necessary barriers to queuefinalizer (which it may
+		// have automatically).
+		throw("queuefinalizer during GC")
+	}
+
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
@@ -399,7 +413,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 	}
 
 	n = s.elemsize
-	if s.sizeclass != 0 {
+	if s.spanclass.sizeclass() != 0 {
 		x = add(x, (uintptr(v)-uintptr(x))/n*n)
 	}
 	return
diff --git a/libgo/go/runtime/mfixalloc.go b/libgo/go/runtime/mfixalloc.go
index fe4b0fc..7496671 100644
--- a/libgo/go/runtime/mfixalloc.go
+++ b/libgo/go/runtime/mfixalloc.go
@@ -29,7 +29,7 @@ type fixalloc struct {
 	first  func(arg, p unsafe.Pointer) // called first time p is returned
 	arg    unsafe.Pointer
 	list   *mlink
-	chunk  unsafe.Pointer
+	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
 	stat   *uint64
@@ -54,7 +54,7 @@ func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg uns
 	f.first = first
 	f.arg = arg
 	f.list = nil
-	f.chunk = nil
+	f.chunk = 0
 	f.nchunk = 0
 	f.inuse = 0
 	f.stat = stat
@@ -77,15 +77,15 @@ func (f *fixalloc) alloc() unsafe.Pointer {
 		return v
 	}
 	if uintptr(f.nchunk) < f.size {
-		f.chunk = persistentalloc(_FixAllocChunk, 0, f.stat)
+		f.chunk = uintptr(persistentalloc(_FixAllocChunk, 0, f.stat))
 		f.nchunk = _FixAllocChunk
 	}
 
-	v := f.chunk
+	v := unsafe.Pointer(f.chunk)
 	if f.first != nil {
 		f.first(f.arg, v)
 	}
-	f.chunk = add(f.chunk, f.size)
+	f.chunk = f.chunk + f.size
 	f.nchunk -= uint32(f.size)
 	f.inuse += f.size
 	return v
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index a4fc2be..31c4be8 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -178,17 +178,21 @@ func gcinit() {
 		throw("size of Workbuf is suboptimal")
 	}
 
+	// No sweep on the first cycle.
+	mheap_.sweepdone = 1
+
+	// Set a reasonable initial GC trigger.
+	memstats.triggerRatio = 7 / 8.0
+
+	// Fake a heap_marked value so it looks like a trigger at
+	// heapminimum is the appropriate growth from heap_marked.
+	// This will go into computing the initial GC goal.
+	memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
+
+	// Set gcpercent from the environment. This will also compute
+	// and set the GC trigger and goal.
 	_ = setGCPercent(readgogc())
-	memstats.gc_trigger = heapminimum
-	// Compute the goal heap size based on the trigger:
-	//   trigger = marked * (1 + triggerRatio)
-	//   marked = trigger / (1 + triggerRatio)
-	//   goal = marked * (1 + GOGC/100)
-	//        = trigger / (1 + triggerRatio) * (1 + GOGC/100)
-	memstats.next_gc = uint64(float64(memstats.gc_trigger) / (1 + gcController.triggerRatio) * (1 + float64(gcpercent)/100))
-	if gcpercent < 0 {
-		memstats.next_gc = ^uint64(0)
-	}
+
 	work.startSema = 1
 	work.markDoneSema = 1
 }
@@ -224,12 +228,8 @@ func setGCPercent(in int32) (out int32) {
 	}
 	gcpercent = in
 	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
-	if gcController.triggerRatio > float64(gcpercent)/100 {
-		gcController.triggerRatio = float64(gcpercent) / 100
-	}
-	// This is either in gcinit or followed by a STW GC, both of
-	// which will reset other stats like memstats.gc_trigger and
-	// memstats.next_gc to appropriate values.
+	// Update pacing in response to gcpercent change.
+	gcSetTriggerRatio(memstats.triggerRatio)
 	unlock(&mheap_.lock)
 	return out
 }
@@ -239,7 +239,9 @@ func setGCPercent(in int32) (out int32) {
 var gcphase uint32
 
 // The compiler knows about this variable.
-// If you change it, you must change the compiler too.
+// If you change it, you must change builtin/runtime.go, too.
+// If you change the first four bytes, you must also change the write
+// barrier insertion code.
 var writeBarrier struct {
 	enabled bool    // compiler emits a check of this before calling write barrier
 	pad     [3]byte // compiler uses 32-bit load for "enabled" field
@@ -329,10 +331,10 @@ var gcMarkWorkerModeStrings = [...]string{
 // utilization between assist and background marking to be 25% of
 // GOMAXPROCS. The high-level design of this algorithm is documented
 // at https://golang.org/s/go15gcpacing.
-var gcController = gcControllerState{
-	// Initial trigger ratio guess.
-	triggerRatio: 7 / 8.0,
-}
+//
+// All fields of gcController are used only during a single mark
+// cycle.
+var gcController gcControllerState
 
 type gcControllerState struct {
 	// scanWork is the total scan work performed this cycle. This
@@ -403,14 +405,6 @@ type gcControllerState struct {
 	// beginning of each cycle.
 	fractionalUtilizationGoal float64
 
-	// triggerRatio is the heap growth ratio at which the garbage
-	// collection cycle should start. E.g., if this is 0.6, then
-	// GC should start when the live heap has reached 1.6 times
-	// the heap size marked by the previous cycle. This should be
-	// ≤ GOGC/100 so the trigger heap size is less than the goal
-	// heap size. This is updated at the end of of each cycle.
-	triggerRatio float64
-
 	_ [sys.CacheLineSize]byte
 
 	// fractionalMarkWorkersNeeded is the number of fractional
@@ -439,7 +433,7 @@ func (c *gcControllerState) startCycle() {
 	// first cycle) or may be much smaller (resulting in a large
 	// error response).
 	if memstats.gc_trigger <= heapminimum {
-		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + c.triggerRatio))
+		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + memstats.triggerRatio))
 	}
 
 	// Re-compute the heap goal for this cycle in case something
@@ -495,17 +489,12 @@ func (c *gcControllerState) startCycle() {
 
 // revise updates the assist ratio during the GC cycle to account for
 // improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan or memstats.heap_live is updated (with
-// mheap_.lock held).
+// whenever memstats.heap_scan, memstats.heap_live, or
+// memstats.next_gc is updated (with mheap_.lock held).
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
 // available).
-//
-// TODO: Consider removing the periodic controller update altogether.
-// Since we switched to allocating black, in theory we shouldn't have
-// to change the assist ratio. However, this is still a useful hook
-// that we've found many uses for when experimenting.
 func (c *gcControllerState) revise() {
 	// Compute the expected scan work remaining.
 	//
@@ -536,7 +525,7 @@ func (c *gcControllerState) revise() {
 	}
 
 	// Compute the heap distance remaining.
-	heapDistance := int64(memstats.next_gc) - int64(memstats.heap_live)
+	heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
 	if heapDistance <= 0 {
 		// This shouldn't happen, but if it does, avoid
 		// dividing by zero or setting the assist negative.
@@ -550,10 +539,15 @@ func (c *gcControllerState) revise() {
 	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
 }
 
-// endCycle updates the GC controller state at the end of the
-// concurrent part of the GC cycle.
-func (c *gcControllerState) endCycle() {
-	h_t := c.triggerRatio // For debugging
+// endCycle computes the trigger ratio for the next cycle.
+func (c *gcControllerState) endCycle() float64 {
+	if work.userForced {
+		// Forced GC means this cycle didn't start at the
+		// trigger, so where it finished isn't good
+		// information about how to adjust the trigger.
+		// Just leave it where it is.
+		return memstats.triggerRatio
+	}
 
 	// Proportional response gain for the trigger controller. Must
 	// be in [0, 1]. Lower values smooth out transient effects but
@@ -582,25 +576,17 @@ func (c *gcControllerState) endCycle() {
 		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
 	}
 
-	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+	triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
 
 	// Finally, we adjust the trigger for next time by this error,
 	// damped by the proportional gain.
-	c.triggerRatio += triggerGain * triggerError
-	if c.triggerRatio < 0 {
-		// This can happen if the mutator is allocating very
-		// quickly or the GC is scanning very slowly.
-		c.triggerRatio = 0
-	} else if c.triggerRatio > goalGrowthRatio*0.95 {
-		// Ensure there's always a little margin so that the
-		// mutator assist ratio isn't infinity.
-		c.triggerRatio = goalGrowthRatio * 0.95
-	}
+	triggerRatio := memstats.triggerRatio + triggerGain*triggerError
 
 	if debug.gcpacertrace > 0 {
 		// Print controller state in terms of the design
 		// document.
 		H_m_prev := memstats.heap_marked
+		h_t := memstats.triggerRatio
 		H_T := memstats.gc_trigger
 		h_a := actualGrowthRatio
 		H_a := memstats.heap_live
@@ -620,6 +606,8 @@ func (c *gcControllerState) endCycle() {
 			" u_a/u_g=", u_a/u_g,
 			"\n")
 	}
+
+	return triggerRatio
 }
 
 // enlistWorker encourages another dedicated mark worker to start on
@@ -651,7 +639,7 @@ func (c *gcControllerState) enlistWorker() {
 	}
 	myID := gp.m.p.ptr().id
 	for tries := 0; tries < 5; tries++ {
-		id := int32(fastrand() % uint32(gomaxprocs-1))
+		id := int32(fastrandn(uint32(gomaxprocs - 1)))
 		if id >= myID {
 			id++
 		}
@@ -701,9 +689,6 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
-		// TODO(austin): This P isn't going to run anything
-		// else for a while, so kick everything out of its run
-		// queue.
 	} else {
 		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
 			// No more workers are need right now.
@@ -761,6 +746,120 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	return gp
 }
 
+// gcSetTriggerRatio sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcpercent, memstats.heap_marked, and
+// memstats.heap_live. These must be up to date.
+//
+// mheap_.lock must be held or the world must be stopped.
+func gcSetTriggerRatio(triggerRatio float64) {
+	// Set the trigger ratio, capped to reasonable bounds.
+	if triggerRatio < 0 {
+		// This can happen if the mutator is allocating very
+		// quickly or the GC is scanning very slowly.
+		triggerRatio = 0
+	} else if gcpercent >= 0 {
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		maxTriggerRatio := 0.95 * float64(gcpercent) / 100
+		if triggerRatio > maxTriggerRatio {
+			triggerRatio = maxTriggerRatio
+		}
+	}
+	memstats.triggerRatio = triggerRatio
+
+	// Compute the absolute GC trigger from the trigger ratio.
+	//
+	// We trigger the next GC cycle when the allocated heap has
+	// grown by the trigger ratio over the marked heap size.
+	trigger := ^uint64(0)
+	if gcpercent >= 0 {
+		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
+		// Don't trigger below the minimum heap size.
+		minTrigger := heapminimum
+		if !gosweepdone() {
+			// Concurrent sweep happens in the heap growth
+			// from heap_live to gc_trigger, so ensure
+			// that concurrent sweep has some heap growth
+			// in which to perform sweeping before we
+			// start the next GC cycle.
+			sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance*uint64(gcpercent)/100
+			if sweepMin > minTrigger {
+				minTrigger = sweepMin
+			}
+		}
+		if trigger < minTrigger {
+			trigger = minTrigger
+		}
+		if int64(trigger) < 0 {
+			print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
+			throw("gc_trigger underflow")
+		}
+	}
+	memstats.gc_trigger = trigger
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over the heap marked by the last
+	// cycle.
+	goal := ^uint64(0)
+	if gcpercent >= 0 {
+		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+		if goal < trigger {
+			// The trigger ratio is always less than GOGC/100, but
+			// other bounds on the trigger may have raised it.
+			// Push up the goal, too.
+			goal = trigger
+		}
+	}
+	memstats.next_gc = goal
+	if trace.enabled {
+		traceNextGC()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		gcController.revise()
+	}
+
+	// Update sweep pacing.
+	if gosweepdone() {
+		mheap_.sweepPagesPerByte = 0
+	} else {
+		// Concurrent sweep needs to sweep all of the in-use
+		// pages by the time the allocated heap reaches the GC
+		// trigger. Compute the ratio of in-use pages to sweep
+		// per byte allocated, accounting for the fact that
+		// some might already be swept.
+		heapLiveBasis := atomic.Load64(&memstats.heap_live)
+		heapDistance := int64(trigger) - int64(heapLiveBasis)
+		// Add a little margin so rounding errors and
+		// concurrent sweep are less likely to leave pages
+		// unswept when GC starts.
+		heapDistance -= 1024 * 1024
+		if heapDistance < _PageSize {
+			// Avoid setting the sweep ratio extremely high
+			heapDistance = _PageSize
+		}
+		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
+		sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
+		if sweepDistancePages <= 0 {
+			mheap_.sweepPagesPerByte = 0
+		} else {
+			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
+			mheap_.sweepHeapLiveBasis = heapLiveBasis
+			// Write pagesSweptBasis last, since this
+			// signals concurrent sweeps to recompute
+			// their debt.
+			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
+		}
+	}
+}
+
 // gcGoalUtilization is the goal CPU utilization for background
 // marking as a fraction of GOMAXPROCS.
 const gcGoalUtilization = 0.25
@@ -783,10 +882,23 @@ const gcAssistTimeSlack = 5000
 const gcOverAssistWork = 64 << 10
 
 var work struct {
-	full  uint64                   // lock-free list of full blocks workbuf
-	empty uint64                   // lock-free list of empty blocks workbuf
+	full  lfstack                  // lock-free list of full blocks workbuf
+	empty lfstack                  // lock-free list of empty blocks workbuf
 	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 
+	wbufSpans struct {
+		lock mutex
+		// free is a list of spans dedicated to workbufs, but
+		// that don't currently contain any workbufs.
+		free mSpanList
+		// busy is a list of all spans containing workbufs on
+		// one of the workbuf lists.
+		busy mSpanList
+	}
+
+	// Restore 64-bit alignment on 32-bit.
+	_ uint32
+
 	// bytesMarked is the number of bytes marked this cycle. This
 	// includes bytes blackened in scanned objects, noscan objects
 	// that go straight to black, and permagrey objects scanned by
@@ -816,15 +928,13 @@ var work struct {
 	// should pass gcDrainBlock to gcDrain to block in the
 	// getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
 	//
-	// TODO: This is a temporary fallback to support
-	// debug.gcrescanstacks > 0 and to work around some known
-	// races. Remove this when we remove the debug option and fix
-	// the races.
+	// TODO: This is a temporary fallback to work around races
+	// that cause early mark termination.
 	helperDrainBlock bool
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nFlushCacheRoots                                  int
-	nDataRoots, nSpanRoots, nStackRoots, nRescanRoots int
+	nFlushCacheRoots                    int
+	nDataRoots, nSpanRoots, nStackRoots int
 
 	// markrootDone indicates that roots have been marked at least
 	// once during the current GC cycle. This is checked by root
@@ -860,6 +970,10 @@ var work struct {
 	// mode is the concurrency mode of the current GC cycle.
 	mode gcMode
 
+	// userForced indicates the current GC cycle was forced by an
+	// explicit user call.
+	userForced bool
+
 	// totaltime is the CPU nanoseconds spent in GC since the
 	// program started if debug.gctrace > 0.
 	totaltime int64
@@ -876,14 +990,19 @@ var work struct {
 		head, tail guintptr
 	}
 
-	// rescan is a list of G's that need to be rescanned during
-	// mark termination. A G adds itself to this list when it
-	// first invalidates its stack scan.
-	rescan struct {
+	// sweepWaiters is a list of blocked goroutines to wake when
+	// we transition from mark termination to sweep.
+	sweepWaiters struct {
 		lock mutex
-		list []guintptr
+		head guintptr
 	}
 
+	// cycles is the number of completed GC cycles, where a GC
+	// cycle is sweep termination, mark, mark termination, and
+	// sweep. This differs from memstats.numgc, which is
+	// incremented at mark termination.
+	cycles uint32
+
 	// Timing/utilization stats for this cycle.
 	stwprocs, maxprocs                 int32
 	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
@@ -899,7 +1018,94 @@ var work struct {
 // garbage collection is complete. It may also block the entire
 // program.
 func GC() {
-	gcStart(gcForceBlockMode, false)
+	// We consider a cycle to be: sweep termination, mark, mark
+	// termination, and sweep. This function shouldn't return
+	// until a full cycle has been completed, from beginning to
+	// end. Hence, we always want to finish up the current cycle
+	// and start a new one. That means:
+	//
+	// 1. In sweep termination, mark, or mark termination of cycle
+	// N, wait until mark termination N completes and transitions
+	// to sweep N.
+	//
+	// 2. In sweep N, help with sweep N.
+	//
+	// At this point we can begin a full cycle N+1.
+	//
+	// 3. Trigger cycle N+1 by starting sweep termination N+1.
+	//
+	// 4. Wait for mark termination N+1 to complete.
+	//
+	// 5. Help with sweep N+1 until it's done.
+	//
+	// This all has to be written to deal with the fact that the
+	// GC may move ahead on its own. For example, when we block
+	// until mark termination N, we may wake up in cycle N+2.
+
+	gp := getg()
+
+	// Prevent the GC phase or cycle count from changing.
+	lock(&work.sweepWaiters.lock)
+	n := atomic.Load(&work.cycles)
+	if gcphase == _GCmark {
+		// Wait until sweep termination, mark, and mark
+		// termination of cycle N complete.
+		gp.schedlink = work.sweepWaiters.head
+		work.sweepWaiters.head.set(gp)
+		goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+	} else {
+		// We're in sweep N already.
+		unlock(&work.sweepWaiters.lock)
+	}
+
+	// We're now in sweep N or later. Trigger GC cycle N+1, which
+	// will first finish sweep N if necessary and then enter sweep
+	// termination N+1.
+	gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerCycle, n: n + 1})
+
+	// Wait for mark termination N+1 to complete.
+	lock(&work.sweepWaiters.lock)
+	if gcphase == _GCmark && atomic.Load(&work.cycles) == n+1 {
+		gp.schedlink = work.sweepWaiters.head
+		work.sweepWaiters.head.set(gp)
+		goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+	} else {
+		unlock(&work.sweepWaiters.lock)
+	}
+
+	// Finish sweep N+1 before returning. We do this both to
+	// complete the cycle and because runtime.GC() is often used
+	// as part of tests and benchmarks to get the system into a
+	// relatively stable and isolated state.
+	for atomic.Load(&work.cycles) == n+1 && gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+		Gosched()
+	}
+
+	// Callers may assume that the heap profile reflects the
+	// just-completed cycle when this returns (historically this
+	// happened because this was a STW GC), but right now the
+	// profile still reflects mark termination N, not N+1.
+	//
+	// As soon as all of the sweep frees from cycle N+1 are done,
+	// we can go ahead and publish the heap profile.
+	//
+	// First, wait for sweeping to finish. (We know there are no
+	// more spans on the sweep queue, but we may be concurrently
+	// sweeping spans, so we have to wait.)
+	for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
+		Gosched()
+	}
+
+	// Now we're really done with sweeping, so we can publish the
+	// stable heap profile. Only do this if we haven't already hit
+	// another mark termination.
+	mp := acquirem()
+	cycle := atomic.Load(&work.cycles)
+	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
+		mProf_PostSweep()
+	}
+	releasem(mp)
 }
 
 // gcMode indicates how concurrent a GC cycle should be.
@@ -911,24 +1117,75 @@ const (
 	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
 )
 
-// gcShouldStart returns true if the exit condition for the _GCoff
-// phase has been met. The exit condition should be tested when
-// allocating.
-//
-// If forceTrigger is true, it ignores the current heap size, but
-// checks all other conditions. In general this should be false.
-func gcShouldStart(forceTrigger bool) bool {
-	return gcphase == _GCoff && (forceTrigger || memstats.heap_live >= memstats.gc_trigger) && memstats.enablegc && panicking == 0 && gcpercent >= 0
+// A gcTrigger is a predicate for starting a GC cycle. Specifically,
+// it is an exit condition for the _GCoff phase.
+type gcTrigger struct {
+	kind gcTriggerKind
+	now  int64  // gcTriggerTime: current time
+	n    uint32 // gcTriggerCycle: cycle number to start
+}
+
+type gcTriggerKind int
+
+const (
+	// gcTriggerAlways indicates that a cycle should be started
+	// unconditionally, even if GOGC is off or we're in a cycle
+	// right now. This cannot be consolidated with other cycles.
+	gcTriggerAlways gcTriggerKind = iota
+
+	// gcTriggerHeap indicates that a cycle should be started when
+	// the heap size reaches the trigger heap size computed by the
+	// controller.
+	gcTriggerHeap
+
+	// gcTriggerTime indicates that a cycle should be started when
+	// it's been more than forcegcperiod nanoseconds since the
+	// previous GC cycle.
+	gcTriggerTime
+
+	// gcTriggerCycle indicates that a cycle should be started if
+	// we have not yet started cycle number gcTrigger.n (relative
+	// to work.cycles).
+	gcTriggerCycle
+)
+
+// test returns true if the trigger condition is satisfied, meaning
+// that the exit condition for the _GCoff phase has been met. The exit
+// condition should be tested when allocating.
+func (t gcTrigger) test() bool {
+	if !memstats.enablegc || panicking != 0 {
+		return false
+	}
+	if t.kind == gcTriggerAlways {
+		return true
+	}
+	if gcphase != _GCoff || gcpercent < 0 {
+		return false
+	}
+	switch t.kind {
+	case gcTriggerHeap:
+		// Non-atomic access to heap_live for performance. If
+		// we are going to trigger on this, this thread just
+		// atomically wrote heap_live anyway and we'll see our
+		// own write.
+		return memstats.heap_live >= memstats.gc_trigger
+	case gcTriggerTime:
+		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
+		return lastgc != 0 && t.now-lastgc > forcegcperiod
+	case gcTriggerCycle:
+		// t.n > work.cycles, but accounting for wraparound.
+		return int32(t.n-work.cycles) > 0
+	}
+	return true
 }
 
-// gcStart transitions the GC from _GCoff to _GCmark (if mode ==
-// gcBackgroundMode) or _GCmarktermination (if mode !=
-// gcBackgroundMode) by performing sweep termination and GC
-// initialization.
+// gcStart transitions the GC from _GCoff to _GCmark (if
+// !mode.stwMark) or _GCmarktermination (if mode.stwMark) by
+// performing sweep termination and GC initialization.
 //
 // This may return without performing this transition in some cases,
 // such as when called on a system stack or with locks held.
-func gcStart(mode gcMode, forceTrigger bool) {
+func gcStart(mode gcMode, trigger gcTrigger) {
 	// Since this is called from malloc and malloc is called in
 	// the guts of a number of libraries that might be holding
 	// locks, don't attempt to start GC in non-preemptible or
@@ -951,29 +1208,21 @@ func gcStart(mode gcMode, forceTrigger bool) {
 	//
 	// We check the transition condition continuously here in case
 	// this G gets delayed in to the next GC cycle.
-	for (mode != gcBackgroundMode || gcShouldStart(forceTrigger)) && gosweepone() != ^uintptr(0) {
+	for trigger.test() && gosweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 	}
 
 	// Perform GC initialization and the sweep termination
 	// transition.
-	//
-	// If this is a forced GC, don't acquire the transition lock
-	// or re-check the transition condition because we
-	// specifically *don't* want to share the transition with
-	// another thread.
-	useStartSema := mode == gcBackgroundMode
-	if useStartSema {
-		semacquire(&work.startSema, 0)
-		// Re-check transition condition under transition lock.
-		if !gcShouldStart(forceTrigger) {
-			semrelease(&work.startSema)
-			return
-		}
+	semacquire(&work.startSema)
+	// Re-check transition condition under transition lock.
+	if !trigger.test() {
+		semrelease(&work.startSema)
+		return
 	}
 
 	// For stats, check if this GC was forced by the user.
-	forced := mode != gcBackgroundMode
+	work.userForced = trigger.kind == gcTriggerAlways || trigger.kind == gcTriggerCycle
 
 	// In gcstoptheworld debug mode, upgrade the mode accordingly.
 	// We do this after re-checking the transition condition so
@@ -988,7 +1237,7 @@ func gcStart(mode gcMode, forceTrigger bool) {
 	}
 
 	// Ok, we're doing it!  Stop everybody else
-	semacquire(&worldsema, 0)
+	semacquire(&worldsema)
 
 	if trace.enabled {
 		traceGCStart()
@@ -1000,13 +1249,13 @@ func gcStart(mode gcMode, forceTrigger bool) {
 
 	gcResetMarkState()
 
-	now := nanotime()
 	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
-	work.tSweepTerm = now
-	work.heap0 = memstats.heap_live
+	work.heap0 = atomic.Load64(&memstats.heap_live)
 	work.pauseNS = 0
 	work.mode = mode
 
+	now := nanotime()
+	work.tSweepTerm = now
 	work.pauseStart = now
 	systemstack(stopTheWorldWithSema)
 	// Finish sweep before we start concurrent scan.
@@ -1017,6 +1266,7 @@ func gcStart(mode gcMode, forceTrigger bool) {
 	// reclaimed until the next GC cycle.
 	clearpools()
 
+	work.cycles++
 	if mode == gcBackgroundMode { // Do as much work concurrently as possible
 		gcController.startCycle()
 		work.heapGoal = memstats.next_gc
@@ -1029,18 +1279,7 @@ func gcStart(mode gcMode, forceTrigger bool) {
 		// the time we start the world and begin
 		// scanning.
 		//
-		// It's necessary to enable write barriers
-		// during the scan phase for several reasons:
-		//
-		// They must be enabled for writes to higher
-		// stack frames before we scan stacks and
-		// install stack barriers because this is how
-		// we track writes to inactive stack frames.
-		// (Alternatively, we could not install stack
-		// barriers over frame boundaries with
-		// up-pointers).
-		//
-		// They must be enabled before assists are
+		// Write barriers must be enabled before assists are
 		// enabled because they must be enabled before
 		// any non-leaf heap objects are marked. Since
 		// allocations are blocked until assists can
@@ -1079,17 +1318,11 @@ func gcStart(mode gcMode, forceTrigger bool) {
 		work.tMark, work.tMarkTerm = t, t
 		work.heapGoal = work.heap0
 
-		if forced {
-			memstats.numforcedgc++
-		}
-
 		// Perform mark termination. This will restart the world.
-		gcMarkTermination()
+		gcMarkTermination(memstats.triggerRatio)
 	}
 
-	if useStartSema {
-		semrelease(&work.startSema)
-	}
+	semrelease(&work.startSema)
 }
 
 // gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
@@ -1109,7 +1342,7 @@ func gcStart(mode gcMode, forceTrigger bool) {
 // by mark termination.
 func gcMarkDone() {
 top:
-	semacquire(&work.markDoneSema, 0)
+	semacquire(&work.markDoneSema)
 
 	// Re-check transition condition under transition lock.
 	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
@@ -1204,14 +1437,14 @@ top:
 
 		// endCycle depends on all gcWork cache stats being
 		// flushed. This is ensured by mark 2.
-		gcController.endCycle()
+		nextTriggerRatio := gcController.endCycle()
 
 		// Perform mark termination. This will restart the world.
-		gcMarkTermination()
+		gcMarkTermination(nextTriggerRatio)
 	}
 }
 
-func gcMarkTermination() {
+func gcMarkTermination(nextTriggerRatio float64) {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
 	atomic.Store(&gcBlackenEnabled, 0)
@@ -1293,11 +1526,17 @@ func gcMarkTermination() {
 		throw("gc done but gcphase != _GCoff")
 	}
 
+	// Update GC trigger and pacing for the next cycle.
+	gcSetTriggerRatio(nextTriggerRatio)
+
 	// Update timing memstats
-	now, unixNow := nanotime(), unixnanotime()
+	now := nanotime()
+	sec, nsec, _ := time_now()
+	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
-	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 	memstats.pause_total_ns += uint64(work.pauseNS)
@@ -1315,25 +1554,36 @@ func gcMarkTermination() {
 	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
 	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
 
-	memstats.numgc++
-
 	// Reset sweep state.
 	sweep.nbgsweep = 0
 	sweep.npausesweep = 0
 
+	if work.userForced {
+		memstats.numforcedgc++
+	}
+
+	// Bump GC cycle count and wake goroutines waiting on sweep.
+	lock(&work.sweepWaiters.lock)
+	memstats.numgc++
+	injectglist(work.sweepWaiters.head.ptr())
+	work.sweepWaiters.head = 0
+	unlock(&work.sweepWaiters.lock)
+
+	// Finish the current heap profiling cycle and start a new
+	// heap profiling cycle. We do this before starting the world
+	// so events don't leak into the wrong cycle.
+	mProf_NextCycle()
+
 	systemstack(startTheWorldWithSema)
 
-	// Update heap profile stats if gcSweep didn't do it. This is
-	// relatively expensive, so we don't want to do it while the
-	// world is stopped, but it needs to happen ASAP after
-	// starting the world to prevent too many allocations from the
-	// next cycle leaking in. It must happen before releasing
-	// worldsema since there are applications that do a
-	// runtime.GC() to update the heap profile and then
-	// immediately collect the profile.
-	if _ConcurrentSweep && work.mode != gcForceBlockMode {
-		mProf_GC()
-	}
+	// Flush the heap profile so we can start a new cycle next GC.
+	// This is relatively expensive, so we don't do it with the
+	// world stopped.
+	mProf_Flush()
+
+	// Prepare workbufs for freeing by the sweeper. We do this
+	// asynchronously because it can take non-trivial time.
+	prepareFreeWorkbufs()
 
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
@@ -1368,7 +1618,7 @@ func gcMarkTermination() {
 			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
 			work.heapGoal>>20, " MB goal, ",
 			work.maxprocs, " P")
-		if work.mode != gcBackgroundMode {
+		if work.userForced {
 			print(" (forced)")
 		}
 		print("\n")
@@ -1521,6 +1771,25 @@ func gcBgMarkWorker(_p_ *p) {
 			default:
 				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
 			case gcMarkWorkerDedicatedMode:
+				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				if gp.preempt {
+					// We were preempted. This is
+					// a useful signal to kick
+					// everything out of the run
+					// queue so it can run
+					// somewhere else.
+					lock(&sched.lock)
+					for {
+						gp, _ := runqget(_p_)
+						if gp == nil {
+							break
+						}
+						globrunqput(gp)
+					}
+					unlock(&sched.lock)
+				}
+				// Go back to draining, this time
+				// without preemption.
 				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
 				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
@@ -1593,7 +1862,7 @@ func gcMarkWorkAvailable(p *p) bool {
 	if p != nil && !p.gcw.empty() {
 		return true
 	}
-	if atomic.Load64(&work.full) != 0 {
+	if !work.full.empty() {
 		return true // global work available
 	}
 	if work.markrootNext < work.markrootJobs {
@@ -1623,24 +1892,22 @@ func gcMark(start_time int64) {
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
 
-	if debug.gcrescanstacks == 0 && work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots+work.nRescanRoots == 0 {
+	if work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots == 0 {
 		// There's no work on the work queue and no root jobs
 		// that can produce work, so don't bother entering the
 		// getfull() barrier.
 		//
-		// With the hybrid barrier enabled, this will be the
-		// situation the vast majority of the time after
-		// concurrent mark. However, we still need a fallback
-		// for STW GC and because there are some known races
-		// that occasionally leave work around for mark
-		// termination.
+		// This will be the situation the vast majority of the
+		// time after concurrent mark. However, we still need
+		// a fallback for STW GC and because there are some
+		// known races that occasionally leave work around for
+		// mark termination.
 		//
 		// We're still hedging our bets here: if we do
 		// accidentally produce some work, we'll still process
 		// it, just not necessarily in parallel.
 		//
-		// TODO(austin): When we eliminate
-		// debug.gcrescanstacks: fix the races, and remove
+		// TODO(austin): Fix the races and and remove
 		// work draining from mark termination so we don't
 		// need the fallback path.
 		work.helperDrainBlock = false
@@ -1704,52 +1971,14 @@ func gcMark(start_time int64) {
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
-	// Trigger the next GC cycle when the allocated heap has grown
-	// by triggerRatio over the marked heap size. Assume that
-	// we're in steady state, so the marked heap size is the
-	// same now as it was at the beginning of the GC cycle.
-	memstats.gc_trigger = uint64(float64(memstats.heap_marked) * (1 + gcController.triggerRatio))
-	if memstats.gc_trigger < heapminimum {
-		memstats.gc_trigger = heapminimum
-	}
-	if int64(memstats.gc_trigger) < 0 {
-		print("next_gc=", memstats.next_gc, " bytesMarked=", work.bytesMarked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "\n")
-		throw("gc_trigger underflow")
-	}
-
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
 	memstats.heap_live = work.bytesMarked
 	memstats.heap_scan = uint64(gcController.scanWork)
 
-	minTrigger := memstats.heap_live + sweepMinHeapDistance*uint64(gcpercent)/100
-	if memstats.gc_trigger < minTrigger {
-		// The allocated heap is already past the trigger.
-		// This can happen if the triggerRatio is very low and
-		// the marked heap is less than the live heap size.
-		//
-		// Concurrent sweep happens in the heap growth from
-		// heap_live to gc_trigger, so bump gc_trigger up to ensure
-		// that concurrent sweep has some heap growth in which
-		// to perform sweeping before we start the next GC
-		// cycle.
-		memstats.gc_trigger = minTrigger
-	}
-
-	// The next GC cycle should finish before the allocated heap
-	// has grown by GOGC/100.
-	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
-	if gcpercent < 0 {
-		memstats.next_gc = ^uint64(0)
-	}
-	if memstats.next_gc < memstats.gc_trigger {
-		memstats.next_gc = memstats.gc_trigger
-	}
-
 	if trace.enabled {
 		traceHeapAlloc()
-		traceNextGC()
 	}
 }
 
@@ -1767,6 +1996,7 @@ func gcSweep(mode gcMode) {
 		// with an empty swept list.
 		throw("non-empty swept list")
 	}
+	mheap_.pagesSwept = 0
 	unlock(&mheap_.lock)
 
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
@@ -1774,35 +2004,23 @@ func gcSweep(mode gcMode) {
 		// Record that no proportional sweeping has to happen.
 		lock(&mheap_.lock)
 		mheap_.sweepPagesPerByte = 0
-		mheap_.pagesSwept = 0
 		unlock(&mheap_.lock)
 		// Sweep all spans eagerly.
 		for sweepone() != ^uintptr(0) {
 			sweep.npausesweep++
 		}
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		mProf_GC()
-		mProf_GC()
+		// Free workbufs eagerly.
+		prepareFreeWorkbufs()
+		for freeSomeWbufs(false) {
+		}
+		// All "free" events for this mark/sweep cycle have
+		// now happened, so we can make this profile cycle
+		// available immediately.
+		mProf_NextCycle()
+		mProf_Flush()
 		return
 	}
 
-	// Concurrent sweep needs to sweep all of the in-use pages by
-	// the time the allocated heap reaches the GC trigger. Compute
-	// the ratio of in-use pages to sweep per byte allocated.
-	heapDistance := int64(memstats.gc_trigger) - int64(memstats.heap_live)
-	// Add a little margin so rounding errors and concurrent
-	// sweep are less likely to leave pages unswept when GC starts.
-	heapDistance -= 1024 * 1024
-	if heapDistance < _PageSize {
-		// Avoid setting the sweep ratio extremely high
-		heapDistance = _PageSize
-	}
-	lock(&mheap_.lock)
-	mheap_.sweepPagesPerByte = float64(mheap_.pagesInUse) / float64(heapDistance)
-	mheap_.pagesSwept = 0
-	mheap_.spanBytesAlloc = 0
-	unlock(&mheap_.lock)
-
 	// Background sweep.
 	lock(&sweep.lock)
 	if sweep.parked {
@@ -1820,24 +2038,16 @@ func gcSweep(mode gcMode) {
 func gcResetMarkState() {
 	// This may be called during a concurrent phase, so make sure
 	// allgs doesn't change.
-	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
-		// Accessing gcRescan is unsafe.
-		throw("bad GC phase")
-	}
 	lock(&allglock)
 	for _, gp := range allgs {
 		gp.gcscandone = false  // set to true in gcphasework
 		gp.gcscanvalid = false // stack has not been scanned
-		gp.gcRescan = -1
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
 
-	// Clear rescan list.
-	work.rescan.list = work.rescan.list[:0]
-
 	work.bytesMarked = 0
-	work.initialHeapLive = memstats.heap_live
+	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
 	work.markrootDone = false
 }
 
diff --git a/libgo/go/runtime/mgclarge.go b/libgo/go/runtime/mgclarge.go
new file mode 100644
index 0000000..757e88d
--- /dev/null
+++ b/libgo/go/runtime/mgclarge.go
@@ -0,0 +1,326 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.go for the general overview.
+//
+// Large spans are the subject of this file. Spans consisting of less than
+// _MaxMHeapLists are held in lists of like sized spans. Larger spans
+// are held in a treap. See https://en.wikipedia.org/wiki/Treap or
+// http://faculty.washington.edu/aragon/pubs/rst89.pdf for an overview.
+// sema.go also holds an implementation of a treap.
+//
+// Each treapNode holds a single span. The treap is sorted by page size
+// and for spans of the same size a secondary sort based on start address
+// is done.
+// Spans are returned based on a best fit algorithm and for spans of the same
+// size the one at the lowest address is selected.
+//
+// The primary routines are
+// insert: adds a span to the treap
+// remove: removes the span from that treap that best fits the required size
+// removeSpan: which removes a specific span from the treap
+//
+// _mheap.lock must be held when manipulating this data structure.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+//go:notinheap
+type mTreap struct {
+	treap *treapNode
+}
+
+//go:notinheap
+type treapNode struct {
+	right     *treapNode // all treapNodes > this treap node
+	left      *treapNode // all treapNodes < this treap node
+	parent    *treapNode // direct parent of this node, nil if root
+	npagesKey uintptr    // number of pages in spanKey, used as primary sort key
+	spanKey   *mspan     // span of size npagesKey, used as secondary sort key
+	priority  uint32     // random number used by treap algorithm keep tree probablistically balanced
+}
+
+func (t *treapNode) init() {
+	t.right = nil
+	t.left = nil
+	t.parent = nil
+	t.spanKey = nil
+	t.npagesKey = 0
+	t.priority = 0
+}
+
+// isSpanInTreap is handy for debugging. One should hold the heap lock, usually
+// mheap_.lock().
+func (t *treapNode) isSpanInTreap(s *mspan) bool {
+	if t == nil {
+		return false
+	}
+	return t.spanKey == s || t.left.isSpanInTreap(s) || t.right.isSpanInTreap(s)
+}
+
+// walkTreap is handy for debugging.
+// Starting at some treapnode t, for example the root, do a depth first preorder walk of
+// the tree executing fn at each treap node. One should hold the heap lock, usually
+// mheap_.lock().
+func (t *treapNode) walkTreap(fn func(tn *treapNode)) {
+	if t == nil {
+		return
+	}
+	fn(t)
+	t.left.walkTreap(fn)
+	t.right.walkTreap(fn)
+}
+
+// checkTreapNode when used in conjunction with walkTreap can usually detect a
+// poorly formed treap.
+func checkTreapNode(t *treapNode) {
+	// lessThan is used to order the treap.
+	// npagesKey and npages are the primary keys.
+	// spanKey and span are the secondary keys.
+	// span == nil (0) will always be lessThan all
+	// spans of the same size.
+	lessThan := func(npages uintptr, s *mspan) bool {
+		if t.npagesKey != npages {
+			return t.npagesKey < npages
+		}
+		// t.npagesKey == npages
+		return uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(s))
+	}
+
+	if t == nil {
+		return
+	}
+	if t.spanKey.npages != t.npagesKey || t.spanKey.next != nil {
+		println("runtime: checkTreapNode treapNode t=", t, "     t.npagesKey=", t.npagesKey,
+			"t.spanKey.npages=", t.spanKey.npages)
+		throw("why does span.npages and treap.ngagesKey do not match?")
+	}
+	if t.left != nil && lessThan(t.left.npagesKey, t.left.spanKey) {
+		throw("t.lessThan(t.left.npagesKey, t.left.spanKey) is not false")
+	}
+	if t.right != nil && !lessThan(t.right.npagesKey, t.right.spanKey) {
+		throw("!t.lessThan(t.left.npagesKey, t.left.spanKey) is not false")
+	}
+}
+
+// insert adds span to the large span treap.
+func (root *mTreap) insert(span *mspan) {
+	npages := span.npages
+	var last *treapNode
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
+		last = t
+		if t.npagesKey < npages {
+			pt = &t.right
+		} else if t.npagesKey > npages {
+			pt = &t.left
+		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+			// t.npagesKey == npages, so sort on span addresses.
+			pt = &t.right
+		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+			pt = &t.left
+		} else {
+			throw("inserting span already in treap")
+		}
+	}
+
+	// Add t as new leaf in tree of span size and unique addrs.
+	// The balanced tree is a treap using priority as the random heap priority.
+	// That is, it is a binary tree ordered according to the npagesKey,
+	// but then among the space of possible binary trees respecting those
+	// npagesKeys, it is kept balanced on average by maintaining a heap ordering
+	// on the priority: s.priority <= both s.right.priority and s.right.priority.
+	// https://en.wikipedia.org/wiki/Treap
+	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
+
+	t := (*treapNode)(mheap_.treapalloc.alloc())
+	t.init()
+	t.npagesKey = span.npages
+	t.priority = fastrand()
+	t.spanKey = span
+	t.parent = last
+	*pt = t // t now at a leaf.
+	// Rotate up into tree according to priority.
+	for t.parent != nil && t.parent.priority > t.priority {
+		if t != nil && t.spanKey.npages != t.npagesKey {
+			println("runtime: insert t=", t, "t.npagesKey=", t.npagesKey)
+			println("runtime:      t.spanKey=", t.spanKey, "t.spanKey.npages=", t.spanKey.npages)
+			throw("span and treap sizes do not match?")
+		}
+		if t.parent.left == t {
+			root.rotateRight(t.parent)
+		} else {
+			if t.parent.right != t {
+				throw("treap insert finds a broken treap")
+			}
+			root.rotateLeft(t.parent)
+		}
+	}
+}
+
+func (root *mTreap) removeNode(t *treapNode) *mspan {
+	if t.spanKey.npages != t.npagesKey {
+		throw("span and treap node npages do not match")
+	}
+	result := t.spanKey
+
+	// Rotate t down to be leaf of tree for removal, respecting priorities.
+	for t.right != nil || t.left != nil {
+		if t.right == nil || t.left != nil && t.left.priority < t.right.priority {
+			root.rotateRight(t)
+		} else {
+			root.rotateLeft(t)
+		}
+	}
+	// Remove t, now a leaf.
+	if t.parent != nil {
+		if t.parent.left == t {
+			t.parent.left = nil
+		} else {
+			t.parent.right = nil
+		}
+	} else {
+		root.treap = nil
+	}
+	// Return the found treapNode's span after freeing the treapNode.
+	t.spanKey = nil
+	t.npagesKey = 0
+	mheap_.treapalloc.free(unsafe.Pointer(t))
+	return result
+}
+
+// remove searches for, finds, removes from the treap, and returns the smallest
+// span that can hold npages. If no span has at least npages return nil.
+// This is slightly more complicated than a simple binary tree search
+// since if an exact match is not found the next larger node is
+// returned.
+// If the last node inspected > npagesKey not holding
+// a left node (a smaller npages) is the "best fit" node.
+func (root *mTreap) remove(npages uintptr) *mspan {
+	t := root.treap
+	for t != nil {
+		if t.spanKey == nil {
+			throw("treap node with nil spanKey found")
+		}
+		if t.npagesKey < npages {
+			t = t.right
+		} else if t.left != nil && t.left.npagesKey >= npages {
+			t = t.left
+		} else {
+			result := t.spanKey
+			root.removeNode(t)
+			return result
+		}
+	}
+	return nil
+}
+
+// removeSpan searches for, finds, deletes span along with
+// the associated treap node. If the span is not in the treap
+// then t will eventually be set to nil and the t.spanKey
+// will throw.
+func (root *mTreap) removeSpan(span *mspan) {
+	npages := span.npages
+	t := root.treap
+	for t.spanKey != span {
+		if t.npagesKey < npages {
+			t = t.right
+		} else if t.npagesKey > npages {
+			t = t.left
+		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+			t = t.right
+		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+			t = t.left
+		}
+	}
+	root.removeNode(t)
+}
+
+// scavengetreap visits each node in the treap and scavenges the
+// treapNode's span.
+func scavengetreap(treap *treapNode, now, limit uint64) uintptr {
+	if treap == nil {
+		return 0
+	}
+	return scavengeTreapNode(treap, now, limit) +
+		scavengetreap(treap.left, now, limit) +
+		scavengetreap(treap.right, now, limit)
+}
+
+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *mTreap) rotateLeft(x *treapNode) {
+	// p -> (x a (y b c))
+	p := x.parent
+	a, y := x.left, x.right
+	b, c := y.left, y.right
+
+	y.left = x
+	x.parent = y
+	y.right = c
+	if c != nil {
+		c.parent = y
+	}
+	x.left = a
+	if a != nil {
+		a.parent = x
+	}
+	x.right = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.left == x {
+		p.left = y
+	} else {
+		if p.right != x {
+			throw("large span treap rotateLeft")
+		}
+		p.right = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *mTreap) rotateRight(y *treapNode) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x, c := y.left, y.right
+	a, b := x.left, x.right
+
+	x.left = a
+	if a != nil {
+		a.parent = x
+	}
+	x.right = y
+	y.parent = x
+	y.left = b
+	if b != nil {
+		b.parent = y
+	}
+	y.right = c
+	if c != nil {
+		c.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.left == y {
+		p.left = x
+	} else {
+		if p.right != y {
+			throw("large span treap rotateRight")
+		}
+		p.right = x
+	}
+}
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index 93252ba..998a830 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -93,21 +93,24 @@ func gcMarkRootPrepare() {
 		// termination, allglen isn't changing, so we'll scan
 		// all Gs.
 		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
-		work.nRescanRoots = 0
 	} else {
 		// We've already scanned span roots and kept the scan
 		// up-to-date during concurrent mark.
 		work.nSpanRoots = 0
 
-		// On the second pass of markroot, we're just scanning
-		// dirty stacks. It's safe to access rescan since the
-		// world is stopped.
+		// The hybrid barrier ensures that stacks can't
+		// contain pointers to unmarked objects, so on the
+		// second markroot, there's no need to scan stacks.
 		work.nStackRoots = 0
-		work.nRescanRoots = len(work.rescan.list)
+
+		if debug.gcrescanstacks > 0 {
+			// Scan stacks anyway for debugging.
+			work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		}
 	}
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -165,8 +168,7 @@ func markroot(gcw *gcWork, i uint32) {
 	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
 	baseSpans := baseData + uint32(work.nDataRoots)
 	baseStacks := baseSpans + uint32(work.nSpanRoots)
-	baseRescan := baseStacks + uint32(work.nStackRoots)
-	end := baseRescan + uint32(work.nRescanRoots)
+	end := baseStacks + uint32(work.nStackRoots)
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
@@ -186,6 +188,11 @@ func markroot(gcw *gcWork, i uint32) {
 		}
 
 	case i == fixedRootFinalizers:
+		// Only do this once per GC cycle since we don't call
+		// queuefinalizer during marking.
+		if work.markrootDone {
+			break
+		}
 		for fb := allfin; fb != nil; fb = fb.alllink {
 			cnt := uintptr(atomic.Load(&fb.cnt))
 			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
@@ -201,15 +208,8 @@ func markroot(gcw *gcWork, i uint32) {
 	default:
 		// the rest is scanning goroutine stacks
 		var gp *g
-		if baseStacks <= i && i < baseRescan {
+		if baseStacks <= i && i < end {
 			gp = allgs[i-baseStacks]
-		} else if baseRescan <= i && i < end {
-			gp = work.rescan.list[i-baseRescan].ptr()
-			if gp.gcRescan != int32(i-baseRescan) {
-				// Looking for issue #17099.
-				println("runtime: gp", gp, "found at rescan index", i-baseRescan, "but should be at", gp.gcRescan)
-				throw("bad g rescan index")
-			}
 		} else {
 			throw("markroot: bad index")
 		}
@@ -352,6 +352,7 @@ func gcAssistAlloc(gp *g) {
 		return
 	}
 
+	traced := false
 retry:
 	// Compute the amount of scan work we need to do to make the
 	// balance positive. When the required amount of work is low,
@@ -387,10 +388,18 @@ retry:
 		if scanWork == 0 {
 			// We were able to steal all of the credit we
 			// needed.
+			if traced {
+				traceGCMarkAssistDone()
+			}
 			return
 		}
 	}
 
+	if trace.enabled && !traced {
+		traced = true
+		traceGCMarkAssistStart()
+	}
+
 	// Perform assist work
 	systemstack(func() {
 		gcAssistAlloc1(gp, scanWork)
@@ -433,6 +442,9 @@ retry:
 		// At this point either background GC has satisfied
 		// this G's assist debt, or the GC cycle is over.
 	}
+	if traced {
+		traceGCMarkAssistDone()
+	}
 }
 
 // gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system
@@ -656,10 +668,6 @@ func doscanstack(*g, *gcWork)
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
-// During mark phase, it also installs stack barriers while traversing
-// gp's stack. During mark termination, it stops scanning when it
-// reaches an unhit stack barrier.
-//
 // scanstack is marked go:systemstack because it must not be preempted
 // while using a workbuf.
 //
@@ -699,84 +707,9 @@ func scanstack(gp *g, gcw *gcWork) {
 	scanstackblock(uintptr(unsafe.Pointer(&gp.gcregs)), unsafe.Sizeof(gp.gcregs), gcw)
 	scanstackblock(uintptr(unsafe.Pointer(&gp.context)), unsafe.Sizeof(gp.context), gcw)
 
-	if gcphase == _GCmark {
-		// gp may have added itself to the rescan list between
-		// when GC started and now. It's clean now, so remove
-		// it. This isn't safe during mark termination because
-		// mark termination is consuming this list, but it's
-		// also not necessary.
-		dequeueRescan(gp)
-	}
 	gp.gcscanvalid = true
 }
 
-// queueRescan adds gp to the stack rescan list and clears
-// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
-// already on the rescan list.
-func queueRescan(gp *g) {
-	if debug.gcrescanstacks == 0 {
-		// Clear gcscanvalid to keep assertions happy.
-		//
-		// TODO: Remove gcscanvalid entirely when we remove
-		// stack rescanning.
-		gp.gcscanvalid = false
-		return
-	}
-
-	if gcphase == _GCoff {
-		gp.gcscanvalid = false
-		return
-	}
-	if gp.gcRescan != -1 {
-		throw("g already on rescan list")
-	}
-
-	lock(&work.rescan.lock)
-	gp.gcscanvalid = false
-
-	// Recheck gcphase under the lock in case there was a phase change.
-	if gcphase == _GCoff {
-		unlock(&work.rescan.lock)
-		return
-	}
-	if len(work.rescan.list) == cap(work.rescan.list) {
-		throw("rescan list overflow")
-	}
-	n := len(work.rescan.list)
-	gp.gcRescan = int32(n)
-	work.rescan.list = work.rescan.list[:n+1]
-	work.rescan.list[n].set(gp)
-	unlock(&work.rescan.lock)
-}
-
-// dequeueRescan removes gp from the stack rescan list, if gp is on
-// the rescan list. The caller must own gp.
-func dequeueRescan(gp *g) {
-	if debug.gcrescanstacks == 0 {
-		return
-	}
-
-	if gp.gcRescan == -1 {
-		return
-	}
-	if gcphase == _GCoff {
-		gp.gcRescan = -1
-		return
-	}
-
-	lock(&work.rescan.lock)
-	if work.rescan.list[gp.gcRescan].ptr() != gp {
-		throw("bad dequeueRescan")
-	}
-	// Careful: gp may itself be the last G on the list.
-	last := work.rescan.list[len(work.rescan.list)-1]
-	work.rescan.list[gp.gcRescan] = last
-	last.ptr().gcRescan = gp.gcRescan
-	gp.gcRescan = -1
-	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
-	unlock(&work.rescan.lock)
-}
-
 type gcDrainFlags int
 
 const (
@@ -1052,7 +985,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// paths), in which case we must *not* enqueue
 			// oblets since their bitmaps will be
 			// uninitialized.
-			if !hbits.hasPointers(n) {
+			if s.spanclass.noscan() {
 				// Bypass the whole scan.
 				gcw.bytesMarked += uint64(n)
 				return
@@ -1185,6 +1118,7 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 			// Dump the object
 			gcDumpObject("obj", obj, ^uintptr(0))
 
+			getg().m.traceback = 2
 			throw("checkmark found unmarked object")
 		}
 		if hbits.isCheckmarked(span.elemsize) {
@@ -1206,6 +1140,7 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
 			gcDumpObject("base", base, off)
 			gcDumpObject("obj", obj, ^uintptr(0))
+			getg().m.traceback = 2
 			throw("marking free object")
 		}
 
@@ -1217,7 +1152,7 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 		atomic.Or8(mbits.bytep, mbits.mask)
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
-		if !hbits.hasPointers(span.elemsize) {
+		if span.spanclass.noscan() {
 			gcw.bytesMarked += uint64(span.elemsize)
 			return
 		}
@@ -1250,7 +1185,7 @@ func gcDumpObject(label string, obj, off uintptr) {
 		print(" s=nil\n")
 		return
 	}
-	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, " s.state=")
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, " s.state=")
 	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
 		print(mSpanStateNames[s.state], "\n")
 	} else {
@@ -1259,7 +1194,7 @@ func gcDumpObject(label string, obj, off uintptr) {
 
 	skipped := false
 	size := s.elemsize
-	if s.state == _MSpanStack && size == 0 {
+	if s.state == _MSpanManual && size == 0 {
 		// We're printing something from a stack frame. We
 		// don't know how big it is, so just show up to an
 		// including off.
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index 2b698bf..c60214c 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -22,10 +22,6 @@ type sweepdata struct {
 
 	nbgsweep    uint32
 	npausesweep uint32
-
-	// pacertracegen is the sweepgen at which the last pacer trace
-	// "sweep finished" message was printed.
-	pacertracegen uint32
 }
 
 // finishsweep_m ensures that all spans are swept.
@@ -62,6 +58,9 @@ func bgsweep(c chan int) {
 			sweep.nbgsweep++
 			Gosched()
 		}
+		for freeSomeWbufs(true) {
+			Gosched()
+		}
 		lock(&sweep.lock)
 		if !gosweepdone() {
 			// This can happen if a GC runs between
@@ -80,20 +79,24 @@ func bgsweep(c chan int) {
 //go:nowritebarrier
 func sweepone() uintptr {
 	_g_ := getg()
+	sweepRatio := mheap_.sweepPagesPerByte // For debugging
 
 	// increment locks to ensure that the goroutine is not preempted
 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
 	_g_.m.locks++
+	if atomic.Load(&mheap_.sweepdone) != 0 {
+		_g_.m.locks--
+		return ^uintptr(0)
+	}
+	atomic.Xadd(&mheap_.sweepers, +1)
+
+	npages := ^uintptr(0)
 	sg := mheap_.sweepgen
 	for {
 		s := mheap_.sweepSpans[1-sg/2%2].pop()
 		if s == nil {
-			mheap_.sweepdone = 1
-			_g_.m.locks--
-			if debug.gcpacertrace > 0 && atomic.Cas(&sweep.pacertracegen, sg-2, sg) {
-				print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
-			}
-			return ^uintptr(0)
+			atomic.Store(&mheap_.sweepdone, 1)
+			break
 		}
 		if s.state != mSpanInUse {
 			// This can happen if direct sweeping already
@@ -108,16 +111,25 @@ func sweepone() uintptr {
 		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			continue
 		}
-		npages := s.npages
+		npages = s.npages
 		if !s.sweep(false) {
 			// Span is still in-use, so this returned no
 			// pages to the heap and the span needs to
 			// move to the swept in-use list.
 			npages = 0
 		}
-		_g_.m.locks--
-		return npages
+		break
+	}
+
+	// Decrement the number of active sweepers and if this is the
+	// last one print trace information.
+	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
+		if debug.gcpacertrace > 0 {
+			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
+		}
 	}
+	_g_.m.locks--
+	return npages
 }
 
 //go:nowritebarrier
@@ -180,15 +192,14 @@ func (s *mspan) sweep(preserve bool) bool {
 	}
 
 	if trace.enabled {
-		traceGCSweepStart()
+		traceGCSweepSpan(s.npages * _PageSize)
 	}
 
 	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
 
-	cl := s.sizeclass
+	spc := s.spanclass
 	size := s.elemsize
 	res := false
-	nfree := 0
 
 	c := _g_.m.mcache
 	freeToHeap := false
@@ -278,12 +289,11 @@ func (s *mspan) sweep(preserve bool) bool {
 	}
 
 	// Count the number of free objects in this span.
-	nfree = s.countFree()
-	if cl == 0 && nfree != 0 {
+	nalloc := uint16(s.countAlloc())
+	if spc.sizeclass() == 0 && nalloc == 0 {
 		s.needzero = 1
 		freeToHeap = true
 	}
-	nalloc := uint16(s.nelems) - uint16(nfree)
 	nfreed := s.allocCount - nalloc
 
 	// This test is not reliable with gccgo, because of
@@ -300,13 +310,16 @@ func (s *mspan) sweep(preserve bool) bool {
 	// unnecessarily, but provided the pointer is not really live
 	// it is not otherwise a problem. So we disable the test for gccgo.
 	if false && nalloc > s.allocCount {
-		print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
 		throw("sweep increased allocation count")
 	}
 
 	s.allocCount = nalloc
 	wasempty := s.nextFreeIndex() == s.nelems
 	s.freeindex = 0 // reset allocation index to start of span.
+	if trace.enabled {
+		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
+	}
 
 	// gcmarkBits becomes the allocBits.
 	// get a fresh cleared gcmarkBits in preparation for next GC
@@ -334,9 +347,9 @@ func (s *mspan) sweep(preserve bool) bool {
 		atomic.Store(&s.sweepgen, sweepgen)
 	}
 
-	if nfreed > 0 && cl != 0 {
-		c.local_nsmallfree[cl] += uintptr(nfreed)
-		res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty)
+	if nfreed > 0 && spc.sizeclass() != 0 {
+		c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+		res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty)
 		// MCentral_FreeSpan updates sweepgen
 	} else if freeToHeap {
 		// Free large span to heap
@@ -370,9 +383,6 @@ func (s *mspan) sweep(preserve bool) bool {
 		// it on the swept in-use list.
 		mheap_.sweepSpans[sweepgen/2%2].push(s)
 	}
-	if trace.enabled {
-		traceGCSweepDone()
-	}
 	return res
 }
 
@@ -385,8 +395,7 @@ func (s *mspan) sweep(preserve bool) bool {
 //
 // deductSweepCredit makes a worst-case assumption that all spanBytes
 // bytes of the ultimately allocated span will be available for object
-// allocation. The caller should call reimburseSweepCredit if that
-// turns out not to be the case once the span is allocated.
+// allocation.
 //
 // deductSweepCredit is the core of the "proportional sweep" system.
 // It uses statistics gathered by the garbage collector to perform
@@ -400,31 +409,28 @@ func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
 		return
 	}
 
-	// Account for this span allocation.
-	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+retry:
+	sweptBasis := atomic.Load64(&mheap_.pagesSweptBasis)
 
 	// Fix debt if necessary.
-	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
-	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+	newHeapLive := uintptr(atomic.Load64(&memstats.heap_live)-mheap_.sweepHeapLiveBasis) + spanBytes
+	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
+	for pagesTarget > int64(atomic.Load64(&mheap_.pagesSwept)-sweptBasis) {
 		if gosweepone() == ^uintptr(0) {
 			mheap_.sweepPagesPerByte = 0
 			break
 		}
+		if atomic.Load64(&mheap_.pagesSweptBasis) != sweptBasis {
+			// Sweep pacing changed. Recompute debt.
+			goto retry
+		}
 	}
-}
 
-// reimburseSweepCredit records that unusableBytes bytes of a
-// just-allocated span are not available for object allocation. This
-// offsets the worst-case charge performed by deductSweepCredit.
-func reimburseSweepCredit(unusableBytes uintptr) {
-	if mheap_.sweepPagesPerByte == 0 {
-		// Nobody cares about the credit. Avoid the atomic.
-		return
-	}
-	nval := atomic.Xadd64(&mheap_.spanBytesAlloc, -int64(unusableBytes))
-	if int64(nval) < 0 {
-		// Debugging for #18043.
-		print("runtime: bad spanBytesAlloc=", nval, " (was ", nval+uint64(unusableBytes), ") unusableBytes=", unusableBytes, " sweepPagesPerByte=", mheap_.sweepPagesPerByte, "\n")
-		throw("spanBytesAlloc underflow")
+	if trace.enabled {
+		traceGCSweepDone()
 	}
 }
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index 5eb05a7..461679b 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -12,8 +12,22 @@ import (
 
 const (
 	_WorkbufSize = 2048 // in bytes; larger values result in less contention
+
+	// workbufAlloc is the number of bytes to allocate at a time
+	// for new workbufs. This must be a multiple of pageSize and
+	// should be a multiple of _WorkbufSize.
+	//
+	// Larger values reduce workbuf allocation overhead. Smaller
+	// values reduce heap fragmentation.
+	workbufAlloc = 32 << 10
 )
 
+func init() {
+	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
+		throw("bad workbufAlloc")
+	}
+}
+
 // Garbage collector work pool abstraction.
 //
 // This implements a producer/consumer model for pointers to grey
@@ -25,21 +39,6 @@ const (
 // grey objects, thus blackening them, and then scans them,
 // potentially producing new pointers to grey objects.
 
-// A wbufptr holds a workbuf*, but protects it from write barriers.
-// workbufs never live on the heap, so write barriers are unnecessary.
-// Write barriers on workbuf pointers may also be dangerous in the GC.
-//
-// TODO: Since workbuf is now go:notinheap, this isn't necessary.
-type wbufptr uintptr
-
-func wbufptrOf(w *workbuf) wbufptr {
-	return wbufptr(unsafe.Pointer(w))
-}
-
-func (wp wbufptr) ptr() *workbuf {
-	return (*workbuf)(unsafe.Pointer(wp))
-}
-
 // A gcWork provides the interface to produce and consume work for the
 // garbage collector.
 //
@@ -75,7 +74,7 @@ type gcWork struct {
 	// next.
 	//
 	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
-	wbuf1, wbuf2 wbufptr
+	wbuf1, wbuf2 *workbuf
 
 	// Bytes marked (blackened) on this gcWork. This is aggregated
 	// into work.bytesMarked by dispose.
@@ -87,12 +86,12 @@ type gcWork struct {
 }
 
 func (w *gcWork) init() {
-	w.wbuf1 = wbufptrOf(getempty())
+	w.wbuf1 = getempty()
 	wbuf2 := trygetfull()
 	if wbuf2 == nil {
 		wbuf2 = getempty()
 	}
-	w.wbuf2 = wbufptrOf(wbuf2)
+	w.wbuf2 = wbuf2
 }
 
 // put enqueues a pointer for the garbage collector to trace.
@@ -100,18 +99,18 @@ func (w *gcWork) init() {
 //go:nowritebarrier
 func (w *gcWork) put(obj uintptr) {
 	flushed := false
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	} else if wbuf.nobj == len(wbuf.obj) {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == len(wbuf.obj) {
 			putfull(wbuf)
 			wbuf = getempty()
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 			flushed = true
 		}
 	}
@@ -132,7 +131,7 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrier
 func (w *gcWork) putFast(obj uintptr) bool {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
 	} else if wbuf.nobj == len(wbuf.obj) {
@@ -151,15 +150,15 @@ func (w *gcWork) putFast(obj uintptr) bool {
 // other gcWork instances or other caches.
 //go:nowritebarrier
 func (w *gcWork) tryGet() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	}
 	if wbuf.nobj == 0 {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == 0 {
 			owbuf := wbuf
 			wbuf = trygetfull()
@@ -167,7 +166,7 @@ func (w *gcWork) tryGet() uintptr {
 				return 0
 			}
 			putempty(owbuf)
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 		}
 	}
 
@@ -180,7 +179,7 @@ func (w *gcWork) tryGet() uintptr {
 // the caller is expected to call tryGet().
 //go:nowritebarrier
 func (w *gcWork) tryGetFast() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		return 0
 	}
@@ -197,15 +196,15 @@ func (w *gcWork) tryGetFast() uintptr {
 // been retrieved.  get returns 0 if there are no pointers remaining.
 //go:nowritebarrier
 func (w *gcWork) get() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	}
 	if wbuf.nobj == 0 {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == 0 {
 			owbuf := wbuf
 			wbuf = getfull()
@@ -213,7 +212,7 @@ func (w *gcWork) get() uintptr {
 				return 0
 			}
 			putempty(owbuf)
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 		}
 	}
 
@@ -231,21 +230,21 @@ func (w *gcWork) get() uintptr {
 //
 //go:nowritebarrier
 func (w *gcWork) dispose() {
-	if wbuf := w.wbuf1.ptr(); wbuf != nil {
+	if wbuf := w.wbuf1; wbuf != nil {
 		if wbuf.nobj == 0 {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
 		}
-		w.wbuf1 = 0
+		w.wbuf1 = nil
 
-		wbuf = w.wbuf2.ptr()
+		wbuf = w.wbuf2
 		if wbuf.nobj == 0 {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
 		}
-		w.wbuf2 = 0
+		w.wbuf2 = nil
 	}
 	if w.bytesMarked != 0 {
 		// dispose happens relatively infrequently. If this
@@ -265,14 +264,14 @@ func (w *gcWork) dispose() {
 // global queue.
 //go:nowritebarrier
 func (w *gcWork) balance() {
-	if w.wbuf1 == 0 {
+	if w.wbuf1 == nil {
 		return
 	}
-	if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
+	if wbuf := w.wbuf2; wbuf.nobj != 0 {
 		putfull(wbuf)
-		w.wbuf2 = wbufptrOf(getempty())
-	} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
-		w.wbuf1 = wbufptrOf(handoff(wbuf))
+		w.wbuf2 = getempty()
+	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
+		w.wbuf1 = handoff(wbuf)
 	} else {
 		return
 	}
@@ -285,7 +284,7 @@ func (w *gcWork) balance() {
 // empty returns true if w has no mark work available.
 //go:nowritebarrier
 func (w *gcWork) empty() bool {
-	return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
+	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
 }
 
 // Internally, the GC work pool is kept in arrays in work buffers.
@@ -327,23 +326,56 @@ func (b *workbuf) checkempty() {
 func getempty() *workbuf {
 	var b *workbuf
 	if work.empty != 0 {
-		b = (*workbuf)(lfstackpop(&work.empty))
+		b = (*workbuf)(work.empty.pop())
 		if b != nil {
 			b.checkempty()
 		}
 	}
 	if b == nil {
-		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), sys.CacheLineSize, &memstats.gc_sys))
+		// Allocate more workbufs.
+		var s *mspan
+		if work.wbufSpans.free.first != nil {
+			lock(&work.wbufSpans.lock)
+			s = work.wbufSpans.free.first
+			if s != nil {
+				work.wbufSpans.free.remove(s)
+				work.wbufSpans.busy.insert(s)
+			}
+			unlock(&work.wbufSpans.lock)
+		}
+		if s == nil {
+			systemstack(func() {
+				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+			})
+			if s == nil {
+				throw("out of memory")
+			}
+			// Record the new span in the busy list.
+			lock(&work.wbufSpans.lock)
+			work.wbufSpans.busy.insert(s)
+			unlock(&work.wbufSpans.lock)
+		}
+		// Slice up the span into new workbufs. Return one and
+		// put the rest on the empty list.
+		for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
+			newb := (*workbuf)(unsafe.Pointer(s.base() + i))
+			newb.nobj = 0
+			if i == 0 {
+				b = newb
+			} else {
+				putempty(newb)
+			}
+		}
 	}
 	return b
 }
 
 // putempty puts a workbuf onto the work.empty list.
-// Upon entry this go routine owns b. The lfstackpush relinquishes ownership.
+// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
 //go:nowritebarrier
 func putempty(b *workbuf) {
 	b.checkempty()
-	lfstackpush(&work.empty, &b.node)
+	work.empty.push(&b.node)
 }
 
 // putfull puts the workbuf on the work.full list for the GC.
@@ -352,14 +384,14 @@ func putempty(b *workbuf) {
 //go:nowritebarrier
 func putfull(b *workbuf) {
 	b.checknonempty()
-	lfstackpush(&work.full, &b.node)
+	work.full.push(&b.node)
 }
 
 // trygetfull tries to get a full or partially empty workbuffer.
 // If one is not immediately available return nil
 //go:nowritebarrier
 func trygetfull() *workbuf {
-	b := (*workbuf)(lfstackpop(&work.full))
+	b := (*workbuf)(work.full.pop())
 	if b != nil {
 		b.checknonempty()
 		return b
@@ -380,7 +412,7 @@ func trygetfull() *workbuf {
 // phase.
 //go:nowritebarrier
 func getfull() *workbuf {
-	b := (*workbuf)(lfstackpop(&work.full))
+	b := (*workbuf)(work.full.pop())
 	if b != nil {
 		b.checknonempty()
 		return b
@@ -398,7 +430,7 @@ func getfull() *workbuf {
 				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
 			}
-			b = (*workbuf)(lfstackpop(&work.full))
+			b = (*workbuf)(work.full.pop())
 			if b != nil {
 				b.checknonempty()
 				return b
@@ -412,15 +444,11 @@ func getfull() *workbuf {
 		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
 			return nil
 		}
-		_g_ := getg()
 		if i < 10 {
-			_g_.m.gcstats.nprocyield++
 			procyield(20)
 		} else if i < 20 {
-			_g_.m.gcstats.nosyield++
 			osyield()
 		} else {
-			_g_.m.gcstats.nsleep++
 			usleep(100)
 		}
 	}
@@ -434,11 +462,49 @@ func handoff(b *workbuf) *workbuf {
 	b.nobj -= n
 	b1.nobj = n
 	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
-	_g_ := getg()
-	_g_.m.gcstats.nhandoff++
-	_g_.m.gcstats.nhandoffcnt += uint64(n)
 
 	// Put b on full list - let first half of b get stolen.
 	putfull(b)
 	return b1
 }
+
+// prepareFreeWorkbufs moves busy workbuf spans to free list so they
+// can be freed to the heap. This must only be called when all
+// workbufs are on the empty list.
+func prepareFreeWorkbufs() {
+	lock(&work.wbufSpans.lock)
+	if work.full != 0 {
+		throw("cannot free workbufs when work.full != 0")
+	}
+	// Since all workbufs are on the empty list, we don't care
+	// which ones are in which spans. We can wipe the entire empty
+	// list and move all workbuf spans to the free list.
+	work.empty = 0
+	work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
+	unlock(&work.wbufSpans.lock)
+}
+
+// freeSomeWbufs frees some workbufs back to the heap and returns
+// true if it should be called again to free more.
+func freeSomeWbufs(preemptible bool) bool {
+	const batchSize = 64 // ~1–2 µs per span.
+	lock(&work.wbufSpans.lock)
+	if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
+		unlock(&work.wbufSpans.lock)
+		return false
+	}
+	systemstack(func() {
+		gp := getg().m.curg
+		for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
+			span := work.wbufSpans.free.first
+			if span == nil {
+				break
+			}
+			work.wbufSpans.free.remove(span)
+			mheap_.freeManual(span, &memstats.gc_sys)
+		}
+	})
+	more := !work.wbufSpans.free.isEmpty()
+	unlock(&work.wbufSpans.lock)
+	return more
+}
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index 7262748..8749f97 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -29,12 +29,13 @@ const minPhysPageSize = 4096
 //go:notinheap
 type mheap struct {
 	lock      mutex
-	free      [_MaxMHeapList]mSpanList // free lists of given length
-	freelarge mSpanList                // free lists length >= _MaxMHeapList
-	busy      [_MaxMHeapList]mSpanList // busy lists of large objects of given length
-	busylarge mSpanList                // busy lists of large objects length >= _MaxMHeapList
+	free      [_MaxMHeapList]mSpanList // free lists of given length up to _MaxMHeapList
+	freelarge mTreap                   // free treap of length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mSpanList // busy lists of large spans of given length
+	busylarge mSpanList                // busy lists of large spans length >= _MaxMHeapList
 	sweepgen  uint32                   // sweep generation, see comment in mspan
 	sweepdone uint32                   // all spans are swept
+	sweepers  uint32                   // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -74,37 +75,82 @@ type mheap struct {
 	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
-	pagesInUse        uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
-	spanBytesAlloc    uint64  // bytes of spans allocated this cycle; updated atomically
-	pagesSwept        uint64  // pages swept this cycle; updated atomically
-	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+	//
+	// These parameters represent a linear function from heap_live
+	// to page sweep count. The proportional sweep system works to
+	// stay in the black by keeping the current page sweep count
+	// above this line at the current heap_live.
+	//
+	// The line has slope sweepPagesPerByte and passes through a
+	// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
+	// any given time, the system is at (memstats.heap_live,
+	// pagesSwept) in this space.
+	//
+	// It's important that the line pass through a point we
+	// control rather than simply starting at a (0,0) origin
+	// because that lets us adjust sweep pacing at any time while
+	// accounting for current progress. If we could only adjust
+	// the slope, it would create a discontinuity in debt if any
+	// progress has already been made.
+	pagesInUse         uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
+	pagesSwept         uint64  // pages swept this cycle; updated atomically
+	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
+	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
+	sweepPagesPerByte  float64 // proportional sweep ratio; written with lock, read without
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
 
 	// Malloc stats.
-	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+	largealloc  uint64                  // bytes allocated for large objects
+	nlargealloc uint64                  // number of large object allocations
+	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
 
 	// range of addresses we might see in the heap
-	bitmap         uintptr // Points to one byte past the end of the bitmap
-	bitmap_mapped  uintptr
-	arena_start    uintptr
-	arena_used     uintptr // always mHeap_Map{Bits,Spans} before updating
-	arena_end      uintptr
+	bitmap        uintptr // Points to one byte past the end of the bitmap
+	bitmap_mapped uintptr
+
+	// The arena_* fields indicate the addresses of the Go heap.
+	//
+	// The maximum range of the Go heap is
+	// [arena_start, arena_start+_MaxMem+1).
+	//
+	// The range of the current Go heap is
+	// [arena_start, arena_used). Parts of this range may not be
+	// mapped, but the metadata structures are always mapped for
+	// the full range.
+	arena_start uintptr
+	arena_used  uintptr // Set with setArenaUsed.
+
+	// The heap is grown using a linear allocator that allocates
+	// from the block [arena_alloc, arena_end). arena_alloc is
+	// often, but *not always* equal to arena_used.
+	arena_alloc uintptr
+	arena_end   uintptr
+
+	// arena_reserved indicates that the memory [arena_alloc,
+	// arena_end) is reserved (e.g., mapped PROT_NONE). If this is
+	// false, we have to be careful not to clobber existing
+	// mappings here. If this is true, then we own the mapping
+	// here and *must* clobber it to use it.
 	arena_reserved bool
 
+	_ uint32 // ensure 64-bit alignment
+
 	// central free lists for small size classes.
 	// the padding makes sure that the MCentrals are
 	// spaced CacheLineSize bytes apart, so that each MCentral.lock
 	// gets its own cache line.
-	central [_NumSizeClasses]struct {
+	// central is indexed by spanClass.
+	central [numSpanClasses]struct {
 		mcentral mcentral
-		pad      [sys.CacheLineSize]byte
+		pad      [sys.CacheLineSize - unsafe.Sizeof(mcentral{})%sys.CacheLineSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
 	cachealloc            fixalloc // allocator for mcache*
+	treapalloc            fixalloc // allocator for treapNodes* used by large objects
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
@@ -117,7 +163,7 @@ var mheap_ mheap
 // When a MSpan is in the heap free list, state == MSpanFree
 // and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
 //
-// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// When a MSpan is allocated, state == MSpanInUse or MSpanManual
 // and heapmap(i) == span for all s->start <= i < s->start+s->npages.
 
 // Every MSpan is in one doubly-linked list,
@@ -125,25 +171,25 @@ var mheap_ mheap
 // MCentral's span lists.
 
 // An MSpan representing actual memory has state _MSpanInUse,
-// _MSpanStack, or _MSpanFree. Transitions between these states are
+// _MSpanManual, or _MSpanFree. Transitions between these states are
 // constrained as follows:
 //
-// * A span may transition from free to in-use or stack during any GC
+// * A span may transition from free to in-use or manual during any GC
 //   phase.
 //
 // * During sweeping (gcphase == _GCoff), a span may transition from
-//   in-use to free (as a result of sweeping) or stack to free (as a
+//   in-use to free (as a result of sweeping) or manual to free (as a
 //   result of stacks being freed).
 //
 // * During GC (gcphase != _GCoff), a span *must not* transition from
-//   stack or in-use to free. Because concurrent GC may read a pointer
+//   manual or in-use to free. Because concurrent GC may read a pointer
 //   and then look up its span, the span state must be monotonic.
 type mSpanState uint8
 
 const (
-	_MSpanDead  mSpanState = iota
-	_MSpanInUse            // allocated for garbage collected heap
-	_MSpanStack            // allocated for use by stack allocator
+	_MSpanDead   mSpanState = iota
+	_MSpanInUse             // allocated for garbage collected heap
+	_MSpanManual            // allocated for manual management (e.g., stack allocator)
 	_MSpanFree
 )
 
@@ -152,7 +198,7 @@ const (
 var mSpanStateNames = []string{
 	"_MSpanDead",
 	"_MSpanInUse",
-	"_MSpanStack",
+	"_MSpanManual",
 	"_MSpanFree",
 }
 
@@ -170,15 +216,16 @@ type mspan struct {
 	prev *mspan     // previous span in list, or nil if none
 	list *mSpanList // For debugging. TODO: Remove.
 
-	startAddr     uintptr   // address of first byte of span aka s.base()
-	npages        uintptr   // number of pages in span
-	stackfreelist gclinkptr // list of free stacks, avoids overloading freelist
+	startAddr uintptr // address of first byte of span aka s.base()
+	npages    uintptr // number of pages in span
+
+	manualFreeList gclinkptr // list of free objects in _MSpanManual spans
 
 	// freeindex is the slot index between 0 and nelems at which to begin scanning
 	// for the next free object in this span.
 	// Each allocation scans allocBits starting at freeindex until it encounters a 0
 	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
-	// just past the the newly discovered free object.
+	// just past the newly discovered free object.
 	//
 	// If freeindex == nelem, this span has no free objects.
 	//
@@ -224,8 +271,8 @@ type mspan struct {
 	// The sweep will free the old allocBits and set allocBits to the
 	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
 	// out memory.
-	allocBits  *uint8
-	gcmarkBits *uint8
+	allocBits  *gcBits
+	gcmarkBits *gcBits
 
 	// sweep generation:
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
@@ -236,8 +283,8 @@ type mspan struct {
 	sweepgen    uint32
 	divMul      uint16     // for divide by elemsize - divMagic.mul
 	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
-	allocCount  uint16     // capacity - number of objects in freelist
-	sizeclass   uint8      // size class
+	allocCount  uint16     // number of allocated objects
+	spanclass   spanClass  // size class and noscan (uint8)
 	incache     bool       // being used by an mcache
 	state       mSpanState // mspaninuse etc
 	needzero    uint8      // needs to be zeroed before allocation
@@ -292,8 +339,33 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h.allspans = append(h.allspans, s)
 }
 
+// A spanClass represents the size class and noscan-ness of a span.
+//
+// Each size class has a noscan spanClass and a scan spanClass. The
+// noscan spanClass contains only noscan objects, which do not contain
+// pointers and thus do not need to be scanned by the garbage
+// collector.
+type spanClass uint8
+
+const (
+	numSpanClasses = _NumSizeClasses << 1
+	tinySpanClass  = spanClass(tinySizeClass<<1 | 1)
+)
+
+func makeSpanClass(sizeclass uint8, noscan bool) spanClass {
+	return spanClass(sizeclass<<1) | spanClass(bool2int(noscan))
+}
+
+func (sc spanClass) sizeclass() int8 {
+	return int8(sc >> 1)
+}
+
+func (sc spanClass) noscan() bool {
+	return sc&1 != 0
+}
+
 // inheap reports whether b is a pointer into a (potentially dead) heap object.
-// It returns false for pointers into stack spans.
+// It returns false for pointers into _MSpanManual spans.
 // Non-preemptible because it is used by write barriers.
 //go:nowritebarrier
 //go:nosplit
@@ -309,7 +381,9 @@ func inheap(b uintptr) bool {
 	return true
 }
 
-// inHeapOrStack is a variant of inheap that returns true for pointers into stack spans.
+// inHeapOrStack is a variant of inheap that returns true for pointers
+// into any allocated heap span.
+//
 //go:nowritebarrier
 //go:nosplit
 func inHeapOrStack(b uintptr) bool {
@@ -322,10 +396,8 @@ func inHeapOrStack(b uintptr) bool {
 		return false
 	}
 	switch s.state {
-	case mSpanInUse:
+	case mSpanInUse, _MSpanManual:
 		return b < s.limit
-	case _MSpanStack:
-		return b < s.base()+s.npages<<_PageShift
 	default:
 		return false
 	}
@@ -376,7 +448,7 @@ func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
 	}
 
 	p := s.base()
-	if s.sizeclass == 0 {
+	if s.spanclass.sizeclass() == 0 {
 		// Large object.
 		if base != nil {
 			*base = p
@@ -401,6 +473,7 @@ func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
 
 // Initialize the heap.
 func (h *mheap) init(spansStart, spansBytes uintptr) {
+	h.treapalloc.init(unsafe.Sizeof(treapNode{}), nil, nil, &memstats.other_sys)
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -421,26 +494,51 @@ func (h *mheap) init(spansStart, spansBytes uintptr) {
 		h.busy[i].init()
 	}
 
-	h.freelarge.init()
 	h.busylarge.init()
 	for i := range h.central {
-		h.central[i].mcentral.init(int32(i))
+		h.central[i].mcentral.init(spanClass(i))
 	}
 
 	sp := (*slice)(unsafe.Pointer(&h.spans))
 	sp.array = unsafe.Pointer(spansStart)
 	sp.len = 0
 	sp.cap = int(spansBytes / sys.PtrSize)
+
+	// Map metadata structures. But don't map race detector memory
+	// since we're not actually growing the arena here (and TSAN
+	// gets mad if you map 0 bytes).
+	h.setArenaUsed(h.arena_used, false)
 }
 
-// mHeap_MapSpans makes sure that the spans are mapped
+// setArenaUsed extends the usable arena to address arena_used and
+// maps auxiliary VM regions for any newly usable arena space.
+//
+// racemap indicates that this memory should be managed by the race
+// detector. racemap should be true unless this is covering a VM hole.
+func (h *mheap) setArenaUsed(arena_used uintptr, racemap bool) {
+	// Map auxiliary structures *before* h.arena_used is updated.
+	// Waiting to update arena_used until after the memory has been mapped
+	// avoids faults when other threads try access these regions immediately
+	// after observing the change to arena_used.
+
+	// Map the bitmap.
+	h.mapBits(arena_used)
+
+	// Map spans array.
+	h.mapSpans(arena_used)
+
+	// Tell the race detector about the new heap memory.
+	if racemap && raceenabled {
+		racemapshadow(unsafe.Pointer(h.arena_used), arena_used-h.arena_used)
+	}
+
+	h.arena_used = arena_used
+}
+
+// mapSpans makes sure that the spans are mapped
 // up to the new value of arena_used.
 //
-// It must be called with the expected new value of arena_used,
-// *before* h.arena_used has been updated.
-// Waiting to update arena_used until after the memory has been mapped
-// avoids faults when other threads try access the bitmap immediately
-// after observing the change to arena_used.
+// Don't call this directly. Call mheap.setArenaUsed.
 func (h *mheap) mapSpans(arena_used uintptr) {
 	// Map spans array, PageSize at a time.
 	n := arena_used
@@ -466,7 +564,7 @@ retry:
 		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			list.remove(s)
 			// swept spans are at the end of the list
-			list.insertBack(s)
+			list.insertBack(s) // Puts it back on a busy list. s is not in the treap at this point.
 			unlock(&h.lock)
 			snpages := s.npages
 			if s.sweep(false) {
@@ -533,7 +631,7 @@ func (h *mheap) reclaim(npage uintptr) {
 
 // Allocate a new span of npage pages from the heap for GC'd memory
 // and record its size class in the HeapMap and HeapMapCache.
-func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
+func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 	_g_ := getg()
 	lock(&h.lock)
 
@@ -547,7 +645,13 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 		// If GC kept a bit for whether there were any marks
 		// in a span, we could release these free spans
 		// at the end of GC and eliminate this entirely.
+		if trace.enabled {
+			traceGCSweepStart()
+		}
 		h.reclaim(npage)
+		if trace.enabled {
+			traceGCSweepDone()
+		}
 	}
 
 	// transfer stats from cache to global
@@ -556,7 +660,7 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
 	_g_.m.mcache.local_tinyallocs = 0
 
-	s := h.allocSpanLocked(npage)
+	s := h.allocSpanLocked(npage, &memstats.heap_inuse)
 	if s != nil {
 		// Record span info, because gc needs to be
 		// able to map interior pointer to containing span.
@@ -564,8 +668,8 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
 		s.state = _MSpanInUse
 		s.allocCount = 0
-		s.sizeclass = uint8(sizeclass)
-		if sizeclass == 0 {
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
 			s.elemsize = s.npages << _PageShift
 			s.divShift = 0
 			s.divMul = 0
@@ -584,9 +688,11 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 		h.pagesInUse += uint64(npage)
 		if large {
 			memstats.heap_objects++
+			mheap_.largealloc += uint64(s.elemsize)
+			mheap_.nlargealloc++
 			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
 			// Swept spans are at the end of lists.
-			if s.npages < uintptr(len(h.free)) {
+			if s.npages < uintptr(len(h.busy)) {
 				h.busy[s.npages].insertBack(s)
 			} else {
 				h.busylarge.insertBack(s)
@@ -615,13 +721,13 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 	return s
 }
 
-func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
 	var s *mspan
 	systemstack(func() {
-		s = h.alloc_m(npage, sizeclass, large)
+		s = h.alloc_m(npage, spanclass, large)
 	})
 
 	if s != nil {
@@ -633,29 +739,46 @@ func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool)
 	return s
 }
 
-func (h *mheap) allocStack(npage uintptr) *mspan {
-	_g_ := getg()
-	if _g_ != _g_.m.g0 {
-		throw("mheap_allocstack not on g0 stack")
-	}
+// allocManual allocates a manually-managed span of npage pages.
+// allocManual returns nil if allocation fails.
+//
+// allocManual adds the bytes used to *stat, which should be a
+// memstats in-use field. Unlike allocations in the GC'd heap, the
+// allocation does *not* count toward heap_inuse or heap_sys.
+//
+// The memory backing the returned span may not be zeroed if
+// span.needzero is set.
+//
+// allocManual must be called on the system stack to prevent stack
+// growth. Since this is used by the stack allocator, stack growth
+// during allocManual would self-deadlock.
+//
+//go:systemstack
+func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
 	lock(&h.lock)
-	s := h.allocSpanLocked(npage)
+	s := h.allocSpanLocked(npage, stat)
 	if s != nil {
-		s.state = _MSpanStack
-		s.stackfreelist = 0
+		s.state = _MSpanManual
+		s.manualFreeList = 0
 		s.allocCount = 0
-		memstats.stacks_inuse += uint64(s.npages << _PageShift)
+		s.spanclass = 0
+		s.nelems = 0
+		s.elemsize = 0
+		s.limit = s.base() + s.npages<<_PageShift
+		// Manually manged memory doesn't count toward heap_sys.
+		memstats.heap_sys -= uint64(s.npages << _PageShift)
 	}
 
-	// This unlock acts as a release barrier. See mHeap_Alloc_m.
+	// This unlock acts as a release barrier. See mheap.alloc_m.
 	unlock(&h.lock)
+
 	return s
 }
 
 // Allocates a span of the given size.  h must be locked.
 // The returned span has been removed from the
 // free list, but its state is still MSpanFree.
-func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
+func (h *mheap) allocSpanLocked(npage uintptr, stat *uint64) *mspan {
 	var list *mSpanList
 	var s *mspan
 
@@ -664,13 +787,12 @@ func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
 		list = &h.free[i]
 		if !list.isEmpty() {
 			s = list.first
+			list.remove(s)
 			goto HaveSpan
 		}
 	}
-
 	// Best fit in list of large spans.
-	list = &h.freelarge
-	s = h.allocLarge(npage)
+	s = h.allocLarge(npage) // allocLarge removed s from h.freelarge for us
 	if s == nil {
 		if !h.grow(npage) {
 			return nil
@@ -689,10 +811,6 @@ HaveSpan:
 	if s.npages < npage {
 		throw("MHeap_AllocLocked - bad npages")
 	}
-	list.remove(s)
-	if s.inList() {
-		throw("still in list")
-	}
 	if s.npreleased > 0 {
 		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
 		memstats.heap_released -= uint64(s.npreleased << _PageShift)
@@ -711,8 +829,8 @@ HaveSpan:
 		h.spans[p] = t
 		h.spans[p+t.npages-1] = t
 		t.needzero = s.needzero
-		s.state = _MSpanStack // prevent coalescing with s
-		t.state = _MSpanStack
+		s.state = _MSpanManual // prevent coalescing with s
+		t.state = _MSpanManual
 		h.freeSpanLocked(t, false, false, s.unusedsince)
 		s.state = _MSpanFree
 	}
@@ -723,7 +841,7 @@ HaveSpan:
 		h.spans[p+n] = s
 	}
 
-	memstats.heap_inuse += uint64(npage << _PageShift)
+	*stat += uint64(npage << _PageShift)
 	memstats.heap_idle -= uint64(npage << _PageShift)
 
 	//println("spanalloc", hex(s.start<<_PageShift))
@@ -733,24 +851,19 @@ HaveSpan:
 	return s
 }
 
-// Allocate a span of exactly npage pages from the list of large spans.
-func (h *mheap) allocLarge(npage uintptr) *mspan {
-	return bestFit(&h.freelarge, npage, nil)
+// Large spans have a minimum size of 1MByte. The maximum number of large spans to support
+// 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
+// the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
+// by a perfectly balanced tree with a a depth of 20. Twice that is an acceptable 40.
+func (h *mheap) isLargeSpan(npages uintptr) bool {
+	return npages >= uintptr(len(h.free))
 }
 
-// Search list for smallest span with >= npage pages.
-// If there are multiple smallest spans, take the one
-// with the earliest starting address.
-func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan {
-	for s := list.first; s != nil; s = s.next {
-		if s.npages < npage {
-			continue
-		}
-		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) {
-			best = s
-		}
-	}
-	return best
+// allocLarge allocates a span of at least npage pages from the treap of large spans.
+// Returns nil if no such span currently exists.
+func (h *mheap) allocLarge(npage uintptr) *mspan {
+	// Search treap for smallest span with >= npage pages.
+	return h.freelarge.remove(npage)
 }
 
 // Try to add at least npage pages of memory to the heap,
@@ -849,22 +962,30 @@ func (h *mheap) freeSpan(s *mspan, acct int32) {
 	})
 }
 
-func (h *mheap) freeStack(s *mspan) {
-	_g_ := getg()
-	if _g_ != _g_.m.g0 {
-		throw("mheap_freestack not on g0 stack")
-	}
+// freeManual frees a manually-managed span returned by allocManual.
+// stat must be the same as the stat passed to the allocManual that
+// allocated s.
+//
+// This must only be called when gcphase == _GCoff. See mSpanState for
+// an explanation.
+//
+// freeManual must be called on the system stack to prevent stack
+// growth, just like allocManual.
+//
+//go:systemstack
+func (h *mheap) freeManual(s *mspan, stat *uint64) {
 	s.needzero = 1
 	lock(&h.lock)
-	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
-	h.freeSpanLocked(s, true, true, 0)
+	*stat -= uint64(s.npages << _PageShift)
+	memstats.heap_sys += uint64(s.npages << _PageShift)
+	h.freeSpanLocked(s, false, true, 0)
 	unlock(&h.lock)
 }
 
 // s must be on a busy list (h.busy or h.busylarge) or unlinked.
 func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
 	switch s.state {
-	case _MSpanStack:
+	case _MSpanManual:
 		if s.allocCount != 0 {
 			throw("MHeap_FreeSpanLocked - invalid stack free")
 		}
@@ -900,50 +1021,98 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i
 	// Coalesce with earlier, later spans.
 	p := (s.base() - h.arena_start) >> _PageShift
 	if p > 0 {
-		t := h.spans[p-1]
-		if t != nil && t.state == _MSpanFree {
-			s.startAddr = t.startAddr
-			s.npages += t.npages
-			s.npreleased = t.npreleased // absorb released pages
-			s.needzero |= t.needzero
-			p -= t.npages
+		before := h.spans[p-1]
+		if before != nil && before.state == _MSpanFree {
+			// Now adjust s.
+			s.startAddr = before.startAddr
+			s.npages += before.npages
+			s.npreleased = before.npreleased // absorb released pages
+			s.needzero |= before.needzero
+			p -= before.npages
 			h.spans[p] = s
-			h.freeList(t.npages).remove(t)
-			t.state = _MSpanDead
-			h.spanalloc.free(unsafe.Pointer(t))
+			// The size is potentially changing so the treap needs to delete adjacent nodes and
+			// insert back as a combined node.
+			if h.isLargeSpan(before.npages) {
+				// We have a t, it is large so it has to be in the treap so we can remove it.
+				h.freelarge.removeSpan(before)
+			} else {
+				h.freeList(before.npages).remove(before)
+			}
+			before.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(before))
 		}
 	}
+
+	// Now check to see if next (greater addresses) span is free and can be coalesced.
 	if (p + s.npages) < uintptr(len(h.spans)) {
-		t := h.spans[p+s.npages]
-		if t != nil && t.state == _MSpanFree {
-			s.npages += t.npages
-			s.npreleased += t.npreleased
-			s.needzero |= t.needzero
+		after := h.spans[p+s.npages]
+		if after != nil && after.state == _MSpanFree {
+			s.npages += after.npages
+			s.npreleased += after.npreleased
+			s.needzero |= after.needzero
 			h.spans[p+s.npages-1] = s
-			h.freeList(t.npages).remove(t)
-			t.state = _MSpanDead
-			h.spanalloc.free(unsafe.Pointer(t))
+			if h.isLargeSpan(after.npages) {
+				h.freelarge.removeSpan(after)
+			} else {
+				h.freeList(after.npages).remove(after)
+			}
+			after.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(after))
 		}
 	}
 
-	// Insert s into appropriate list.
-	h.freeList(s.npages).insert(s)
+	// Insert s into appropriate list or treap.
+	if h.isLargeSpan(s.npages) {
+		h.freelarge.insert(s)
+	} else {
+		h.freeList(s.npages).insert(s)
+	}
 }
 
 func (h *mheap) freeList(npages uintptr) *mSpanList {
-	if npages < uintptr(len(h.free)) {
-		return &h.free[npages]
-	}
-	return &h.freelarge
+	return &h.free[npages]
 }
 
 func (h *mheap) busyList(npages uintptr) *mSpanList {
-	if npages < uintptr(len(h.free)) {
+	if npages < uintptr(len(h.busy)) {
 		return &h.busy[npages]
 	}
 	return &h.busylarge
 }
 
+func scavengeTreapNode(t *treapNode, now, limit uint64) uintptr {
+	s := t.spanKey
+	var sumreleased uintptr
+	if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
+		start := s.base()
+		end := start + s.npages<<_PageShift
+		if physPageSize > _PageSize {
+			// We can only release pages in
+			// physPageSize blocks, so round start
+			// and end in. (Otherwise, madvise
+			// will round them *out* and release
+			// more memory than we want.)
+			start = (start + physPageSize - 1) &^ (physPageSize - 1)
+			end &^= physPageSize - 1
+			if end <= start {
+				// start and end don't span a
+				// whole physical page.
+				return sumreleased
+			}
+		}
+		len := end - start
+		released := len - (s.npreleased << _PageShift)
+		if physPageSize > _PageSize && released == 0 {
+			return sumreleased
+		}
+		memstats.heap_released += uint64(released)
+		sumreleased += released
+		s.npreleased = len >> _PageShift
+		sysUnused(unsafe.Pointer(start), len)
+	}
+	return sumreleased
+}
+
 func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 	if list.isEmpty() {
 		return 0
@@ -984,27 +1153,31 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 }
 
 func (h *mheap) scavenge(k int32, now, limit uint64) {
+	// Disallow malloc or panic while holding the heap lock. We do
+	// this here because this is an non-mallocgc entry-point to
+	// the mheap API.
+	gp := getg()
+	gp.m.mallocing++
 	lock(&h.lock)
 	var sumreleased uintptr
 	for i := 0; i < len(h.free); i++ {
 		sumreleased += scavengelist(&h.free[i], now, limit)
 	}
-	sumreleased += scavengelist(&h.freelarge, now, limit)
+	sumreleased += scavengetreap(h.freelarge.treap, now, limit)
 	unlock(&h.lock)
+	gp.m.mallocing--
 
 	if debug.gctrace > 0 {
 		if sumreleased > 0 {
 			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
 		}
-		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
-		// But we can't call ReadMemStats on g0 holding locks.
 		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
 	}
 }
 
 //go:linkname runtime_debug_freeOSMemory runtime_debug.freeOSMemory
 func runtime_debug_freeOSMemory() {
-	gcStart(gcForceBlockMode, false)
+	GC()
 	systemstack(func() { mheap_.scavenge(-1, ^uint64(0), 0) })
 }
 
@@ -1017,7 +1190,7 @@ func (span *mspan) init(base uintptr, npages uintptr) {
 	span.startAddr = base
 	span.npages = npages
 	span.allocCount = 0
-	span.sizeclass = 0
+	span.spanclass = 0
 	span.incache = false
 	span.elemsize = 0
 	span.state = _MSpanDead
@@ -1043,7 +1216,8 @@ func (list *mSpanList) init() {
 
 func (list *mSpanList) remove(span *mspan) {
 	if span.list != list {
-		println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
+		print("runtime: failed MSpanList_Remove span.npages=", span.npages,
+			" span=", span, " prev=", span.prev, " span.list=", span.list, " list=", list, "\n")
 		throw("MSpanList_Remove")
 	}
 	if list.first == span {
@@ -1085,7 +1259,7 @@ func (list *mSpanList) insert(span *mspan) {
 
 func (list *mSpanList) insertBack(span *mspan) {
 	if span.next != nil || span.prev != nil || span.list != nil {
-		println("failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
+		println("runtime: failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
 		throw("MSpanList_InsertBack")
 	}
 	span.prev = list.last
@@ -1100,6 +1274,31 @@ func (list *mSpanList) insertBack(span *mspan) {
 	span.list = list
 }
 
+// takeAll removes all spans from other and inserts them at the front
+// of list.
+func (list *mSpanList) takeAll(other *mSpanList) {
+	if other.isEmpty() {
+		return
+	}
+
+	// Reparent everything in other to list.
+	for s := other.first; s != nil; s = s.next {
+		s.list = list
+	}
+
+	// Concatenate the lists.
+	if list.isEmpty() {
+		*list = *other
+	} else {
+		// Neither list is empty. Put other before list.
+		other.last.next = list.first
+		list.first.prev = other.last
+		list.first = other.first
+	}
+
+	other.first, other.last = nil, nil
+}
+
 const (
 	_KindSpecialFinalizer = 1
 	_KindSpecialProfile   = 2
@@ -1311,6 +1510,22 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr) {
 	}
 }
 
+// gcBits is an alloc/mark bitmap. This is always used as *gcBits.
+//
+//go:notinheap
+type gcBits uint8
+
+// bytep returns a pointer to the n'th byte of b.
+func (b *gcBits) bytep(n uintptr) *uint8 {
+	return addb((*uint8)(b), n)
+}
+
+// bitp returns a pointer to the byte containing bit n and a mask for
+// selecting that bit from *bytep.
+func (b *gcBits) bitp(n uintptr) (bytep *uint8, mask uint8) {
+	return b.bytep(n / 8), 1 << (n % 8)
+}
+
 const gcBitsChunkBytes = uintptr(64 << 10)
 const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
 
@@ -1320,42 +1535,87 @@ type gcBitsHeader struct {
 }
 
 //go:notinheap
-type gcBits struct {
+type gcBitsArena struct {
 	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
-	free uintptr // free is the index into bits of the next free byte.
-	next *gcBits
-	bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8
+	free uintptr // free is the index into bits of the next free byte; read/write atomically
+	next *gcBitsArena
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]gcBits
 }
 
 var gcBitsArenas struct {
 	lock     mutex
-	free     *gcBits
-	next     *gcBits
-	current  *gcBits
-	previous *gcBits
+	free     *gcBitsArena
+	next     *gcBitsArena // Read atomically. Write atomically under lock.
+	current  *gcBitsArena
+	previous *gcBitsArena
+}
+
+// tryAlloc allocates from b or returns nil if b does not have enough room.
+// This is safe to call concurrently.
+func (b *gcBitsArena) tryAlloc(bytes uintptr) *gcBits {
+	if b == nil || atomic.Loaduintptr(&b.free)+bytes > uintptr(len(b.bits)) {
+		return nil
+	}
+	// Try to allocate from this block.
+	end := atomic.Xadduintptr(&b.free, bytes)
+	if end > uintptr(len(b.bits)) {
+		return nil
+	}
+	// There was enough room.
+	start := end - bytes
+	return &b.bits[start]
 }
 
 // newMarkBits returns a pointer to 8 byte aligned bytes
 // to be used for a span's mark bits.
-func newMarkBits(nelems uintptr) *uint8 {
-	lock(&gcBitsArenas.lock)
+func newMarkBits(nelems uintptr) *gcBits {
 	blocksNeeded := uintptr((nelems + 63) / 64)
 	bytesNeeded := blocksNeeded * 8
-	if gcBitsArenas.next == nil ||
-		gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) {
-		// Allocate a new arena.
-		fresh := newArena()
-		fresh.next = gcBitsArenas.next
-		gcBitsArenas.next = fresh
-	}
-	if gcBitsArenas.next.free >= gcBitsChunkBytes {
-		println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes)
+
+	// Try directly allocating from the current head arena.
+	head := (*gcBitsArena)(atomic.Loadp(unsafe.Pointer(&gcBitsArenas.next)))
+	if p := head.tryAlloc(bytesNeeded); p != nil {
+		return p
+	}
+
+	// There's not enough room in the head arena. We may need to
+	// allocate a new arena.
+	lock(&gcBitsArenas.lock)
+	// Try the head arena again, since it may have changed. Now
+	// that we hold the lock, the list head can't change, but its
+	// free position still can.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate a new arena. This may temporarily drop the lock.
+	fresh := newArenaMayUnlock()
+	// If newArenaMayUnlock dropped the lock, another thread may
+	// have put a fresh arena on the "next" list. Try allocating
+	// from next again.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		// Put fresh back on the free list.
+		// TODO: Mark it "already zeroed"
+		fresh.next = gcBitsArenas.free
+		gcBitsArenas.free = fresh
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate from the fresh arena. We haven't linked it in yet, so
+	// this cannot race and is guaranteed to succeed.
+	p := fresh.tryAlloc(bytesNeeded)
+	if p == nil {
 		throw("markBits overflow")
 	}
-	result := &gcBitsArenas.next.bits[gcBitsArenas.next.free]
-	gcBitsArenas.next.free += bytesNeeded
+
+	// Add the fresh arena to the "next" list.
+	fresh.next = gcBitsArenas.next
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), unsafe.Pointer(fresh))
+
 	unlock(&gcBitsArenas.lock)
-	return result
+	return p
 }
 
 // newAllocBits returns a pointer to 8 byte aligned bytes
@@ -1364,7 +1624,7 @@ func newMarkBits(nelems uintptr) *uint8 {
 // allocation bits. For spans not being initialized the
 // the mark bits are repurposed as allocation bits when
 // the span is swept.
-func newAllocBits(nelems uintptr) *uint8 {
+func newAllocBits(nelems uintptr) *gcBits {
 	return newMarkBits(nelems)
 }
 
@@ -1398,18 +1658,21 @@ func nextMarkBitArenaEpoch() {
 	}
 	gcBitsArenas.previous = gcBitsArenas.current
 	gcBitsArenas.current = gcBitsArenas.next
-	gcBitsArenas.next = nil // newMarkBits calls newArena when needed
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), nil) // newMarkBits calls newArena when needed
 	unlock(&gcBitsArenas.lock)
 }
 
-// newArena allocates and zeroes a gcBits arena.
-func newArena() *gcBits {
-	var result *gcBits
+// newArenaMayUnlock allocates and zeroes a gcBits arena.
+// The caller must hold gcBitsArena.lock. This may temporarily release it.
+func newArenaMayUnlock() *gcBitsArena {
+	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
-		result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		unlock(&gcBitsArenas.lock)
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
+		lock(&gcBitsArenas.lock)
 	} else {
 		result = gcBitsArenas.free
 		gcBitsArenas.free = gcBitsArenas.free.next
@@ -1418,7 +1681,7 @@ func newArena() *gcBits {
 	result.next = nil
 	// If result.bits is not 8 byte aligned adjust index so
 	// that &result.bits[result.free] is 8 byte aligned.
-	if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 {
+	if uintptr(unsafe.Offsetof(gcBitsArena{}.bits))&7 == 0 {
 		result.free = 0
 	} else {
 		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
diff --git a/libgo/go/runtime/mksizeclasses.go b/libgo/go/runtime/mksizeclasses.go
index 0f897ba..0cb2b33 100644
--- a/libgo/go/runtime/mksizeclasses.go
+++ b/libgo/go/runtime/mksizeclasses.go
@@ -48,7 +48,7 @@ func main() {
 	flag.Parse()
 
 	var b bytes.Buffer
-	fmt.Fprintln(&b, "// AUTO-GENERATED by mksizeclasses.go; DO NOT EDIT")
+	fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.")
 	fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go")
 	fmt.Fprintln(&b)
 	fmt.Fprintln(&b, "package runtime")
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 87f84a7..f31c88c 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -64,27 +64,70 @@ type memRecord struct {
 	// come only after a GC during concurrent sweeping. So if we would
 	// naively count them, we would get a skew toward mallocs.
 	//
-	// Mallocs are accounted in recent stats.
-	// Explicit frees are accounted in recent stats.
-	// GC frees are accounted in prev stats.
-	// After GC prev stats are added to final stats and
-	// recent stats are moved into prev stats.
-	allocs      uintptr
-	frees       uintptr
-	alloc_bytes uintptr
-	free_bytes  uintptr
-
-	// changes between next-to-last GC and last GC
-	prev_allocs      uintptr
-	prev_frees       uintptr
-	prev_alloc_bytes uintptr
-	prev_free_bytes  uintptr
-
-	// changes since last GC
-	recent_allocs      uintptr
-	recent_frees       uintptr
-	recent_alloc_bytes uintptr
-	recent_free_bytes  uintptr
+	// Hence, we delay information to get consistent snapshots as
+	// of mark termination. Allocations count toward the next mark
+	// termination's snapshot, while sweep frees count toward the
+	// previous mark termination's snapshot:
+	//
+	//              MT          MT          MT          MT
+	//             .·|         .·|         .·|         .·|
+	//          .·˙  |      .·˙  |      .·˙  |      .·˙  |
+	//       .·˙     |   .·˙     |   .·˙     |   .·˙     |
+	//    .·˙        |.·˙        |.·˙        |.·˙        |
+	//
+	//       alloc → ▲ ← free
+	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//       C+2     →    C+1    →  C
+	//
+	//                   alloc → ▲ ← free
+	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//                   C+2     →    C+1    →  C
+	//
+	// Since we can't publish a consistent snapshot until all of
+	// the sweep frees are accounted for, we wait until the next
+	// mark termination ("MT" above) to publish the previous mark
+	// termination's snapshot ("P" above). To do this, allocation
+	// and free events are accounted to *future* heap profile
+	// cycles ("C+n" above) and we only publish a cycle once all
+	// of the events from that cycle must be done. Specifically:
+	//
+	// Mallocs are accounted to cycle C+2.
+	// Explicit frees are accounted to cycle C+2.
+	// GC frees (done during sweeping) are accounted to cycle C+1.
+	//
+	// After mark termination, we increment the global heap
+	// profile cycle counter and accumulate the stats from cycle C
+	// into the active profile.
+
+	// active is the currently published profile. A profiling
+	// cycle can be accumulated into active once its complete.
+	active memRecordCycle
+
+	// future records the profile events we're counting for cycles
+	// that have not yet been published. This is ring buffer
+	// indexed by the global heap profile cycle C and stores
+	// cycles C, C+1, and C+2. Unlike active, these counts are
+	// only for a single cycle; they are not cumulative across
+	// cycles.
+	//
+	// We store cycle C here because there's a window between when
+	// C becomes the active cycle and when we've flushed it to
+	// active.
+	future [3]memRecordCycle
+}
+
+// memRecordCycle
+type memRecordCycle struct {
+	allocs, frees           uintptr
+	alloc_bytes, free_bytes uintptr
+}
+
+// add accumulates b into a. It does not zero b.
+func (a *memRecordCycle) add(b *memRecordCycle) {
+	a.allocs += b.allocs
+	a.frees += b.frees
+	a.alloc_bytes += b.alloc_bytes
+	a.free_bytes += b.free_bytes
 }
 
 // A blockRecord is the bucket data for a bucket of type blockProfile,
@@ -100,8 +143,21 @@ var (
 	xbuckets  *bucket // mutex profile buckets
 	buckhash  *[179999]*bucket
 	bucketmem uintptr
+
+	mProf struct {
+		// All fields in mProf are protected by proflock.
+
+		// cycle is the global heap profile cycle. This wraps
+		// at mProfCycleWrap.
+		cycle uint32
+		// flushed indicates that future[cycle] in all buckets
+		// has been flushed to the active profile.
+		flushed bool
+	}
 )
 
+const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
 	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(location{})
@@ -212,30 +268,71 @@ func eqslice(x, y []location) bool {
 	return true
 }
 
-func mprof_GC() {
+// mProf_NextCycle publishes the next heap profile cycle and creates a
+// fresh heap profile cycle. This operation is fast and can be done
+// during STW. The caller must call mProf_Flush before calling
+// mProf_NextCycle again.
+//
+// This is called by mark termination during STW so allocations and
+// frees after the world is started again count towards a new heap
+// profiling cycle.
+func mProf_NextCycle() {
+	lock(&proflock)
+	// We explicitly wrap mProf.cycle rather than depending on
+	// uint wraparound because the memRecord.future ring does not
+	// itself wrap at a power of two.
+	mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
+	mProf.flushed = false
+	unlock(&proflock)
+}
+
+// mProf_Flush flushes the events from the current heap profiling
+// cycle into the active profile. After this it is safe to start a new
+// heap profiling cycle with mProf_NextCycle.
+//
+// This is called by GC after mark termination starts the world. In
+// contrast with mProf_NextCycle, this is somewhat expensive, but safe
+// to do concurrently.
+func mProf_Flush() {
+	lock(&proflock)
+	if !mProf.flushed {
+		mProf_FlushLocked()
+		mProf.flushed = true
+	}
+	unlock(&proflock)
+}
+
+func mProf_FlushLocked() {
+	c := mProf.cycle
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		mp.allocs += mp.prev_allocs
-		mp.frees += mp.prev_frees
-		mp.alloc_bytes += mp.prev_alloc_bytes
-		mp.free_bytes += mp.prev_free_bytes
 
-		mp.prev_allocs = mp.recent_allocs
-		mp.prev_frees = mp.recent_frees
-		mp.prev_alloc_bytes = mp.recent_alloc_bytes
-		mp.prev_free_bytes = mp.recent_free_bytes
-
-		mp.recent_allocs = 0
-		mp.recent_frees = 0
-		mp.recent_alloc_bytes = 0
-		mp.recent_free_bytes = 0
+		// Flush cycle C into the published profile and clear
+		// it for reuse.
+		mpc := &mp.future[c%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
 	}
 }
 
-// Record that a gc just happened: all the 'recent' statistics are now real.
-func mProf_GC() {
+// mProf_PostSweep records that all sweep frees for this GC cycle have
+// completed. This has the effect of publishing the heap profile
+// snapshot as of the last mark termination without advancing the heap
+// profile cycle.
+func mProf_PostSweep() {
 	lock(&proflock)
-	mprof_GC()
+	// Flush cycle C+1 to the active profile so everything as of
+	// the last mark termination becomes visible. *Don't* advance
+	// the cycle, since we're still accumulating allocs in cycle
+	// C+2, which have to become C+1 in the next mark termination
+	// and so on.
+	c := mProf.cycle
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
+	}
 	unlock(&proflock)
 }
 
@@ -245,9 +342,11 @@ func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 	nstk := callers(4, stk[:])
 	lock(&proflock)
 	b := stkbucket(memProfile, size, stk[:nstk], true)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.recent_allocs++
-	mp.recent_alloc_bytes += size
+	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+	mpc.allocs++
+	mpc.alloc_bytes += size
 	unlock(&proflock)
 
 	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
@@ -262,9 +361,11 @@ func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 // Called when freeing a profiled block.
 func mProf_Free(b *bucket, size uintptr) {
 	lock(&proflock)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.prev_frees++
-	mp.prev_free_bytes += size
+	mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+	mpc.frees++
+	mpc.free_bytes += size
 	unlock(&proflock)
 }
 
@@ -298,7 +399,7 @@ func blockevent(cycles int64, skip int) {
 		cycles = 1
 	}
 	if blocksampled(cycles) {
-		saveblockevent(cycles, skip+1, blockProfile, &blockprofilerate)
+		saveblockevent(cycles, skip+1, blockProfile)
 	}
 }
 
@@ -310,7 +411,7 @@ func blocksampled(cycles int64) bool {
 	return true
 }
 
-func saveblockevent(cycles int64, skip int, which bucketType, ratep *uint64) {
+func saveblockevent(cycles int64, skip int, which bucketType) {
 	gp := getg()
 	var nstk int
 	var stk [maxStack]location
@@ -355,7 +456,7 @@ func mutexevent(cycles int64, skip int) {
 	// TODO(pjw): measure impact of always calling fastrand vs using something
 	// like malloc.go:nextSample()
 	if rate > 0 && int64(fastrand())%rate == 0 {
-		saveblockevent(cycles, skip+1, mutexProfile, &mutexprofilerate)
+		saveblockevent(cycles, skip+1, mutexProfile)
 	}
 }
 
@@ -443,13 +544,17 @@ func (r *MemProfileRecord) Stack() []uintptr {
 // of calling MemProfile directly.
 func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 	lock(&proflock)
+	// If we're between mProf_NextCycle and mProf_Flush, take care
+	// of flushing to the active profile so we only have to look
+	// at the active profile below.
+	mProf_FlushLocked()
 	clear := true
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		if inuseZero || mp.alloc_bytes != mp.free_bytes {
+		if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 			n++
 		}
-		if mp.allocs != 0 || mp.frees != 0 {
+		if mp.active.allocs != 0 || mp.active.frees != 0 {
 			clear = false
 		}
 	}
@@ -457,13 +562,15 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 		// Absolutely no data, suggesting that a garbage collection
 		// has not yet happened. In order to allow profiling when
 		// garbage collection is disabled from the beginning of execution,
-		// accumulate stats as if a GC just happened, and recount buckets.
-		mprof_GC()
-		mprof_GC()
+		// accumulate all of the cycles, and recount buckets.
 		n = 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
-			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+			for c := range mp.future {
+				mp.active.add(&mp.future[c])
+				mp.future[c] = memRecordCycle{}
+			}
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				n++
 			}
 		}
@@ -473,7 +580,7 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 		idx := 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
-			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				record(&p[idx], b)
 				idx++
 			}
@@ -486,10 +593,10 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 // Write b's data to r.
 func record(r *MemProfileRecord, b *bucket) {
 	mp := b.mp()
-	r.AllocBytes = int64(mp.alloc_bytes)
-	r.FreeBytes = int64(mp.free_bytes)
-	r.AllocObjects = int64(mp.allocs)
-	r.FreeObjects = int64(mp.frees)
+	r.AllocBytes = int64(mp.active.alloc_bytes)
+	r.FreeBytes = int64(mp.active.free_bytes)
+	r.AllocObjects = int64(mp.active.allocs)
+	r.FreeObjects = int64(mp.active.frees)
 	for i, loc := range b.stk() {
 		if i >= len(r.Stack0) {
 			break
@@ -505,7 +612,7 @@ func iterate_memprof(fn func(*bucket, uintptr, *location, uintptr, uintptr, uint
 	lock(&proflock)
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		fn(b, b.nstk, &b.stk()[0], b.size, mp.allocs, mp.frees)
+		fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
 	}
 	unlock(&proflock)
 }
diff --git a/libgo/go/runtime/msize.go b/libgo/go/runtime/msize.go
index 438c987..0accb83 100644
--- a/libgo/go/runtime/msize.go
+++ b/libgo/go/runtime/msize.go
@@ -9,28 +9,6 @@
 
 package runtime
 
-// sizeToClass(0 <= n <= MaxSmallSize) returns the size class,
-//	1 <= sizeclass < NumSizeClasses, for n.
-//	Size class 0 is reserved to mean "not small".
-//
-// The sizeToClass lookup is implemented using two arrays,
-// one mapping sizes <= 1024 to their class and one mapping
-// sizes >= 1024 and <= MaxSmallSize to their class.
-// All objects are 8-aligned, so the first array is indexed by
-// the size divided by 8 (rounded up).  Objects >= 1024 bytes
-// are 128-aligned, so the second array is indexed by the
-// size divided by 128 (rounded up).  The arrays are constants
-// in sizeclass.go generated by mksizeclass.go.
-func sizeToClass(size uint32) uint32 {
-	if size > _MaxSmallSize {
-		throw("invalid size")
-	}
-	if size > smallSizeMax-8 {
-		return uint32(size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv])
-	}
-	return uint32(size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv])
-}
-
 // Returns size of the memory block that mallocgc will allocate if you ask for the size.
 func roundupsize(size uintptr) uintptr {
 	if size < _MaxSmallSize {
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index aa3cfef..71dc223 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -33,13 +33,12 @@ type mstats struct {
 	// Statistics about malloc heap.
 	// Protected by mheap.lock
 	//
-	// In mstats, heap_sys and heap_inuse includes stack memory,
-	// while in MemStats stack memory is separated out from the
-	// heap stats.
+	// Like MemStats, heap_sys and heap_inuse do not count memory
+	// in manually-managed spans.
 	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system
+	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
 	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in non-idle spans
+	heap_inuse    uint64 // bytes in _MSpanInUse spans
 	heap_released uint64 // bytes released to the os
 	heap_objects  uint64 // total number of allocated objects
 
@@ -59,7 +58,7 @@ type mstats struct {
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // this number is included in heap_inuse above; differs from MemStats.StackInuse
+	stacks_inuse uint64 // bytes in manually-managed stack spans
 	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
 	mspan_sys    uint64
@@ -72,7 +71,7 @@ type mstats struct {
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
 	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
-	last_gc         uint64 // last gc (in absolute time)
+	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
 	pause_end       [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
@@ -92,13 +91,26 @@ type mstats struct {
 
 	// Statistics below here are not exported to MemStats directly.
 
-	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+	last_gc_nanotime uint64 // last gc (monotonic time)
+	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+
+	// triggerRatio is the heap growth ratio that triggers marking.
+	//
+	// E.g., if this is 0.6, then GC should start when the live
+	// heap has reached 1.6 times the heap size marked by the
+	// previous cycle. This should be ≤ GOGC/100 so the trigger
+	// heap size is less than the goal heap size. This is set
+	// during mark termination for the next cycle's trigger.
+	triggerRatio float64
 
 	// gc_trigger is the heap size that triggers marking.
 	//
 	// When heap_live ≥ gc_trigger, the mark phase will start.
 	// This is also the heap size by which proportional sweeping
 	// must be complete.
+	//
+	// This is computed from triggerRatio during mark termination
+	// for the next cycle's trigger.
 	gc_trigger uint64
 
 	// heap_live is the number of bytes considered live by the GC.
@@ -121,6 +133,8 @@ type mstats struct {
 	// leads to a conservative GC rate rather than a GC rate that
 	// is potentially too low.
 	//
+	// Reads should likewise be atomic (or during STW).
+	//
 	// Whenever this is updated, call traceHeapAlloc() and
 	// gcController.revise().
 	heap_live uint64
@@ -451,22 +465,18 @@ func ReadMemStats(m *MemStats) {
 }
 
 func readmemstats_m(stats *MemStats) {
-	updatememstats(nil)
+	updatememstats()
 
 	// The size of the trailing by_size array differs between
 	// mstats and MemStats. NumSizeClasses was changed, but we
 	// cannot change MemStats because of backward compatibility.
 	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
 
-	// Stack numbers are part of the heap numbers, separate those out for user consumption
+	// memstats.stacks_sys is only memory mapped directly for OS stacks.
+	// Add in heap-allocated stack memory for user consumption.
 	stats.StackSys += stats.StackInuse
-	stats.HeapInuse -= stats.StackInuse
-	stats.HeapSys -= stats.StackInuse
 }
 
-// For gccgo this is in runtime/mgc0.c.
-func updatememstats(stats *gcstats)
-
 //go:linkname readGCStats runtime_debug.readGCStats
 func readGCStats(pauses *[]uint64) {
 	systemstack(func() {
@@ -500,7 +510,7 @@ func readGCStats_m(pauses *[]uint64) {
 		p[n+i] = memstats.pause_end[j]
 	}
 
-	p[n+n] = memstats.last_gc
+	p[n+n] = memstats.last_gc_unix
 	p[n+n+1] = uint64(memstats.numgc)
 	p[n+n+2] = memstats.pause_total_ns
 	unlock(&mheap_.lock)
@@ -508,26 +518,15 @@ func readGCStats_m(pauses *[]uint64) {
 }
 
 //go:nowritebarrier
-func updatememstats(stats *gcstats) {
-	if stats != nil {
-		*stats = gcstats{}
-	}
-	for mp := allm; mp != nil; mp = mp.alllink {
-		if stats != nil {
-			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
-			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
-			for i, v := range src {
-				dst[i] += v
-			}
-			mp.gcstats = gcstats{}
-		}
-	}
-
+func updatememstats() {
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
 		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
 
+	// We also count stacks_inuse as sys memory.
+	memstats.sys += memstats.stacks_inuse
+
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
 	// Current number of alive object in the heap and amount of alive heap memory
@@ -550,45 +549,49 @@ func updatememstats(stats *gcstats) {
 	// Aggregate local stats.
 	cachestats()
 
-	// Scan all spans and count number of alive objects.
-	lock(&mheap_.lock)
-	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+	// Collect allocation stats. This is safe and consistent
+	// because the world is stopped.
+	var smallFree, totalAlloc, totalFree uint64
+	// Collect per-spanclass stats.
+	for spc := range mheap_.central {
+		// The mcaches are now empty, so mcentral stats are
+		// up-to-date.
+		c := &mheap_.central[spc].mcentral
+		memstats.nmalloc += c.nmalloc
+		i := spanClass(spc).sizeclass()
+		memstats.by_size[i].nmalloc += c.nmalloc
+		totalAlloc += c.nmalloc * uint64(class_to_size[i])
+	}
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		if i == 0 {
+			memstats.nmalloc += mheap_.nlargealloc
+			totalAlloc += mheap_.largealloc
+			totalFree += mheap_.largefree
+			memstats.nfree += mheap_.nlargefree
 			continue
 		}
-		if s.sizeclass == 0 {
-			memstats.nmalloc++
-			memstats.alloc += uint64(s.elemsize)
-		} else {
-			memstats.nmalloc += uint64(s.allocCount)
-			memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount)
-			memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize)
-		}
-	}
-	unlock(&mheap_.lock)
 
-	// Aggregate by size class.
-	smallfree := uint64(0)
-	memstats.nfree = mheap_.nlargefree
-	for i := 0; i < len(memstats.by_size); i++ {
+		// The mcache stats have been flushed to mheap_.
 		memstats.nfree += mheap_.nsmallfree[i]
 		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
-		smallfree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
 	}
+	totalFree += smallFree
+
 	memstats.nfree += memstats.tinyallocs
-	memstats.nmalloc += memstats.nfree
+	memstats.nmalloc += memstats.tinyallocs
 
 	// Calculate derived stats.
-	memstats.total_alloc = memstats.alloc + mheap_.largefree + smallfree
+	memstats.total_alloc = totalAlloc
+	memstats.alloc = totalAlloc - totalFree
 	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
 //go:nowritebarrier
 func cachestats() {
-	for i := 0; ; i++ {
-		p := allp[i]
+	for _, p := range &allp {
 		if p == nil {
 			break
 		}
diff --git a/libgo/go/runtime/mstkbar.go b/libgo/go/runtime/mstkbar.go
deleted file mode 100644
index 616c220..0000000
--- a/libgo/go/runtime/mstkbar.go
+++ /dev/null
@@ -1,395 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore
-
-// Garbage collector: stack barriers
-//
-// Stack barriers enable the garbage collector to determine how much
-// of a gorountine stack has changed between when a stack is scanned
-// during the concurrent scan phase and when it is re-scanned during
-// the stop-the-world mark termination phase. Mark termination only
-// needs to re-scan the changed part, so for deep stacks this can
-// significantly reduce GC pause time compared to the alternative of
-// re-scanning whole stacks. The deeper the stacks, the more stack
-// barriers help.
-//
-// When stacks are scanned during the concurrent scan phase, the stack
-// scan installs stack barriers by selecting stack frames and
-// overwriting the saved return PCs (or link registers) of these
-// frames with the PC of a "stack barrier trampoline". Later, when a
-// selected frame returns, it "returns" to this trampoline instead of
-// returning to its actual caller. The trampoline records that the
-// stack has unwound past this frame and jumps to the original return
-// PC recorded when the stack barrier was installed. Mark termination
-// re-scans only as far as the first frame that hasn't hit a stack
-// barrier and then removes and un-hit stack barriers.
-//
-// This scheme is very lightweight. No special code is required in the
-// mutator to record stack unwinding and the trampoline is only a few
-// assembly instructions.
-//
-// Book-keeping
-// ------------
-//
-// The primary cost of stack barriers is book-keeping: the runtime has
-// to record the locations of all stack barriers and the original
-// return PCs in order to return to the correct caller when a stack
-// barrier is hit and so it can remove un-hit stack barriers. In order
-// to minimize this cost, the Go runtime places stack barriers in
-// exponentially-spaced frames, starting 1K past the current frame.
-// The book-keeping structure hence grows logarithmically with the
-// size of the stack and mark termination re-scans at most twice as
-// much stack as necessary.
-//
-// The runtime reserves space for this book-keeping structure at the
-// top of the stack allocation itself (just above the outermost
-// frame). This is necessary because the regular memory allocator can
-// itself grow the stack, and hence can't be used when allocating
-// stack-related structures.
-//
-// For debugging, the runtime also supports installing stack barriers
-// at every frame. However, this requires significantly more
-// book-keeping space.
-//
-// Correctness
-// -----------
-//
-// The runtime and the compiler cooperate to ensure that all objects
-// reachable from the stack as of mark termination are marked.
-// Anything unchanged since the concurrent scan phase will be marked
-// because it is marked by the concurrent scan. After the concurrent
-// scan, there are three possible classes of stack modifications that
-// must be tracked:
-//
-// 1) Mutator writes below the lowest un-hit stack barrier. This
-// includes all writes performed by an executing function to its own
-// stack frame. This part of the stack will be re-scanned by mark
-// termination, which will mark any objects made reachable from
-// modifications to this part of the stack.
-//
-// 2) Mutator writes above the lowest un-hit stack barrier. It's
-// possible for a mutator to modify the stack above the lowest un-hit
-// stack barrier if a higher frame has passed down a pointer to a
-// stack variable in its frame. This is called an "up-pointer". The
-// compiler ensures that writes through up-pointers have an
-// accompanying write barrier (it simply doesn't distinguish between
-// writes through up-pointers and writes through heap pointers). This
-// write barrier marks any object made reachable from modifications to
-// this part of the stack.
-//
-// 3) Runtime writes to the stack. Various runtime operations such as
-// sends to unbuffered channels can write to arbitrary parts of the
-// stack, including above the lowest un-hit stack barrier. We solve
-// this in two ways. In many cases, the runtime can perform an
-// explicit write barrier operation like in case 2. However, in the
-// case of bulk memory move (typedmemmove), the runtime doesn't
-// necessary have ready access to a pointer bitmap for the memory
-// being copied, so it simply unwinds any stack barriers below the
-// destination.
-//
-// Gotchas
-// -------
-//
-// Anything that inspects or manipulates the stack potentially needs
-// to understand stack barriers. The most obvious case is that
-// gentraceback needs to use the original return PC when it encounters
-// the stack barrier trampoline. Anything that unwinds the stack such
-// as panic/recover must unwind stack barriers in tandem with
-// unwinding the stack.
-//
-// Stack barriers require that any goroutine whose stack has been
-// scanned must execute write barriers. Go solves this by simply
-// enabling write barriers globally during the concurrent scan phase.
-// However, traditionally, write barriers are not enabled during this
-// phase.
-//
-// Synchronization
-// ---------------
-//
-// For the most part, accessing and modifying stack barriers is
-// synchronized around GC safe points. Installing stack barriers
-// forces the G to a safe point, while all other operations that
-// modify stack barriers run on the G and prevent it from reaching a
-// safe point.
-//
-// Subtlety arises when a G may be tracebacked when *not* at a safe
-// point. This happens during sigprof. For this, each G has a "stack
-// barrier lock" (see gcLockStackBarriers, gcUnlockStackBarriers).
-// Operations that manipulate stack barriers acquire this lock, while
-// sigprof tries to acquire it and simply skips the traceback if it
-// can't acquire it. There is one exception for performance and
-// complexity reasons: hitting a stack barrier manipulates the stack
-// barrier list without acquiring the stack barrier lock. For this,
-// gentraceback performs a special fix up if the traceback starts in
-// the stack barrier function.
-
-package runtime
-
-import (
-	"runtime/internal/atomic"
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-const debugStackBarrier = false
-
-// firstStackBarrierOffset is the approximate byte offset at
-// which to place the first stack barrier from the current SP.
-// This is a lower bound on how much stack will have to be
-// re-scanned during mark termination. Subsequent barriers are
-// placed at firstStackBarrierOffset * 2^n offsets.
-//
-// For debugging, this can be set to 0, which will install a
-// stack barrier at every frame. If you do this, you may also
-// have to raise _StackMin, since the stack barrier
-// bookkeeping will use a large amount of each stack.
-var firstStackBarrierOffset = 1024
-
-// gcMaxStackBarriers returns the maximum number of stack barriers
-// that can be installed in a stack of stackSize bytes.
-func gcMaxStackBarriers(stackSize int) (n int) {
-	if debug.gcstackbarrieroff > 0 {
-		return 0
-	}
-
-	if firstStackBarrierOffset == 0 {
-		// Special debugging case for inserting stack barriers
-		// at every frame. Steal half of the stack for the
-		// []stkbar. Technically, if the stack were to consist
-		// solely of return PCs we would need two thirds of
-		// the stack, but stealing that much breaks things and
-		// this doesn't happen in practice.
-		return stackSize / 2 / int(unsafe.Sizeof(stkbar{}))
-	}
-
-	offset := firstStackBarrierOffset
-	for offset < stackSize {
-		n++
-		offset *= 2
-	}
-	return n + 1
-}
-
-// gcInstallStackBarrier installs a stack barrier over the return PC of frame.
-//go:nowritebarrier
-func gcInstallStackBarrier(gp *g, frame *stkframe) bool {
-	if frame.lr == 0 {
-		if debugStackBarrier {
-			print("not installing stack barrier with no LR, goid=", gp.goid, "\n")
-		}
-		return false
-	}
-
-	if frame.fn.entry == cgocallback_gofuncPC {
-		// cgocallback_gofunc doesn't return to its LR;
-		// instead, its return path puts LR in g.sched.pc and
-		// switches back to the system stack on which
-		// cgocallback_gofunc was originally called. We can't
-		// have a stack barrier in g.sched.pc, so don't
-		// install one in this frame.
-		if debugStackBarrier {
-			print("not installing stack barrier over LR of cgocallback_gofunc, goid=", gp.goid, "\n")
-		}
-		return false
-	}
-
-	// Save the return PC and overwrite it with stackBarrier.
-	var lrUintptr uintptr
-	if usesLR {
-		lrUintptr = frame.sp
-	} else {
-		lrUintptr = frame.fp - sys.RegSize
-	}
-	lrPtr := (*sys.Uintreg)(unsafe.Pointer(lrUintptr))
-	if debugStackBarrier {
-		print("install stack barrier at ", hex(lrUintptr), " over ", hex(*lrPtr), ", goid=", gp.goid, "\n")
-		if uintptr(*lrPtr) != frame.lr {
-			print("frame.lr=", hex(frame.lr))
-			throw("frame.lr differs from stack LR")
-		}
-	}
-
-	gp.stkbar = gp.stkbar[:len(gp.stkbar)+1]
-	stkbar := &gp.stkbar[len(gp.stkbar)-1]
-	stkbar.savedLRPtr = lrUintptr
-	stkbar.savedLRVal = uintptr(*lrPtr)
-	*lrPtr = sys.Uintreg(stackBarrierPC)
-	return true
-}
-
-// gcRemoveStackBarriers removes all stack barriers installed in gp's stack.
-//
-// gp's stack barriers must be locked.
-//
-//go:nowritebarrier
-func gcRemoveStackBarriers(gp *g) {
-	if debugStackBarrier && gp.stkbarPos != 0 {
-		print("hit ", gp.stkbarPos, " stack barriers, goid=", gp.goid, "\n")
-	}
-
-	// Remove stack barriers that we didn't hit.
-	for _, stkbar := range gp.stkbar[gp.stkbarPos:] {
-		gcRemoveStackBarrier(gp, stkbar)
-	}
-
-	// Clear recorded stack barriers so copystack doesn't try to
-	// adjust them.
-	gp.stkbarPos = 0
-	gp.stkbar = gp.stkbar[:0]
-}
-
-// gcRemoveStackBarrier removes a single stack barrier. It is the
-// inverse operation of gcInstallStackBarrier.
-//
-// This is nosplit to ensure gp's stack does not move.
-//
-//go:nowritebarrier
-//go:nosplit
-func gcRemoveStackBarrier(gp *g, stkbar stkbar) {
-	if debugStackBarrier {
-		print("remove stack barrier at ", hex(stkbar.savedLRPtr), " with ", hex(stkbar.savedLRVal), ", goid=", gp.goid, "\n")
-	}
-	lrPtr := (*sys.Uintreg)(unsafe.Pointer(stkbar.savedLRPtr))
-	if val := *lrPtr; val != sys.Uintreg(stackBarrierPC) {
-		printlock()
-		print("at *", hex(stkbar.savedLRPtr), " expected stack barrier PC ", hex(stackBarrierPC), ", found ", hex(val), ", goid=", gp.goid, "\n")
-		print("gp.stkbar=")
-		gcPrintStkbars(gp, -1)
-		print(", gp.stack=[", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
-		throw("stack barrier lost")
-	}
-	*lrPtr = sys.Uintreg(stkbar.savedLRVal)
-}
-
-// gcTryRemoveAllStackBarriers tries to remove stack barriers from all
-// Gs in gps. It is best-effort and efficient. If it can't remove
-// barriers from a G immediately, it will simply skip it.
-func gcTryRemoveAllStackBarriers(gps []*g) {
-	for _, gp := range gps {
-	retry:
-		for {
-			switch s := readgstatus(gp); s {
-			default:
-				break retry
-
-			case _Grunnable, _Gsyscall, _Gwaiting:
-				if !castogscanstatus(gp, s, s|_Gscan) {
-					continue
-				}
-				gcLockStackBarriers(gp)
-				gcRemoveStackBarriers(gp)
-				gcUnlockStackBarriers(gp)
-				restartg(gp)
-				break retry
-			}
-		}
-	}
-}
-
-// gcPrintStkbars prints the stack barriers of gp for debugging. It
-// places a "@@@" marker at gp.stkbarPos. If marker >= 0, it will also
-// place a "==>" marker before the marker'th entry.
-func gcPrintStkbars(gp *g, marker int) {
-	print("[")
-	for i, s := range gp.stkbar {
-		if i > 0 {
-			print(" ")
-		}
-		if i == int(gp.stkbarPos) {
-			print("@@@ ")
-		}
-		if i == marker {
-			print("==> ")
-		}
-		print("*", hex(s.savedLRPtr), "=", hex(s.savedLRVal))
-	}
-	if int(gp.stkbarPos) == len(gp.stkbar) {
-		print(" @@@")
-	}
-	if marker == len(gp.stkbar) {
-		print(" ==>")
-	}
-	print("]")
-}
-
-// gcUnwindBarriers marks all stack barriers up the frame containing
-// sp as hit and removes them. This is used during stack unwinding for
-// panic/recover and by heapBitsBulkBarrier to force stack re-scanning
-// when its destination is on the stack.
-//
-// This is nosplit to ensure gp's stack does not move.
-//
-//go:nosplit
-func gcUnwindBarriers(gp *g, sp uintptr) {
-	gcLockStackBarriers(gp)
-	// On LR machines, if there is a stack barrier on the return
-	// from the frame containing sp, this will mark it as hit even
-	// though it isn't, but it's okay to be conservative.
-	before := gp.stkbarPos
-	for int(gp.stkbarPos) < len(gp.stkbar) && gp.stkbar[gp.stkbarPos].savedLRPtr < sp {
-		gcRemoveStackBarrier(gp, gp.stkbar[gp.stkbarPos])
-		gp.stkbarPos++
-	}
-	gcUnlockStackBarriers(gp)
-	if debugStackBarrier && gp.stkbarPos != before {
-		print("skip barriers below ", hex(sp), " in goid=", gp.goid, ": ")
-		// We skipped barriers between the "==>" marker
-		// (before) and the "@@@" marker (gp.stkbarPos).
-		gcPrintStkbars(gp, int(before))
-		print("\n")
-	}
-}
-
-// nextBarrierPC returns the original return PC of the next stack barrier.
-// Used by getcallerpc, so it must be nosplit.
-//go:nosplit
-func nextBarrierPC() uintptr {
-	gp := getg()
-	return gp.stkbar[gp.stkbarPos].savedLRVal
-}
-
-// setNextBarrierPC sets the return PC of the next stack barrier.
-// Used by setcallerpc, so it must be nosplit.
-//go:nosplit
-func setNextBarrierPC(pc uintptr) {
-	gp := getg()
-	gcLockStackBarriers(gp)
-	gp.stkbar[gp.stkbarPos].savedLRVal = pc
-	gcUnlockStackBarriers(gp)
-}
-
-// gcLockStackBarriers synchronizes with tracebacks of gp's stack
-// during sigprof for installation or removal of stack barriers. It
-// blocks until any current sigprof is done tracebacking gp's stack
-// and then disallows profiling tracebacks of gp's stack.
-//
-// This is necessary because a sigprof during barrier installation or
-// removal could observe inconsistencies between the stkbar array and
-// the stack itself and crash.
-//
-//go:nosplit
-func gcLockStackBarriers(gp *g) {
-	// Disable preemption so scanstack cannot run while the caller
-	// is manipulating the stack barriers.
-	acquirem()
-	for !atomic.Cas(&gp.stackLock, 0, 1) {
-		osyield()
-	}
-}
-
-//go:nosplit
-func gcTryLockStackBarriers(gp *g) bool {
-	mp := acquirem()
-	result := atomic.Cas(&gp.stackLock, 0, 1)
-	if !result {
-		releasem(mp)
-	}
-	return result
-}
-
-func gcUnlockStackBarriers(gp *g) {
-	atomic.Store(&gp.stackLock, 0)
-	releasem(getg().m)
-}
diff --git a/libgo/go/runtime/net_plan9.go b/libgo/go/runtime/net_plan9.go
index 10fd089..77ae8c6 100644
--- a/libgo/go/runtime/net_plan9.go
+++ b/libgo/go/runtime/net_plan9.go
@@ -8,12 +8,12 @@ import (
 	_ "unsafe"
 )
 
-//go:linkname runtime_ignoreHangup net.runtime_ignoreHangup
+//go:linkname runtime_ignoreHangup internal_poll.runtime_ignoreHangup
 func runtime_ignoreHangup() {
 	getg().m.ignoreHangup = true
 }
 
-//go:linkname runtime_unignoreHangup net.runtime_unignoreHangup
+//go:linkname runtime_unignoreHangup internal_poll.runtime_unignoreHangup
 func runtime_unignoreHangup(sig string) {
 	getg().m.ignoreHangup = false
 }
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index 8932455..e9bbfecb 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -80,12 +80,13 @@ type pollCache struct {
 }
 
 var (
-	netpollInited uint32
-	pollcache     pollCache
+	netpollInited  uint32
+	pollcache      pollCache
+	netpollWaiters uint32
 )
 
-//go:linkname net_runtime_pollServerInit net.runtime_pollServerInit
-func net_runtime_pollServerInit() {
+//go:linkname poll_runtime_pollServerInit internal_poll.runtime_pollServerInit
+func poll_runtime_pollServerInit() {
 	netpollinit()
 	atomic.Store(&netpollInited, 1)
 }
@@ -94,15 +95,23 @@ func netpollinited() bool {
 	return atomic.Load(&netpollInited) != 0
 }
 
-//go:linkname net_runtime_pollOpen net.runtime_pollOpen
-func net_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
+//go:linkname poll_runtime_pollServerDescriptor internal_poll.runtime_pollServerDescriptor
+
+// poll_runtime_pollServerDescriptor returns the descriptor being used,
+// or ^uintptr(0) if the system does not use a poll descriptor.
+func poll_runtime_pollServerDescriptor() uintptr {
+	return netpolldescriptor()
+}
+
+//go:linkname poll_runtime_pollOpen internal_poll.runtime_pollOpen
+func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd := pollcache.alloc()
 	lock(&pd.lock)
 	if pd.wg != 0 && pd.wg != pdReady {
-		throw("netpollOpen: blocked write on free descriptor")
+		throw("runtime: blocked write on free polldesc")
 	}
 	if pd.rg != 0 && pd.rg != pdReady {
-		throw("netpollOpen: blocked read on free descriptor")
+		throw("runtime: blocked read on free polldesc")
 	}
 	pd.fd = fd
 	pd.closing = false
@@ -118,16 +127,16 @@ func net_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	return pd, int(errno)
 }
 
-//go:linkname net_runtime_pollClose net.runtime_pollClose
-func net_runtime_pollClose(pd *pollDesc) {
+//go:linkname poll_runtime_pollClose internal_poll.runtime_pollClose
+func poll_runtime_pollClose(pd *pollDesc) {
 	if !pd.closing {
-		throw("netpollClose: close w/o unblock")
+		throw("runtime: close polldesc w/o unblock")
 	}
 	if pd.wg != 0 && pd.wg != pdReady {
-		throw("netpollClose: blocked write on closing descriptor")
+		throw("runtime: blocked write on closing polldesc")
 	}
 	if pd.rg != 0 && pd.rg != pdReady {
-		throw("netpollClose: blocked read on closing descriptor")
+		throw("runtime: blocked read on closing polldesc")
 	}
 	netpollclose(pd.fd)
 	pollcache.free(pd)
@@ -140,8 +149,8 @@ func (c *pollCache) free(pd *pollDesc) {
 	unlock(&c.lock)
 }
 
-//go:linkname net_runtime_pollReset net.runtime_pollReset
-func net_runtime_pollReset(pd *pollDesc, mode int) int {
+//go:linkname poll_runtime_pollReset internal_poll.runtime_pollReset
+func poll_runtime_pollReset(pd *pollDesc, mode int) int {
 	err := netpollcheckerr(pd, int32(mode))
 	if err != 0 {
 		return err
@@ -154,8 +163,8 @@ func net_runtime_pollReset(pd *pollDesc, mode int) int {
 	return 0
 }
 
-//go:linkname net_runtime_pollWait net.runtime_pollWait
-func net_runtime_pollWait(pd *pollDesc, mode int) int {
+//go:linkname poll_runtime_pollWait internal_poll.runtime_pollWait
+func poll_runtime_pollWait(pd *pollDesc, mode int) int {
 	err := netpollcheckerr(pd, int32(mode))
 	if err != 0 {
 		return err
@@ -176,16 +185,16 @@ func net_runtime_pollWait(pd *pollDesc, mode int) int {
 	return 0
 }
 
-//go:linkname net_runtime_pollWaitCanceled net.runtime_pollWaitCanceled
-func net_runtime_pollWaitCanceled(pd *pollDesc, mode int) {
+//go:linkname poll_runtime_pollWaitCanceled internal_poll.runtime_pollWaitCanceled
+func poll_runtime_pollWaitCanceled(pd *pollDesc, mode int) {
 	// This function is used only on windows after a failed attempt to cancel
 	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
 	for !netpollblock(pd, int32(mode), true) {
 	}
 }
 
-//go:linkname net_runtime_pollSetDeadline net.runtime_pollSetDeadline
-func net_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
+//go:linkname poll_runtime_pollSetDeadline internal_poll.runtime_pollSetDeadline
+func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	lock(&pd.lock)
 	if pd.closing {
 		unlock(&pd.lock)
@@ -247,18 +256,18 @@ func net_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 3)
+		netpollgoready(rg, 3)
 	}
 	if wg != nil {
-		goready(wg, 3)
+		netpollgoready(wg, 3)
 	}
 }
 
-//go:linkname net_runtime_pollUnblock net.runtime_pollUnblock
-func net_runtime_pollUnblock(pd *pollDesc) {
+//go:linkname poll_runtime_pollUnblock internal_poll.runtime_pollUnblock
+func poll_runtime_pollUnblock(pd *pollDesc) {
 	lock(&pd.lock)
 	if pd.closing {
-		throw("netpollUnblock: already closing")
+		throw("runtime: unblock on closing polldesc")
 	}
 	pd.closing = true
 	pd.seq++
@@ -276,10 +285,10 @@ func net_runtime_pollUnblock(pd *pollDesc) {
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 3)
+		netpollgoready(rg, 3)
 	}
 	if wg != nil {
-		goready(wg, 3)
+		netpollgoready(wg, 3)
 	}
 }
 
@@ -315,7 +324,19 @@ func netpollcheckerr(pd *pollDesc, mode int32) int {
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
-	return atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	if r {
+		// Bump the count of goroutines waiting for the poller.
+		// The scheduler uses this to decide whether to block
+		// waiting for the poller if there is nothing else to do.
+		atomic.Xadd(&netpollWaiters, 1)
+	}
+	return r
+}
+
+func netpollgoready(gp *g, traceskip int) {
+	atomic.Xadd(&netpollWaiters, -1)
+	goready(gp, traceskip+1)
 }
 
 // returns true if IO is ready, or false if timedout or closed
@@ -334,7 +355,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 			return true
 		}
 		if old != 0 {
-			throw("netpollblock: double wait")
+			throw("runtime: double wait")
 		}
 		if atomic.Casuintptr(gpp, 0, pdWait) {
 			break
@@ -350,7 +371,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 	// be careful to not lose concurrent READY notification
 	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
-		throw("netpollblock: corrupted state")
+		throw("runtime: corrupted polldesc")
 	}
 	return old == pdReady
 }
@@ -396,7 +417,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 	var rg *g
 	if read {
 		if pd.rd <= 0 || pd.rt.f == nil {
-			throw("netpolldeadlineimpl: inconsistent read deadline")
+			throw("runtime: inconsistent read deadline")
 		}
 		pd.rd = -1
 		atomicstorep(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
@@ -405,7 +426,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 	var wg *g
 	if write {
 		if pd.wd <= 0 || pd.wt.f == nil && !read {
-			throw("netpolldeadlineimpl: inconsistent write deadline")
+			throw("runtime: inconsistent write deadline")
 		}
 		pd.wd = -1
 		atomicstorep(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
@@ -413,10 +434,10 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 0)
+		netpollgoready(rg, 0)
 	}
 	if wg != nil {
-		goready(wg, 0)
+		netpollgoready(wg, 0)
 	}
 }
 
diff --git a/libgo/go/runtime/netpoll_epoll.go b/libgo/go/runtime/netpoll_epoll.go
index 247692e..ced399d 100644
--- a/libgo/go/runtime/netpoll_epoll.go
+++ b/libgo/go/runtime/netpoll_epoll.go
@@ -47,6 +47,10 @@ func netpollinit() {
 	throw("netpollinit: failed to create descriptor")
 }
 
+func netpolldescriptor() uintptr {
+	return uintptr(epfd)
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	var ev epollevent
 	ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLETpos
@@ -66,7 +70,7 @@ func netpollclose(fd uintptr) int32 {
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // polls for ready network connections
@@ -86,7 +90,7 @@ retry:
 		e := errno()
 		if e != _EINTR {
 			println("runtime: epollwait on fd", epfd, "failed with", e)
-			throw("epollwait failed")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index eae4f21..47927fe 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -32,11 +32,15 @@ func netpollinit() {
 	kq = kqueue()
 	if kq < 0 {
 		println("netpollinit: kqueue failed with", errno())
-		throw("netpollinit: kqueue failed")
+		throw("runtime: netpollinit failed")
 	}
 	closeonexec(kq)
 }
 
+func netpolldescriptor() uintptr {
+	return uintptr(kq)
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
 	// for the whole fd lifetime. The notifications are automatically unregistered
@@ -64,7 +68,7 @@ func netpollclose(fd uintptr) int32 {
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // Polls for ready network connections.
@@ -85,7 +89,7 @@ retry:
 		e := errno()
 		if e != _EINTR {
 			println("runtime: kevent on fd", kq, "failed with", e)
-			throw("kevent failed")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}
diff --git a/libgo/go/runtime/netpoll_nacl.go b/libgo/go/runtime/netpoll_nacl.go
index 5cbc300..dc5a55e 100644
--- a/libgo/go/runtime/netpoll_nacl.go
+++ b/libgo/go/runtime/netpoll_nacl.go
@@ -10,6 +10,10 @@ package runtime
 func netpollinit() {
 }
 
+func netpolldescriptor() uintptr {
+	return ^uintptr(0)
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	return 0
 }
diff --git a/libgo/go/runtime/netpoll_solaris.go b/libgo/go/runtime/netpoll_solaris.go
index cc6754c..e1e7385 100644
--- a/libgo/go/runtime/netpoll_solaris.go
+++ b/libgo/go/runtime/netpoll_solaris.go
@@ -96,8 +96,12 @@ func netpollinit() {
 		return
 	}
 
-	print("netpollinit: failed to create port (", errno(), ")\n")
-	throw("netpollinit: failed to create port")
+	print("runtime: port_create failed (errno=", errno(), ")\n")
+	throw("runtime: netpollinit failed")
+}
+
+func netpolldescriptor() uintptr {
+	return uintptr(portfd)
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -139,8 +143,8 @@ func netpollupdate(pd *pollDesc, set, clear uint32) {
 	}
 
 	if events != 0 && port_associate(portfd, _PORT_SOURCE_FD, pd.fd, events, uintptr(unsafe.Pointer(pd))) != 0 {
-		print("netpollupdate: failed to associate (", errno(), ")\n")
-		throw("netpollupdate: failed to associate")
+		print("runtime: port_associate failed (errno=", errno(), ")\n")
+		throw("runtime: netpollupdate failed")
 	}
 	pd.user = events
 }
@@ -154,7 +158,7 @@ func netpollarm(pd *pollDesc, mode int) {
 	case 'w':
 		netpollupdate(pd, _POLLOUT, 0)
 	default:
-		throw("netpollarm: bad mode")
+		throw("runtime: bad mode")
 	}
 	unlock(&pd.lock)
 }
@@ -177,8 +181,8 @@ retry:
 	var n uint32 = 1
 	if port_getn(portfd, &events[0], uint32(len(events)), &n, wait) < 0 {
 		if e := errno(); e != _EINTR {
-			print("runtime: port_getn on fd ", portfd, " failed with ", e, "\n")
-			throw("port_getn failed")
+			print("runtime: port_getn on fd ", portfd, " failed (errno=", e, ")\n")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}
diff --git a/libgo/go/runtime/netpoll_stub.go b/libgo/go/runtime/netpoll_stub.go
index 09f64ad..a4d6b46 100644
--- a/libgo/go/runtime/netpoll_stub.go
+++ b/libgo/go/runtime/netpoll_stub.go
@@ -6,6 +6,8 @@
 
 package runtime
 
+var netpollWaiters uint32
+
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
 func netpoll(block bool) (gp *g) {
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index 7ad1158..79dafb0 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -12,7 +12,8 @@ const _DWORD_MAX = 0xffffffff
 
 const _INVALID_HANDLE_VALUE = ^uintptr(0)
 
-// net_op must be the same as beginning of net.operation. Keep these in sync.
+// net_op must be the same as beginning of internal/poll.operation.
+// Keep these in sync.
 type net_op struct {
 	// used by windows
 	o overlapped
@@ -35,11 +36,15 @@ var iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
 func netpollinit() {
 	iocphandle = stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX)
 	if iocphandle == 0 {
-		println("netpoll: failed to create iocp handle (errno=", getlasterror(), ")")
-		throw("netpoll: failed to create iocp handle")
+		println("runtime: CreateIoCompletionPort failed (errno=", getlasterror(), ")")
+		throw("runtime: netpollinit failed")
 	}
 }
 
+func netpolldescriptor() uintptr {
+	return iocphandle
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	if stdcall4(_CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0 {
 		return -int32(getlasterror())
@@ -53,7 +58,7 @@ func netpollclose(fd uintptr) int32 {
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // Polls for completed network IO.
@@ -89,8 +94,8 @@ retry:
 			if !block && errno == _WAIT_TIMEOUT {
 				return nil
 			}
-			println("netpoll: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
-			throw("netpoll: GetQueuedCompletionStatusEx failed")
+			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
+			throw("runtime: netpoll failed")
 		}
 		mp.blocked = false
 		for i = 0; i < n; i++ {
@@ -116,8 +121,8 @@ retry:
 				return nil
 			}
 			if op == nil {
-				println("netpoll: GetQueuedCompletionStatus failed (errno=", errno, ")")
-				throw("netpoll: GetQueuedCompletionStatus failed")
+				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
+				throw("runtime: netpoll failed")
 			}
 			// dequeued failed IO packet, so report that
 		}
@@ -132,12 +137,13 @@ retry:
 
 func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
 	if op == nil {
-		throw("netpoll: GetQueuedCompletionStatus returned op == nil")
+		println("runtime: GetQueuedCompletionStatus returned op == nil")
+		throw("runtime: netpoll failed")
 	}
 	mode := op.mode
 	if mode != 'r' && mode != 'w' {
-		println("netpoll: GetQueuedCompletionStatus returned invalid mode=", mode)
-		throw("netpoll: GetQueuedCompletionStatus returned invalid mode")
+		println("runtime: GetQueuedCompletionStatus returned invalid mode=", mode)
+		throw("runtime: netpoll failed")
 	}
 	op.errno = errno
 	op.qty = qty
diff --git a/libgo/go/runtime/numcpu_freebsd_test.go b/libgo/go/runtime/numcpu_freebsd_test.go
new file mode 100644
index 0000000..e78890a
--- /dev/null
+++ b/libgo/go/runtime/numcpu_freebsd_test.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import "testing"
+
+func TestFreeBSDNumCPU(t *testing.T) {
+	got := runTestProg(t, "testprog", "FreeBSDNumCPU")
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got:\n%s", want, got)
+	}
+}
diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go
index db3ea48..5709555 100644
--- a/libgo/go/runtime/os_gccgo.go
+++ b/libgo/go/runtime/os_gccgo.go
@@ -8,7 +8,7 @@ import (
 	"unsafe"
 )
 
-// Temporary for C code to call:
+// For C code to call:
 //go:linkname minit runtime.minit
 
 func goenvs() {
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 43d595f..2f65603 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -450,6 +450,8 @@ func gopanic(e interface{}) {
 	}
 	gp._panic = p
 
+	atomic.Xadd(&runningPanicDefers, 1)
+
 	for {
 		d := gp._defer
 		if d == nil {
@@ -486,8 +488,8 @@ func gopanic(e interface{}) {
 		d._panic = nil
 
 		if p.recovered {
-			// Some deferred function called recover.
-			// Stop running this panic.
+			atomic.Xadd(&runningPanicDefers, -1)
+
 			gp._panic = p.link
 
 			// Aborted panics are marked but remain on the g.panic list.
@@ -531,6 +533,11 @@ func gopanic(e interface{}) {
 	// and String methods to prepare the panic strings before startpanic.
 	preprintpanics(gp._panic)
 	startpanic()
+
+	// startpanic set panicking, which will block main from exiting,
+	// so now OK to decrement runningPanicDefers.
+	atomic.Xadd(&runningPanicDefers, -1)
+
 	printpanics(gp._panic)
 	dopanic(0)       // should not return
 	*(*int)(nil) = 0 // not reached
@@ -801,7 +808,17 @@ func throw(s string) {
 	*(*int)(nil) = 0 // not reached
 }
 
-//uint32 runtime·panicking;
+// runningPanicDefers is non-zero while running deferred functions for panic.
+// runningPanicDefers is incremented and decremented atomically.
+// This is used to try hard to get a panic stack trace out when exiting.
+var runningPanicDefers uint32
+
+// panicking is non-zero when crashing the program for an unrecovered panic.
+// panicking is incremented and decremented atomically.
+var panicking uint32
+
+// paniclk is held while printing the panic information and stack trace,
+// so that two concurrent panics don't overlap their output.
 var paniclk mutex
 
 func startpanic() {
diff --git a/libgo/go/runtime/pprof/elf.go b/libgo/go/runtime/pprof/elf.go
new file mode 100644
index 0000000..a8b5ea6
--- /dev/null
+++ b/libgo/go/runtime/pprof/elf.go
@@ -0,0 +1,109 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"os"
+)
+
+var (
+	errBadELF    = errors.New("malformed ELF binary")
+	errNoBuildID = errors.New("no NT_GNU_BUILD_ID found in ELF binary")
+)
+
+// elfBuildID returns the GNU build ID of the named ELF binary,
+// without introducing a dependency on debug/elf and its dependencies.
+func elfBuildID(file string) (string, error) {
+	buf := make([]byte, 256)
+	f, err := os.Open(file)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	if _, err := f.ReadAt(buf[:64], 0); err != nil {
+		return "", err
+	}
+
+	// ELF file begins with \x7F E L F.
+	if buf[0] != 0x7F || buf[1] != 'E' || buf[2] != 'L' || buf[3] != 'F' {
+		return "", errBadELF
+	}
+
+	var byteOrder binary.ByteOrder
+	switch buf[5] {
+	default:
+		return "", errBadELF
+	case 1: // little-endian
+		byteOrder = binary.LittleEndian
+	case 2: // big-endian
+		byteOrder = binary.BigEndian
+	}
+
+	var shnum int
+	var shoff, shentsize int64
+	switch buf[4] {
+	default:
+		return "", errBadELF
+	case 1: // 32-bit file header
+		shoff = int64(byteOrder.Uint32(buf[32:]))
+		shentsize = int64(byteOrder.Uint16(buf[46:]))
+		if shentsize != 40 {
+			return "", errBadELF
+		}
+		shnum = int(byteOrder.Uint16(buf[48:]))
+	case 2: // 64-bit file header
+		shoff = int64(byteOrder.Uint64(buf[40:]))
+		shentsize = int64(byteOrder.Uint16(buf[58:]))
+		if shentsize != 64 {
+			return "", errBadELF
+		}
+		shnum = int(byteOrder.Uint16(buf[60:]))
+	}
+
+	for i := 0; i < shnum; i++ {
+		if _, err := f.ReadAt(buf[:shentsize], shoff+int64(i)*shentsize); err != nil {
+			return "", err
+		}
+		if typ := byteOrder.Uint32(buf[4:]); typ != 7 { // SHT_NOTE
+			continue
+		}
+		var off, size int64
+		if shentsize == 40 {
+			// 32-bit section header
+			off = int64(byteOrder.Uint32(buf[16:]))
+			size = int64(byteOrder.Uint32(buf[20:]))
+		} else {
+			// 64-bit section header
+			off = int64(byteOrder.Uint64(buf[24:]))
+			size = int64(byteOrder.Uint64(buf[32:]))
+		}
+		size += off
+		for off < size {
+			if _, err := f.ReadAt(buf[:16], off); err != nil { // room for header + name GNU\x00
+				return "", err
+			}
+			nameSize := int(byteOrder.Uint32(buf[0:]))
+			descSize := int(byteOrder.Uint32(buf[4:]))
+			noteType := int(byteOrder.Uint32(buf[8:]))
+			descOff := off + int64(12+(nameSize+3)&^3)
+			off = descOff + int64((descSize+3)&^3)
+			if nameSize != 4 || noteType != 3 || buf[12] != 'G' || buf[13] != 'N' || buf[14] != 'U' || buf[15] != '\x00' { // want name GNU\x00 type 3 (NT_GNU_BUILD_ID)
+				continue
+			}
+			if descSize > len(buf) {
+				return "", errBadELF
+			}
+			if _, err := f.ReadAt(buf[:descSize], descOff); err != nil {
+				return "", err
+			}
+			return fmt.Sprintf("%x", buf[:descSize]), nil
+		}
+	}
+	return "", errNoBuildID
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/encode.go b/libgo/go/runtime/pprof/internal/profile/encode.go
new file mode 100644
index 0000000..6b879a8
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/encode.go
@@ -0,0 +1,470 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package profile
+
+import (
+	"errors"
+	"fmt"
+	"sort"
+)
+
+func (p *Profile) decoder() []decoder {
+	return profileDecoder
+}
+
+// preEncode populates the unexported fields to be used by encode
+// (with suffix X) from the corresponding exported fields. The
+// exported fields are cleared up to facilitate testing.
+func (p *Profile) preEncode() {
+	strings := make(map[string]int)
+	addString(strings, "")
+
+	for _, st := range p.SampleType {
+		st.typeX = addString(strings, st.Type)
+		st.unitX = addString(strings, st.Unit)
+	}
+
+	for _, s := range p.Sample {
+		s.labelX = nil
+		var keys []string
+		for k := range s.Label {
+			keys = append(keys, k)
+		}
+		sort.Strings(keys)
+		for _, k := range keys {
+			vs := s.Label[k]
+			for _, v := range vs {
+				s.labelX = append(s.labelX,
+					Label{
+						keyX: addString(strings, k),
+						strX: addString(strings, v),
+					},
+				)
+			}
+		}
+		var numKeys []string
+		for k := range s.NumLabel {
+			numKeys = append(numKeys, k)
+		}
+		sort.Strings(numKeys)
+		for _, k := range numKeys {
+			vs := s.NumLabel[k]
+			for _, v := range vs {
+				s.labelX = append(s.labelX,
+					Label{
+						keyX: addString(strings, k),
+						numX: v,
+					},
+				)
+			}
+		}
+		s.locationIDX = nil
+		for _, l := range s.Location {
+			s.locationIDX = append(s.locationIDX, l.ID)
+		}
+	}
+
+	for _, m := range p.Mapping {
+		m.fileX = addString(strings, m.File)
+		m.buildIDX = addString(strings, m.BuildID)
+	}
+
+	for _, l := range p.Location {
+		for i, ln := range l.Line {
+			if ln.Function != nil {
+				l.Line[i].functionIDX = ln.Function.ID
+			} else {
+				l.Line[i].functionIDX = 0
+			}
+		}
+		if l.Mapping != nil {
+			l.mappingIDX = l.Mapping.ID
+		} else {
+			l.mappingIDX = 0
+		}
+	}
+	for _, f := range p.Function {
+		f.nameX = addString(strings, f.Name)
+		f.systemNameX = addString(strings, f.SystemName)
+		f.filenameX = addString(strings, f.Filename)
+	}
+
+	p.dropFramesX = addString(strings, p.DropFrames)
+	p.keepFramesX = addString(strings, p.KeepFrames)
+
+	if pt := p.PeriodType; pt != nil {
+		pt.typeX = addString(strings, pt.Type)
+		pt.unitX = addString(strings, pt.Unit)
+	}
+
+	p.stringTable = make([]string, len(strings))
+	for s, i := range strings {
+		p.stringTable[i] = s
+	}
+}
+
+func (p *Profile) encode(b *buffer) {
+	for _, x := range p.SampleType {
+		encodeMessage(b, 1, x)
+	}
+	for _, x := range p.Sample {
+		encodeMessage(b, 2, x)
+	}
+	for _, x := range p.Mapping {
+		encodeMessage(b, 3, x)
+	}
+	for _, x := range p.Location {
+		encodeMessage(b, 4, x)
+	}
+	for _, x := range p.Function {
+		encodeMessage(b, 5, x)
+	}
+	encodeStrings(b, 6, p.stringTable)
+	encodeInt64Opt(b, 7, p.dropFramesX)
+	encodeInt64Opt(b, 8, p.keepFramesX)
+	encodeInt64Opt(b, 9, p.TimeNanos)
+	encodeInt64Opt(b, 10, p.DurationNanos)
+	if pt := p.PeriodType; pt != nil && (pt.typeX != 0 || pt.unitX != 0) {
+		encodeMessage(b, 11, p.PeriodType)
+	}
+	encodeInt64Opt(b, 12, p.Period)
+}
+
+var profileDecoder = []decoder{
+	nil, // 0
+	// repeated ValueType sample_type = 1
+	func(b *buffer, m message) error {
+		x := new(ValueType)
+		pp := m.(*Profile)
+		pp.SampleType = append(pp.SampleType, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Sample sample = 2
+	func(b *buffer, m message) error {
+		x := new(Sample)
+		pp := m.(*Profile)
+		pp.Sample = append(pp.Sample, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Mapping mapping = 3
+	func(b *buffer, m message) error {
+		x := new(Mapping)
+		pp := m.(*Profile)
+		pp.Mapping = append(pp.Mapping, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Location location = 4
+	func(b *buffer, m message) error {
+		x := new(Location)
+		pp := m.(*Profile)
+		pp.Location = append(pp.Location, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Function function = 5
+	func(b *buffer, m message) error {
+		x := new(Function)
+		pp := m.(*Profile)
+		pp.Function = append(pp.Function, x)
+		return decodeMessage(b, x)
+	},
+	// repeated string string_table = 6
+	func(b *buffer, m message) error {
+		err := decodeStrings(b, &m.(*Profile).stringTable)
+		if err != nil {
+			return err
+		}
+		if *&m.(*Profile).stringTable[0] != "" {
+			return errors.New("string_table[0] must be ''")
+		}
+		return nil
+	},
+	// repeated int64 drop_frames = 7
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).dropFramesX) },
+	// repeated int64 keep_frames = 8
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).keepFramesX) },
+	// repeated int64 time_nanos = 9
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).TimeNanos) },
+	// repeated int64 duration_nanos = 10
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).DurationNanos) },
+	// optional string period_type = 11
+	func(b *buffer, m message) error {
+		x := new(ValueType)
+		pp := m.(*Profile)
+		pp.PeriodType = x
+		return decodeMessage(b, x)
+	},
+	// repeated int64 period = 12
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).Period) },
+}
+
+// postDecode takes the unexported fields populated by decode (with
+// suffix X) and populates the corresponding exported fields.
+// The unexported fields are cleared up to facilitate testing.
+func (p *Profile) postDecode() error {
+	var err error
+
+	mappings := make(map[uint64]*Mapping)
+	for _, m := range p.Mapping {
+		m.File, err = getString(p.stringTable, &m.fileX, err)
+		m.BuildID, err = getString(p.stringTable, &m.buildIDX, err)
+		mappings[m.ID] = m
+	}
+
+	functions := make(map[uint64]*Function)
+	for _, f := range p.Function {
+		f.Name, err = getString(p.stringTable, &f.nameX, err)
+		f.SystemName, err = getString(p.stringTable, &f.systemNameX, err)
+		f.Filename, err = getString(p.stringTable, &f.filenameX, err)
+		functions[f.ID] = f
+	}
+
+	locations := make(map[uint64]*Location)
+	for _, l := range p.Location {
+		l.Mapping = mappings[l.mappingIDX]
+		l.mappingIDX = 0
+		for i, ln := range l.Line {
+			if id := ln.functionIDX; id != 0 {
+				l.Line[i].Function = functions[id]
+				if l.Line[i].Function == nil {
+					return fmt.Errorf("Function ID %d not found", id)
+				}
+				l.Line[i].functionIDX = 0
+			}
+		}
+		locations[l.ID] = l
+	}
+
+	for _, st := range p.SampleType {
+		st.Type, err = getString(p.stringTable, &st.typeX, err)
+		st.Unit, err = getString(p.stringTable, &st.unitX, err)
+	}
+
+	for _, s := range p.Sample {
+		labels := make(map[string][]string)
+		numLabels := make(map[string][]int64)
+		for _, l := range s.labelX {
+			var key, value string
+			key, err = getString(p.stringTable, &l.keyX, err)
+			if l.strX != 0 {
+				value, err = getString(p.stringTable, &l.strX, err)
+				labels[key] = append(labels[key], value)
+			} else {
+				numLabels[key] = append(numLabels[key], l.numX)
+			}
+		}
+		if len(labels) > 0 {
+			s.Label = labels
+		}
+		if len(numLabels) > 0 {
+			s.NumLabel = numLabels
+		}
+		s.Location = nil
+		for _, lid := range s.locationIDX {
+			s.Location = append(s.Location, locations[lid])
+		}
+		s.locationIDX = nil
+	}
+
+	p.DropFrames, err = getString(p.stringTable, &p.dropFramesX, err)
+	p.KeepFrames, err = getString(p.stringTable, &p.keepFramesX, err)
+
+	if pt := p.PeriodType; pt == nil {
+		p.PeriodType = &ValueType{}
+	}
+
+	if pt := p.PeriodType; pt != nil {
+		pt.Type, err = getString(p.stringTable, &pt.typeX, err)
+		pt.Unit, err = getString(p.stringTable, &pt.unitX, err)
+	}
+	p.stringTable = nil
+	return nil
+}
+
+func (p *ValueType) decoder() []decoder {
+	return valueTypeDecoder
+}
+
+func (p *ValueType) encode(b *buffer) {
+	encodeInt64Opt(b, 1, p.typeX)
+	encodeInt64Opt(b, 2, p.unitX)
+}
+
+var valueTypeDecoder = []decoder{
+	nil, // 0
+	// optional int64 type = 1
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).typeX) },
+	// optional int64 unit = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).unitX) },
+}
+
+func (p *Sample) decoder() []decoder {
+	return sampleDecoder
+}
+
+func (p *Sample) encode(b *buffer) {
+	encodeUint64s(b, 1, p.locationIDX)
+	for _, x := range p.Value {
+		encodeInt64(b, 2, x)
+	}
+	for _, x := range p.labelX {
+		encodeMessage(b, 3, x)
+	}
+}
+
+var sampleDecoder = []decoder{
+	nil, // 0
+	// repeated uint64 location = 1
+	func(b *buffer, m message) error { return decodeUint64s(b, &m.(*Sample).locationIDX) },
+	// repeated int64 value = 2
+	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Sample).Value) },
+	// repeated Label label = 3
+	func(b *buffer, m message) error {
+		s := m.(*Sample)
+		n := len(s.labelX)
+		s.labelX = append(s.labelX, Label{})
+		return decodeMessage(b, &s.labelX[n])
+	},
+}
+
+func (p Label) decoder() []decoder {
+	return labelDecoder
+}
+
+func (p Label) encode(b *buffer) {
+	encodeInt64Opt(b, 1, p.keyX)
+	encodeInt64Opt(b, 2, p.strX)
+	encodeInt64Opt(b, 3, p.numX)
+}
+
+var labelDecoder = []decoder{
+	nil, // 0
+	// optional int64 key = 1
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).keyX) },
+	// optional int64 str = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).strX) },
+	// optional int64 num = 3
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).numX) },
+}
+
+func (p *Mapping) decoder() []decoder {
+	return mappingDecoder
+}
+
+func (p *Mapping) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeUint64Opt(b, 2, p.Start)
+	encodeUint64Opt(b, 3, p.Limit)
+	encodeUint64Opt(b, 4, p.Offset)
+	encodeInt64Opt(b, 5, p.fileX)
+	encodeInt64Opt(b, 6, p.buildIDX)
+	encodeBoolOpt(b, 7, p.HasFunctions)
+	encodeBoolOpt(b, 8, p.HasFilenames)
+	encodeBoolOpt(b, 9, p.HasLineNumbers)
+	encodeBoolOpt(b, 10, p.HasInlineFrames)
+}
+
+var mappingDecoder = []decoder{
+	nil, // 0
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).ID) },            // optional uint64 id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Start) },         // optional uint64 memory_offset = 2
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Limit) },         // optional uint64 memory_limit = 3
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Offset) },        // optional uint64 file_offset = 4
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).fileX) },          // optional int64 filename = 5
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).buildIDX) },       // optional int64 build_id = 6
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFunctions) },    // optional bool has_functions = 7
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFilenames) },    // optional bool has_filenames = 8
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasLineNumbers) },  // optional bool has_line_numbers = 9
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasInlineFrames) }, // optional bool has_inline_frames = 10
+}
+
+func (p *Location) decoder() []decoder {
+	return locationDecoder
+}
+
+func (p *Location) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeUint64Opt(b, 2, p.mappingIDX)
+	encodeUint64Opt(b, 3, p.Address)
+	for i := range p.Line {
+		encodeMessage(b, 4, &p.Line[i])
+	}
+}
+
+var locationDecoder = []decoder{
+	nil, // 0
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).ID) },         // optional uint64 id = 1;
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).mappingIDX) }, // optional uint64 mapping_id = 2;
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).Address) },    // optional uint64 address = 3;
+	func(b *buffer, m message) error { // repeated Line line = 4
+		pp := m.(*Location)
+		n := len(pp.Line)
+		pp.Line = append(pp.Line, Line{})
+		return decodeMessage(b, &pp.Line[n])
+	},
+}
+
+func (p *Line) decoder() []decoder {
+	return lineDecoder
+}
+
+func (p *Line) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.functionIDX)
+	encodeInt64Opt(b, 2, p.Line)
+}
+
+var lineDecoder = []decoder{
+	nil, // 0
+	// optional uint64 function_id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Line).functionIDX) },
+	// optional int64 line = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Line) },
+}
+
+func (p *Function) decoder() []decoder {
+	return functionDecoder
+}
+
+func (p *Function) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeInt64Opt(b, 2, p.nameX)
+	encodeInt64Opt(b, 3, p.systemNameX)
+	encodeInt64Opt(b, 4, p.filenameX)
+	encodeInt64Opt(b, 5, p.StartLine)
+}
+
+var functionDecoder = []decoder{
+	nil, // 0
+	// optional uint64 id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Function).ID) },
+	// optional int64 function_name = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).nameX) },
+	// optional int64 function_system_name = 3
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).systemNameX) },
+	// repeated int64 filename = 4
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).filenameX) },
+	// optional int64 start_line = 5
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).StartLine) },
+}
+
+func addString(strings map[string]int, s string) int64 {
+	i, ok := strings[s]
+	if !ok {
+		i = len(strings)
+		strings[s] = i
+	}
+	return int64(i)
+}
+
+func getString(strings []string, strng *int64, err error) (string, error) {
+	if err != nil {
+		return "", err
+	}
+	s := int(*strng)
+	if s < 0 || s >= len(strings) {
+		return "", errMalformed
+	}
+	*strng = 0
+	return strings[s], nil
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/filter.go b/libgo/go/runtime/pprof/internal/profile/filter.go
new file mode 100644
index 0000000..1baa096
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/filter.go
@@ -0,0 +1,158 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implements methods to filter samples from profiles.
+
+package profile
+
+import "regexp"
+
+// FilterSamplesByName filters the samples in a profile and only keeps
+// samples where at least one frame matches focus but none match ignore.
+// Returns true is the corresponding regexp matched at least one sample.
+func (p *Profile) FilterSamplesByName(focus, ignore, hide *regexp.Regexp) (fm, im, hm bool) {
+	focusOrIgnore := make(map[uint64]bool)
+	hidden := make(map[uint64]bool)
+	for _, l := range p.Location {
+		if ignore != nil && l.matchesName(ignore) {
+			im = true
+			focusOrIgnore[l.ID] = false
+		} else if focus == nil || l.matchesName(focus) {
+			fm = true
+			focusOrIgnore[l.ID] = true
+		}
+		if hide != nil && l.matchesName(hide) {
+			hm = true
+			l.Line = l.unmatchedLines(hide)
+			if len(l.Line) == 0 {
+				hidden[l.ID] = true
+			}
+		}
+	}
+
+	s := make([]*Sample, 0, len(p.Sample))
+	for _, sample := range p.Sample {
+		if focusedAndNotIgnored(sample.Location, focusOrIgnore) {
+			if len(hidden) > 0 {
+				var locs []*Location
+				for _, loc := range sample.Location {
+					if !hidden[loc.ID] {
+						locs = append(locs, loc)
+					}
+				}
+				if len(locs) == 0 {
+					// Remove sample with no locations (by not adding it to s).
+					continue
+				}
+				sample.Location = locs
+			}
+			s = append(s, sample)
+		}
+	}
+	p.Sample = s
+
+	return
+}
+
+// matchesName returns whether the function name or file in the
+// location matches the regular expression.
+func (loc *Location) matchesName(re *regexp.Regexp) bool {
+	for _, ln := range loc.Line {
+		if fn := ln.Function; fn != nil {
+			if re.MatchString(fn.Name) {
+				return true
+			}
+			if re.MatchString(fn.Filename) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// unmatchedLines returns the lines in the location that do not match
+// the regular expression.
+func (loc *Location) unmatchedLines(re *regexp.Regexp) []Line {
+	var lines []Line
+	for _, ln := range loc.Line {
+		if fn := ln.Function; fn != nil {
+			if re.MatchString(fn.Name) {
+				continue
+			}
+			if re.MatchString(fn.Filename) {
+				continue
+			}
+		}
+		lines = append(lines, ln)
+	}
+	return lines
+}
+
+// focusedAndNotIgnored looks up a slice of ids against a map of
+// focused/ignored locations. The map only contains locations that are
+// explicitly focused or ignored. Returns whether there is at least
+// one focused location but no ignored locations.
+func focusedAndNotIgnored(locs []*Location, m map[uint64]bool) bool {
+	var f bool
+	for _, loc := range locs {
+		if focus, focusOrIgnore := m[loc.ID]; focusOrIgnore {
+			if focus {
+				// Found focused location. Must keep searching in case there
+				// is an ignored one as well.
+				f = true
+			} else {
+				// Found ignored location. Can return false right away.
+				return false
+			}
+		}
+	}
+	return f
+}
+
+// TagMatch selects tags for filtering
+type TagMatch func(key, val string, nval int64) bool
+
+// FilterSamplesByTag removes all samples from the profile, except
+// those that match focus and do not match the ignore regular
+// expression.
+func (p *Profile) FilterSamplesByTag(focus, ignore TagMatch) (fm, im bool) {
+	samples := make([]*Sample, 0, len(p.Sample))
+	for _, s := range p.Sample {
+		focused, ignored := focusedSample(s, focus, ignore)
+		fm = fm || focused
+		im = im || ignored
+		if focused && !ignored {
+			samples = append(samples, s)
+		}
+	}
+	p.Sample = samples
+	return
+}
+
+// focusedTag checks a sample against focus and ignore regexps.
+// Returns whether the focus/ignore regexps match any tags
+func focusedSample(s *Sample, focus, ignore TagMatch) (fm, im bool) {
+	fm = focus == nil
+	for key, vals := range s.Label {
+		for _, val := range vals {
+			if ignore != nil && ignore(key, val, 0) {
+				im = true
+			}
+			if !fm && focus(key, val, 0) {
+				fm = true
+			}
+		}
+	}
+	for key, vals := range s.NumLabel {
+		for _, val := range vals {
+			if ignore != nil && ignore(key, "", val) {
+				im = true
+			}
+			if !fm && focus(key, "", val) {
+				fm = true
+			}
+		}
+	}
+	return fm, im
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/legacy_profile.go b/libgo/go/runtime/pprof/internal/profile/legacy_profile.go
new file mode 100644
index 0000000..d69f8de
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/legacy_profile.go
@@ -0,0 +1,1266 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file implements parsers to convert legacy profiles into the
+// profile.proto format.
+
+package profile
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var (
+	countStartRE = regexp.MustCompile(`\A(\w+) profile: total \d+\n\z`)
+	countRE      = regexp.MustCompile(`\A(\d+) @(( 0x[0-9a-f]+)+)\n\z`)
+
+	heapHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] *@ *(heap[_a-z0-9]*)/?(\d*)`)
+	heapSampleRE = regexp.MustCompile(`(-?\d+): *(-?\d+) *\[ *(\d+): *(\d+) *] @([ x0-9a-f]*)`)
+
+	contentionSampleRE = regexp.MustCompile(`(\d+) *(\d+) @([ x0-9a-f]*)`)
+
+	hexNumberRE = regexp.MustCompile(`0x[0-9a-f]+`)
+
+	growthHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ growthz`)
+
+	fragmentationHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ fragmentationz`)
+
+	threadzStartRE = regexp.MustCompile(`--- threadz \d+ ---`)
+	threadStartRE  = regexp.MustCompile(`--- Thread ([[:xdigit:]]+) \(name: (.*)/(\d+)\) stack: ---`)
+
+	procMapsRE = regexp.MustCompile(`([[:xdigit:]]+)-([[:xdigit:]]+)\s+([-rwxp]+)\s+([[:xdigit:]]+)\s+([[:xdigit:]]+):([[:xdigit:]]+)\s+([[:digit:]]+)\s*(\S+)?`)
+
+	briefMapsRE = regexp.MustCompile(`\s*([[:xdigit:]]+)-([[:xdigit:]]+):\s*(\S+)(\s.*@)?([[:xdigit:]]+)?`)
+
+	// LegacyHeapAllocated instructs the heapz parsers to use the
+	// allocated memory stats instead of the default in-use memory. Note
+	// that tcmalloc doesn't provide all allocated memory, only in-use
+	// stats.
+	LegacyHeapAllocated bool
+)
+
+func isSpaceOrComment(line string) bool {
+	trimmed := strings.TrimSpace(line)
+	return len(trimmed) == 0 || trimmed[0] == '#'
+}
+
+// parseGoCount parses a Go count profile (e.g., threadcreate or
+// goroutine) and returns a new Profile.
+func parseGoCount(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	var line string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		line, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(line) {
+			break
+		}
+	}
+
+	m := countStartRE.FindStringSubmatch(line)
+	if m == nil {
+		return nil, errUnrecognized
+	}
+	profileType := m[1]
+	p := &Profile{
+		PeriodType: &ValueType{Type: profileType, Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{{Type: profileType, Unit: "count"}},
+	}
+	locations := make(map[uint64]*Location)
+	for {
+		line, err = r.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return nil, err
+		}
+		if isSpaceOrComment(line) {
+			continue
+		}
+		if strings.HasPrefix(line, "---") {
+			break
+		}
+		m := countRE.FindStringSubmatch(line)
+		if m == nil {
+			return nil, errMalformed
+		}
+		n, err := strconv.ParseInt(m[1], 0, 64)
+		if err != nil {
+			return nil, errMalformed
+		}
+		fields := strings.Fields(m[2])
+		locs := make([]*Location, 0, len(fields))
+		for _, stk := range fields {
+			addr, err := strconv.ParseUint(stk, 0, 64)
+			if err != nil {
+				return nil, errMalformed
+			}
+			// Adjust all frames by -1 to land on the call instruction.
+			addr--
+			loc := locations[addr]
+			if loc == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				locations[addr] = loc
+				p.Location = append(p.Location, loc)
+			}
+			locs = append(locs, loc)
+		}
+		p.Sample = append(p.Sample, &Sample{
+			Location: locs,
+			Value:    []int64{n},
+		})
+	}
+
+	if err = parseAdditionalSections(strings.TrimSpace(line), r, p); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// remapLocationIDs ensures there is a location for each address
+// referenced by a sample, and remaps the samples to point to the new
+// location ids.
+func (p *Profile) remapLocationIDs() {
+	seen := make(map[*Location]bool, len(p.Location))
+	var locs []*Location
+
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			if seen[l] {
+				continue
+			}
+			l.ID = uint64(len(locs) + 1)
+			locs = append(locs, l)
+			seen[l] = true
+		}
+	}
+	p.Location = locs
+}
+
+func (p *Profile) remapFunctionIDs() {
+	seen := make(map[*Function]bool, len(p.Function))
+	var fns []*Function
+
+	for _, l := range p.Location {
+		for _, ln := range l.Line {
+			fn := ln.Function
+			if fn == nil || seen[fn] {
+				continue
+			}
+			fn.ID = uint64(len(fns) + 1)
+			fns = append(fns, fn)
+			seen[fn] = true
+		}
+	}
+	p.Function = fns
+}
+
+// remapMappingIDs matches location addresses with existing mappings
+// and updates them appropriately. This is O(N*M), if this ever shows
+// up as a bottleneck, evaluate sorting the mappings and doing a
+// binary search, which would make it O(N*log(M)).
+func (p *Profile) remapMappingIDs() {
+	if len(p.Mapping) == 0 {
+		return
+	}
+
+	// Some profile handlers will incorrectly set regions for the main
+	// executable if its section is remapped. Fix them through heuristics.
+
+	// Remove the initial mapping if named '/anon_hugepage' and has a
+	// consecutive adjacent mapping.
+	if m := p.Mapping[0]; strings.HasPrefix(m.File, "/anon_hugepage") {
+		if len(p.Mapping) > 1 && m.Limit == p.Mapping[1].Start {
+			p.Mapping = p.Mapping[1:]
+		}
+	}
+
+	// Subtract the offset from the start of the main mapping if it
+	// ends up at a recognizable start address.
+	const expectedStart = 0x400000
+	if m := p.Mapping[0]; m.Start-m.Offset == expectedStart {
+		m.Start = expectedStart
+		m.Offset = 0
+	}
+
+	for _, l := range p.Location {
+		if a := l.Address; a != 0 {
+			for _, m := range p.Mapping {
+				if m.Start <= a && a < m.Limit {
+					l.Mapping = m
+					break
+				}
+			}
+		}
+	}
+
+	// Reset all mapping IDs.
+	for i, m := range p.Mapping {
+		m.ID = uint64(i + 1)
+	}
+}
+
+var cpuInts = []func([]byte) (uint64, []byte){
+	get32l,
+	get32b,
+	get64l,
+	get64b,
+}
+
+func get32l(b []byte) (uint64, []byte) {
+	if len(b) < 4 {
+		return 0, nil
+	}
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24, b[4:]
+}
+
+func get32b(b []byte) (uint64, []byte) {
+	if len(b) < 4 {
+		return 0, nil
+	}
+	return uint64(b[3]) | uint64(b[2])<<8 | uint64(b[1])<<16 | uint64(b[0])<<24, b[4:]
+}
+
+func get64l(b []byte) (uint64, []byte) {
+	if len(b) < 8 {
+		return 0, nil
+	}
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56, b[8:]
+}
+
+func get64b(b []byte) (uint64, []byte) {
+	if len(b) < 8 {
+		return 0, nil
+	}
+	return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56, b[8:]
+}
+
+// ParseTracebacks parses a set of tracebacks and returns a newly
+// populated profile. It will accept any text file and generate a
+// Profile out of it with any hex addresses it can identify, including
+// a process map if it can recognize one. Each sample will include a
+// tag "source" with the addresses recognized in string format.
+func ParseTracebacks(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	p := &Profile{
+		PeriodType: &ValueType{Type: "trace", Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{
+			{Type: "trace", Unit: "count"},
+		},
+	}
+
+	var sources []string
+	var sloc []*Location
+
+	locs := make(map[uint64]*Location)
+	for {
+		l, err := r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if sectionTrigger(l) == memoryMapSection {
+			break
+		}
+		if s, addrs := extractHexAddresses(l); len(s) > 0 {
+			for _, addr := range addrs {
+				// Addresses from stack traces point to the next instruction after
+				// each call. Adjust by -1 to land somewhere on the actual call.
+				addr--
+				loc := locs[addr]
+				if locs[addr] == nil {
+					loc = &Location{
+						Address: addr,
+					}
+					p.Location = append(p.Location, loc)
+					locs[addr] = loc
+				}
+				sloc = append(sloc, loc)
+			}
+
+			sources = append(sources, s...)
+		} else {
+			if len(sources) > 0 || len(sloc) > 0 {
+				addTracebackSample(sloc, sources, p)
+				sloc, sources = nil, nil
+			}
+		}
+	}
+
+	// Add final sample to save any leftover data.
+	if len(sources) > 0 || len(sloc) > 0 {
+		addTracebackSample(sloc, sources, p)
+	}
+
+	if err := p.ParseMemoryMap(r); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+func addTracebackSample(l []*Location, s []string, p *Profile) {
+	p.Sample = append(p.Sample,
+		&Sample{
+			Value:    []int64{1},
+			Location: l,
+			Label:    map[string][]string{"source": s},
+		})
+}
+
+// parseCPU parses a profilez legacy profile and returns a newly
+// populated Profile.
+//
+// The general format for profilez samples is a sequence of words in
+// binary format. The first words are a header with the following data:
+//   1st word -- 0
+//   2nd word -- 3
+//   3rd word -- 0 if a c++ application, 1 if a java application.
+//   4th word -- Sampling period (in microseconds).
+//   5th word -- Padding.
+func parseCPU(b []byte) (*Profile, error) {
+	var parse func([]byte) (uint64, []byte)
+	var n1, n2, n3, n4, n5 uint64
+	for _, parse = range cpuInts {
+		var tmp []byte
+		n1, tmp = parse(b)
+		n2, tmp = parse(tmp)
+		n3, tmp = parse(tmp)
+		n4, tmp = parse(tmp)
+		n5, tmp = parse(tmp)
+
+		if tmp != nil && n1 == 0 && n2 == 3 && n3 == 0 && n4 > 0 && n5 == 0 {
+			b = tmp
+			return cpuProfile(b, int64(n4), parse)
+		}
+	}
+	return nil, errUnrecognized
+}
+
+// cpuProfile returns a new Profile from C++ profilez data.
+// b is the profile bytes after the header, period is the profiling
+// period, and parse is a function to parse 8-byte chunks from the
+// profile in its native endianness.
+func cpuProfile(b []byte, period int64, parse func(b []byte) (uint64, []byte)) (*Profile, error) {
+	p := &Profile{
+		Period:     period * 1000,
+		PeriodType: &ValueType{Type: "cpu", Unit: "nanoseconds"},
+		SampleType: []*ValueType{
+			{Type: "samples", Unit: "count"},
+			{Type: "cpu", Unit: "nanoseconds"},
+		},
+	}
+	var err error
+	if b, _, err = parseCPUSamples(b, parse, true, p); err != nil {
+		return nil, err
+	}
+
+	// If all samples have the same second-to-the-bottom frame, it
+	// strongly suggests that it is an uninteresting artifact of
+	// measurement -- a stack frame pushed by the signal handler. The
+	// bottom frame is always correct as it is picked up from the signal
+	// structure, not the stack. Check if this is the case and if so,
+	// remove.
+	if len(p.Sample) > 1 && len(p.Sample[0].Location) > 1 {
+		allSame := true
+		id1 := p.Sample[0].Location[1].Address
+		for _, s := range p.Sample {
+			if len(s.Location) < 2 || id1 != s.Location[1].Address {
+				allSame = false
+				break
+			}
+		}
+		if allSame {
+			for _, s := range p.Sample {
+				s.Location = append(s.Location[:1], s.Location[2:]...)
+			}
+		}
+	}
+
+	if err := p.ParseMemoryMap(bytes.NewBuffer(b)); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// parseCPUSamples parses a collection of profilez samples from a
+// profile.
+//
+// profilez samples are a repeated sequence of stack frames of the
+// form:
+//    1st word -- The number of times this stack was encountered.
+//    2nd word -- The size of the stack (StackSize).
+//    3rd word -- The first address on the stack.
+//    ...
+//    StackSize + 2 -- The last address on the stack
+// The last stack trace is of the form:
+//   1st word -- 0
+//   2nd word -- 1
+//   3rd word -- 0
+//
+// Addresses from stack traces may point to the next instruction after
+// each call. Optionally adjust by -1 to land somewhere on the actual
+// call (except for the leaf, which is not a call).
+func parseCPUSamples(b []byte, parse func(b []byte) (uint64, []byte), adjust bool, p *Profile) ([]byte, map[uint64]*Location, error) {
+	locs := make(map[uint64]*Location)
+	for len(b) > 0 {
+		var count, nstk uint64
+		count, b = parse(b)
+		nstk, b = parse(b)
+		if b == nil || nstk > uint64(len(b)/4) {
+			return nil, nil, errUnrecognized
+		}
+		var sloc []*Location
+		addrs := make([]uint64, nstk)
+		for i := 0; i < int(nstk); i++ {
+			addrs[i], b = parse(b)
+		}
+
+		if count == 0 && nstk == 1 && addrs[0] == 0 {
+			// End of data marker
+			break
+		}
+		for i, addr := range addrs {
+			if adjust && i > 0 {
+				addr--
+			}
+			loc := locs[addr]
+			if loc == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				locs[addr] = loc
+				p.Location = append(p.Location, loc)
+			}
+			sloc = append(sloc, loc)
+		}
+		p.Sample = append(p.Sample,
+			&Sample{
+				Value:    []int64{int64(count), int64(count) * p.Period},
+				Location: sloc,
+			})
+	}
+	// Reached the end without finding the EOD marker.
+	return b, locs, nil
+}
+
+// parseHeap parses a heapz legacy or a growthz profile and
+// returns a newly populated Profile.
+func parseHeap(b []byte) (p *Profile, err error) {
+	r := bytes.NewBuffer(b)
+	l, err := r.ReadString('\n')
+	if err != nil {
+		return nil, errUnrecognized
+	}
+
+	sampling := ""
+
+	if header := heapHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "objects", Unit: "bytes"},
+		}
+
+		var period int64
+		if len(header[6]) > 0 {
+			if period, err = strconv.ParseInt(header[6], 10, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+
+		switch header[5] {
+		case "heapz_v2", "heap_v2":
+			sampling, p.Period = "v2", period
+		case "heapprofile":
+			sampling, p.Period = "", 1
+		case "heap":
+			sampling, p.Period = "v2", period/2
+		default:
+			return nil, errUnrecognized
+		}
+	} else if header = growthHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "heapgrowth", Unit: "count"},
+			Period:     1,
+		}
+	} else if header = fragmentationHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "allocations", Unit: "count"},
+			Period:     1,
+		}
+	} else {
+		return nil, errUnrecognized
+	}
+
+	if LegacyHeapAllocated {
+		for _, st := range p.SampleType {
+			st.Type = "alloc_" + st.Type
+		}
+	} else {
+		for _, st := range p.SampleType {
+			st.Type = "inuse_" + st.Type
+		}
+	}
+
+	locs := make(map[uint64]*Location)
+	for {
+		l, err = r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+
+			if l == "" {
+				break
+			}
+		}
+
+		if isSpaceOrComment(l) {
+			continue
+		}
+		l = strings.TrimSpace(l)
+
+		if sectionTrigger(l) != unrecognizedSection {
+			break
+		}
+
+		value, blocksize, addrs, err := parseHeapSample(l, p.Period, sampling)
+		if err != nil {
+			return nil, err
+		}
+		var sloc []*Location
+		for _, addr := range addrs {
+			// Addresses from stack traces point to the next instruction after
+			// each call. Adjust by -1 to land somewhere on the actual call.
+			addr--
+			loc := locs[addr]
+			if locs[addr] == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				p.Location = append(p.Location, loc)
+				locs[addr] = loc
+			}
+			sloc = append(sloc, loc)
+		}
+
+		p.Sample = append(p.Sample, &Sample{
+			Value:    value,
+			Location: sloc,
+			NumLabel: map[string][]int64{"bytes": {blocksize}},
+		})
+	}
+
+	if err = parseAdditionalSections(l, r, p); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// parseHeapSample parses a single row from a heap profile into a new Sample.
+func parseHeapSample(line string, rate int64, sampling string) (value []int64, blocksize int64, addrs []uint64, err error) {
+	sampleData := heapSampleRE.FindStringSubmatch(line)
+	if len(sampleData) != 6 {
+		return value, blocksize, addrs, fmt.Errorf("unexpected number of sample values: got %d, want 6", len(sampleData))
+	}
+
+	// Use first two values by default; tcmalloc sampling generates the
+	// same value for both, only the older heap-profile collect separate
+	// stats for in-use and allocated objects.
+	valueIndex := 1
+	if LegacyHeapAllocated {
+		valueIndex = 3
+	}
+
+	var v1, v2 int64
+	if v1, err = strconv.ParseInt(sampleData[valueIndex], 10, 64); err != nil {
+		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+	if v2, err = strconv.ParseInt(sampleData[valueIndex+1], 10, 64); err != nil {
+		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+
+	if v1 == 0 {
+		if v2 != 0 {
+			return value, blocksize, addrs, fmt.Errorf("allocation count was 0 but allocation bytes was %d", v2)
+		}
+	} else {
+		blocksize = v2 / v1
+		if sampling == "v2" {
+			v1, v2 = scaleHeapSample(v1, v2, rate)
+		}
+	}
+
+	value = []int64{v1, v2}
+	addrs = parseHexAddresses(sampleData[5])
+
+	return value, blocksize, addrs, nil
+}
+
+// extractHexAddresses extracts hex numbers from a string and returns
+// them, together with their numeric value, in a slice.
+func extractHexAddresses(s string) ([]string, []uint64) {
+	hexStrings := hexNumberRE.FindAllString(s, -1)
+	var ids []uint64
+	for _, s := range hexStrings {
+		if id, err := strconv.ParseUint(s, 0, 64); err == nil {
+			ids = append(ids, id)
+		} else {
+			// Do not expect any parsing failures due to the regexp matching.
+			panic("failed to parse hex value:" + s)
+		}
+	}
+	return hexStrings, ids
+}
+
+// parseHexAddresses parses hex numbers from a string and returns them
+// in a slice.
+func parseHexAddresses(s string) []uint64 {
+	_, ids := extractHexAddresses(s)
+	return ids
+}
+
+// scaleHeapSample adjusts the data from a heapz Sample to
+// account for its probability of appearing in the collected
+// data. heapz profiles are a sampling of the memory allocations
+// requests in a program. We estimate the unsampled value by dividing
+// each collected sample by its probability of appearing in the
+// profile. heapz v2 profiles rely on a poisson process to determine
+// which samples to collect, based on the desired average collection
+// rate R. The probability of a sample of size S to appear in that
+// profile is 1-exp(-S/R).
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+	if count == 0 || size == 0 {
+		return 0, 0
+	}
+
+	if rate <= 1 {
+		// if rate==1 all samples were collected so no adjustment is needed.
+		// if rate<1 treat as unknown and skip scaling.
+		return count, size
+	}
+
+	avgSize := float64(size) / float64(count)
+	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+	return int64(float64(count) * scale), int64(float64(size) * scale)
+}
+
+// parseContention parses a mutex or contention profile. There are 2 cases:
+// "--- contentionz " for legacy C++ profiles (and backwards compatibility)
+// "--- mutex:" or "--- contention:" for profiles generated by the Go runtime.
+// This code converts the text output from runtime into a *Profile. (In the future
+// the runtime might write a serialized Profile directly making this unnecessary.)
+func parseContention(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+	var l string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		l, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(l) {
+			break
+		}
+	}
+
+	if strings.HasPrefix(l, "--- contentionz ") {
+		return parseCppContention(r)
+	} else if strings.HasPrefix(l, "--- mutex:") {
+		return parseCppContention(r)
+	} else if strings.HasPrefix(l, "--- contention:") {
+		return parseCppContention(r)
+	}
+	return nil, errUnrecognized
+}
+
+// parseCppContention parses the output from synchronization_profiling.cc
+// for backward compatibility, and the compatible (non-debug) block profile
+// output from the Go runtime.
+func parseCppContention(r *bytes.Buffer) (*Profile, error) {
+	p := &Profile{
+		PeriodType: &ValueType{Type: "contentions", Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{
+			{Type: "contentions", Unit: "count"},
+			{Type: "delay", Unit: "nanoseconds"},
+		},
+	}
+
+	var cpuHz int64
+	var l string
+	var err error
+	// Parse text of the form "attribute = value" before the samples.
+	const delimiter = "="
+	for {
+		l, err = r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+
+			if l == "" {
+				break
+			}
+		}
+		if isSpaceOrComment(l) {
+			continue
+		}
+
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if strings.HasPrefix(l, "---") {
+			break
+		}
+
+		attr := strings.SplitN(l, delimiter, 2)
+		if len(attr) != 2 {
+			break
+		}
+		key, val := strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1])
+		var err error
+		switch key {
+		case "cycles/second":
+			if cpuHz, err = strconv.ParseInt(val, 0, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		case "sampling period":
+			if p.Period, err = strconv.ParseInt(val, 0, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		case "ms since reset":
+			ms, err := strconv.ParseInt(val, 0, 64)
+			if err != nil {
+				return nil, errUnrecognized
+			}
+			p.DurationNanos = ms * 1000 * 1000
+		case "format":
+			// CPP contentionz profiles don't have format.
+			return nil, errUnrecognized
+		case "resolution":
+			// CPP contentionz profiles don't have resolution.
+			return nil, errUnrecognized
+		case "discarded samples":
+		default:
+			return nil, errUnrecognized
+		}
+	}
+
+	locs := make(map[uint64]*Location)
+	for {
+		if !isSpaceOrComment(l) {
+			if l = strings.TrimSpace(l); strings.HasPrefix(l, "---") {
+				break
+			}
+			value, addrs, err := parseContentionSample(l, p.Period, cpuHz)
+			if err != nil {
+				return nil, err
+			}
+			var sloc []*Location
+			for _, addr := range addrs {
+				// Addresses from stack traces point to the next instruction after
+				// each call. Adjust by -1 to land somewhere on the actual call.
+				addr--
+				loc := locs[addr]
+				if locs[addr] == nil {
+					loc = &Location{
+						Address: addr,
+					}
+					p.Location = append(p.Location, loc)
+					locs[addr] = loc
+				}
+				sloc = append(sloc, loc)
+			}
+			p.Sample = append(p.Sample, &Sample{
+				Value:    value,
+				Location: sloc,
+			})
+		}
+
+		if l, err = r.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+	}
+
+	if err = parseAdditionalSections(l, r, p); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+// parseContentionSample parses a single row from a contention profile
+// into a new Sample.
+func parseContentionSample(line string, period, cpuHz int64) (value []int64, addrs []uint64, err error) {
+	sampleData := contentionSampleRE.FindStringSubmatch(line)
+	if sampleData == nil {
+		return value, addrs, errUnrecognized
+	}
+
+	v1, err := strconv.ParseInt(sampleData[1], 10, 64)
+	if err != nil {
+		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+	v2, err := strconv.ParseInt(sampleData[2], 10, 64)
+	if err != nil {
+		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+
+	// Unsample values if period and cpuHz are available.
+	// - Delays are scaled to cycles and then to nanoseconds.
+	// - Contentions are scaled to cycles.
+	if period > 0 {
+		if cpuHz > 0 {
+			cpuGHz := float64(cpuHz) / 1e9
+			v1 = int64(float64(v1) * float64(period) / cpuGHz)
+		}
+		v2 = v2 * period
+	}
+
+	value = []int64{v2, v1}
+	addrs = parseHexAddresses(sampleData[3])
+
+	return value, addrs, nil
+}
+
+// parseThread parses a Threadz profile and returns a new Profile.
+func parseThread(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	var line string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		line, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(line) {
+			break
+		}
+	}
+
+	if m := threadzStartRE.FindStringSubmatch(line); m != nil {
+		// Advance over initial comments until first stack trace.
+		for {
+			line, err = r.ReadString('\n')
+			if err != nil {
+				if err != io.EOF {
+					return nil, err
+				}
+
+				if line == "" {
+					break
+				}
+			}
+			if sectionTrigger(line) != unrecognizedSection || line[0] == '-' {
+				break
+			}
+		}
+	} else if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
+		return nil, errUnrecognized
+	}
+
+	p := &Profile{
+		SampleType: []*ValueType{{Type: "thread", Unit: "count"}},
+		PeriodType: &ValueType{Type: "thread", Unit: "count"},
+		Period:     1,
+	}
+
+	locs := make(map[uint64]*Location)
+	// Recognize each thread and populate profile samples.
+	for sectionTrigger(line) == unrecognizedSection {
+		if strings.HasPrefix(line, "---- no stack trace for") {
+			line = ""
+			break
+		}
+		if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
+			return nil, errUnrecognized
+		}
+
+		var addrs []uint64
+		line, addrs, err = parseThreadSample(r)
+		if err != nil {
+			return nil, errUnrecognized
+		}
+		if len(addrs) == 0 {
+			// We got a --same as previous threads--. Bump counters.
+			if len(p.Sample) > 0 {
+				s := p.Sample[len(p.Sample)-1]
+				s.Value[0]++
+			}
+			continue
+		}
+
+		var sloc []*Location
+		for _, addr := range addrs {
+			// Addresses from stack traces point to the next instruction after
+			// each call. Adjust by -1 to land somewhere on the actual call.
+			addr--
+			loc := locs[addr]
+			if locs[addr] == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				p.Location = append(p.Location, loc)
+				locs[addr] = loc
+			}
+			sloc = append(sloc, loc)
+		}
+
+		p.Sample = append(p.Sample, &Sample{
+			Value:    []int64{1},
+			Location: sloc,
+		})
+	}
+
+	if err = parseAdditionalSections(line, r, p); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+// parseThreadSample parses a symbolized or unsymbolized stack trace.
+// Returns the first line after the traceback, the sample (or nil if
+// it hits a 'same-as-previous' marker) and an error.
+func parseThreadSample(b *bytes.Buffer) (nextl string, addrs []uint64, err error) {
+	var l string
+	sameAsPrevious := false
+	for {
+		if l, err = b.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return "", nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if strings.HasPrefix(l, "---") {
+			break
+		}
+		if strings.Contains(l, "same as previous thread") {
+			sameAsPrevious = true
+			continue
+		}
+
+		addrs = append(addrs, parseHexAddresses(l)...)
+	}
+
+	if sameAsPrevious {
+		return l, nil, nil
+	}
+	return l, addrs, nil
+}
+
+// parseAdditionalSections parses any additional sections in the
+// profile, ignoring any unrecognized sections.
+func parseAdditionalSections(l string, b *bytes.Buffer, p *Profile) (err error) {
+	for {
+		if sectionTrigger(l) == memoryMapSection {
+			break
+		}
+		// Ignore any unrecognized sections.
+		if l, err := b.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return err
+			}
+			if l == "" {
+				break
+			}
+		}
+	}
+	return p.ParseMemoryMap(b)
+}
+
+// ParseMemoryMap parses a memory map in the format of
+// /proc/self/maps, and overrides the mappings in the current profile.
+// It renumbers the samples and locations in the profile correspondingly.
+func (p *Profile) ParseMemoryMap(rd io.Reader) error {
+	b := bufio.NewReader(rd)
+
+	var attrs []string
+	var r *strings.Replacer
+	const delimiter = "="
+	for {
+		l, err := b.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if r != nil {
+			l = r.Replace(l)
+		}
+		m, err := parseMappingEntry(l)
+		if err != nil {
+			if err == errUnrecognized {
+				// Recognize assignments of the form: attr=value, and replace
+				// $attr with value on subsequent mappings.
+				if attr := strings.SplitN(l, delimiter, 2); len(attr) == 2 {
+					attrs = append(attrs, "$"+strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1]))
+					r = strings.NewReplacer(attrs...)
+				}
+				// Ignore any unrecognized entries
+				continue
+			}
+			return err
+		}
+		if m == nil || (m.File == "" && len(p.Mapping) != 0) {
+			// In some cases the first entry may include the address range
+			// but not the name of the file. It should be followed by
+			// another entry with the name.
+			continue
+		}
+		if len(p.Mapping) == 1 && p.Mapping[0].File == "" {
+			// Update the name if this is the entry following that empty one.
+			p.Mapping[0].File = m.File
+			continue
+		}
+		p.Mapping = append(p.Mapping, m)
+	}
+	p.remapLocationIDs()
+	p.remapFunctionIDs()
+	p.remapMappingIDs()
+	return nil
+}
+
+func parseMappingEntry(l string) (*Mapping, error) {
+	mapping := &Mapping{}
+	var err error
+	if me := procMapsRE.FindStringSubmatch(l); len(me) == 9 {
+		if !strings.Contains(me[3], "x") {
+			// Skip non-executable entries.
+			return nil, nil
+		}
+		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if me[4] != "" {
+			if mapping.Offset, err = strconv.ParseUint(me[4], 16, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+		mapping.File = me[8]
+		return mapping, nil
+	}
+
+	if me := briefMapsRE.FindStringSubmatch(l); len(me) == 6 {
+		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		mapping.File = me[3]
+		if me[5] != "" {
+			if mapping.Offset, err = strconv.ParseUint(me[5], 16, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+		return mapping, nil
+	}
+
+	return nil, errUnrecognized
+}
+
+type sectionType int
+
+const (
+	unrecognizedSection sectionType = iota
+	memoryMapSection
+)
+
+var memoryMapTriggers = []string{
+	"--- Memory map: ---",
+	"MAPPED_LIBRARIES:",
+}
+
+func sectionTrigger(line string) sectionType {
+	for _, trigger := range memoryMapTriggers {
+		if strings.Contains(line, trigger) {
+			return memoryMapSection
+		}
+	}
+	return unrecognizedSection
+}
+
+func (p *Profile) addLegacyFrameInfo() {
+	switch {
+	case isProfileType(p, heapzSampleTypes) ||
+		isProfileType(p, heapzInUseSampleTypes) ||
+		isProfileType(p, heapzAllocSampleTypes):
+		p.DropFrames, p.KeepFrames = allocRxStr, allocSkipRxStr
+	case isProfileType(p, contentionzSampleTypes):
+		p.DropFrames, p.KeepFrames = lockRxStr, ""
+	default:
+		p.DropFrames, p.KeepFrames = cpuProfilerRxStr, ""
+	}
+}
+
+var heapzSampleTypes = []string{"allocations", "size"} // early Go pprof profiles
+var heapzInUseSampleTypes = []string{"inuse_objects", "inuse_space"}
+var heapzAllocSampleTypes = []string{"alloc_objects", "alloc_space"}
+var contentionzSampleTypes = []string{"contentions", "delay"}
+
+func isProfileType(p *Profile, t []string) bool {
+	st := p.SampleType
+	if len(st) != len(t) {
+		return false
+	}
+
+	for i := range st {
+		if st[i].Type != t[i] {
+			return false
+		}
+	}
+	return true
+}
+
+var allocRxStr = strings.Join([]string{
+	// POSIX entry points.
+	`calloc`,
+	`cfree`,
+	`malloc`,
+	`free`,
+	`memalign`,
+	`do_memalign`,
+	`(__)?posix_memalign`,
+	`pvalloc`,
+	`valloc`,
+	`realloc`,
+
+	// TC malloc.
+	`tcmalloc::.*`,
+	`tc_calloc`,
+	`tc_cfree`,
+	`tc_malloc`,
+	`tc_free`,
+	`tc_memalign`,
+	`tc_posix_memalign`,
+	`tc_pvalloc`,
+	`tc_valloc`,
+	`tc_realloc`,
+	`tc_new`,
+	`tc_delete`,
+	`tc_newarray`,
+	`tc_deletearray`,
+	`tc_new_nothrow`,
+	`tc_newarray_nothrow`,
+
+	// Memory-allocation routines on OS X.
+	`malloc_zone_malloc`,
+	`malloc_zone_calloc`,
+	`malloc_zone_valloc`,
+	`malloc_zone_realloc`,
+	`malloc_zone_memalign`,
+	`malloc_zone_free`,
+
+	// Go runtime
+	`runtime\..*`,
+
+	// Other misc. memory allocation routines
+	`BaseArena::.*`,
+	`(::)?do_malloc_no_errno`,
+	`(::)?do_malloc_pages`,
+	`(::)?do_malloc`,
+	`DoSampledAllocation`,
+	`MallocedMemBlock::MallocedMemBlock`,
+	`_M_allocate`,
+	`__builtin_(vec_)?delete`,
+	`__builtin_(vec_)?new`,
+	`__gnu_cxx::new_allocator::allocate`,
+	`__libc_malloc`,
+	`__malloc_alloc_template::allocate`,
+	`allocate`,
+	`cpp_alloc`,
+	`operator new(\[\])?`,
+	`simple_alloc::allocate`,
+}, `|`)
+
+var allocSkipRxStr = strings.Join([]string{
+	// Preserve Go runtime frames that appear in the middle/bottom of
+	// the stack.
+	`runtime\.panic`,
+	`runtime\.reflectcall`,
+	`runtime\.call[0-9]*`,
+}, `|`)
+
+var cpuProfilerRxStr = strings.Join([]string{
+	`ProfileData::Add`,
+	`ProfileData::prof_handler`,
+	`CpuProfiler::prof_handler`,
+	`__pthread_sighandler`,
+	`__restore`,
+}, `|`)
+
+var lockRxStr = strings.Join([]string{
+	`RecordLockProfileData`,
+	`(base::)?RecordLockProfileData.*`,
+	`(base::)?SubmitMutexProfileData.*`,
+	`(base::)?SubmitSpinLockProfileData.*`,
+	`(Mutex::)?AwaitCommon.*`,
+	`(Mutex::)?Unlock.*`,
+	`(Mutex::)?UnlockSlow.*`,
+	`(Mutex::)?ReaderUnlock.*`,
+	`(MutexLock::)?~MutexLock.*`,
+	`(SpinLock::)?Unlock.*`,
+	`(SpinLock::)?SlowUnlock.*`,
+	`(SpinLockHolder::)?~SpinLockHolder.*`,
+}, `|`)
diff --git a/libgo/go/runtime/pprof/internal/profile/profile.go b/libgo/go/runtime/pprof/internal/profile/profile.go
new file mode 100644
index 0000000..9b6a6f9
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/profile.go
@@ -0,0 +1,575 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package profile provides a representation of profile.proto and
+// methods to encode/decode profiles in this format.
+//
+// This package is only for testing runtime/pprof.
+// It is not used by production Go programs.
+package profile
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"regexp"
+	"strings"
+	"time"
+)
+
+// Profile is an in-memory representation of profile.proto.
+type Profile struct {
+	SampleType []*ValueType
+	Sample     []*Sample
+	Mapping    []*Mapping
+	Location   []*Location
+	Function   []*Function
+
+	DropFrames string
+	KeepFrames string
+
+	TimeNanos     int64
+	DurationNanos int64
+	PeriodType    *ValueType
+	Period        int64
+
+	dropFramesX int64
+	keepFramesX int64
+	stringTable []string
+}
+
+// ValueType corresponds to Profile.ValueType
+type ValueType struct {
+	Type string // cpu, wall, inuse_space, etc
+	Unit string // seconds, nanoseconds, bytes, etc
+
+	typeX int64
+	unitX int64
+}
+
+// Sample corresponds to Profile.Sample
+type Sample struct {
+	Location []*Location
+	Value    []int64
+	Label    map[string][]string
+	NumLabel map[string][]int64
+
+	locationIDX []uint64
+	labelX      []Label
+}
+
+// Label corresponds to Profile.Label
+type Label struct {
+	keyX int64
+	// Exactly one of the two following values must be set
+	strX int64
+	numX int64 // Integer value for this label
+}
+
+// Mapping corresponds to Profile.Mapping
+type Mapping struct {
+	ID              uint64
+	Start           uint64
+	Limit           uint64
+	Offset          uint64
+	File            string
+	BuildID         string
+	HasFunctions    bool
+	HasFilenames    bool
+	HasLineNumbers  bool
+	HasInlineFrames bool
+
+	fileX    int64
+	buildIDX int64
+}
+
+// Location corresponds to Profile.Location
+type Location struct {
+	ID      uint64
+	Mapping *Mapping
+	Address uint64
+	Line    []Line
+
+	mappingIDX uint64
+}
+
+// Line corresponds to Profile.Line
+type Line struct {
+	Function *Function
+	Line     int64
+
+	functionIDX uint64
+}
+
+// Function corresponds to Profile.Function
+type Function struct {
+	ID         uint64
+	Name       string
+	SystemName string
+	Filename   string
+	StartLine  int64
+
+	nameX       int64
+	systemNameX int64
+	filenameX   int64
+}
+
+// Parse parses a profile and checks for its validity. The input
+// may be a gzip-compressed encoded protobuf or one of many legacy
+// profile formats which may be unsupported in the future.
+func Parse(r io.Reader) (*Profile, error) {
+	orig, err := ioutil.ReadAll(r)
+	if err != nil {
+		return nil, err
+	}
+
+	var p *Profile
+	if len(orig) >= 2 && orig[0] == 0x1f && orig[1] == 0x8b {
+		gz, err := gzip.NewReader(bytes.NewBuffer(orig))
+		if err != nil {
+			return nil, fmt.Errorf("decompressing profile: %v", err)
+		}
+		data, err := ioutil.ReadAll(gz)
+		if err != nil {
+			return nil, fmt.Errorf("decompressing profile: %v", err)
+		}
+		orig = data
+	}
+	if p, err = parseUncompressed(orig); err != nil {
+		if p, err = parseLegacy(orig); err != nil {
+			return nil, fmt.Errorf("parsing profile: %v", err)
+		}
+	}
+
+	if err := p.CheckValid(); err != nil {
+		return nil, fmt.Errorf("malformed profile: %v", err)
+	}
+	return p, nil
+}
+
+var errUnrecognized = fmt.Errorf("unrecognized profile format")
+var errMalformed = fmt.Errorf("malformed profile format")
+
+func parseLegacy(data []byte) (*Profile, error) {
+	parsers := []func([]byte) (*Profile, error){
+		parseCPU,
+		parseHeap,
+		parseGoCount, // goroutine, threadcreate
+		parseThread,
+		parseContention,
+	}
+
+	for _, parser := range parsers {
+		p, err := parser(data)
+		if err == nil {
+			p.setMain()
+			p.addLegacyFrameInfo()
+			return p, nil
+		}
+		if err != errUnrecognized {
+			return nil, err
+		}
+	}
+	return nil, errUnrecognized
+}
+
+func parseUncompressed(data []byte) (*Profile, error) {
+	p := &Profile{}
+	if err := unmarshal(data, p); err != nil {
+		return nil, err
+	}
+
+	if err := p.postDecode(); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+var libRx = regexp.MustCompile(`([.]so$|[.]so[._][0-9]+)`)
+
+// setMain scans Mapping entries and guesses which entry is main
+// because legacy profiles don't obey the convention of putting main
+// first.
+func (p *Profile) setMain() {
+	for i := 0; i < len(p.Mapping); i++ {
+		file := strings.TrimSpace(strings.Replace(p.Mapping[i].File, "(deleted)", "", -1))
+		if len(file) == 0 {
+			continue
+		}
+		if len(libRx.FindStringSubmatch(file)) > 0 {
+			continue
+		}
+		if strings.HasPrefix(file, "[") {
+			continue
+		}
+		// Swap what we guess is main to position 0.
+		tmp := p.Mapping[i]
+		p.Mapping[i] = p.Mapping[0]
+		p.Mapping[0] = tmp
+		break
+	}
+}
+
+// Write writes the profile as a gzip-compressed marshaled protobuf.
+func (p *Profile) Write(w io.Writer) error {
+	p.preEncode()
+	b := marshal(p)
+	zw := gzip.NewWriter(w)
+	defer zw.Close()
+	_, err := zw.Write(b)
+	return err
+}
+
+// CheckValid tests whether the profile is valid. Checks include, but are
+// not limited to:
+//   - len(Profile.Sample[n].value) == len(Profile.value_unit)
+//   - Sample.id has a corresponding Profile.Location
+func (p *Profile) CheckValid() error {
+	// Check that sample values are consistent
+	sampleLen := len(p.SampleType)
+	if sampleLen == 0 && len(p.Sample) != 0 {
+		return fmt.Errorf("missing sample type information")
+	}
+	for _, s := range p.Sample {
+		if len(s.Value) != sampleLen {
+			return fmt.Errorf("mismatch: sample has: %d values vs. %d types", len(s.Value), len(p.SampleType))
+		}
+	}
+
+	// Check that all mappings/locations/functions are in the tables
+	// Check that there are no duplicate ids
+	mappings := make(map[uint64]*Mapping, len(p.Mapping))
+	for _, m := range p.Mapping {
+		if m.ID == 0 {
+			return fmt.Errorf("found mapping with reserved ID=0")
+		}
+		if mappings[m.ID] != nil {
+			return fmt.Errorf("multiple mappings with same id: %d", m.ID)
+		}
+		mappings[m.ID] = m
+	}
+	functions := make(map[uint64]*Function, len(p.Function))
+	for _, f := range p.Function {
+		if f.ID == 0 {
+			return fmt.Errorf("found function with reserved ID=0")
+		}
+		if functions[f.ID] != nil {
+			return fmt.Errorf("multiple functions with same id: %d", f.ID)
+		}
+		functions[f.ID] = f
+	}
+	locations := make(map[uint64]*Location, len(p.Location))
+	for _, l := range p.Location {
+		if l.ID == 0 {
+			return fmt.Errorf("found location with reserved id=0")
+		}
+		if locations[l.ID] != nil {
+			return fmt.Errorf("multiple locations with same id: %d", l.ID)
+		}
+		locations[l.ID] = l
+		if m := l.Mapping; m != nil {
+			if m.ID == 0 || mappings[m.ID] != m {
+				return fmt.Errorf("inconsistent mapping %p: %d", m, m.ID)
+			}
+		}
+		for _, ln := range l.Line {
+			if f := ln.Function; f != nil {
+				if f.ID == 0 || functions[f.ID] != f {
+					return fmt.Errorf("inconsistent function %p: %d", f, f.ID)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// Aggregate merges the locations in the profile into equivalence
+// classes preserving the request attributes. It also updates the
+// samples to point to the merged locations.
+func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address bool) error {
+	for _, m := range p.Mapping {
+		m.HasInlineFrames = m.HasInlineFrames && inlineFrame
+		m.HasFunctions = m.HasFunctions && function
+		m.HasFilenames = m.HasFilenames && filename
+		m.HasLineNumbers = m.HasLineNumbers && linenumber
+	}
+
+	// Aggregate functions
+	if !function || !filename {
+		for _, f := range p.Function {
+			if !function {
+				f.Name = ""
+				f.SystemName = ""
+			}
+			if !filename {
+				f.Filename = ""
+			}
+		}
+	}
+
+	// Aggregate locations
+	if !inlineFrame || !address || !linenumber {
+		for _, l := range p.Location {
+			if !inlineFrame && len(l.Line) > 1 {
+				l.Line = l.Line[len(l.Line)-1:]
+			}
+			if !linenumber {
+				for i := range l.Line {
+					l.Line[i].Line = 0
+				}
+			}
+			if !address {
+				l.Address = 0
+			}
+		}
+	}
+
+	return p.CheckValid()
+}
+
+// Print dumps a text representation of a profile. Intended mainly
+// for debugging purposes.
+func (p *Profile) String() string {
+
+	ss := make([]string, 0, len(p.Sample)+len(p.Mapping)+len(p.Location))
+	if pt := p.PeriodType; pt != nil {
+		ss = append(ss, fmt.Sprintf("PeriodType: %s %s", pt.Type, pt.Unit))
+	}
+	ss = append(ss, fmt.Sprintf("Period: %d", p.Period))
+	if p.TimeNanos != 0 {
+		ss = append(ss, fmt.Sprintf("Time: %v", time.Unix(0, p.TimeNanos)))
+	}
+	if p.DurationNanos != 0 {
+		ss = append(ss, fmt.Sprintf("Duration: %v", time.Duration(p.DurationNanos)))
+	}
+
+	ss = append(ss, "Samples:")
+	var sh1 string
+	for _, s := range p.SampleType {
+		sh1 = sh1 + fmt.Sprintf("%s/%s ", s.Type, s.Unit)
+	}
+	ss = append(ss, strings.TrimSpace(sh1))
+	for _, s := range p.Sample {
+		var sv string
+		for _, v := range s.Value {
+			sv = fmt.Sprintf("%s %10d", sv, v)
+		}
+		sv = sv + ": "
+		for _, l := range s.Location {
+			sv = sv + fmt.Sprintf("%d ", l.ID)
+		}
+		ss = append(ss, sv)
+		const labelHeader = "                "
+		if len(s.Label) > 0 {
+			ls := labelHeader
+			for k, v := range s.Label {
+				ls = ls + fmt.Sprintf("%s:%v ", k, v)
+			}
+			ss = append(ss, ls)
+		}
+		if len(s.NumLabel) > 0 {
+			ls := labelHeader
+			for k, v := range s.NumLabel {
+				ls = ls + fmt.Sprintf("%s:%v ", k, v)
+			}
+			ss = append(ss, ls)
+		}
+	}
+
+	ss = append(ss, "Locations")
+	for _, l := range p.Location {
+		locStr := fmt.Sprintf("%6d: %#x ", l.ID, l.Address)
+		if m := l.Mapping; m != nil {
+			locStr = locStr + fmt.Sprintf("M=%d ", m.ID)
+		}
+		if len(l.Line) == 0 {
+			ss = append(ss, locStr)
+		}
+		for li := range l.Line {
+			lnStr := "??"
+			if fn := l.Line[li].Function; fn != nil {
+				lnStr = fmt.Sprintf("%s %s:%d s=%d",
+					fn.Name,
+					fn.Filename,
+					l.Line[li].Line,
+					fn.StartLine)
+				if fn.Name != fn.SystemName {
+					lnStr = lnStr + "(" + fn.SystemName + ")"
+				}
+			}
+			ss = append(ss, locStr+lnStr)
+			// Do not print location details past the first line
+			locStr = "             "
+		}
+	}
+
+	ss = append(ss, "Mappings")
+	for _, m := range p.Mapping {
+		bits := ""
+		if m.HasFunctions {
+			bits = bits + "[FN]"
+		}
+		if m.HasFilenames {
+			bits = bits + "[FL]"
+		}
+		if m.HasLineNumbers {
+			bits = bits + "[LN]"
+		}
+		if m.HasInlineFrames {
+			bits = bits + "[IN]"
+		}
+		ss = append(ss, fmt.Sprintf("%d: %#x/%#x/%#x %s %s %s",
+			m.ID,
+			m.Start, m.Limit, m.Offset,
+			m.File,
+			m.BuildID,
+			bits))
+	}
+
+	return strings.Join(ss, "\n") + "\n"
+}
+
+// Merge adds profile p adjusted by ratio r into profile p. Profiles
+// must be compatible (same Type and SampleType).
+// TODO(rsilvera): consider normalizing the profiles based on the
+// total samples collected.
+func (p *Profile) Merge(pb *Profile, r float64) error {
+	if err := p.Compatible(pb); err != nil {
+		return err
+	}
+
+	pb = pb.Copy()
+
+	// Keep the largest of the two periods.
+	if pb.Period > p.Period {
+		p.Period = pb.Period
+	}
+
+	p.DurationNanos += pb.DurationNanos
+
+	p.Mapping = append(p.Mapping, pb.Mapping...)
+	for i, m := range p.Mapping {
+		m.ID = uint64(i + 1)
+	}
+	p.Location = append(p.Location, pb.Location...)
+	for i, l := range p.Location {
+		l.ID = uint64(i + 1)
+	}
+	p.Function = append(p.Function, pb.Function...)
+	for i, f := range p.Function {
+		f.ID = uint64(i + 1)
+	}
+
+	if r != 1.0 {
+		for _, s := range pb.Sample {
+			for i, v := range s.Value {
+				s.Value[i] = int64((float64(v) * r))
+			}
+		}
+	}
+	p.Sample = append(p.Sample, pb.Sample...)
+	return p.CheckValid()
+}
+
+// Compatible determines if two profiles can be compared/merged.
+// returns nil if the profiles are compatible; otherwise an error with
+// details on the incompatibility.
+func (p *Profile) Compatible(pb *Profile) error {
+	if !compatibleValueTypes(p.PeriodType, pb.PeriodType) {
+		return fmt.Errorf("incompatible period types %v and %v", p.PeriodType, pb.PeriodType)
+	}
+
+	if len(p.SampleType) != len(pb.SampleType) {
+		return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
+	}
+
+	for i := range p.SampleType {
+		if !compatibleValueTypes(p.SampleType[i], pb.SampleType[i]) {
+			return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
+		}
+	}
+
+	return nil
+}
+
+// HasFunctions determines if all locations in this profile have
+// symbolized function information.
+func (p *Profile) HasFunctions() bool {
+	for _, l := range p.Location {
+		if l.Mapping == nil || !l.Mapping.HasFunctions {
+			return false
+		}
+	}
+	return true
+}
+
+// HasFileLines determines if all locations in this profile have
+// symbolized file and line number information.
+func (p *Profile) HasFileLines() bool {
+	for _, l := range p.Location {
+		if l.Mapping == nil || (!l.Mapping.HasFilenames || !l.Mapping.HasLineNumbers) {
+			return false
+		}
+	}
+	return true
+}
+
+func compatibleValueTypes(v1, v2 *ValueType) bool {
+	if v1 == nil || v2 == nil {
+		return true // No grounds to disqualify.
+	}
+	return v1.Type == v2.Type && v1.Unit == v2.Unit
+}
+
+// Copy makes a fully independent copy of a profile.
+func (p *Profile) Copy() *Profile {
+	p.preEncode()
+	b := marshal(p)
+
+	pp := &Profile{}
+	if err := unmarshal(b, pp); err != nil {
+		panic(err)
+	}
+	if err := pp.postDecode(); err != nil {
+		panic(err)
+	}
+
+	return pp
+}
+
+// Demangler maps symbol names to a human-readable form. This may
+// include C++ demangling and additional simplification. Names that
+// are not demangled may be missing from the resulting map.
+type Demangler func(name []string) (map[string]string, error)
+
+// Demangle attempts to demangle and optionally simplify any function
+// names referenced in the profile. It works on a best-effort basis:
+// it will silently preserve the original names in case of any errors.
+func (p *Profile) Demangle(d Demangler) error {
+	// Collect names to demangle.
+	var names []string
+	for _, fn := range p.Function {
+		names = append(names, fn.SystemName)
+	}
+
+	// Update profile with demangled names.
+	demangled, err := d(names)
+	if err != nil {
+		return err
+	}
+	for _, fn := range p.Function {
+		if dd, ok := demangled[fn.SystemName]; ok {
+			fn.Name = dd
+		}
+	}
+	return nil
+}
+
+// Empty returns true if the profile contains no samples.
+func (p *Profile) Empty() bool {
+	return len(p.Sample) == 0
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/profile_test.go b/libgo/go/runtime/pprof/internal/profile/profile_test.go
new file mode 100644
index 0000000..e1963f3
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/profile_test.go
@@ -0,0 +1,79 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package profile
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestEmptyProfile(t *testing.T) {
+	var buf bytes.Buffer
+	p, err := Parse(&buf)
+	if err != nil {
+		t.Error("Want no error, got", err)
+	}
+	if p == nil {
+		t.Fatal("Want a valid profile, got <nil>")
+	}
+	if !p.Empty() {
+		t.Errorf("Profile should be empty, got %#v", p)
+	}
+}
+
+func TestParseContention(t *testing.T) {
+	tests := []struct {
+		name    string
+		in      string
+		wantErr bool
+	}{
+		{
+			name: "valid",
+			in: `--- mutex:
+cycles/second=3491920901
+sampling period=1
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
+34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
+`,
+		},
+		{
+			name: "valid with comment",
+			in: `--- mutex:
+cycles/second=3491920901
+sampling period=1
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
+#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
+#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
+#	0x4a2be0	main.main.func3+0x70		/go/src/internal/pprof/profile/a_binary.go:58
+
+34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
+#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
+#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
+#	0x4a2b16	main.main.func2+0xd6		/go/src/internal/pprof/profile/a_binary.go:48
+`,
+		},
+		{
+			name:    "empty",
+			in:      `--- mutex:`,
+			wantErr: true,
+		},
+		{
+			name: "invalid header",
+			in: `--- channel:
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31`,
+			wantErr: true,
+		},
+	}
+	for _, tc := range tests {
+		_, err := parseContention([]byte(tc.in))
+		if tc.wantErr && err == nil {
+			t.Errorf("parseContention(%q) succeeded unexpectedly", tc.name)
+		}
+		if !tc.wantErr && err != nil {
+			t.Errorf("parseContention(%q) failed unexpectedly: %v", tc.name, err)
+		}
+	}
+
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/proto.go b/libgo/go/runtime/pprof/internal/profile/proto.go
new file mode 100644
index 0000000..11d7f9f
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/proto.go
@@ -0,0 +1,360 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a simple protocol buffer encoder and decoder.
+//
+// A protocol message must implement the message interface:
+//   decoder() []decoder
+//   encode(*buffer)
+//
+// The decode method returns a slice indexed by field number that gives the
+// function to decode that field.
+// The encode method encodes its receiver into the given buffer.
+//
+// The two methods are simple enough to be implemented by hand rather than
+// by using a protocol compiler.
+//
+// See profile.go for examples of messages implementing this interface.
+//
+// There is no support for groups, message sets, or "has" bits.
+
+package profile
+
+import "errors"
+
+type buffer struct {
+	field int
+	typ   int
+	u64   uint64
+	data  []byte
+	tmp   [16]byte
+}
+
+type decoder func(*buffer, message) error
+
+type message interface {
+	decoder() []decoder
+	encode(*buffer)
+}
+
+func marshal(m message) []byte {
+	var b buffer
+	m.encode(&b)
+	return b.data
+}
+
+func encodeVarint(b *buffer, x uint64) {
+	for x >= 128 {
+		b.data = append(b.data, byte(x)|0x80)
+		x >>= 7
+	}
+	b.data = append(b.data, byte(x))
+}
+
+func encodeLength(b *buffer, tag int, len int) {
+	encodeVarint(b, uint64(tag)<<3|2)
+	encodeVarint(b, uint64(len))
+}
+
+func encodeUint64(b *buffer, tag int, x uint64) {
+	// append varint to b.data
+	encodeVarint(b, uint64(tag)<<3|0)
+	encodeVarint(b, x)
+}
+
+func encodeUint64s(b *buffer, tag int, x []uint64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			encodeVarint(b, u)
+		}
+		n2 := len(b.data)
+		encodeLength(b, tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		encodeUint64(b, tag, u)
+	}
+}
+
+func encodeUint64Opt(b *buffer, tag int, x uint64) {
+	if x == 0 {
+		return
+	}
+	encodeUint64(b, tag, x)
+}
+
+func encodeInt64(b *buffer, tag int, x int64) {
+	u := uint64(x)
+	encodeUint64(b, tag, u)
+}
+
+func encodeInt64Opt(b *buffer, tag int, x int64) {
+	if x == 0 {
+		return
+	}
+	encodeInt64(b, tag, x)
+}
+
+func encodeInt64s(b *buffer, tag int, x []int64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			encodeVarint(b, uint64(u))
+		}
+		n2 := len(b.data)
+		encodeLength(b, tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		encodeInt64(b, tag, u)
+	}
+}
+
+func encodeString(b *buffer, tag int, x string) {
+	encodeLength(b, tag, len(x))
+	b.data = append(b.data, x...)
+}
+
+func encodeStrings(b *buffer, tag int, x []string) {
+	for _, s := range x {
+		encodeString(b, tag, s)
+	}
+}
+
+func encodeStringOpt(b *buffer, tag int, x string) {
+	if x == "" {
+		return
+	}
+	encodeString(b, tag, x)
+}
+
+func encodeBool(b *buffer, tag int, x bool) {
+	if x {
+		encodeUint64(b, tag, 1)
+	} else {
+		encodeUint64(b, tag, 0)
+	}
+}
+
+func encodeBoolOpt(b *buffer, tag int, x bool) {
+	if x == false {
+		return
+	}
+	encodeBool(b, tag, x)
+}
+
+func encodeMessage(b *buffer, tag int, m message) {
+	n1 := len(b.data)
+	m.encode(b)
+	n2 := len(b.data)
+	encodeLength(b, tag, n2-n1)
+	n3 := len(b.data)
+	copy(b.tmp[:], b.data[n2:n3])
+	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+	copy(b.data[n1:], b.tmp[:n3-n2])
+}
+
+func unmarshal(data []byte, m message) (err error) {
+	b := buffer{data: data, typ: 2}
+	return decodeMessage(&b, m)
+}
+
+func le64(p []byte) uint64 {
+	return uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
+}
+
+func le32(p []byte) uint32 {
+	return uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+}
+
+func decodeVarint(data []byte) (uint64, []byte, error) {
+	var i int
+	var u uint64
+	for i = 0; ; i++ {
+		if i >= 10 || i >= len(data) {
+			return 0, nil, errors.New("bad varint")
+		}
+		u |= uint64(data[i]&0x7F) << uint(7*i)
+		if data[i]&0x80 == 0 {
+			return u, data[i+1:], nil
+		}
+	}
+}
+
+func decodeField(b *buffer, data []byte) ([]byte, error) {
+	x, data, err := decodeVarint(data)
+	if err != nil {
+		return nil, err
+	}
+	b.field = int(x >> 3)
+	b.typ = int(x & 7)
+	b.data = nil
+	b.u64 = 0
+	switch b.typ {
+	case 0:
+		b.u64, data, err = decodeVarint(data)
+		if err != nil {
+			return nil, err
+		}
+	case 1:
+		if len(data) < 8 {
+			return nil, errors.New("not enough data")
+		}
+		b.u64 = le64(data[:8])
+		data = data[8:]
+	case 2:
+		var n uint64
+		n, data, err = decodeVarint(data)
+		if err != nil {
+			return nil, err
+		}
+		if n > uint64(len(data)) {
+			return nil, errors.New("too much data")
+		}
+		b.data = data[:n]
+		data = data[n:]
+	case 5:
+		if len(data) < 4 {
+			return nil, errors.New("not enough data")
+		}
+		b.u64 = uint64(le32(data[:4]))
+		data = data[4:]
+	default:
+		return nil, errors.New("unknown type: " + string(b.typ))
+	}
+
+	return data, nil
+}
+
+func checkType(b *buffer, typ int) error {
+	if b.typ != typ {
+		return errors.New("type mismatch")
+	}
+	return nil
+}
+
+func decodeMessage(b *buffer, m message) error {
+	if err := checkType(b, 2); err != nil {
+		return err
+	}
+	dec := m.decoder()
+	data := b.data
+	for len(data) > 0 {
+		// pull varint field# + type
+		var err error
+		data, err = decodeField(b, data)
+		if err != nil {
+			return err
+		}
+		if b.field >= len(dec) || dec[b.field] == nil {
+			continue
+		}
+		if err := dec[b.field](b, m); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func decodeInt64(b *buffer, x *int64) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	*x = int64(b.u64)
+	return nil
+}
+
+func decodeInt64s(b *buffer, x *[]int64) error {
+	if b.typ == 2 {
+		// Packed encoding
+		data := b.data
+		for len(data) > 0 {
+			var u uint64
+			var err error
+
+			if u, data, err = decodeVarint(data); err != nil {
+				return err
+			}
+			*x = append(*x, int64(u))
+		}
+		return nil
+	}
+	var i int64
+	if err := decodeInt64(b, &i); err != nil {
+		return err
+	}
+	*x = append(*x, i)
+	return nil
+}
+
+func decodeUint64(b *buffer, x *uint64) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	*x = b.u64
+	return nil
+}
+
+func decodeUint64s(b *buffer, x *[]uint64) error {
+	if b.typ == 2 {
+		data := b.data
+		// Packed encoding
+		for len(data) > 0 {
+			var u uint64
+			var err error
+
+			if u, data, err = decodeVarint(data); err != nil {
+				return err
+			}
+			*x = append(*x, u)
+		}
+		return nil
+	}
+	var u uint64
+	if err := decodeUint64(b, &u); err != nil {
+		return err
+	}
+	*x = append(*x, u)
+	return nil
+}
+
+func decodeString(b *buffer, x *string) error {
+	if err := checkType(b, 2); err != nil {
+		return err
+	}
+	*x = string(b.data)
+	return nil
+}
+
+func decodeStrings(b *buffer, x *[]string) error {
+	var s string
+	if err := decodeString(b, &s); err != nil {
+		return err
+	}
+	*x = append(*x, s)
+	return nil
+}
+
+func decodeBool(b *buffer, x *bool) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	if int64(b.u64) == 0 {
+		*x = false
+	} else {
+		*x = true
+	}
+	return nil
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/proto_test.go b/libgo/go/runtime/pprof/internal/profile/proto_test.go
new file mode 100644
index 0000000..c2613fc
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/proto_test.go
@@ -0,0 +1,67 @@
+package profile
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestPackedEncoding(t *testing.T) {
+
+	type testcase struct {
+		uint64s []uint64
+		int64s  []int64
+		encoded []byte
+	}
+	for i, tc := range []testcase{
+		{
+			[]uint64{0, 1, 10, 100, 1000, 10000},
+			[]int64{1000, 0, 1000},
+			[]byte{10, 8, 0, 1, 10, 100, 232, 7, 144, 78, 18, 5, 232, 7, 0, 232, 7},
+		},
+		{
+			[]uint64{10000},
+			nil,
+			[]byte{8, 144, 78},
+		},
+		{
+			nil,
+			[]int64{-10000},
+			[]byte{16, 240, 177, 255, 255, 255, 255, 255, 255, 255, 1},
+		},
+	} {
+		source := &packedInts{tc.uint64s, tc.int64s}
+		if got, want := marshal(source), tc.encoded; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed encode %d, got %v, want %v", i, got, want)
+		}
+
+		dest := new(packedInts)
+		if err := unmarshal(tc.encoded, dest); err != nil {
+			t.Errorf("failed decode %d: %v", i, err)
+			continue
+		}
+		if got, want := dest.uint64s, tc.uint64s; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed decode uint64s %d, got %v, want %v", i, got, want)
+		}
+		if got, want := dest.int64s, tc.int64s; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed decode int64s %d, got %v, want %v", i, got, want)
+		}
+	}
+}
+
+type packedInts struct {
+	uint64s []uint64
+	int64s  []int64
+}
+
+func (u *packedInts) decoder() []decoder {
+	return []decoder{
+		nil,
+		func(b *buffer, m message) error { return decodeUint64s(b, &m.(*packedInts).uint64s) },
+		func(b *buffer, m message) error { return decodeInt64s(b, &m.(*packedInts).int64s) },
+	}
+}
+
+func (u *packedInts) encode(b *buffer) {
+	encodeUint64s(b, 1, u.uint64s)
+	encodeInt64s(b, 2, u.int64s)
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/prune.go b/libgo/go/runtime/pprof/internal/profile/prune.go
new file mode 100644
index 0000000..1924fad
--- /dev/null
+++ b/libgo/go/runtime/pprof/internal/profile/prune.go
@@ -0,0 +1,97 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implements methods to remove frames from profiles.
+
+package profile
+
+import (
+	"fmt"
+	"regexp"
+)
+
+// Prune removes all nodes beneath a node matching dropRx, and not
+// matching keepRx. If the root node of a Sample matches, the sample
+// will have an empty stack.
+func (p *Profile) Prune(dropRx, keepRx *regexp.Regexp) {
+	prune := make(map[uint64]bool)
+	pruneBeneath := make(map[uint64]bool)
+
+	for _, loc := range p.Location {
+		var i int
+		for i = len(loc.Line) - 1; i >= 0; i-- {
+			if fn := loc.Line[i].Function; fn != nil && fn.Name != "" {
+				funcName := fn.Name
+				// Account for leading '.' on the PPC ELF v1 ABI.
+				if funcName[0] == '.' {
+					funcName = funcName[1:]
+				}
+				if dropRx.MatchString(funcName) {
+					if keepRx == nil || !keepRx.MatchString(funcName) {
+						break
+					}
+				}
+			}
+		}
+
+		if i >= 0 {
+			// Found matching entry to prune.
+			pruneBeneath[loc.ID] = true
+
+			// Remove the matching location.
+			if i == len(loc.Line)-1 {
+				// Matched the top entry: prune the whole location.
+				prune[loc.ID] = true
+			} else {
+				loc.Line = loc.Line[i+1:]
+			}
+		}
+	}
+
+	// Prune locs from each Sample
+	for _, sample := range p.Sample {
+		// Scan from the root to the leaves to find the prune location.
+		// Do not prune frames before the first user frame, to avoid
+		// pruning everything.
+		foundUser := false
+		for i := len(sample.Location) - 1; i >= 0; i-- {
+			id := sample.Location[i].ID
+			if !prune[id] && !pruneBeneath[id] {
+				foundUser = true
+				continue
+			}
+			if !foundUser {
+				continue
+			}
+			if prune[id] {
+				sample.Location = sample.Location[i+1:]
+				break
+			}
+			if pruneBeneath[id] {
+				sample.Location = sample.Location[i:]
+				break
+			}
+		}
+	}
+}
+
+// RemoveUninteresting prunes and elides profiles using built-in
+// tables of uninteresting function names.
+func (p *Profile) RemoveUninteresting() error {
+	var keep, drop *regexp.Regexp
+	var err error
+
+	if p.DropFrames != "" {
+		if drop, err = regexp.Compile("^(" + p.DropFrames + ")$"); err != nil {
+			return fmt.Errorf("failed to compile regexp %s: %v", p.DropFrames, err)
+		}
+		if p.KeepFrames != "" {
+			if keep, err = regexp.Compile("^(" + p.KeepFrames + ")$"); err != nil {
+				return fmt.Errorf("failed to compile regexp %s: %v", p.KeepFrames, err)
+			}
+		}
+		p.Prune(drop, keep)
+	}
+	return nil
+}
diff --git a/libgo/go/runtime/pprof/internal/protopprof/protomemprofile.go b/libgo/go/runtime/pprof/internal/protopprof/protomemprofile.go
deleted file mode 100644
index c2ab5b5..0000000
--- a/libgo/go/runtime/pprof/internal/protopprof/protomemprofile.go
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"internal/pprof/profile"
-	"math"
-	"runtime"
-	"time"
-)
-
-// EncodeMemProfile converts MemProfileRecords to a Profile.
-func EncodeMemProfile(mr []runtime.MemProfileRecord, rate int64, t time.Time) *profile.Profile {
-	p := &profile.Profile{
-		Period:     rate,
-		PeriodType: &profile.ValueType{Type: "space", Unit: "bytes"},
-		SampleType: []*profile.ValueType{
-			{Type: "alloc_objects", Unit: "count"},
-			{Type: "alloc_space", Unit: "bytes"},
-			{Type: "inuse_objects", Unit: "count"},
-			{Type: "inuse_space", Unit: "bytes"},
-		},
-		TimeNanos: int64(t.UnixNano()),
-	}
-
-	locs := make(map[uintptr]*profile.Location)
-	for _, r := range mr {
-		stack := r.Stack()
-		sloc := make([]*profile.Location, len(stack))
-		for i, addr := range stack {
-			loc := locs[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(p.Location) + 1),
-					Address: uint64(addr),
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc[i] = loc
-		}
-
-		ao, ab := scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
-		uo, ub := scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
-
-		p.Sample = append(p.Sample, &profile.Sample{
-			Value:    []int64{ao, ab, uo, ub},
-			Location: sloc,
-		})
-	}
-	if runtime.GOOS == "linux" {
-		addMappings(p)
-	}
-	return p
-}
-
-// scaleHeapSample adjusts the data from a heap Sample to
-// account for its probability of appearing in the collected
-// data. heap profiles are a sampling of the memory allocations
-// requests in a program. We estimate the unsampled value by dividing
-// each collected sample by its probability of appearing in the
-// profile. heap profiles rely on a poisson process to determine
-// which samples to collect, based on the desired average collection
-// rate R. The probability of a sample of size S to appear in that
-// profile is 1-exp(-S/R).
-func scaleHeapSample(count, size, rate int64) (int64, int64) {
-	if count == 0 || size == 0 {
-		return 0, 0
-	}
-
-	if rate <= 1 {
-		// if rate==1 all samples were collected so no adjustment is needed.
-		// if rate<1 treat as unknown and skip scaling.
-		return count, size
-	}
-
-	avgSize := float64(size) / float64(count)
-	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
-
-	return int64(float64(count) * scale), int64(float64(size) * scale)
-}
diff --git a/libgo/go/runtime/pprof/internal/protopprof/protomemprofile_test.go b/libgo/go/runtime/pprof/internal/protopprof/protomemprofile_test.go
deleted file mode 100644
index a10fe77..0000000
--- a/libgo/go/runtime/pprof/internal/protopprof/protomemprofile_test.go
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"bytes"
-	"internal/pprof/profile"
-	"io/ioutil"
-	"reflect"
-	"runtime"
-	"testing"
-	"time"
-)
-
-// TestSampledHeapAllocProfile tests encoding of a memory profile from
-// runtime.MemProfileRecord data.
-func TestSampledHeapAllocProfile(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("Test requires a system with /proc/self/maps")
-	}
-
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
-
-	var buf bytes.Buffer
-
-	rec, rate := testMemRecords(address1, address2)
-	p := EncodeMemProfile(rec, rate, time.Now())
-	if err := p.Write(&buf); err != nil {
-		t.Fatalf("Failed to write profile: %v", err)
-	}
-
-	p, err = profile.Parse(&buf)
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-
-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "space", Unit: "bytes"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "alloc_objects", Unit: "count"},
-		{Type: "alloc_space", Unit: "bytes"},
-		{Type: "inuse_objects", Unit: "count"},
-		{Type: "inuse_space", Unit: "bytes"},
-	}
-	// Expected samples, with values unsampled according to the profiling rate.
-	expectedSample := []*profile.Sample{
-		{Value: []int64{2050, 2099200, 1537, 1574400}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[1], Address: address2},
-		}},
-		{Value: []int64{1, 829411, 1, 829411}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2 + 1},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 2},
-		}},
-		{Value: []int64{1, 829411, 0, 0}, Location: []*profile.Location{
-			{ID: 5, Mapping: mprof.Mapping[0], Address: address1 + 1},
-			{ID: 6, Mapping: mprof.Mapping[0], Address: address1 + 2},
-			{ID: 7, Mapping: mprof.Mapping[1], Address: address2 + 3},
-		}},
-	}
-
-	if p.Period != 512*1024 {
-		t.Fatalf("Sampling periods do not match")
-	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
-	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
-	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
-	}
-}
-
-func testMemRecords(a1, a2 uint64) ([]runtime.MemProfileRecord, int64) {
-	addr1, addr2 := uintptr(a1), uintptr(a2)
-	rate := int64(512 * 1024)
-	rec := []runtime.MemProfileRecord{
-		{4096, 1024, 4, 1, [32]uintptr{addr1, addr2}},
-		{512 * 1024, 0, 1, 0, [32]uintptr{addr2 + 1, addr2 + 2}},
-		{512 * 1024, 512 * 1024, 1, 1, [32]uintptr{addr1 + 1, addr1 + 2, addr2 + 3}},
-	}
-	return rec, rate
-}
diff --git a/libgo/go/runtime/pprof/internal/protopprof/protopprof.go b/libgo/go/runtime/pprof/internal/protopprof/protopprof.go
deleted file mode 100644
index 5d269c4..0000000
--- a/libgo/go/runtime/pprof/internal/protopprof/protopprof.go
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package protopprof converts the runtime's raw profile logs
-// to Profile structs containing a representation of the pprof
-// protocol buffer profile format.
-package protopprof
-
-import (
-	"fmt"
-	"os"
-	"runtime"
-	"time"
-	"unsafe"
-
-	"internal/pprof/profile"
-)
-
-// TranslateCPUProfile parses binary CPU profiling stack trace data
-// generated by runtime.CPUProfile() into a profile struct.
-func TranslateCPUProfile(b []byte, startTime time.Time) (*profile.Profile, error) {
-	const wordSize = unsafe.Sizeof(uintptr(0))
-	const minRawProfile = 5 * wordSize // Need a minimum of 5 words.
-	if uintptr(len(b)) < minRawProfile {
-		return nil, fmt.Errorf("truncated profile")
-	}
-	n := int(uintptr(len(b)) / wordSize)
-	data := ((*[1 << 28]uintptr)(unsafe.Pointer(&b[0])))[:n:n]
-	period := data[3]
-	data = data[5:] // skip header
-
-	// profile initialization taken from pprof tool
-	p := &profile.Profile{
-		Period:     int64(period) * 1000,
-		PeriodType: &profile.ValueType{Type: "cpu", Unit: "nanoseconds"},
-		SampleType: []*profile.ValueType{
-			{Type: "samples", Unit: "count"},
-			{Type: "cpu", Unit: "nanoseconds"},
-		},
-		TimeNanos:     int64(startTime.UnixNano()),
-		DurationNanos: time.Since(startTime).Nanoseconds(),
-	}
-	// Parse CPU samples from the profile.
-	locs := make(map[uint64]*profile.Location)
-	for len(b) > 0 {
-		if len(data) < 2 || uintptr(len(data)) < 2+data[1] {
-			return nil, fmt.Errorf("truncated profile")
-		}
-		count := data[0]
-		nstk := data[1]
-		if uintptr(len(data)) < 2+nstk {
-			return nil, fmt.Errorf("truncated profile")
-		}
-		stk := data[2 : 2+nstk]
-		data = data[2+nstk:]
-
-		if count == 0 && nstk == 1 && stk[0] == 0 {
-			// end of data marker
-			break
-		}
-
-		sloc := make([]*profile.Location, len(stk))
-		for i, addr := range stk {
-			addr := uint64(addr)
-			// Addresses from stack traces point to the next instruction after
-			// each call.  Adjust by -1 to land somewhere on the actual call
-			// (except for the leaf, which is not a call).
-			if i > 0 {
-				addr--
-			}
-			loc := locs[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(p.Location) + 1),
-					Address: addr,
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc[i] = loc
-		}
-		p.Sample = append(p.Sample, &profile.Sample{
-			Value:    []int64{int64(count), int64(count) * int64(p.Period)},
-			Location: sloc,
-		})
-	}
-
-	if runtime.GOOS == "linux" {
-		if err := addMappings(p); err != nil {
-			return nil, err
-		}
-	}
-	return p, nil
-}
-
-func addMappings(p *profile.Profile) error {
-	// Parse memory map from /proc/self/maps
-	f, err := os.Open("/proc/self/maps")
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-	return p.ParseMemoryMap(f)
-}
diff --git a/libgo/go/runtime/pprof/internal/protopprof/protopprof_test.go b/libgo/go/runtime/pprof/internal/protopprof/protopprof_test.go
deleted file mode 100644
index f1937b5..0000000
--- a/libgo/go/runtime/pprof/internal/protopprof/protopprof_test.go
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"bytes"
-	"fmt"
-	"internal/pprof/profile"
-	"io/ioutil"
-	"reflect"
-	"runtime"
-	"testing"
-	"time"
-	"unsafe"
-)
-
-// Helper function to initialize empty cpu profile with sampling period provided.
-func createEmptyProfileWithPeriod(t *testing.T, periodMs uint64) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	// Profile header is as follows:
-	// The first, third and fifth words are 0. The second word is 3.
-	// The fourth word is the period.
-	// EOD marker:
-	// The sixth word -- count is initialized to 0 above.
-	// The code below sets the seventh word -- nstk to 1
-	// The eighth word -- addr is initialized to 0 above.
-	words := []int{0, 3, 0, int(periodMs), 0, 0, 1, 0}
-	n := int(unsafe.Sizeof(0)) * len(words)
-	data := ((*[1 << 29]byte)(unsafe.Pointer(&words[0])))[:n:n]
-	if _, err := buf.Write(data); err != nil {
-		t.Fatalf("createEmptyProfileWithPeriod failed: %v", err)
-	}
-	return *buf
-}
-
-// Helper function to initialize cpu profile with two sample values.
-func createProfileWithTwoSamples(t *testing.T, periodMs uintptr, count1 uintptr, count2 uintptr,
-	address1 uintptr, address2 uintptr) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	words := []uintptr{0, 3, 0, uintptr(periodMs), 0, uintptr(count1), 2,
-		uintptr(address1), uintptr(address1 + 2),
-		uintptr(count2), 2, uintptr(address2), uintptr(address2 + 2),
-		0, 1, 0}
-	for _, n := range words {
-		var err error
-		switch unsafe.Sizeof(int(0)) {
-		case 8:
-			_, err = buf.Write((*[8]byte)(unsafe.Pointer(&n))[:8:8])
-		case 4:
-			_, err = buf.Write((*[4]byte)(unsafe.Pointer(&n))[:4:4])
-		}
-		if err != nil {
-			t.Fatalf("createProfileWithTwoSamples failed: %v", err)
-		}
-	}
-	return *buf
-}
-
-// Tests TranslateCPUProfile parses correct sampling period in an otherwise empty cpu profile.
-func TestTranlateCPUProfileSamplingPeriod(t *testing.T) {
-	// A test server with mock cpu profile data.
-	var buf bytes.Buffer
-
-	startTime := time.Now()
-	b := createEmptyProfileWithPeriod(t, 2000)
-	p, err := TranslateCPUProfile(b.Bytes(), startTime)
-	if err != nil {
-		t.Fatalf("translate failed: %v", err)
-	}
-	if err := p.Write(&buf); err != nil {
-		t.Fatalf("write failed: %v", err)
-	}
-
-	p, err = profile.Parse(&buf)
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-
-	// Expected PeriodType and SampleType.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "samples", Unit: "count"},
-		{Type: "cpu", Unit: "nanoseconds"},
-	}
-	if p.Period != 2000*1000 || !reflect.DeepEqual(p.PeriodType, expectedPeriodType) ||
-		!reflect.DeepEqual(p.SampleType, expectedSampleType) || p.Sample != nil {
-		t.Fatalf("Unexpected Profile fields")
-	}
-}
-
-func getSampleAsString(sample []*profile.Sample) string {
-	var str string
-	for _, x := range sample {
-		for _, y := range x.Location {
-			if y.Mapping != nil {
-				str += fmt.Sprintf("Mapping:%v\n", *y.Mapping)
-			}
-			str += fmt.Sprintf("Location:%v\n", y)
-		}
-		str += fmt.Sprintf("Sample:%v\n", *x)
-	}
-	return str
-}
-
-// Tests TranslateCPUProfile parses a cpu profile with sample values present.
-func TestTranslateCPUProfileWithSamples(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("test requires a system with /proc/self/maps")
-	}
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
-	// A test server with mock cpu profile data.
-
-	startTime := time.Now()
-	b := createProfileWithTwoSamples(t, 2000, 20, 40, uintptr(address1), uintptr(address2))
-	p, err := TranslateCPUProfile(b.Bytes(), startTime)
-
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "samples", Unit: "count"},
-		{Type: "cpu", Unit: "nanoseconds"},
-	}
-	expectedSample := []*profile.Sample{
-		{Value: []int64{20, 20 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[0], Address: address1 + 1},
-		}},
-		{Value: []int64{40, 40 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 1},
-		}},
-	}
-	if p.Period != 2000*1000 {
-		t.Fatalf("Sampling periods do not match")
-	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
-	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
-	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
-	}
-}
diff --git a/libgo/go/runtime/pprof/label.go b/libgo/go/runtime/pprof/label.go
new file mode 100644
index 0000000..35647ee
--- /dev/null
+++ b/libgo/go/runtime/pprof/label.go
@@ -0,0 +1,85 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+)
+
+type label struct {
+	key   string
+	value string
+}
+
+// LabelSet is a set of labels.
+type LabelSet struct {
+	list []label
+}
+
+// labelContextKey is the type of contextKeys used for profiler labels.
+type labelContextKey struct{}
+
+func labelValue(ctx context.Context) labelMap {
+	labels, _ := ctx.Value(labelContextKey{}).(*labelMap)
+	if labels == nil {
+		return labelMap(nil)
+	}
+	return *labels
+}
+
+// labelMap is the representation of the label set held in the context type.
+// This is an initial implementation, but it will be replaced with something
+// that admits incremental immutable modification more efficiently.
+type labelMap map[string]string
+
+// WithLabels returns a new context.Context with the given labels added.
+// A label overwrites a prior label with the same key.
+func WithLabels(ctx context.Context, labels LabelSet) context.Context {
+	childLabels := make(labelMap)
+	parentLabels := labelValue(ctx)
+	// TODO(matloob): replace the map implementation with something
+	// more efficient so creating a child context WithLabels doesn't need
+	// to clone the map.
+	for k, v := range parentLabels {
+		childLabels[k] = v
+	}
+	for _, label := range labels.list {
+		childLabels[label.key] = label.value
+	}
+	return context.WithValue(ctx, labelContextKey{}, &childLabels)
+}
+
+// Labels takes an even number of strings representing key-value pairs
+// and makes a LabelSet containing them.
+// A label overwrites a prior label with the same key.
+func Labels(args ...string) LabelSet {
+	if len(args)%2 != 0 {
+		panic("uneven number of arguments to pprof.Labels")
+	}
+	labels := LabelSet{}
+	for i := 0; i+1 < len(args); i += 2 {
+		labels.list = append(labels.list, label{key: args[i], value: args[i+1]})
+	}
+	return labels
+}
+
+// Label returns the value of the label with the given key on ctx, and a boolean indicating
+// whether that label exists.
+func Label(ctx context.Context, key string) (string, bool) {
+	ctxLabels := labelValue(ctx)
+	v, ok := ctxLabels[key]
+	return v, ok
+}
+
+// ForLabels invokes f with each label set on the context.
+// The function f should return true to continue iteration or false to stop iteration early.
+func ForLabels(ctx context.Context, f func(key, value string) bool) {
+	ctxLabels := labelValue(ctx)
+	for k, v := range ctxLabels {
+		if !f(k, v) {
+			break
+		}
+	}
+}
diff --git a/libgo/go/runtime/pprof/label_test.go b/libgo/go/runtime/pprof/label_test.go
new file mode 100644
index 0000000..240445f
--- /dev/null
+++ b/libgo/go/runtime/pprof/label_test.go
@@ -0,0 +1,82 @@
+package pprof
+
+import (
+	"context"
+	"reflect"
+	"sort"
+	"testing"
+)
+
+func labelsSorted(ctx context.Context) []label {
+	ls := []label{}
+	ForLabels(ctx, func(key, value string) bool {
+		ls = append(ls, label{key, value})
+		return true
+	})
+	sort.Sort(labelSorter(ls))
+	return ls
+}
+
+type labelSorter []label
+
+func (s labelSorter) Len() int           { return len(s) }
+func (s labelSorter) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s labelSorter) Less(i, j int) bool { return s[i].key < s[j].key }
+
+func TestContextLabels(t *testing.T) {
+	// Background context starts with no lablels.
+	ctx := context.Background()
+	labels := labelsSorted(ctx)
+	if len(labels) != 0 {
+		t.Errorf("labels on background context: want [], got %v ", labels)
+	}
+
+	// Add a single label.
+	ctx = WithLabels(ctx, Labels("key", "value"))
+	// Retrieve it with Label.
+	v, ok := Label(ctx, "key")
+	if !ok || v != "value" {
+		t.Errorf(`Label(ctx, "key"): got %v, %v; want "value", ok`, v, ok)
+	}
+	gotLabels := labelsSorted(ctx)
+	wantLabels := []label{{"key", "value"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Add a label with a different key.
+	ctx = WithLabels(ctx, Labels("key2", "value2"))
+	v, ok = Label(ctx, "key2")
+	if !ok || v != "value2" {
+		t.Errorf(`Label(ctx, "key2"): got %v, %v; want "value2", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value"}, {"key2", "value2"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Add label with first key to test label replacement.
+	ctx = WithLabels(ctx, Labels("key", "value3"))
+	v, ok = Label(ctx, "key")
+	if !ok || v != "value3" {
+		t.Errorf(`Label(ctx, "key3"): got %v, %v; want "value3", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value3"}, {"key2", "value2"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Labels called with two labels with the same key should pick the second.
+	ctx = WithLabels(ctx, Labels("key4", "value4a", "key4", "value4b"))
+	v, ok = Label(ctx, "key4")
+	if !ok || v != "value4b" {
+		t.Errorf(`Label(ctx, "key4"): got %v, %v; want "value4b", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value3"}, {"key2", "value2"}, {"key4", "value4b"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+}
diff --git a/libgo/go/runtime/pprof/map.go b/libgo/go/runtime/pprof/map.go
new file mode 100644
index 0000000..a271ad0
--- /dev/null
+++ b/libgo/go/runtime/pprof/map.go
@@ -0,0 +1,89 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import "unsafe"
+
+// A profMap is a map from (stack, tag) to mapEntry.
+// It grows without bound, but that's assumed to be OK.
+type profMap struct {
+	hash    map[uintptr]*profMapEntry
+	all     *profMapEntry
+	last    *profMapEntry
+	free    []profMapEntry
+	freeStk []uintptr
+}
+
+// A profMapEntry is a single entry in the profMap.
+type profMapEntry struct {
+	nextHash *profMapEntry // next in hash list
+	nextAll  *profMapEntry // next in list of all entries
+	stk      []uintptr
+	tag      unsafe.Pointer
+	count    int64
+}
+
+func (m *profMap) lookup(stk []uint64, tag unsafe.Pointer) *profMapEntry {
+	// Compute hash of (stk, tag).
+	h := uintptr(0)
+	for _, x := range stk {
+		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
+		h += uintptr(x) * 41
+	}
+	h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
+	h += uintptr(tag) * 41
+
+	// Find entry if present.
+	var last *profMapEntry
+Search:
+	for e := m.hash[h]; e != nil; last, e = e, e.nextHash {
+		if len(e.stk) != len(stk) || e.tag != tag {
+			continue
+		}
+		for j := range stk {
+			if e.stk[j] != uintptr(stk[j]) {
+				continue Search
+			}
+		}
+		// Move to front.
+		if last != nil {
+			last.nextHash = e.nextHash
+			e.nextHash = m.hash[h]
+			m.hash[h] = e
+		}
+		return e
+	}
+
+	// Add new entry.
+	if len(m.free) < 1 {
+		m.free = make([]profMapEntry, 128)
+	}
+	e := &m.free[0]
+	m.free = m.free[1:]
+	e.nextHash = m.hash[h]
+	e.tag = tag
+
+	if len(m.freeStk) < len(stk) {
+		m.freeStk = make([]uintptr, 1024)
+	}
+	e.stk = m.freeStk[:len(stk)]
+	m.freeStk = m.freeStk[len(stk):]
+
+	for j := range stk {
+		e.stk[j] = uintptr(stk[j])
+	}
+	if m.hash == nil {
+		m.hash = make(map[uintptr]*profMapEntry)
+	}
+	m.hash[h] = e
+	if m.all == nil {
+		m.all = e
+		m.last = e
+	} else {
+		m.last.nextAll = e
+		m.last = e
+	}
+	return e
+}
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index 5ebd46b..5d77a1d 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package pprof_test
+package pprof
 
 import (
 	"bytes"
@@ -10,7 +10,6 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
 	"testing"
 	"unsafe"
 )
@@ -87,26 +86,26 @@ func TestMemoryProfiler(t *testing.T) {
 	tests := []string{
 
 		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:41
-#	0x[0-9,a-f]+	runtime_pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:75
+#	0x[0-9,a-f]+	pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:40
+#	0x[0-9,a-f]+	runtime_pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:74
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof_test\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:22
-#	0x[0-9,a-f]+	runtime_pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:73
+#	0x[0-9,a-f]+	pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:21
+#	0x[0-9,a-f]+	runtime_pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:72
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 
 		// This should start with "0: 0" but gccgo's imprecise
 		// GC means that sometimes the value is not collected.
 		fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:28
-#	0x[0-9,a-f]+	runtime_pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:74
+#	0x[0-9,a-f]+	pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:27
+#	0x[0-9,a-f]+	runtime_pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:73
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 
 		// This should start with "0: 0" but gccgo's imprecise
 		// GC means that sometimes the value is not collected.
 		fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	pprof_test\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:49
+#	0x[0-9,a-f]+	pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:48
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}
 
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 0db1ded..a57b69d 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -33,7 +33,9 @@
 //            }
 //            defer pprof.StopCPUProfile()
 //        }
-//        ...
+//
+//        // ... rest of the program ...
+//
 //        if *memprofile != "" {
 //            f, err := os.Create(*memprofile)
 //            if err != nil {
@@ -73,15 +75,14 @@ import (
 	"bufio"
 	"bytes"
 	"fmt"
-	"internal/pprof/profile"
 	"io"
 	"runtime"
-	"runtime/pprof/internal/protopprof"
 	"sort"
 	"strings"
 	"sync"
 	"text/tabwriter"
 	"time"
+	"unsafe"
 )
 
 // BUG(rsc): Profiles are only as good as the kernel support used to generate them.
@@ -183,6 +184,8 @@ func unlockProfiles() {
 // If a profile with that name already exists, NewProfile panics.
 // The convention is to use a 'import/path.' prefix to create
 // separate name spaces for each package.
+// For compatibility with various tools that read pprof data,
+// profile names should not contain spaces.
 func NewProfile(name string) *Profile {
 	lockProfiles()
 	defer unlockProfiles()
@@ -264,13 +267,18 @@ func (p *Profile) Add(value interface{}, skip int) {
 
 	stk := make([]uintptr, 32)
 	n := runtime.Callers(skip+1, stk[:])
+	stk = stk[:n]
+	if len(stk) == 0 {
+		// The value for skip is too large, and there's no stack trace to record.
+		stk = []uintptr{funcPC(lostProfileEvent) + 1}
+	}
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if p.m[value] != nil {
 		panic("pprof: Profile.Add of duplicate value")
 	}
-	p.m[value] = stk[:n]
+	p.m[value] = stk
 }
 
 // Remove removes the execution stack associated with value from the profile.
@@ -303,8 +311,8 @@ func (p *Profile) WriteTo(w io.Writer, debug int) error {
 	}
 
 	// Obtain consistent snapshot under lock; then process without lock.
-	all := make([][]uintptr, 0, len(p.m))
 	p.mu.Lock()
+	all := make([][]uintptr, 0, len(p.m))
 	for _, stk := range p.m {
 		all = append(all, stk)
 	}
@@ -380,35 +388,29 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 	}
 
 	// Output profile in protobuf form.
-	prof := &profile.Profile{
-		PeriodType: &profile.ValueType{Type: name, Unit: "count"},
-		Period:     1,
-		Sample:     make([]*profile.Sample, 0, len(keys)),
-		SampleType: []*profile.ValueType{{Type: name, Unit: "count"}},
-	}
-	locMap := make(map[uintptr]*profile.Location)
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, name, "count")
+	b.pb.int64Opt(tagProfile_Period, 1)
+	b.pbValueType(tagProfile_SampleType, name, "count")
+
+	values := []int64{0}
+	var locs []uint64
 	for _, k := range keys {
-		stk := p.Stack(index[k])
-		c := count[k]
-		locs := make([]*profile.Location, len(stk))
-		for i, addr := range stk {
-			loc := locMap[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(locMap) + 1),
-					Address: uint64(addr - 1),
-				}
-				prof.Location = append(prof.Location, loc)
-				locMap[addr] = loc
+		values[0] = int64(count[k])
+		locs = locs[:0]
+		for _, addr := range p.Stack(index[k]) {
+			// For count profiles, all stack addresses are
+			// return PCs, which is what locForPC expects.
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
 			}
-			locs[i] = loc
+			locs = append(locs, l)
 		}
-		prof.Sample = append(prof.Sample, &profile.Sample{
-			Location: locs,
-			Value:    []int64{int64(c)},
-		})
+		b.pbSample(values, locs, nil)
 	}
-	return prof.Write(w)
+	b.build()
+	return nil
 }
 
 // keysByCount sorts keys with higher counts first, breaking ties by key string order.
@@ -510,8 +512,7 @@ func writeHeap(w io.Writer, debug int) error {
 	}
 
 	if debug == 0 {
-		pp := protopprof.EncodeMemProfile(p, int64(runtime.MemProfileRate), time.Now())
-		return pp.Write(w)
+		return writeHeapProto(w, p, int64(runtime.MemProfileRate))
 	}
 
 	sort.Slice(p, func(i, j int) bool { return p[i].InUseBytes() > p[j].InUseBytes() })
@@ -576,8 +577,12 @@ func writeHeap(w io.Writer, debug int) error {
 	fmt.Fprintf(w, "# OtherSys = %d\n", s.OtherSys)
 
 	fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
+	fmt.Fprintf(w, "# LastGC = %d\n", s.LastGC)
 	fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
+	fmt.Fprintf(w, "# PauseEnd = %d\n", s.PauseEnd)
 	fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
+	fmt.Fprintf(w, "# NumForcedGC = %d\n", s.NumForcedGC)
+	fmt.Fprintf(w, "# GCCPUFraction = %v\n", s.GCCPUFraction)
 	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
 	tw.Flush()
@@ -703,30 +708,32 @@ func StartCPUProfile(w io.Writer) error {
 	return nil
 }
 
+// readProfile, provided by the runtime, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+func readProfile() (data []uint64, tags []unsafe.Pointer, eof bool)
+
 func profileWriter(w io.Writer) {
-	startTime := time.Now()
-	// This will buffer the entire profile into buf and then
-	// translate it into a profile.Profile structure. This will
-	// create two copies of all the data in the profile in memory.
-	// TODO(matloob): Convert each chunk of the proto output and
-	// stream it out instead of converting the entire profile.
-	var buf bytes.Buffer
+	b := newProfileBuilder(w)
+	var err error
 	for {
-		data := runtime.CPUProfile()
-		if data == nil {
+		time.Sleep(100 * time.Millisecond)
+		data, tags, eof := readProfile()
+		if e := b.addCPUData(data, tags); e != nil && err == nil {
+			err = e
+		}
+		if eof {
 			break
 		}
-		buf.Write(data)
 	}
-
-	profile, err := protopprof.TranslateCPUProfile(buf.Bytes(), startTime)
 	if err != nil {
 		// The runtime should never produce an invalid or truncated profile.
 		// It drops records that can't fit into its log buffers.
-		panic(fmt.Errorf("could not translate binary profile to proto format: %v", err))
+		panic("runtime/pprof: converting profile: " + err.Error())
 	}
-
-	profile.Write(w)
+	b.build()
 	cpu.done <- true
 }
 
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 6034058..9e5e403 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -4,13 +4,12 @@
 
 // +build !nacl
 
-package pprof_test
+package pprof
 
 import (
 	"bytes"
-	"compress/gzip"
+	"context"
 	"fmt"
-	"internal/pprof/profile"
 	"internal/testenv"
 	"io"
 	"io/ioutil"
@@ -19,14 +18,15 @@ import (
 	"os/exec"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
+	"runtime/pprof/internal/profile"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 )
 
-func cpuHogger(f func(), dur time.Duration) {
+func cpuHogger(f func() int, dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
 	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
@@ -46,7 +46,7 @@ var (
 // The actual CPU hogging function.
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
-func cpuHog1() {
+func cpuHog1() int {
 	foo := salt1
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
@@ -55,10 +55,10 @@ func cpuHog1() {
 			foo *= foo + 1
 		}
 	}
-	salt1 = foo
+	return foo
 }
 
-func cpuHog2() {
+func cpuHog2() int {
 	foo := salt2
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
@@ -67,18 +67,18 @@ func cpuHog2() {
 			foo *= foo + 2
 		}
 	}
-	salt2 = foo
+	return foo
 }
 
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"pprof_test.cpuHog1"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"pprof.cpuHog1"}, func(dur time.Duration) {
 		cpuHogger(cpuHog1, dur)
 	})
 }
 
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"pprof_test.cpuHog1", "pprof_test.cpuHog2"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"pprof.cpuHog1", "pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
 			cpuHogger(cpuHog1, dur)
@@ -89,18 +89,42 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	})
 }
 
-func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []uintptr)) {
+func TestCPUProfileInlining(t *testing.T) {
+	testCPUProfile(t, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, func(dur time.Duration) {
+		cpuHogger(inlinedCaller, dur)
+	})
+}
+
+func inlinedCaller() int {
+	inlinedCallee()
+	return 0
+}
+
+func inlinedCallee() {
+	// We could just use cpuHog1, but for loops prevent inlining
+	// right now. :(
+	foo := salt1
+	i := 0
+loop:
+	if foo > 0 {
+		foo *= foo
+	} else {
+		foo *= foo + 1
+	}
+	if i++; i < 1e5 {
+		goto loop
+	}
+	salt1 = foo
+}
+
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
 	p, err := profile.Parse(bytes.NewReader(valBytes))
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, sample := range p.Sample {
 		count := uintptr(sample.Value[0])
-		stk := make([]uintptr, len(sample.Location))
-		for i := range sample.Location {
-			stk[i] = uintptr(sample.Location[i].Address)
-		}
-		f(count, stk)
+		f(count, sample.Location, sample.Label)
 	}
 }
 
@@ -124,8 +148,7 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 
 	const maxDuration = 5 * time.Second
 	// If we're running a long test, start with a long duration
-	// because some of the tests (e.g., TestStackBarrierProfiling)
-	// are trying to make sure something *doesn't* happen.
+	// for tests that try to make sure something *doesn't* happen.
 	duration := 5 * time.Second
 	if testing.Short() {
 		duration = 200 * time.Millisecond
@@ -169,32 +192,45 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	t.FailNow()
 }
 
+func contains(slice []string, s string) bool {
+	for i := range slice {
+		if slice[i] == s {
+			return true
+		}
+	}
+	return false
+}
+
 func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
 	ok = true
 
 	// Check that profile is well formed and contains need.
 	have := make([]uintptr, len(need))
 	var samples uintptr
-	parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+	var buf bytes.Buffer
+	parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
+		fmt.Fprintf(&buf, "%d:", count)
+		fprintStack(&buf, stk)
 		samples += count
-		for _, pc := range stk {
-			f := runtime.FuncForPC(pc)
-			if f == nil {
-				continue
-			}
-			t.Log(f.Name(), count)
-			for i, name := range need {
-				if strings.Contains(f.Name(), name) {
-					have[i] += count
+		for i, name := range need {
+			if semi := strings.Index(name, ";"); semi > -1 {
+				kv := strings.SplitN(name[semi+1:], "=", 2)
+				if len(kv) != 2 || !contains(labels[kv[0]], kv[1]) {
+					continue
 				}
+				name = name[:semi]
 			}
-			if strings.Contains(f.Name(), "stackBarrier") {
-				// The runtime should have unwound this.
-				t.Fatalf("profile includes stackBarrier")
+			for _, loc := range stk {
+				for _, line := range loc.Line {
+					if strings.Contains(line.Function.Name, name) {
+						have[i] += count
+					}
+				}
 			}
 		}
+		fmt.Fprintf(&buf, "\n")
 	})
-	t.Logf("total %d CPU profile samples collected", samples)
+	t.Logf("total %d CPU profile samples collected:\n%s", samples, buf.String())
 
 	if samples < 10 && runtime.GOOS == "windows" {
 		// On some windows machines we end up with
@@ -301,36 +337,43 @@ func TestGoroutineSwitch(t *testing.T) {
 
 		// Read profile to look for entries for runtime.gogo with an attempt at a traceback.
 		// The special entry
-		parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+		parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, _ map[string][]string) {
 			// An entry with two frames with 'System' in its top frame
 			// exists to record a PC without a traceback. Those are okay.
 			if len(stk) == 2 {
-				f := runtime.FuncForPC(stk[1])
-				if f != nil && (f.Name() == "runtime._System" || f.Name() == "runtime._ExternalCode" || f.Name() == "runtime._GC") {
+				name := stk[1].Line[0].Function.Name
+				if name == "runtime._System" || name == "runtime._ExternalCode" || name == "runtime._GC" {
 					return
 				}
 			}
 
 			// Otherwise, should not see runtime.gogo.
 			// The place we'd see it would be the inner most frame.
-			f := runtime.FuncForPC(stk[0])
-			if f != nil && f.Name() == "runtime.gogo" {
+			name := stk[0].Line[0].Function.Name
+			if name == "runtime.gogo" {
 				var buf bytes.Buffer
-				for _, pc := range stk {
-					f := runtime.FuncForPC(pc)
-					if f == nil {
-						fmt.Fprintf(&buf, "%#x ?:0\n", pc)
-					} else {
-						file, line := f.FileLine(pc)
-						fmt.Fprintf(&buf, "%#x %s:%d\n", pc, file, line)
-					}
-				}
+				fprintStack(&buf, stk)
 				t.Fatalf("found profile entry for runtime.gogo:\n%s", buf.String())
 			}
 		})
 	}
 }
 
+func fprintStack(w io.Writer, stk []*profile.Location) {
+	for _, loc := range stk {
+		fmt.Fprintf(w, " %#x", loc.Address)
+		fmt.Fprintf(w, " (")
+		for i, line := range loc.Line {
+			if i > 0 {
+				fmt.Fprintf(w, " ")
+			}
+			fmt.Fprintf(w, "%s:%d", line.Function.Name, line.Line)
+		}
+		fmt.Fprintf(w, ")")
+	}
+	fmt.Fprintf(w, "\n")
+}
+
 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
 	testCPUProfile(t, nil, func(duration time.Duration) {
@@ -351,111 +394,6 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
-func slurpString(r io.Reader) string {
-	slurp, _ := ioutil.ReadAll(r)
-	return string(slurp)
-}
-
-func getLinuxKernelConfig() string {
-	if f, err := os.Open("/proc/config"); err == nil {
-		defer f.Close()
-		return slurpString(f)
-	}
-	if f, err := os.Open("/proc/config.gz"); err == nil {
-		defer f.Close()
-		r, err := gzip.NewReader(f)
-		if err != nil {
-			return ""
-		}
-		return slurpString(r)
-	}
-	if f, err := os.Open("/boot/config"); err == nil {
-		defer f.Close()
-		return slurpString(f)
-	}
-	uname, _ := exec.Command("uname", "-r").Output()
-	if len(uname) > 0 {
-		if f, err := os.Open("/boot/config-" + strings.TrimSpace(string(uname))); err == nil {
-			defer f.Close()
-			return slurpString(f)
-		}
-	}
-	return ""
-}
-
-func haveLinuxHiresTimers() bool {
-	config := getLinuxKernelConfig()
-	return strings.Contains(config, "CONFIG_HIGH_RES_TIMERS=y")
-}
-
-func TestStackBarrierProfiling(t *testing.T) {
-	if (runtime.GOOS == "linux" && runtime.GOARCH == "arm") ||
-		runtime.GOOS == "openbsd" ||
-		runtime.GOOS == "solaris" ||
-		runtime.GOOS == "dragonfly" ||
-		runtime.GOOS == "freebsd" {
-		// This test currently triggers a large number of
-		// usleep(100)s. These kernels/arches have poor
-		// resolution timers, so this gives up a whole
-		// scheduling quantum. On Linux and the BSDs (and
-		// probably Solaris), profiling signals are only
-		// generated when a process completes a whole
-		// scheduling quantum, so this test often gets zero
-		// profiling signals and fails.
-		t.Skipf("low resolution timers inhibit profiling signals (golang.org/issue/13405)")
-		return
-	}
-
-	if runtime.GOOS == "linux" && strings.HasPrefix(runtime.GOARCH, "mips") {
-		if !haveLinuxHiresTimers() {
-			t.Skipf("low resolution timers inhibit profiling signals (golang.org/issue/13405, golang.org/issue/17936)")
-		}
-	}
-
-	if !strings.Contains(os.Getenv("GODEBUG"), "gcstackbarrierall=1") {
-		// Re-execute this test with constant GC and stack
-		// barriers at every frame.
-		testenv.MustHaveExec(t)
-		if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
-			t.Skip("gcstackbarrierall doesn't work on ppc64")
-		}
-		args := []string{"-test.run=TestStackBarrierProfiling"}
-		if testing.Short() {
-			args = append(args, "-test.short")
-		}
-		cmd := exec.Command(os.Args[0], args...)
-		cmd.Env = append([]string{"GODEBUG=gcstackbarrierall=1", "GOGC=1", "GOTRACEBACK=system"}, os.Environ()...)
-		if out, err := cmd.CombinedOutput(); err != nil {
-			t.Fatalf("subprocess failed with %v:\n%s", err, out)
-		}
-		return
-	}
-
-	testCPUProfile(t, nil, func(duration time.Duration) {
-		// In long mode, we're likely to get one or two
-		// samples in stackBarrier.
-		t := time.After(duration)
-		for {
-			deepStack(1000)
-			select {
-			case <-t:
-				return
-			default:
-			}
-		}
-	})
-}
-
-var x []byte
-
-func deepStack(depth int) int {
-	if depth == 0 {
-		return 0
-	}
-	x = make([]byte, 1024)
-	return deepStack(depth-1) + 1
-}
-
 // Operating systems that are expected to fail the tests. See issue 13841.
 var badOS = map[string]bool{
 	"darwin":    true,
@@ -474,46 +412,46 @@ func TestBlockProfile(t *testing.T) {
 	}
 	tests := [...]TestCase{
 		{"chan recv", blockChanRecv, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanRecv\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan send", blockChanSend, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chansend1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanSend\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chansend1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanSend\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan close", blockChanClose, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanClose\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanClose\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select recv async", blockSelectRecvAsync, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectRecvAsync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select send sync", blockSelectSendSync, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectSendSync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"mutex", blockMutex, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9,a-f]+	.*/src/sync/mutex\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockMutex\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9a-f]+	.*/src/sync/mutex\.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockMutex\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"cond", blockCond, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9,a-f]+	.*/src/sync/cond\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockCond\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9a-f]+	.*/src/sync/cond\.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockCond\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 	}
 
@@ -608,6 +546,10 @@ func blockMutex() {
 		time.Sleep(blockDelay)
 		mu.Unlock()
 	}()
+	// Note: Unlock releases mu before recording the mutex event,
+	// so it's theoretically possible for this to proceed and
+	// capture the profile before the event is recorded. As long
+	// as this is blocked before the unlock happens, it's okay.
 	mu.Lock()
 }
 
@@ -656,7 +598,7 @@ func TestMutexProfile(t *testing.T) {
 	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
 		t.Errorf("%q didn't match %q", lines[3], r2)
 	}
-	r3 := "^#.*pprof_test.\\$nested.*$"
+	r3 := "^#.*pprof.\\$nested.*$"
 	match := false
 	for _, i := range []int{5, 6} {
 		if ok, _ := regexp.MatchString(r3, lines[i]); ok {
@@ -678,22 +620,26 @@ func TestGoroutineCounts(t *testing.T) {
 	if runtime.Compiler == "gccgo" {
 		t.Skip("goroutine stacks not supported on gccgo")
 	}
-	if runtime.GOOS == "openbsd" {
-		testenv.SkipFlaky(t, 15156)
-	}
+
+	// Setting GOMAXPROCS to 1 ensures we can force all goroutines to the
+	// desired blocking point.
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+
 	c := make(chan int)
 	for i := 0; i < 100; i++ {
-		if i%10 == 0 {
+		switch {
+		case i%10 == 0:
 			go func1(c)
-			continue
-		}
-		if i%2 == 0 {
+		case i%2 == 0:
 			go func2(c)
-			continue
+		default:
+			go func3(c)
+		}
+		// Let goroutines block on channel
+		for j := 0; j < 5; j++ {
+			runtime.Gosched()
 		}
-		go func3(c)
 	}
-	time.Sleep(10 * time.Millisecond) // let goroutines block on channel
 
 	var w bytes.Buffer
 	goroutineProf := Lookup("goroutine")
@@ -756,3 +702,81 @@ func containsCounts(prof *profile.Profile, counts []int64) bool {
 	}
 	return true
 }
+
+// Issue 18836.
+func TestEmptyCallStack(t *testing.T) {
+	t.Parallel()
+	var buf bytes.Buffer
+	p := NewProfile("test18836")
+	p.Add("foo", 47674)
+	p.WriteTo(&buf, 1)
+	p.Remove("foo")
+	got := buf.String()
+	prefix := "test18836 profile: total 1\n"
+	if !strings.HasPrefix(got, prefix) {
+		t.Fatalf("got:\n\t%q\nwant prefix:\n\t%q\n", got, prefix)
+	}
+	lostevent := "lostProfileEvent"
+	if !strings.Contains(got, lostevent) {
+		t.Fatalf("got:\n\t%q\ndoes not contain:\n\t%q\n", got, lostevent)
+	}
+}
+
+func TestCPUProfileLabel(t *testing.T) {
+	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, func(dur time.Duration) {
+		Do(context.Background(), Labels("key", "value"), func(context.Context) {
+			cpuHogger(cpuHog1, dur)
+		})
+	})
+}
+
+func TestLabelRace(t *testing.T) {
+	// Test the race detector annotations for synchronization
+	// between settings labels and consuming them from the
+	// profile.
+	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, func(dur time.Duration) {
+		start := time.Now()
+		var wg sync.WaitGroup
+		for time.Since(start) < dur {
+			for i := 0; i < 10; i++ {
+				wg.Add(1)
+				go func() {
+					Do(context.Background(), Labels("key", "value"), func(context.Context) {
+						cpuHogger(cpuHog1, time.Millisecond)
+					})
+					wg.Done()
+				}()
+			}
+			wg.Wait()
+		}
+	})
+}
+
+// Check that there is no deadlock when the program receives SIGPROF while in
+// 64bit atomics' critical section. Used to happen on mips{,le}. See #20146.
+func TestAtomicLoadStore64(t *testing.T) {
+	f, err := ioutil.TempFile("", "profatomic")
+	if err != nil {
+		t.Fatalf("TempFile: %v", err)
+	}
+	defer os.Remove(f.Name())
+	defer f.Close()
+
+	if err := StartCPUProfile(f); err != nil {
+		t.Fatal(err)
+	}
+	defer StopCPUProfile()
+
+	var flag uint64
+	done := make(chan bool, 1)
+
+	go func() {
+		for atomic.LoadUint64(&flag) == 0 {
+			runtime.Gosched()
+		}
+		done <- true
+	}()
+	time.Sleep(50 * time.Millisecond)
+	atomic.StoreUint64(&flag, 1)
+	<-done
+}
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
new file mode 100644
index 0000000..5e1d71c
--- /dev/null
+++ b/libgo/go/runtime/pprof/proto.go
@@ -0,0 +1,515 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"runtime"
+	"sort"
+	"strconv"
+	"time"
+	"unsafe"
+)
+
+// lostProfileEvent is the function to which lost profiling
+// events are attributed.
+// (The name shows up in the pprof graphs.)
+func lostProfileEvent() { lostProfileEvent() }
+
+// funcPC returns the PC for the func value f.
+func funcPC(f interface{}) uintptr {
+	type iface struct {
+		tab  unsafe.Pointer
+		data unsafe.Pointer
+	}
+	i := (*iface)(unsafe.Pointer(&f))
+	return **(**uintptr)(i.data)
+}
+
+// A profileBuilder writes a profile incrementally from a
+// stream of profile samples delivered by the runtime.
+type profileBuilder struct {
+	start      time.Time
+	end        time.Time
+	havePeriod bool
+	period     int64
+	m          profMap
+
+	// encoding state
+	w         io.Writer
+	zw        *gzip.Writer
+	pb        protobuf
+	strings   []string
+	stringMap map[string]int
+	locs      map[uintptr]int
+	funcs     map[string]int // Package path-qualified function name to Function.ID
+	mem       []memMap
+}
+
+type memMap struct {
+	start uintptr
+	end   uintptr
+}
+
+const (
+	// message Profile
+	tagProfile_SampleType    = 1  // repeated ValueType
+	tagProfile_Sample        = 2  // repeated Sample
+	tagProfile_Mapping       = 3  // repeated Mapping
+	tagProfile_Location      = 4  // repeated Location
+	tagProfile_Function      = 5  // repeated Function
+	tagProfile_StringTable   = 6  // repeated string
+	tagProfile_DropFrames    = 7  // int64 (string table index)
+	tagProfile_KeepFrames    = 8  // int64 (string table index)
+	tagProfile_TimeNanos     = 9  // int64
+	tagProfile_DurationNanos = 10 // int64
+	tagProfile_PeriodType    = 11 // ValueType (really optional string???)
+	tagProfile_Period        = 12 // int64
+
+	// message ValueType
+	tagValueType_Type = 1 // int64 (string table index)
+	tagValueType_Unit = 2 // int64 (string table index)
+
+	// message Sample
+	tagSample_Location = 1 // repeated uint64
+	tagSample_Value    = 2 // repeated int64
+	tagSample_Label    = 3 // repeated Label
+
+	// message Label
+	tagLabel_Key = 1 // int64 (string table index)
+	tagLabel_Str = 2 // int64 (string table index)
+	tagLabel_Num = 3 // int64
+
+	// message Mapping
+	tagMapping_ID              = 1  // uint64
+	tagMapping_Start           = 2  // uint64
+	tagMapping_Limit           = 3  // uint64
+	tagMapping_Offset          = 4  // uint64
+	tagMapping_Filename        = 5  // int64 (string table index)
+	tagMapping_BuildID         = 6  // int64 (string table index)
+	tagMapping_HasFunctions    = 7  // bool
+	tagMapping_HasFilenames    = 8  // bool
+	tagMapping_HasLineNumbers  = 9  // bool
+	tagMapping_HasInlineFrames = 10 // bool
+
+	// message Location
+	tagLocation_ID        = 1 // uint64
+	tagLocation_MappingID = 2 // uint64
+	tagLocation_Address   = 3 // uint64
+	tagLocation_Line      = 4 // repeated Line
+
+	// message Line
+	tagLine_FunctionID = 1 // uint64
+	tagLine_Line       = 2 // int64
+
+	// message Function
+	tagFunction_ID         = 1 // uint64
+	tagFunction_Name       = 2 // int64 (string table index)
+	tagFunction_SystemName = 3 // int64 (string table index)
+	tagFunction_Filename   = 4 // int64 (string table index)
+	tagFunction_StartLine  = 5 // int64
+)
+
+// stringIndex adds s to the string table if not already present
+// and returns the index of s in the string table.
+func (b *profileBuilder) stringIndex(s string) int64 {
+	id, ok := b.stringMap[s]
+	if !ok {
+		id = len(b.strings)
+		b.strings = append(b.strings, s)
+		b.stringMap[s] = id
+	}
+	return int64(id)
+}
+
+func (b *profileBuilder) flush() {
+	const dataFlush = 4096
+	if b.pb.nest == 0 && len(b.pb.data) > dataFlush {
+		b.zw.Write(b.pb.data)
+		b.pb.data = b.pb.data[:0]
+	}
+}
+
+// pbValueType encodes a ValueType message to b.pb.
+func (b *profileBuilder) pbValueType(tag int, typ, unit string) {
+	start := b.pb.startMessage()
+	b.pb.int64(tagValueType_Type, b.stringIndex(typ))
+	b.pb.int64(tagValueType_Unit, b.stringIndex(unit))
+	b.pb.endMessage(tag, start)
+}
+
+// pbSample encodes a Sample message to b.pb.
+func (b *profileBuilder) pbSample(values []int64, locs []uint64, labels func()) {
+	start := b.pb.startMessage()
+	b.pb.int64s(tagSample_Value, values)
+	b.pb.uint64s(tagSample_Location, locs)
+	if labels != nil {
+		labels()
+	}
+	b.pb.endMessage(tagProfile_Sample, start)
+	b.flush()
+}
+
+// pbLabel encodes a Label message to b.pb.
+func (b *profileBuilder) pbLabel(tag int, key, str string, num int64) {
+	start := b.pb.startMessage()
+	b.pb.int64Opt(tagLabel_Key, b.stringIndex(key))
+	b.pb.int64Opt(tagLabel_Str, b.stringIndex(str))
+	b.pb.int64Opt(tagLabel_Num, num)
+	b.pb.endMessage(tag, start)
+}
+
+// pbLine encodes a Line message to b.pb.
+func (b *profileBuilder) pbLine(tag int, funcID uint64, line int64) {
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLine_FunctionID, funcID)
+	b.pb.int64Opt(tagLine_Line, line)
+	b.pb.endMessage(tag, start)
+}
+
+// pbMapping encodes a Mapping message to b.pb.
+func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file, buildID string) {
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagMapping_ID, id)
+	b.pb.uint64Opt(tagMapping_Start, base)
+	b.pb.uint64Opt(tagMapping_Limit, limit)
+	b.pb.uint64Opt(tagMapping_Offset, offset)
+	b.pb.int64Opt(tagMapping_Filename, b.stringIndex(file))
+	b.pb.int64Opt(tagMapping_BuildID, b.stringIndex(buildID))
+	// TODO: Set any of HasInlineFrames, HasFunctions, HasFilenames, HasLineNumbers?
+	// It seems like they should all be true, but they've never been set.
+	b.pb.endMessage(tag, start)
+}
+
+// locForPC returns the location ID for addr.
+// addr must be a return PC. This returns the location of the call.
+// It may emit to b.pb, so there must be no message encoding in progress.
+func (b *profileBuilder) locForPC(addr uintptr) uint64 {
+	id := uint64(b.locs[addr])
+	if id != 0 {
+		return id
+	}
+
+	// Expand this one address using CallersFrames so we can cache
+	// each expansion. In general, CallersFrames takes a whole
+	// stack, but in this case we know there will be no skips in
+	// the stack and we have return PCs anyway.
+	frames := runtime.CallersFrames([]uintptr{addr})
+	frame, more := frames.Next()
+	if frame.Function == "runtime.goexit" {
+		// Short-circuit if we see runtime.goexit so the loop
+		// below doesn't allocate a useless empty location.
+		return 0
+	}
+
+	if frame.PC == 0 {
+		// If we failed to resolve the frame, at least make up
+		// a reasonable call PC. This mostly happens in tests.
+		frame.PC = addr - 1
+	}
+
+	// We can't write out functions while in the middle of the
+	// Location message, so record new functions we encounter and
+	// write them out after the Location.
+	type newFunc struct {
+		id         uint64
+		name, file string
+	}
+	newFuncs := make([]newFunc, 0, 8)
+
+	id = uint64(len(b.locs)) + 1
+	b.locs[addr] = int(id)
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLocation_ID, id)
+	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
+	for frame.Function != "runtime.goexit" {
+		// Write out each line in frame expansion.
+		funcID := uint64(b.funcs[frame.Function])
+		if funcID == 0 {
+			funcID = uint64(len(b.funcs)) + 1
+			b.funcs[frame.Function] = int(funcID)
+			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+		}
+		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
+		if !more {
+			break
+		}
+		frame, more = frames.Next()
+	}
+	if len(b.mem) > 0 {
+		i := sort.Search(len(b.mem), func(i int) bool {
+			return b.mem[i].end > addr
+		})
+		if i < len(b.mem) && b.mem[i].start <= addr && addr < b.mem[i].end {
+			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+		}
+	}
+	b.pb.endMessage(tagProfile_Location, start)
+
+	// Write out functions we found during frame expansion.
+	for _, fn := range newFuncs {
+		start := b.pb.startMessage()
+		b.pb.uint64Opt(tagFunction_ID, fn.id)
+		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+		b.pb.endMessage(tagProfile_Function, start)
+	}
+
+	b.flush()
+	return id
+}
+
+// newProfileBuilder returns a new profileBuilder.
+// CPU profiling data obtained from the runtime can be added
+// by calling b.addCPUData, and then the eventual profile
+// can be obtained by calling b.finish.
+func newProfileBuilder(w io.Writer) *profileBuilder {
+	zw, _ := gzip.NewWriterLevel(w, gzip.BestSpeed)
+	b := &profileBuilder{
+		w:         w,
+		zw:        zw,
+		start:     time.Now(),
+		strings:   []string{""},
+		stringMap: map[string]int{"": 0},
+		locs:      map[uintptr]int{},
+		funcs:     map[string]int{},
+	}
+	b.readMapping()
+	return b
+}
+
+// addCPUData adds the CPU profiling data to the profile.
+// The data must be a whole number of records,
+// as delivered by the runtime.
+func (b *profileBuilder) addCPUData(data []uint64, tags []unsafe.Pointer) error {
+	if !b.havePeriod {
+		// first record is period
+		if len(data) < 3 {
+			return fmt.Errorf("truncated profile")
+		}
+		if data[0] != 3 || data[2] == 0 {
+			return fmt.Errorf("malformed profile")
+		}
+		// data[2] is sampling rate in Hz. Convert to sampling
+		// period in nanoseconds.
+		b.period = 1e9 / int64(data[2])
+		b.havePeriod = true
+		data = data[3:]
+	}
+
+	// Parse CPU samples from the profile.
+	// Each sample is 3+n uint64s:
+	//	data[0] = 3+n
+	//	data[1] = time stamp (ignored)
+	//	data[2] = count
+	//	data[3:3+n] = stack
+	// If the count is 0 and the stack has length 1,
+	// that's an overflow record inserted by the runtime
+	// to indicate that stack[0] samples were lost.
+	// Otherwise the count is usually 1,
+	// but in a few special cases like lost non-Go samples
+	// there can be larger counts.
+	// Because many samples with the same stack arrive,
+	// we want to deduplicate immediately, which we do
+	// using the b.m profMap.
+	for len(data) > 0 {
+		if len(data) < 3 || data[0] > uint64(len(data)) {
+			return fmt.Errorf("truncated profile")
+		}
+		if data[0] < 3 || tags != nil && len(tags) < 1 {
+			return fmt.Errorf("malformed profile")
+		}
+		count := data[2]
+		stk := data[3:data[0]]
+		data = data[data[0]:]
+		var tag unsafe.Pointer
+		if tags != nil {
+			tag = tags[0]
+			tags = tags[1:]
+		}
+
+		if count == 0 && len(stk) == 1 {
+			// overflow record
+			count = uint64(stk[0])
+			stk = []uint64{
+				uint64(funcPC(lostProfileEvent)),
+			}
+		}
+		b.m.lookup(stk, tag).count += int64(count)
+	}
+	return nil
+}
+
+// build completes and returns the constructed profile.
+func (b *profileBuilder) build() error {
+	b.end = time.Now()
+
+	b.pb.int64Opt(tagProfile_TimeNanos, b.start.UnixNano())
+	if b.havePeriod { // must be CPU profile
+		b.pbValueType(tagProfile_SampleType, "samples", "count")
+		b.pbValueType(tagProfile_SampleType, "cpu", "nanoseconds")
+		b.pb.int64Opt(tagProfile_DurationNanos, b.end.Sub(b.start).Nanoseconds())
+		b.pbValueType(tagProfile_PeriodType, "cpu", "nanoseconds")
+		b.pb.int64Opt(tagProfile_Period, b.period)
+	}
+
+	values := []int64{0, 0}
+	var locs []uint64
+	for e := b.m.all; e != nil; e = e.nextAll {
+		values[0] = e.count
+		values[1] = e.count * b.period
+
+		var labels func()
+		if e.tag != nil {
+			labels = func() {
+				for k, v := range *(*labelMap)(e.tag) {
+					b.pbLabel(tagSample_Label, k, v, 0)
+				}
+			}
+		}
+
+		locs = locs[:0]
+		for i, addr := range e.stk {
+			// Addresses from stack traces point to the
+			// next instruction after each call, except
+			// for the leaf, which points to where the
+			// signal occurred. locForPC expects return
+			// PCs, so increment the leaf address to look
+			// like a return PC.
+			if i == 0 {
+				addr++
+			}
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
+			}
+			locs = append(locs, l)
+		}
+		b.pbSample(values, locs, labels)
+	}
+
+	// TODO: Anything for tagProfile_DropFrames?
+	// TODO: Anything for tagProfile_KeepFrames?
+
+	b.pb.strings(tagProfile_StringTable, b.strings)
+	b.zw.Write(b.pb.data)
+	b.zw.Close()
+	return nil
+}
+
+// readMapping reads /proc/self/maps and writes mappings to b.pb.
+// It saves the address ranges of the mappings in b.mem for use
+// when emitting locations.
+func (b *profileBuilder) readMapping() {
+	data, _ := ioutil.ReadFile("/proc/self/maps")
+	parseProcSelfMaps(data, b.addMapping)
+}
+
+func parseProcSelfMaps(data []byte, addMapping func(lo, hi, offset uint64, file, buildID string)) {
+	// $ cat /proc/self/maps
+	// 00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat
+	// 0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat
+	// 0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat
+	// 014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+	// 7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+	// 7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+	// 7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+	// 7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+	// 7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+	// 7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+	// 7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+	// ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
+
+	var line []byte
+	// next removes and returns the next field in the line.
+	// It also removes from line any spaces following the field.
+	next := func() []byte {
+		j := bytes.IndexByte(line, ' ')
+		if j < 0 {
+			f := line
+			line = nil
+			return f
+		}
+		f := line[:j]
+		line = line[j+1:]
+		for len(line) > 0 && line[0] == ' ' {
+			line = line[1:]
+		}
+		return f
+	}
+
+	for len(data) > 0 {
+		i := bytes.IndexByte(data, '\n')
+		if i < 0 {
+			line, data = data, nil
+		} else {
+			line, data = data[:i], data[i+1:]
+		}
+		addr := next()
+		i = bytes.IndexByte(addr, '-')
+		if i < 0 {
+			continue
+		}
+		lo, err := strconv.ParseUint(string(addr[:i]), 16, 64)
+		if err != nil {
+			continue
+		}
+		hi, err := strconv.ParseUint(string(addr[i+1:]), 16, 64)
+		if err != nil {
+			continue
+		}
+		perm := next()
+		if len(perm) < 4 || perm[2] != 'x' {
+			// Only interested in executable mappings.
+			continue
+		}
+		offset, err := strconv.ParseUint(string(next()), 16, 64)
+		if err != nil {
+			continue
+		}
+		next()          // dev
+		inode := next() // inode
+		if line == nil {
+			continue
+		}
+		file := string(line)
+		if len(inode) == 1 && inode[0] == '0' && file == "" {
+			// Huge-page text mappings list the initial fragment of
+			// mapped but unpopulated memory as being inode 0.
+			// Don't report that part.
+			// But [vdso] and [vsyscall] are inode 0, so let non-empty file names through.
+			continue
+		}
+
+		// TODO: pprof's remapMappingIDs makes two adjustments:
+		// 1. If there is an /anon_hugepage mapping first and it is
+		// consecutive to a next mapping, drop the /anon_hugepage.
+		// 2. If start-offset = 0x400000, change start to 0x400000 and offset to 0.
+		// There's no indication why either of these is needed.
+		// Let's try not doing these and see what breaks.
+		// If we do need them, they would go here, before we
+		// enter the mappings into b.mem in the first place.
+
+		buildID, _ := elfBuildID(file)
+		addMapping(lo, hi, offset, file, buildID)
+	}
+}
+
+func (b *profileBuilder) addMapping(lo, hi, offset uint64, file, buildID string) {
+	b.mem = append(b.mem, memMap{uintptr(lo), uintptr(hi)})
+	b.pbMapping(tagProfile_Mapping, uint64(len(b.mem)), lo, hi, offset, file, buildID)
+}
diff --git a/libgo/go/runtime/pprof/proto_test.go b/libgo/go/runtime/pprof/proto_test.go
new file mode 100644
index 0000000..a268c3a
--- /dev/null
+++ b/libgo/go/runtime/pprof/proto_test.go
@@ -0,0 +1,224 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"reflect"
+	"runtime"
+	"runtime/pprof/internal/profile"
+	"strings"
+	"testing"
+)
+
+// translateCPUProfile parses binary CPU profiling stack trace data
+// generated by runtime.CPUProfile() into a profile struct.
+// This is only used for testing. Real conversions stream the
+// data into the profileBuilder as it becomes available.
+func translateCPUProfile(data []uint64) (*profile.Profile, error) {
+	var buf bytes.Buffer
+	b := newProfileBuilder(&buf)
+	if err := b.addCPUData(data, nil); err != nil {
+		return nil, err
+	}
+	b.build()
+	return profile.Parse(&buf)
+}
+
+// fmtJSON returns a pretty-printed JSON form for x.
+// It works reasonbly well for printing protocol-buffer
+// data structures like profile.Profile.
+func fmtJSON(x interface{}) string {
+	js, _ := json.MarshalIndent(x, "", "\t")
+	return string(js)
+}
+
+func TestConvertCPUProfileEmpty(t *testing.T) {
+	// A test server with mock cpu profile data.
+	var buf bytes.Buffer
+
+	b := []uint64{3, 0, 500} // empty profile at 500 Hz (2ms sample period)
+	p, err := translateCPUProfile(b)
+	if err != nil {
+		t.Fatalf("translateCPUProfile: %v", err)
+	}
+	if err := p.Write(&buf); err != nil {
+		t.Fatalf("writing profile: %v", err)
+	}
+
+	p, err = profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("profile.Parse: %v", err)
+	}
+
+	// Expected PeriodType and SampleType.
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	sampleType := []*profile.ValueType{
+		{Type: "samples", Unit: "count"},
+		{Type: "cpu", Unit: "nanoseconds"},
+	}
+
+	checkProfile(t, p, 2000*1000, periodType, sampleType, nil)
+}
+
+// For gccgo make these functions different so that gccgo doesn't
+// merge them with each other and with lostProfileEvent.
+func f1(i int) { f1(i + 1) }
+func f2(i int) { f2(i + 2) }
+
+// testPCs returns two PCs and two corresponding memory mappings
+// to use in test profiles.
+func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
+	switch runtime.GOOS {
+	case "linux", "android", "netbsd":
+		// Figure out two addresses from /proc/self/maps.
+		mmap, err := ioutil.ReadFile("/proc/self/maps")
+		if err != nil {
+			t.Fatal(err)
+		}
+		mprof := &profile.Profile{}
+		if err = mprof.ParseMemoryMap(bytes.NewReader(mmap)); err != nil {
+			t.Fatalf("parsing /proc/self/maps: %v", err)
+		}
+		if len(mprof.Mapping) < 2 {
+			// It is possible for a binary to only have 1 executable
+			// region of memory.
+			t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
+		}
+		addr1 = mprof.Mapping[0].Start
+		map1 = mprof.Mapping[0]
+		map1.BuildID, _ = elfBuildID(map1.File)
+		addr2 = mprof.Mapping[1].Start
+		map2 = mprof.Mapping[1]
+		map2.BuildID, _ = elfBuildID(map2.File)
+	default:
+		addr1 = uint64(funcPC(f1))
+		addr2 = uint64(funcPC(f2))
+	}
+	return
+}
+
+func TestConvertCPUProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)
+
+	b := []uint64{
+		3, 0, 500, // hz = 500
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 40, uint64(addr2), uint64(addr2 + 2), // 40 samples in addr2
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+	}
+	p, err := translateCPUProfile(b)
+	if err != nil {
+		t.Fatalf("translating profile: %v", err)
+	}
+	period := int64(2000 * 1000)
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	sampleType := []*profile.ValueType{
+		{Type: "samples", Unit: "count"},
+		{Type: "cpu", Unit: "nanoseconds"},
+	}
+	samples := []*profile.Sample{
+		{Value: []int64{20, 20 * 2000 * 1000}, Location: []*profile.Location{
+			{ID: 1, Mapping: map1, Address: addr1},
+			{ID: 2, Mapping: map1, Address: addr1 + 1},
+		}},
+		{Value: []int64{40, 40 * 2000 * 1000}, Location: []*profile.Location{
+			{ID: 3, Mapping: map2, Address: addr2},
+			{ID: 4, Mapping: map2, Address: addr2 + 1},
+		}},
+	}
+	checkProfile(t, p, period, periodType, sampleType, samples)
+}
+
+func checkProfile(t *testing.T, p *profile.Profile, period int64, periodType *profile.ValueType, sampleType []*profile.ValueType, samples []*profile.Sample) {
+	if p.Period != period {
+		t.Fatalf("p.Period = %d, want %d", p.Period, period)
+	}
+	if !reflect.DeepEqual(p.PeriodType, periodType) {
+		t.Fatalf("p.PeriodType = %v\nwant = %v", fmtJSON(p.PeriodType), fmtJSON(periodType))
+	}
+	if !reflect.DeepEqual(p.SampleType, sampleType) {
+		t.Fatalf("p.SampleType = %v\nwant = %v", fmtJSON(p.SampleType), fmtJSON(sampleType))
+	}
+	// Clear line info since it is not in the expected samples.
+	// If we used f1 and f2 above, then the samples will have line info.
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			l.Line = nil
+		}
+	}
+	if fmtJSON(p.Sample) != fmtJSON(samples) { // ignore unexported fields
+		if len(p.Sample) == len(samples) {
+			for i := range p.Sample {
+				if !reflect.DeepEqual(p.Sample[i], samples[i]) {
+					t.Errorf("sample %d = %v\nwant = %v\n", i, fmtJSON(p.Sample[i]), fmtJSON(samples[i]))
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
+		t.Fatalf("p.Sample = %v\nwant = %v", fmtJSON(p.Sample), fmtJSON(samples))
+	}
+}
+
+var profSelfMapsTests = `
+00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat
+0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat
+0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat
+014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+ffffffffff600000-ffffffffff601000 r-xp 00000090 00:00 0                  [vsyscall]
+->
+00400000 0040b000 00000000 /bin/cat
+7f7d7797c000 7f7d77b36000 00000000 /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d41000 7f7d77d64000 00000000 /lib/x86_64-linux-gnu/ld-2.19.so
+7ffc34343000 7ffc34345000 00000000 [vdso]
+ffffffffff600000 ffffffffff601000 00000090 [vsyscall]
+
+00400000-07000000 r-xp 00000000 00:00 0 
+07000000-07093000 r-xp 06c00000 00:2e 536754                             /path/to/gobench_server_main
+07093000-0722d000 rw-p 06c92000 00:2e 536754                             /path/to/gobench_server_main
+0722d000-07b21000 rw-p 00000000 00:00 0 
+c000000000-c000036000 rw-p 00000000 00:00 0 
+->
+07000000 07093000 06c00000 /path/to/gobench_server_main
+`
+
+func TestProcSelfMaps(t *testing.T) {
+	for tx, tt := range strings.Split(profSelfMapsTests, "\n\n") {
+		i := strings.Index(tt, "->\n")
+		if i < 0 {
+			t.Fatal("malformed test case")
+		}
+		in, out := tt[:i], tt[i+len("->\n"):]
+		if len(out) > 0 && out[len(out)-1] != '\n' {
+			out += "\n"
+		}
+		var buf bytes.Buffer
+		parseProcSelfMaps([]byte(in), func(lo, hi, offset uint64, file, buildID string) {
+			fmt.Fprintf(&buf, "%08x %08x %08x %s\n", lo, hi, offset, file)
+		})
+		if buf.String() != out {
+			t.Errorf("#%d: have:\n%s\nwant:\n%s\n%q\n%q", tx, buf.String(), out, buf.String(), out)
+		}
+	}
+}
diff --git a/libgo/go/runtime/pprof/protobuf.go b/libgo/go/runtime/pprof/protobuf.go
new file mode 100644
index 0000000..7b99095
--- /dev/null
+++ b/libgo/go/runtime/pprof/protobuf.go
@@ -0,0 +1,141 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+// A protobuf is a simple protocol buffer encoder.
+type protobuf struct {
+	data []byte
+	tmp  [16]byte
+	nest int
+}
+
+func (b *protobuf) varint(x uint64) {
+	for x >= 128 {
+		b.data = append(b.data, byte(x)|0x80)
+		x >>= 7
+	}
+	b.data = append(b.data, byte(x))
+}
+
+func (b *protobuf) length(tag int, len int) {
+	b.varint(uint64(tag)<<3 | 2)
+	b.varint(uint64(len))
+}
+
+func (b *protobuf) uint64(tag int, x uint64) {
+	// append varint to b.data
+	b.varint(uint64(tag)<<3 | 0)
+	b.varint(x)
+}
+
+func (b *protobuf) uint64s(tag int, x []uint64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(u)
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.uint64(tag, u)
+	}
+}
+
+func (b *protobuf) uint64Opt(tag int, x uint64) {
+	if x == 0 {
+		return
+	}
+	b.uint64(tag, x)
+}
+
+func (b *protobuf) int64(tag int, x int64) {
+	u := uint64(x)
+	b.uint64(tag, u)
+}
+
+func (b *protobuf) int64Opt(tag int, x int64) {
+	if x == 0 {
+		return
+	}
+	b.int64(tag, x)
+}
+
+func (b *protobuf) int64s(tag int, x []int64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(uint64(u))
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.int64(tag, u)
+	}
+}
+
+func (b *protobuf) string(tag int, x string) {
+	b.length(tag, len(x))
+	b.data = append(b.data, x...)
+}
+
+func (b *protobuf) strings(tag int, x []string) {
+	for _, s := range x {
+		b.string(tag, s)
+	}
+}
+
+func (b *protobuf) stringOpt(tag int, x string) {
+	if x == "" {
+		return
+	}
+	b.string(tag, x)
+}
+
+func (b *protobuf) bool(tag int, x bool) {
+	if x {
+		b.uint64(tag, 1)
+	} else {
+		b.uint64(tag, 0)
+	}
+}
+
+func (b *protobuf) boolOpt(tag int, x bool) {
+	if x == false {
+		return
+	}
+	b.bool(tag, x)
+}
+
+type msgOffset int
+
+func (b *protobuf) startMessage() msgOffset {
+	b.nest++
+	return msgOffset(len(b.data))
+}
+
+func (b *protobuf) endMessage(tag int, start msgOffset) {
+	n1 := int(start)
+	n2 := len(b.data)
+	b.length(tag, n2-n1)
+	n3 := len(b.data)
+	copy(b.tmp[:], b.data[n2:n3])
+	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+	copy(b.data[n1:], b.tmp[:n3-n2])
+	b.nest--
+}
diff --git a/libgo/go/runtime/pprof/protomem.go b/libgo/go/runtime/pprof/protomem.go
new file mode 100644
index 0000000..2756cfd
--- /dev/null
+++ b/libgo/go/runtime/pprof/protomem.go
@@ -0,0 +1,93 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"io"
+	"math"
+	"runtime"
+	"strings"
+)
+
+// writeHeapProto writes the current heap profile in protobuf format to w.
+func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64) error {
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, "space", "bytes")
+	b.pb.int64Opt(tagProfile_Period, rate)
+	b.pbValueType(tagProfile_SampleType, "alloc_objects", "count")
+	b.pbValueType(tagProfile_SampleType, "alloc_space", "bytes")
+	b.pbValueType(tagProfile_SampleType, "inuse_objects", "count")
+	b.pbValueType(tagProfile_SampleType, "inuse_space", "bytes")
+
+	values := []int64{0, 0, 0, 0}
+	var locs []uint64
+	for _, r := range p {
+		locs = locs[:0]
+		hideRuntime := true
+		for tries := 0; tries < 2; tries++ {
+			for _, addr := range r.Stack() {
+				// For heap profiles, all stack
+				// addresses are return PCs, which is
+				// what locForPC expects.
+				if hideRuntime {
+					if f := runtime.FuncForPC(addr); f != nil && strings.HasPrefix(f.Name(), "runtime.") {
+						continue
+					}
+					// Found non-runtime. Show any runtime uses above it.
+					hideRuntime = false
+				}
+				l := b.locForPC(addr)
+				if l == 0 { // runtime.goexit
+					continue
+				}
+				locs = append(locs, l)
+			}
+			if len(locs) > 0 {
+				break
+			}
+			hideRuntime = false // try again, and show all frames
+		}
+
+		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
+		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
+		var blockSize int64
+		if values[0] > 0 {
+			blockSize = values[1] / values[0]
+		}
+		b.pbSample(values, locs, func() {
+			if blockSize != 0 {
+				b.pbLabel(tagSample_Label, "bytes", "", blockSize)
+			}
+		})
+	}
+	b.build()
+	return nil
+}
+
+// scaleHeapSample adjusts the data from a heap Sample to
+// account for its probability of appearing in the collected
+// data. heap profiles are a sampling of the memory allocations
+// requests in a program. We estimate the unsampled value by dividing
+// each collected sample by its probability of appearing in the
+// profile. heap profiles rely on a poisson process to determine
+// which samples to collect, based on the desired average collection
+// rate R. The probability of a sample of size S to appear in that
+// profile is 1-exp(-S/R).
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+	if count == 0 || size == 0 {
+		return 0, 0
+	}
+
+	if rate <= 1 {
+		// if rate==1 all samples were collected so no adjustment is needed.
+		// if rate<1 treat as unknown and skip scaling.
+		return count, size
+	}
+
+	avgSize := float64(size) / float64(count)
+	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+	return int64(float64(count) * scale), int64(float64(size) * scale)
+}
diff --git a/libgo/go/runtime/pprof/protomem_test.go b/libgo/go/runtime/pprof/protomem_test.go
new file mode 100644
index 0000000..1e30ed9
--- /dev/null
+++ b/libgo/go/runtime/pprof/protomem_test.go
@@ -0,0 +1,74 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"runtime"
+	"runtime/pprof/internal/profile"
+	"testing"
+)
+
+func TestConvertMemProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)
+
+	var buf bytes.Buffer
+	// MemProfileRecord stacks are return PCs, so add one to the
+	// addresses recorded in the "profile". The proto profile
+	// locations are call PCs, so conversion will subtract one
+	// from these and get back to addr1 and addr2.
+	a1, a2 := uintptr(addr1)+1, uintptr(addr2)+1
+	rate := int64(512 * 1024)
+	rec := []runtime.MemProfileRecord{
+		{AllocBytes: 4096, FreeBytes: 1024, AllocObjects: 4, FreeObjects: 1, Stack0: [32]uintptr{a1, a2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 0, AllocObjects: 1, FreeObjects: 0, Stack0: [32]uintptr{a2 + 1, a2 + 2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 512 * 1024, AllocObjects: 1, FreeObjects: 1, Stack0: [32]uintptr{a1 + 1, a1 + 2, a2 + 3}},
+	}
+
+	if err := writeHeapProto(&buf, rec, rate); err != nil {
+		t.Fatalf("writing profile: %v", err)
+	}
+
+	p, err := profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("profile.Parse: %v", err)
+	}
+
+	periodType := &profile.ValueType{Type: "space", Unit: "bytes"}
+	sampleType := []*profile.ValueType{
+		{Type: "alloc_objects", Unit: "count"},
+		{Type: "alloc_space", Unit: "bytes"},
+		{Type: "inuse_objects", Unit: "count"},
+		{Type: "inuse_space", Unit: "bytes"},
+	}
+	samples := []*profile.Sample{
+		{
+			Value: []int64{2050, 2099200, 1537, 1574400},
+			Location: []*profile.Location{
+				{ID: 1, Mapping: map1, Address: addr1},
+				{ID: 2, Mapping: map2, Address: addr2},
+			},
+			NumLabel: map[string][]int64{"bytes": {1024}},
+		},
+		{
+			Value: []int64{1, 829411, 1, 829411},
+			Location: []*profile.Location{
+				{ID: 3, Mapping: map2, Address: addr2 + 1},
+				{ID: 4, Mapping: map2, Address: addr2 + 2},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
+		{
+			Value: []int64{1, 829411, 0, 0},
+			Location: []*profile.Location{
+				{ID: 5, Mapping: map1, Address: addr1 + 1},
+				{ID: 6, Mapping: map1, Address: addr1 + 2},
+				{ID: 7, Mapping: map2, Address: addr2 + 3},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
+	}
+	checkProfile(t, p, rate, periodType, sampleType, samples)
+}
diff --git a/libgo/go/runtime/pprof/runtime.go b/libgo/go/runtime/pprof/runtime.go
new file mode 100644
index 0000000..e6aace8
--- /dev/null
+++ b/libgo/go/runtime/pprof/runtime.go
@@ -0,0 +1,36 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+	"unsafe"
+)
+
+// runtime_setProfLabel is defined in runtime/proflabel.go.
+func runtime_setProfLabel(labels unsafe.Pointer)
+
+// runtime_getProfLabel is defined in runtime/proflabel.go.
+func runtime_getProfLabel() unsafe.Pointer
+
+// SetGoroutineLabels sets the current goroutine's labels to match ctx.
+// This is a lower-level API than Do, which should be used instead when possible.
+func SetGoroutineLabels(ctx context.Context) {
+	ctxLabels, _ := ctx.Value(labelContextKey{}).(*labelMap)
+	runtime_setProfLabel(unsafe.Pointer(ctxLabels))
+}
+
+// Do calls f with a copy of the parent context with the
+// given labels added to the parent's label map.
+// Each key/value pair in labels is inserted into the label map in the
+// order provided, overriding any previous value for the same key.
+// The augmented label map will be set for the duration of the call to f
+// and restored once f returns.
+func Do(ctx context.Context, labels LabelSet, f func(context.Context)) {
+	defer SetGoroutineLabels(ctx)
+	ctx = WithLabels(ctx, labels)
+	SetGoroutineLabels(ctx)
+	f(ctx)
+}
diff --git a/libgo/go/runtime/pprof/runtime_test.go b/libgo/go/runtime/pprof/runtime_test.go
new file mode 100644
index 0000000..0dd5324
--- /dev/null
+++ b/libgo/go/runtime/pprof/runtime_test.go
@@ -0,0 +1,96 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestSetGoroutineLabels(t *testing.T) {
+	sync := make(chan struct{})
+
+	wantLabels := map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty before test, got %v", gotLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("Expected child goroutine's profile labels to be empty before test, got %v", gotLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+
+	wantLabels = map[string]string{"key": "value"}
+	ctx := WithLabels(context.Background(), Labels("key", "value"))
+	SetGoroutineLabels(ctx)
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("parent goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("child goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+
+	wantLabels = map[string]string{}
+	ctx = context.Background()
+	SetGoroutineLabels(ctx)
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty, got %v", gotLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("Expected child goroutine's profile labels to be empty, got %v", gotLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+}
+
+func TestDo(t *testing.T) {
+	wantLabels := map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty before Do, got %v", gotLabels)
+	}
+
+	Do(context.Background(), Labels("key1", "value1", "key2", "value2"), func(ctx context.Context) {
+		wantLabels := map[string]string{"key1": "value1", "key2": "value2"}
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("parent goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+		}
+
+		sync := make(chan struct{})
+		go func() {
+			wantLabels := map[string]string{"key1": "value1", "key2": "value2"}
+			if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+				t.Errorf("child goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+			}
+			sync <- struct{}{}
+		}()
+		<-sync
+
+	})
+
+	wantLabels = map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		fmt.Printf("%#v", gotLabels)
+		fmt.Printf("%#v", wantLabels)
+		t.Errorf("Expected parent goroutine's profile labels to be empty after Do, got %v", gotLabels)
+	}
+}
+
+func getProfLabel() map[string]string {
+	l := (*labelMap)(runtime_getProfLabel())
+	if l == nil {
+		return map[string]string{}
+	}
+	return *l
+}
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 6b6f9c9..345f57b 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -214,8 +214,17 @@ func main() {
 	// Make racy client program work: if panicking on
 	// another goroutine at the same time as main returns,
 	// let the other goroutine finish printing the panic trace.
-	// Once it does, it will exit. See issue 3934.
-	if panicking != 0 {
+	// Once it does, it will exit. See issues 3934 and 20018.
+	if atomic.Load(&runningPanicDefers) != 0 {
+		// Running deferred functions should not take long.
+		for c := 0; c < 1000; c++ {
+			if atomic.Load(&runningPanicDefers) == 0 {
+				break
+			}
+			Gosched()
+		}
+	}
+	if atomic.Load(&panicking) != 0 {
 		gopark(nil, nil, "panicwait", traceEvGoStop, 1)
 	}
 
@@ -255,18 +264,25 @@ func forcegchelper() {
 		if debug.gctrace > 0 {
 			println("GC forced")
 		}
-		gcStart(gcBackgroundMode, true)
+		// Time-triggered, fully concurrent.
+		gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerTime, now: nanotime()})
 	}
 }
 
-//go:nosplit
-
 // Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
+//go:nosplit
 func Gosched() {
 	mcall(gosched_m)
 }
 
+// goschedguarded yields the processor like gosched, but also checks
+// for forbidden states and opts out of the yield in those cases.
+//go:nosplit
+func goschedguarded() {
+	mcall(goschedguarded_m)
+}
+
 // Puts the current goroutine into a waiting state and calls unlockf.
 // If unlockf returns false, the goroutine is resumed.
 // unlockf must not access this G's stack, as it may be moved between
@@ -419,16 +435,6 @@ func allgadd(gp *g) {
 	lock(&allglock)
 	allgs = append(allgs, gp)
 	allglen = uintptr(len(allgs))
-
-	// Grow GC rescan list if necessary.
-	if len(allgs) > cap(work.rescan.list) {
-		lock(&work.rescan.lock)
-		l := work.rescan.list
-		// Let append do the heavy lifting, but keep the
-		// length the same.
-		work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
-		unlock(&work.rescan.lock)
-	}
 	unlock(&allglock)
 }
 
@@ -765,9 +771,8 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning && gp.gcscanvalid {
-		// Run queueRescan on the system stack so it has more space.
-		systemstack(func() { queueRescan(gp) })
+	if newval == _Grunning {
+		gp.gcscanvalid = false
 	}
 }
 
@@ -779,8 +784,6 @@ func scang(gp *g, gcw *gcWork) {
 	// Nothing is racing with us now, but gcscandone might be set to true left over
 	// from an earlier round of stack scanning (we scan twice per GC).
 	// We use gcscandone to record whether the scan has been done during this round.
-	// It is important that the scan happens exactly once: if called twice,
-	// the installation of stack barriers will detect the double scan and die.
 
 	gp.gcscandone = false
 
@@ -902,7 +905,7 @@ func restartg(gp *g) {
 // in panic or being exited, this may not reliably stop all
 // goroutines.
 func stopTheWorld(reason string) {
-	semacquire(&worldsema, 0)
+	semacquire(&worldsema)
 	getg().m.preemptoff = reason
 	systemstack(stopTheWorldWithSema)
 }
@@ -1129,10 +1132,10 @@ func mstart1() {
 	}
 
 	asminit()
-	minit()
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
+	// For gccgo minit was called by C code.
 	if _g_.m == &m0 {
 		// Create an extra M for callbacks on threads not created by Go.
 		if iscgo && !cgoHasExtraM {
@@ -1363,6 +1366,7 @@ func needm(x byte) {
 	// running at all (that is, there's no garbage collection
 	// running right now).
 	mp.needextram = mp.schedlink == 0
+	extraMCount--
 	unlockextra(mp.schedlink.ptr())
 
 	// Save and block signals before installing g.
@@ -1376,12 +1380,16 @@ func needm(x byte) {
 
 	// Install g (= m->curg).
 	setg(mp.curg)
-	atomic.Store(&mp.curg.atomicstatus, _Gsyscall)
-	setGContext()
 
 	// Initialize this thread to use the m.
 	asminit()
 	minit()
+
+	setGContext()
+
+	// mp.curg is now a real goroutine.
+	casgstatus(mp.curg, _Gdead, _Gsyscall)
+	atomic.Xadd(&sched.ngsys, -1)
 }
 
 var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
@@ -1414,14 +1422,12 @@ func oneNewExtraM() {
 	// the goroutine stack ends.
 	mp, g0SP, g0SPSize := allocm(nil, nil, true)
 	gp := malg(true, false, nil, nil)
-	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
+	gp.gcscanvalid = true
 	gp.gcscandone = true
-	gp.gcRescan = -1
-
-	// malg returns status as Gidle, change to Gdead before adding to allg
-	// where GC will see it.
-	// gccgo uses Gdead here, not Gsyscall, because the split
-	// stack context is not initialized.
+	// malg returns status as _Gidle. Change to _Gdead before
+	// adding to allg where GC can see it. We use _Gdead to hide
+	// this from tracebacks and stack scans since it isn't a
+	// "real" goroutine until needm grabs it.
 	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
@@ -1436,9 +1442,16 @@ func oneNewExtraM() {
 	// Here we need to set the context for g0.
 	makeGContext(mp.g0, g0SP, g0SPSize)
 
+	// gp is now on the allg list, but we don't want it to be
+	// counted by gcount. It would be more "proper" to increment
+	// sched.ngfree, but that requires locking. Incrementing ngsys
+	// has the same effect.
+	atomic.Xadd(&sched.ngsys, +1)
+
 	// Add m to the extra list.
 	mnext := lockextra(true)
 	mp.schedlink.set(mnext)
+	extraMCount++
 	unlockextra(mp)
 }
 
@@ -1474,6 +1487,10 @@ func dropm() {
 	// with no pointer manipulation.
 	mp := getg().m
 
+	// Return mp.curg to dead state.
+	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	atomic.Xadd(&sched.ngsys, +1)
+
 	// Block signals before unminit.
 	// Unminit unregisters the signal handling stack (but needs g on some systems).
 	// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
@@ -1489,6 +1506,7 @@ func dropm() {
 	mp.curg.gcnextsp = 0
 
 	mnext := lockextra(true)
+	extraMCount++
 	mp.schedlink.set(mnext)
 
 	setg(nil)
@@ -1505,6 +1523,7 @@ func getm() uintptr {
 }
 
 var extram uintptr
+var extraMCount uint32 // Protected by lockextra
 var extraMWaiters uint32
 
 // lockextra locks the extra list and returns the list head.
@@ -1551,6 +1570,10 @@ func unlockextra(mp *m) {
 	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
 }
 
+// execLock serializes exec and clone to avoid bugs or unspecified behaviour
+// around exec'ing while creating/destroying threads.  See issue #19546.
+var execLock rwmutex
+
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
@@ -1559,7 +1582,9 @@ func newm(fn func(), _p_ *p) {
 	mp, _, _ := allocm(_p_, fn, false)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
+	execLock.rlock() // Prevent process clone.
 	newosproc(mp)
+	execLock.runlock()
 }
 
 // Stops execution of the current m until new work is available.
@@ -1812,7 +1837,7 @@ func execute(gp *g, inheritTime bool) {
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
 	if _g_.m.profilehz != hz {
-		resetcpuprofiler(hz)
+		setThreadCPUProfiler(hz)
 	}
 
 	if trace.enabled {
@@ -1850,6 +1875,9 @@ top:
 			ready(gp, 0, true)
 		}
 	}
+	if *cgo_yield != nil {
+		asmcgocall(*cgo_yield, nil)
+	}
 
 	// local runq
 	if gp, inheritTime := runqget(_p_); gp != nil {
@@ -2007,7 +2035,7 @@ stop:
 	}
 
 	// poll network
-	if netpollinited() && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
@@ -2048,7 +2076,7 @@ func pollWork() bool {
 	if !runqempty(p) {
 		return true
 	}
-	if netpollinited() && sched.lastpoll != 0 {
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
 		if gp := netpoll(false); gp != nil {
 			injectglist(gp)
 			return true
@@ -2211,7 +2239,7 @@ func park_m(gp *g) {
 	_g_ := getg()
 
 	if trace.enabled {
-		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
+		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
 	}
 
 	casgstatus(gp, _Grunning, _Gwaiting)
@@ -2256,6 +2284,19 @@ func gosched_m(gp *g) {
 	goschedImpl(gp)
 }
 
+// goschedguarded is a forbidden-states-avoided version of gosched_m
+func goschedguarded_m(gp *g) {
+
+	if gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.p.ptr().status != _Prunning {
+		gogo(gp) // never return
+	}
+
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
 func gopreempt_m(gp *g) {
 	if trace.enabled {
 		traceGoPreempt()
@@ -2290,10 +2331,11 @@ func goexit0(gp *g) {
 	gp.writebuf = nil
 	gp.waitreason = ""
 	gp.param = nil
+	gp.labels = nil
+	gp.timer = nil
 
 	// Note that gp's stack scan is now "valid" because it has no
-	// stack. We could dequeueRescan, but that takes a lock and
-	// isn't really necessary.
+	// stack.
 	gp.gcscanvalid = true
 	dropg()
 
@@ -2641,12 +2683,12 @@ func syscall_exitsyscall() {
 func beforefork() {
 	gp := getg().m.curg
 
-	// Fork can hang if preempted with signals frequently enough (see issue 5517).
-	// Ensure that we stay on the same M where we disable profiling.
+	// Block signals during a fork, so that the child does not run
+	// a signal handler before exec if a signal is sent to the process
+	// group. See issue #18600.
 	gp.m.locks++
-	if gp.m.profilehz != 0 {
-		resetcpuprofiler(0)
-	}
+	msigsave(gp.m)
+	sigblock()
 }
 
 // Called from syscall package before fork.
@@ -2659,10 +2701,8 @@ func syscall_runtime_BeforeFork() {
 func afterfork() {
 	gp := getg().m.curg
 
-	hz := sched.profilehz
-	if hz != 0 {
-		resetcpuprofiler(hz)
-	}
+	msigrestore(gp.m.sigmask)
+
 	gp.m.locks--
 }
 
@@ -2673,6 +2713,50 @@ func syscall_runtime_AfterFork() {
 	systemstack(afterfork)
 }
 
+// inForkedChild is true while manipulating signals in the child process.
+// This is used to avoid calling libc functions in case we are using vfork.
+var inForkedChild bool
+
+// Called from syscall package after fork in child.
+// It resets non-sigignored signals to the default handler, and
+// restores the signal mask in preparation for the exec.
+//
+// Because this might be called during a vfork, and therefore may be
+// temporarily sharing address space with the parent process, this must
+// not change any global variables or calling into C code that may do so.
+//
+//go:linkname syscall_runtime_AfterForkInChild syscall.runtime_AfterForkInChild
+//go:nosplit
+//go:nowritebarrierrec
+func syscall_runtime_AfterForkInChild() {
+	// It's OK to change the global variable inForkedChild here
+	// because we are going to change it back. There is no race here,
+	// because if we are sharing address space with the parent process,
+	// then the parent process can not be running concurrently.
+	inForkedChild = true
+
+	clearSignalHandlers()
+
+	// When we are the child we are the only thread running,
+	// so we know that nothing else has changed gp.m.sigmask.
+	msigrestore(getg().m.sigmask)
+
+	inForkedChild = false
+}
+
+// Called from syscall package before Exec.
+//go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
+func syscall_runtime_BeforeExec() {
+	// Prevent thread creation during exec.
+	execLock.lock()
+}
+
+// Called from syscall package after Exec.
+//go:linkname syscall_runtime_AfterExec syscall.runtime_AfterExec
+func syscall_runtime_AfterExec() {
+	execLock.unlock()
+}
+
 // Create a new g running fn passing arg as the single argument.
 // Put it on the queue of g's waiting to run.
 // The compiler turns a go statement into a call to this.
@@ -2695,7 +2779,6 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	if newg == nil {
 		newg = malg(true, false, &sp, &spsize)
 		casgstatus(newg, _Gidle, _Gdead)
-		newg.gcRescan = -1
 		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
 	} else {
 		resetNewG(newg, &sp, &spsize)
@@ -2717,17 +2800,13 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	newg.param = arg
 	newg.gopc = getcallerpc(unsafe.Pointer(&fn))
 	newg.startpc = fn
-	// The stack is dirty from the argument frame, so queue it for
-	// scanning. Do this before setting it to runnable so we still
-	// own the G. If we're recycling a G, it may already be on the
-	// rescan list.
-	if newg.gcRescan == -1 {
-		queueRescan(newg)
-	} else {
-		// The recycled G is already on the rescan list. Just
-		// mark the stack dirty.
-		newg.gcscanvalid = false
+	if _g_.m.curg != nil {
+		newg.labels = _g_.m.curg.labels
+	}
+	if isSystemGoroutine(newg) {
+		atomic.Xadd(&sched.ngsys, +1)
 	}
+	newg.gcscanvalid = false
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -2926,8 +3005,7 @@ func badunlockosthread() {
 
 func gcount() int32 {
 	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
-	for i := 0; ; i++ {
-		_p_ := allp[i]
+	for _, _p_ := range &allp {
 		if _p_ == nil {
 			break
 		}
@@ -2947,13 +3025,18 @@ func mcount() int32 {
 }
 
 var prof struct {
-	lock uint32
-	hz   int32
+	signalLock uint32
+	hz         int32
 }
 
-func _System()       { _System() }
-func _ExternalCode() { _ExternalCode() }
-func _GC()           { _GC() }
+func _System()                    { _System() }
+func _ExternalCode()              { _ExternalCode() }
+func _LostExternalCode()          { _LostExternalCode() }
+func _GC()                        { _GC() }
+func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
+
+// Counts SIGPROFs received while in atomic64 critical section, on mips{,le}
+var lostAtomic64Count uint64
 
 var _SystemPC = funcPC(_System)
 var _ExternalCodePC = funcPC(_ExternalCode)
@@ -3009,14 +3092,11 @@ func sigprof(pc uintptr, gp *g, mp *m) {
 	}
 
 	if prof.hz != 0 {
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
+		if (GOARCH == "mips" || GOARCH == "mipsle") && lostAtomic64Count > 0 {
+			cpuprof.addLostAtomic64(lostAtomic64Count)
+			lostAtomic64Count = 0
 		}
-		if prof.hz != 0 {
-			cpuprof.add(stk[:n])
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.add(gp, stk[:n])
 	}
 	getg().m.mallocing--
 }
@@ -3047,19 +3127,28 @@ func sigprofNonGo(pc uintptr) {
 			nonprofGoStk[1] = _ExternalCodePC + sys.PCQuantum
 		}
 
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.addNonGo(nonprofGoStk[:n])
+		cpuprof.addNonGo(nonprofGoStk[:n])
+	}
+}
+
+// sigprofNonGoPC is called when a profiling signal arrived on a
+// non-Go thread and we have a single PC value, not a stack trace.
+// g is nil, and what we can do is very limited.
+//go:nosplit
+//go:nowritebarrierrec
+func sigprofNonGoPC(pc uintptr) {
+	if prof.hz != 0 {
+		stk := []uintptr{
+			pc,
+			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.addNonGo(stk)
 	}
 }
 
-// Arrange to call fn with a traceback hz times a second.
-func setcpuprofilerate_m(hz int32) {
+// setcpuprofilerate sets the CPU profiling rate to hz times per second.
+// If hz <= 0, setcpuprofilerate turns off CPU profiling.
+func setcpuprofilerate(hz int32) {
 	// Force sane arguments.
 	if hz < 0 {
 		hz = 0
@@ -3073,20 +3162,23 @@ func setcpuprofilerate_m(hz int32) {
 	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
 	// it would deadlock.
-	resetcpuprofiler(0)
+	setThreadCPUProfiler(0)
 
-	for !atomic.Cas(&prof.lock, 0, 1) {
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
 		osyield()
 	}
-	prof.hz = hz
-	atomic.Store(&prof.lock, 0)
+	if prof.hz != hz {
+		setProcessCPUProfiler(hz)
+		prof.hz = hz
+	}
+	atomic.Store(&prof.signalLock, 0)
 
 	lock(&sched.lock)
 	sched.profilehz = hz
 	unlock(&sched.lock)
 
 	if hz != 0 {
-		resetcpuprofiler(hz)
+		setThreadCPUProfiler(hz)
 	}
 
 	_g_.m.locks--
@@ -3424,7 +3516,25 @@ func sysmon() {
 				if scavengelimit < forcegcperiod {
 					maxsleep = scavengelimit / 2
 				}
+				shouldRelax := true
+				if osRelaxMinNS > 0 {
+					lock(&timers.lock)
+					if timers.sleeping {
+						now := nanotime()
+						next := timers.sleepUntil
+						if next-now < osRelaxMinNS {
+							shouldRelax = false
+						}
+					}
+					unlock(&timers.lock)
+				}
+				if shouldRelax {
+					osRelax(true)
+				}
 				notetsleep(&sched.sysmonnote, maxsleep)
+				if shouldRelax {
+					osRelax(false)
+				}
 				lock(&sched.lock)
 				atomic.Store(&sched.sysmonwait, 0)
 				noteclear(&sched.sysmonnote)
@@ -3433,10 +3543,13 @@ func sysmon() {
 			}
 			unlock(&sched.lock)
 		}
+		// trigger libc interceptors if needed
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
-		unixnow := unixnanotime()
 		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
@@ -3461,8 +3574,7 @@ func sysmon() {
 			idle++
 		}
 		// check if we need to force a GC
-		lastgc := int64(atomic.Load64(&memstats.last_gc))
-		if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
+		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
 			lock(&forcegc.lock)
 			forcegc.idle = 0
 			forcegc.g.schedlink = 0
@@ -3482,7 +3594,7 @@ func sysmon() {
 	}
 }
 
-var pdesc [_MaxGomaxprocs]struct {
+type sysmontick struct {
 	schedtick   uint32
 	schedwhen   int64
 	syscalltick uint32
@@ -3500,7 +3612,7 @@ func retake(now int64) uint32 {
 		if _p_ == nil {
 			continue
 		}
-		pd := &pdesc[i]
+		pd := &_p_.sysmontick
 		s := _p_.status
 		if s == _Psyscall {
 			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
@@ -3905,7 +4017,7 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 
 	if randomizeScheduler {
 		for i := uint32(1); i <= n; i++ {
-			j := fastrand() % (i + 1)
+			j := fastrandn(i + 1)
 			batch[i], batch[j] = batch[j], batch[i]
 		}
 	}
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index 813c929..313a961 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -53,14 +53,14 @@ func TestStopTheWorldDeadlock(t *testing.T) {
 }
 
 func TestYieldProgress(t *testing.T) {
-	testYieldProgress(t, false)
+	testYieldProgress(false)
 }
 
 func TestYieldLockedProgress(t *testing.T) {
-	testYieldProgress(t, true)
+	testYieldProgress(true)
 }
 
-func testYieldProgress(t *testing.T, locked bool) {
+func testYieldProgress(locked bool) {
 	c := make(chan bool)
 	cack := make(chan bool)
 	go func() {
@@ -430,10 +430,13 @@ func TestPingPongHog(t *testing.T) {
 	<-lightChan
 
 	// Check that hogCount and lightCount are within a factor of
-	// 2, which indicates that both pairs of goroutines handed off
-	// the P within a time-slice to their buddy.
-	if hogCount > lightCount*2 || lightCount > hogCount*2 {
-		t.Fatalf("want hogCount/lightCount in [0.5, 2]; got %d/%d = %g", hogCount, lightCount, float64(hogCount)/float64(lightCount))
+	// 5, which indicates that both pairs of goroutines handed off
+	// the P within a time-slice to their buddy. We can use a
+	// fairly large factor here to make this robust: if the
+	// scheduler isn't working right, the gap should be ~1000X.
+	const factor = 5
+	if hogCount > lightCount*factor || lightCount > hogCount*factor {
+		t.Fatalf("want hogCount/lightCount in [%v, %v]; got %d/%d = %g", 1.0/factor, factor, hogCount, lightCount, float64(hogCount)/float64(lightCount))
 	}
 }
 
diff --git a/libgo/go/runtime/profbuf.go b/libgo/go/runtime/profbuf.go
new file mode 100644
index 0000000..f40881a
--- /dev/null
+++ b/libgo/go/runtime/profbuf.go
@@ -0,0 +1,561 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// A profBuf is a lock-free buffer for profiling events,
+// safe for concurrent use by one reader and one writer.
+// The writer may be a signal handler running without a user g.
+// The reader is assumed to be a user g.
+//
+// Each logged event corresponds to a fixed size header, a list of
+// uintptrs (typically a stack), and exactly one unsafe.Pointer tag.
+// The header and uintptrs are stored in the circular buffer data and the
+// tag is stored in a circular buffer tags, running in parallel.
+// In the circular buffer data, each event takes 2+hdrsize+len(stk)
+// words: the value 2+hdrsize+len(stk), then the time of the event, then
+// hdrsize words giving the fixed-size header, and then len(stk) words
+// for the stack.
+//
+// The current effective offsets into the tags and data circular buffers
+// for reading and writing are stored in the high 30 and low 32 bits of r and w.
+// The bottom bits of the high 32 are additional flag bits in w, unused in r.
+// "Effective" offsets means the total number of reads or writes, mod 2^length.
+// The offset in the buffer is the effective offset mod the length of the buffer.
+// To make wraparound mod 2^length match wraparound mod length of the buffer,
+// the length of the buffer must be a power of two.
+//
+// If the reader catches up to the writer, a flag passed to read controls
+// whether the read blocks until more data is available. A read returns a
+// pointer to the buffer data itself; the caller is assumed to be done with
+// that data at the next read. The read offset rNext tracks the next offset to
+// be returned by read. By definition, r ≤ rNext ≤ w (before wraparound),
+// and rNext is only used by the reader, so it can be accessed without atomics.
+//
+// If the writer gets ahead of the reader, so that the buffer fills,
+// future writes are discarded and replaced in the output stream by an
+// overflow entry, which has size 2+hdrsize+1, time set to the time of
+// the first discarded write, a header of all zeroed words, and a "stack"
+// containing one word, the number of discarded writes.
+//
+// Between the time the buffer fills and the buffer becomes empty enough
+// to hold more data, the overflow entry is stored as a pending overflow
+// entry in the fields overflow and overflowTime. The pending overflow
+// entry can be turned into a real record by either the writer or the
+// reader. If the writer is called to write a new record and finds that
+// the output buffer has room for both the pending overflow entry and the
+// new record, the writer emits the pending overflow entry and the new
+// record into the buffer. If the reader is called to read data and finds
+// that the output buffer is empty but that there is a pending overflow
+// entry, the reader will return a synthesized record for the pending
+// overflow entry.
+//
+// Only the writer can create or add to a pending overflow entry, but
+// either the reader or the writer can clear the pending overflow entry.
+// A pending overflow entry is indicated by the low 32 bits of 'overflow'
+// holding the number of discarded writes, and overflowTime holding the
+// time of the first discarded write. The high 32 bits of 'overflow'
+// increment each time the low 32 bits transition from zero to non-zero
+// or vice versa. This sequence number avoids ABA problems in the use of
+// compare-and-swap to coordinate between reader and writer.
+// The overflowTime is only written when the low 32 bits of overflow are
+// zero, that is, only when there is no pending overflow entry, in
+// preparation for creating a new one. The reader can therefore fetch and
+// clear the entry atomically using
+//
+//	for {
+//		overflow = load(&b.overflow)
+//		if uint32(overflow) == 0 {
+//			// no pending entry
+//			break
+//		}
+//		time = load(&b.overflowTime)
+//		if cas(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+//			// pending entry cleared
+//			break
+//		}
+//	}
+//	if uint32(overflow) > 0 {
+//		emit entry for uint32(overflow), time
+//	}
+//
+type profBuf struct {
+	// accessed atomically
+	r, w         profAtomic
+	overflow     uint64
+	overflowTime uint64
+	eof          uint32
+
+	// immutable (excluding slice content)
+	hdrsize uintptr
+	data    []uint64
+	tags    []unsafe.Pointer
+
+	// owned by reader
+	rNext       profIndex
+	overflowBuf []uint64 // for use by reader to return overflow record
+	wait        note
+}
+
+// A profAtomic is the atomically-accessed word holding a profIndex.
+type profAtomic uint64
+
+// A profIndex is the packet tag and data counts and flags bits, described above.
+type profIndex uint64
+
+const (
+	profReaderSleeping profIndex = 1 << 32 // reader is sleeping and must be woken up
+	profWriteExtra     profIndex = 1 << 33 // overflow or eof waiting
+)
+
+func (x *profAtomic) load() profIndex {
+	return profIndex(atomic.Load64((*uint64)(x)))
+}
+
+func (x *profAtomic) store(new profIndex) {
+	atomic.Store64((*uint64)(x), uint64(new))
+}
+
+func (x *profAtomic) cas(old, new profIndex) bool {
+	return atomic.Cas64((*uint64)(x), uint64(old), uint64(new))
+}
+
+func (x profIndex) dataCount() uint32 {
+	return uint32(x)
+}
+
+func (x profIndex) tagCount() uint32 {
+	return uint32(x >> 34)
+}
+
+// countSub subtracts two counts obtained from profIndex.dataCount or profIndex.tagCount,
+// assuming that they are no more than 2^29 apart (guaranteed since they are never more than
+// len(data) or len(tags) apart, respectively).
+// tagCount wraps at 2^30, while dataCount wraps at 2^32.
+// This function works for both.
+func countSub(x, y uint32) int {
+	// x-y is 32-bit signed or 30-bit signed; sign-extend to 32 bits and convert to int.
+	return int(int32(x-y) << 2 >> 2)
+}
+
+// addCountsAndClearFlags returns the packed form of "x + (data, tag) - all flags".
+func (x profIndex) addCountsAndClearFlags(data, tag int) profIndex {
+	return profIndex((uint64(x)>>34+uint64(uint32(tag)<<2>>2))<<34 | uint64(uint32(x)+uint32(data)))
+}
+
+// hasOverflow reports whether b has any overflow records pending.
+func (b *profBuf) hasOverflow() bool {
+	return uint32(atomic.Load64(&b.overflow)) > 0
+}
+
+// takeOverflow consumes the pending overflow records, returning the overflow count
+// and the time of the first overflow.
+// When called by the reader, it is racing against incrementOverflow.
+func (b *profBuf) takeOverflow() (count uint32, time uint64) {
+	overflow := atomic.Load64(&b.overflow)
+	time = atomic.Load64(&b.overflowTime)
+	for {
+		count = uint32(overflow)
+		if count == 0 {
+			time = 0
+			break
+		}
+		// Increment generation, clear overflow count in low bits.
+		if atomic.Cas64(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+			break
+		}
+		overflow = atomic.Load64(&b.overflow)
+		time = atomic.Load64(&b.overflowTime)
+	}
+	return uint32(overflow), time
+}
+
+// incrementOverflow records a single overflow at time now.
+// It is racing against a possible takeOverflow in the reader.
+func (b *profBuf) incrementOverflow(now int64) {
+	for {
+		overflow := atomic.Load64(&b.overflow)
+
+		// Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot.
+		// We need to set overflowTime if we're incrementing b.overflow from 0.
+		if uint32(overflow) == 0 {
+			// Store overflowTime first so it's always available when overflow != 0.
+			atomic.Store64(&b.overflowTime, uint64(now))
+			atomic.Store64(&b.overflow, (((overflow>>32)+1)<<32)+1)
+			break
+		}
+		// Otherwise we're racing to increment against reader
+		// who wants to set b.overflow to 0.
+		// Out of paranoia, leave 2³²-1 a sticky overflow value,
+		// to avoid wrapping around. Extremely unlikely.
+		if int32(overflow) == -1 {
+			break
+		}
+		if atomic.Cas64(&b.overflow, overflow, overflow+1) {
+			break
+		}
+	}
+}
+
+// newProfBuf returns a new profiling buffer with room for
+// a header of hdrsize words and a buffer of at least bufwords words.
+func newProfBuf(hdrsize, bufwords, tags int) *profBuf {
+	if min := 2 + hdrsize + 1; bufwords < min {
+		bufwords = min
+	}
+
+	// Buffer sizes must be power of two, so that we don't have to
+	// worry about uint32 wraparound changing the effective position
+	// within the buffers. We store 30 bits of count; limiting to 28
+	// gives us some room for intermediate calculations.
+	if bufwords >= 1<<28 || tags >= 1<<28 {
+		throw("newProfBuf: buffer too large")
+	}
+	var i int
+	for i = 1; i < bufwords; i <<= 1 {
+	}
+	bufwords = i
+	for i = 1; i < tags; i <<= 1 {
+	}
+	tags = i
+
+	b := new(profBuf)
+	b.hdrsize = uintptr(hdrsize)
+	b.data = make([]uint64, bufwords)
+	b.tags = make([]unsafe.Pointer, tags)
+	b.overflowBuf = make([]uint64, 2+b.hdrsize+1)
+	return b
+}
+
+// canWriteRecord reports whether the buffer has room
+// for a single contiguous record with a stack of length nstk.
+func (b *profBuf) canWriteRecord(nstk int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 1 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	want := 2 + int(b.hdrsize) + nstk
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+	}
+	return nd >= want
+}
+
+// canWriteTwoRecords reports whether the buffer has room
+// for two records with stack lengths nstk1, nstk2, in that order.
+// Each record must be contiguous on its own, but the two
+// records need not be contiguous (one can be at the end of the buffer
+// and the other can wrap around and start at the beginning of the buffer).
+func (b *profBuf) canWriteTwoRecords(nstk1, nstk2 int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 2 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+
+	// first record
+	want := 2 + int(b.hdrsize) + nstk1
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	i += want
+	nd -= want
+
+	// second record
+	want = 2 + int(b.hdrsize) + nstk2
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	return nd >= want
+}
+
+// write writes an entry to the profiling buffer b.
+// The entry begins with a fixed hdr, which must have
+// length b.hdrsize, followed by a variable-sized stack
+// and a single tag pointer *tagPtr (or nil if tagPtr is nil).
+// No write barriers allowed because this might be called from a signal handler.
+func (b *profBuf) write(tagPtr *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+	if b == nil {
+		return
+	}
+	if len(hdr) > int(b.hdrsize) {
+		throw("misuse of profBuf.write")
+	}
+
+	if hasOverflow := b.hasOverflow(); hasOverflow && b.canWriteTwoRecords(1, len(stk)) {
+		// Room for both an overflow record and the one being written.
+		// Write the overflow record if the reader hasn't gotten to it yet.
+		// Only racing against reader, not other writers.
+		count, time := b.takeOverflow()
+		if count > 0 {
+			var stk [1]uintptr
+			stk[0] = uintptr(count)
+			b.write(nil, int64(time), nil, stk[:])
+		}
+	} else if hasOverflow || !b.canWriteRecord(len(stk)) {
+		// Pending overflow without room to write overflow and new records
+		// or no overflow but also no room for new record.
+		b.incrementOverflow(now)
+		b.wakeupExtra()
+		return
+	}
+
+	// There's room: write the record.
+	br := b.r.load()
+	bw := b.w.load()
+
+	// Profiling tag
+	//
+	// The tag is a pointer, but we can't run a write barrier here.
+	// We have interrupted the OS-level execution of gp, but the
+	// runtime still sees gp as executing. In effect, we are running
+	// in place of the real gp. Since gp is the only goroutine that
+	// can overwrite gp.labels, the value of gp.labels is stable during
+	// this signal handler: it will still be reachable from gp when
+	// we finish executing. If a GC is in progress right now, it must
+	// keep gp.labels alive, because gp.labels is reachable from gp.
+	// If gp were to overwrite gp.labels, the deletion barrier would
+	// still shade that pointer, which would preserve it for the
+	// in-progress GC, so all is well. Any future GC will see the
+	// value we copied when scanning b.tags (heap-allocated).
+	// We arrange that the store here is always overwriting a nil,
+	// so there is no need for a deletion barrier on b.tags[wt].
+	wt := int(bw.tagCount() % uint32(len(b.tags)))
+	if tagPtr != nil {
+		*(*uintptr)(unsafe.Pointer(&b.tags[wt])) = uintptr(unsafe.Pointer(*tagPtr))
+	}
+
+	// Main record.
+	// It has to fit in a contiguous section of the slice, so if it doesn't fit at the end,
+	// leave a rewind marker (0) and start over at the beginning of the slice.
+	wd := int(bw.dataCount() % uint32(len(b.data)))
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	skip := 0
+	if wd+2+int(b.hdrsize)+len(stk) > len(b.data) {
+		b.data[wd] = 0
+		skip = len(b.data) - wd
+		nd -= skip
+		wd = 0
+	}
+	data := b.data[wd:]
+	data[0] = uint64(2 + b.hdrsize + uintptr(len(stk))) // length
+	data[1] = uint64(now)                               // time stamp
+	// header, zero-padded
+	i := uintptr(copy(data[2:2+b.hdrsize], hdr))
+	for ; i < b.hdrsize; i++ {
+		data[2+i] = 0
+	}
+	for i, pc := range stk {
+		data[2+b.hdrsize+uintptr(i)] = uint64(pc)
+	}
+
+	for {
+		// Commit write.
+		// Racing with reader setting flag bits in b.w, to avoid lost wakeups.
+		old := b.w.load()
+		new := old.addCountsAndClearFlags(skip+2+len(stk)+int(b.hdrsize), 1)
+		if !b.w.cas(old, new) {
+			continue
+		}
+		// If there was a reader, wake it up.
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// close signals that there will be no more writes on the buffer.
+// Once all the data has been read from the buffer, reads will return eof=true.
+func (b *profBuf) close() {
+	if atomic.Load(&b.eof) > 0 {
+		throw("runtime: profBuf already closed")
+	}
+	atomic.Store(&b.eof, 1)
+	b.wakeupExtra()
+}
+
+// wakeupExtra must be called after setting one of the "extra"
+// atomic fields b.overflow or b.eof.
+// It records the change in b.w and wakes up the reader if needed.
+func (b *profBuf) wakeupExtra() {
+	for {
+		old := b.w.load()
+		new := old | profWriteExtra
+		if !b.w.cas(old, new) {
+			continue
+		}
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// profBufReadMode specifies whether to block when no data is available to read.
+type profBufReadMode int
+
+const (
+	profBufBlocking profBufReadMode = iota
+	profBufNonBlocking
+)
+
+var overflowTag [1]unsafe.Pointer // always nil
+
+func (b *profBuf) read(mode profBufReadMode) (data []uint64, tags []unsafe.Pointer, eof bool) {
+	if b == nil {
+		return nil, nil, true
+	}
+
+	br := b.rNext
+
+	// Commit previous read, returning that part of the ring to the writer.
+	// First clear tags that have now been read, both to avoid holding
+	// up the memory they point at for longer than necessary
+	// and so that b.write can assume it is always overwriting
+	// nil tag entries (see comment in b.write).
+	rPrev := b.r.load()
+	if rPrev != br {
+		ntag := countSub(br.tagCount(), rPrev.tagCount())
+		ti := int(rPrev.tagCount() % uint32(len(b.tags)))
+		for i := 0; i < ntag; i++ {
+			b.tags[ti] = nil
+			if ti++; ti == len(b.tags) {
+				ti = 0
+			}
+		}
+		b.r.store(br)
+	}
+
+Read:
+	bw := b.w.load()
+	numData := countSub(bw.dataCount(), br.dataCount())
+	if numData == 0 {
+		if b.hasOverflow() {
+			// No data to read, but there is overflow to report.
+			// Racing with writer flushing b.overflow into a real record.
+			count, time := b.takeOverflow()
+			if count == 0 {
+				// Lost the race, go around again.
+				goto Read
+			}
+			// Won the race, report overflow.
+			dst := b.overflowBuf
+			dst[0] = uint64(2 + b.hdrsize + 1)
+			dst[1] = uint64(time)
+			for i := uintptr(0); i < b.hdrsize; i++ {
+				dst[2+i] = 0
+			}
+			dst[2+b.hdrsize] = uint64(count)
+			return dst[:2+b.hdrsize+1], overflowTag[:1], false
+		}
+		if atomic.Load(&b.eof) > 0 {
+			// No data, no overflow, EOF set: done.
+			return nil, nil, true
+		}
+		if bw&profWriteExtra != 0 {
+			// Writer claims to have published extra information (overflow or eof).
+			// Attempt to clear notification and then check again.
+			// If we fail to clear the notification it means b.w changed,
+			// so we still need to check again.
+			b.w.cas(bw, bw&^profWriteExtra)
+			goto Read
+		}
+
+		// Nothing to read right now.
+		// Return or sleep according to mode.
+		if mode == profBufNonBlocking {
+			return nil, nil, false
+		}
+		if !b.w.cas(bw, bw|profReaderSleeping) {
+			goto Read
+		}
+		// Committed to sleeping.
+		notetsleepg(&b.wait, -1)
+		noteclear(&b.wait)
+		goto Read
+	}
+	data = b.data[br.dataCount()%uint32(len(b.data)):]
+	if len(data) > numData {
+		data = data[:numData]
+	} else {
+		numData -= len(data) // available in case of wraparound
+	}
+	skip := 0
+	if data[0] == 0 {
+		// Wraparound record. Go back to the beginning of the ring.
+		skip = len(data)
+		data = b.data
+		if len(data) > numData {
+			data = data[:numData]
+		}
+	}
+
+	ntag := countSub(bw.tagCount(), br.tagCount())
+	if ntag == 0 {
+		throw("runtime: malformed profBuf buffer - tag and data out of sync")
+	}
+	tags = b.tags[br.tagCount()%uint32(len(b.tags)):]
+	if len(tags) > ntag {
+		tags = tags[:ntag]
+	}
+
+	// Count out whole data records until either data or tags is done.
+	// They are always in sync in the buffer, but due to an end-of-slice
+	// wraparound we might need to stop early and return the rest
+	// in the next call.
+	di := 0
+	ti := 0
+	for di < len(data) && data[di] != 0 && ti < len(tags) {
+		if uintptr(di)+uintptr(data[di]) > uintptr(len(data)) {
+			throw("runtime: malformed profBuf buffer - invalid size")
+		}
+		di += int(data[di])
+		ti++
+	}
+
+	// Remember how much we returned, to commit read on next call.
+	b.rNext = br.addCountsAndClearFlags(skip+di, ti)
+
+	if raceenabled {
+		// Match racereleasemerge in runtime_setProfLabel,
+		// so that the setting of the labels in runtime_setProfLabel
+		// is treated as happening before any use of the labels
+		// by our caller. The synchronization on labelSync itself is a fiction
+		// for the race detector. The actual synchronization is handled
+		// by the fact that the signal handler only reads from the current
+		// goroutine and uses atomics to write the updated queue indices,
+		// and then the read-out from the signal handler buffer uses
+		// atomics to read those queue indices.
+		raceacquire(unsafe.Pointer(&labelSync))
+	}
+
+	return data[:di], tags[:ti], false
+}
diff --git a/libgo/go/runtime/profbuf_test.go b/libgo/go/runtime/profbuf_test.go
new file mode 100644
index 0000000..d9c5264
--- /dev/null
+++ b/libgo/go/runtime/profbuf_test.go
@@ -0,0 +1,182 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"reflect"
+	. "runtime"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func TestProfBuf(t *testing.T) {
+	const hdrSize = 2
+
+	write := func(t *testing.T, b *ProfBuf, tag unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+		b.Write(&tag, now, hdr, stk)
+	}
+	read := func(t *testing.T, b *ProfBuf, data []uint64, tags []unsafe.Pointer) {
+		rdata, rtags, eof := b.Read(ProfBufNonBlocking)
+		if !reflect.DeepEqual(rdata, data) || !reflect.DeepEqual(rtags, tags) {
+			t.Fatalf("unexpected profile read:\nhave data %#x\nwant data %#x\nhave tags %#x\nwant tags %#x", rdata, data, rtags, tags)
+		}
+		if eof {
+			t.Fatalf("unexpected eof")
+		}
+	}
+	readBlock := func(t *testing.T, b *ProfBuf, data []uint64, tags []unsafe.Pointer) func() {
+		c := make(chan int)
+		go func() {
+			eof := data == nil
+			rdata, rtags, reof := b.Read(ProfBufBlocking)
+			if !reflect.DeepEqual(rdata, data) || !reflect.DeepEqual(rtags, tags) || reof != eof {
+				// Errorf, not Fatalf, because called in goroutine.
+				t.Errorf("unexpected profile read:\nhave data %#x\nwant data %#x\nhave tags %#x\nwant tags %#x\nhave eof=%v, want %v", rdata, data, rtags, tags, reof, eof)
+			}
+			c <- 1
+		}()
+		time.Sleep(10 * time.Millisecond) // let goroutine run and block
+		return func() {
+			select {
+			case <-c:
+			case <-time.After(1 * time.Second):
+				t.Fatalf("timeout waiting for blocked read")
+			}
+		}
+	}
+	readEOF := func(t *testing.T, b *ProfBuf) {
+		rdata, rtags, eof := b.Read(ProfBufBlocking)
+		if rdata != nil || rtags != nil || !eof {
+			t.Errorf("unexpected profile read: %#x, %#x, eof=%v; want nil, nil, eof=true", rdata, rtags, eof)
+		}
+		rdata, rtags, eof = b.Read(ProfBufNonBlocking)
+		if rdata != nil || rtags != nil || !eof {
+			t.Errorf("unexpected profile read (non-blocking): %#x, %#x, eof=%v; want nil, nil, eof=true", rdata, rtags, eof)
+		}
+	}
+
+	myTags := make([]byte, 100)
+	t.Logf("myTags is %p", &myTags[0])
+
+	t.Run("BasicWriteRead", func(t *testing.T) {
+		b := NewProfBuf(2, 11, 1)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		read(t, b, nil, nil) // release data returned by previous read
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadMany", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204, 5, 500, 502, 504, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2]), unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadManyShortData", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadManyShortTags", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadAfterOverflow1", func(t *testing.T) {
+		// overflow record synthesized by write
+		b := NewProfBuf(2, 16, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})           // uses 10
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])}) // reads 10 but still in use until next read
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5})                       // uses 6
+		read(t, b, []uint64{6, 1, 2, 3, 4, 5}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})              // reads 6 but still in use until next read
+		// now 10 available
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204, 205, 206, 207, 208, 209}) // no room
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), int64(100+i), []uint64{101, 102}, []uintptr{201, 202, 203, 204}) // no room for overflow+this record
+		}
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506}) // room for overflow+this record
+		read(t, b, []uint64{5, 99, 0, 0, 300, 5, 500, 502, 504, 506}, []unsafe.Pointer{nil, unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadAfterOverflow2", func(t *testing.T) {
+		// overflow record synthesized by read
+		b := NewProfBuf(2, 16, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213})
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), 100, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		}
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])}) // reads 10 but still in use until next read
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{})                     // still overflow
+		read(t, b, []uint64{5, 99, 0, 0, 301}, []unsafe.Pointer{nil})                                     // overflow synthesized by read
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 505}, []uintptr{506})                  // written
+		read(t, b, []uint64{5, 500, 502, 505, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadAtEndAfterOverflow", func(t *testing.T) {
+		b := NewProfBuf(2, 12, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), 100, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		}
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		read(t, b, []uint64{5, 99, 0, 0, 300}, []unsafe.Pointer{nil})
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506})
+		read(t, b, []uint64{5, 500, 502, 504, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("BlockingWriteRead", func(t *testing.T) {
+		b := NewProfBuf(2, 11, 1)
+		wait := readBlock(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		wait()
+		wait = readBlock(t, b, []uint64{8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[2])})
+		time.Sleep(10 * time.Millisecond)
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		wait()
+		wait = readBlock(t, b, nil, nil)
+		b.Close()
+		wait()
+		wait = readBlock(t, b, nil, nil)
+		wait()
+		readEOF(t, b)
+	})
+
+	t.Run("DataWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 16, 1024)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+
+	t.Run("TagWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 1024, 2)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+
+	t.Run("BothWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 16, 2)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+}
diff --git a/libgo/go/runtime/proflabel.go b/libgo/go/runtime/proflabel.go
new file mode 100644
index 0000000..ff73fe4
--- /dev/null
+++ b/libgo/go/runtime/proflabel.go
@@ -0,0 +1,40 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var labelSync uintptr
+
+//go:linkname runtime_setProfLabel runtime_pprof.runtime_setProfLabel
+func runtime_setProfLabel(labels unsafe.Pointer) {
+	// Introduce race edge for read-back via profile.
+	// This would more properly use &getg().labels as the sync address,
+	// but we do the read in a signal handler and can't call the race runtime then.
+	//
+	// This uses racereleasemerge rather than just racerelease so
+	// the acquire in profBuf.read synchronizes with *all* prior
+	// setProfLabel operations, not just the most recent one. This
+	// is important because profBuf.read will observe different
+	// labels set by different setProfLabel operations on
+	// different goroutines, so it needs to synchronize with all
+	// of them (this wouldn't be an issue if we could synchronize
+	// on &getg().labels since we would synchronize with each
+	// most-recent labels write separately.)
+	//
+	// racereleasemerge is like a full read-modify-write on
+	// labelSync, rather than just a store-release, so it carries
+	// a dependency on the previous racereleasemerge, which
+	// ultimately carries forward to the acquire in profBuf.read.
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&labelSync))
+	}
+	getg().labels = labels
+}
+
+//go:linkname runtime_getProfLabel runtime_pprof.runtime_getProfLabel
+func runtime_getProfLabel() unsafe.Pointer {
+	return getg().labels
+}
diff --git a/libgo/go/runtime/rand_test.go b/libgo/go/runtime/rand_test.go
new file mode 100644
index 0000000..f8831b0
--- /dev/null
+++ b/libgo/go/runtime/rand_test.go
@@ -0,0 +1,45 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"strconv"
+	"testing"
+)
+
+func BenchmarkFastrand(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			Fastrand()
+		}
+	})
+}
+
+func BenchmarkFastrandHashiter(b *testing.B) {
+	var m = make(map[int]int, 10)
+	for i := 0; i < 10; i++ {
+		m[i] = i
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _ = range m {
+				break
+			}
+		}
+	})
+}
+
+var sink32 uint32
+
+func BenchmarkFastrandn(b *testing.B) {
+	for n := uint32(2); n <= 5; n++ {
+		b.Run(strconv.Itoa(int(n)), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sink32 = Fastrandn(n)
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/relax_stub.go b/libgo/go/runtime/relax_stub.go
new file mode 100644
index 0000000..81ed129
--- /dev/null
+++ b/libgo/go/runtime/relax_stub.go
@@ -0,0 +1,17 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !windows
+
+package runtime
+
+// osRelaxMinNS is the number of nanoseconds of idleness to tolerate
+// without performing an osRelax. Since osRelax may reduce the
+// precision of timers, this should be enough larger than the relaxed
+// timer precision to keep the timer error acceptable.
+const osRelaxMinNS = 0
+
+// osRelax is called by the scheduler when transitioning to and from
+// all Ps being idle.
+func osRelax(relax bool) {}
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index dd3f7b2..627adf7 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -47,15 +47,14 @@ var traceback_env uint32
 //go:nosplit
 func gotraceback() (level int32, all, crash bool) {
 	_g_ := getg()
-	all = _g_.m.throwing > 0
+	t := atomic.Load(&traceback_cache)
+	crash = t&tracebackCrash != 0
+	all = _g_.m.throwing > 0 || t&tracebackAll != 0
 	if _g_.m.traceback != 0 {
 		level = int32(_g_.m.traceback)
-		return
+	} else {
+		level = int32(t >> tracebackShift)
 	}
-	t := atomic.Load(&traceback_cache)
-	crash = t&tracebackCrash != 0
-	all = all || t&tracebackAll != 0
-	level = int32(t >> tracebackShift)
 	return
 }
 
@@ -330,35 +329,23 @@ type dbgVar struct {
 // except for "memprofilerate" since there is an
 // existing int var for that value, which may
 // already have an initial value.
-
-// For gccgo we use a named type so that the C code can see the
-// definition.
-type debugVars struct {
-	allocfreetrace    int32
-	cgocheck          int32
-	efence            int32
-	gccheckmark       int32
-	gcpacertrace      int32
-	gcshrinkstackoff  int32
-	gcstackbarrieroff int32
-	gcstackbarrierall int32
-	gcrescanstacks    int32
-	gcstoptheworld    int32
-	gctrace           int32
-	invalidptr        int32
-	sbrk              int32
-	scavenge          int32
-	scheddetail       int32
-	schedtrace        int32
-	wbshadow          int32
+var debug struct {
+	allocfreetrace   int32
+	cgocheck         int32
+	efence           int32
+	gccheckmark      int32
+	gcpacertrace     int32
+	gcshrinkstackoff int32
+	gcrescanstacks   int32
+	gcstoptheworld   int32
+	gctrace          int32
+	invalidptr       int32
+	sbrk             int32
+	scavenge         int32
+	scheddetail      int32
+	schedtrace       int32
 }
 
-var debug debugVars
-
-// For gccgo's C code.
-//extern runtime_setdebug
-func runtime_setdebug(*debugVars)
-
 var dbgvars = []dbgVar{
 	{"allocfreetrace", &debug.allocfreetrace},
 	{"cgocheck", &debug.cgocheck},
@@ -366,8 +353,6 @@ var dbgvars = []dbgVar{
 	{"gccheckmark", &debug.gccheckmark},
 	{"gcpacertrace", &debug.gcpacertrace},
 	{"gcshrinkstackoff", &debug.gcshrinkstackoff},
-	{"gcstackbarrieroff", &debug.gcstackbarrieroff},
-	{"gcstackbarrierall", &debug.gcstackbarrierall},
 	{"gcrescanstacks", &debug.gcrescanstacks},
 	{"gcstoptheworld", &debug.gcstoptheworld},
 	{"gctrace", &debug.gctrace},
@@ -376,7 +361,6 @@ var dbgvars = []dbgVar{
 	{"scavenge", &debug.scavenge},
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
-	{"wbshadow", &debug.wbshadow},
 }
 
 func parsedebugvars() {
@@ -430,26 +414,12 @@ func parsedebugvars() {
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 
-	if debug.gcrescanstacks == 0 {
-		// Without rescanning, there's no need for stack
-		// barriers.
-		debug.gcstackbarrieroff = 1
-		debug.gcstackbarrierall = 0
-	}
-
-	// if debug.gcstackbarrierall > 0 {
-	// 	firstStackBarrierOffset = 0
-	// }
-
 	// For cgocheck > 1, we turn on the write barrier at all times
 	// and check all pointer writes.
 	if debug.cgocheck > 1 {
 		writeBarrier.cgo = true
 		writeBarrier.enabled = true
 	}
-
-	// Tell the C code what the value is.
-	runtime_setdebug(&debug)
 }
 
 //go:linkname setTraceback runtime_debug.SetTraceback
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index cdd3fcc..045e76f 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -254,7 +254,7 @@ func setMNoWB(mp **m, new *m) {
 type sudog struct {
 	// The following fields are protected by the hchan.lock of the
 	// channel this sudog is blocking on. shrinkstack depends on
-	// this.
+	// this for sudogs involved in channel ops.
 
 	g          *g
 	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
@@ -263,25 +263,19 @@ type sudog struct {
 	elem       unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
-	// waitlink is only accessed by g.
+	// For channels, waitlink is only accessed by g.
+	// For semaphores, all fields (including the ones above)
+	// are only accessed when holding a semaRoot lock.
 
 	acquiretime int64
 	releasetime int64
 	ticket      uint32
-	waitlink    *sudog // g.waiting list
+	parent      *sudog // semaRoot binary tree
+	waitlink    *sudog // g.waiting list or semaRoot
+	waittail    *sudog // semaRoot
 	c           *hchan // channel
 }
 
-type gcstats struct {
-	// the struct must consist of only uint64's,
-	// because it is casted to uint64[].
-	nhandoff    uint64
-	nhandoffcnt uint64
-	nprocyield  uint64
-	nosyield    uint64
-	nsleep      uint64
-}
-
 /*
 Not used by gccgo.
 
@@ -318,12 +312,6 @@ type stack struct {
 	lo uintptr
 	hi uintptr
 }
-
-// stkbar records the state of a G's stack barrier.
-type stkbar struct {
-	savedLRPtr uintptr // location overwritten by stack barrier PC
-	savedLRVal uintptr // value overwritten at savedLRPtr
-}
 */
 
 type g struct {
@@ -341,12 +329,9 @@ type g struct {
 	_panic *_panic // innermost panic - offset known to liblink
 	_defer *_defer // innermost defer
 	m      *m      // current m; offset known to arm liblink
-	// Not for gccgo: stackAlloc     uintptr // stack allocation is [stack.lo,stack.lo+stackAlloc)
 	// Not for gccgo: sched          gobuf
 	syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
 	syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
-	// Not for gccgo: stkbar         []stkbar       // stack barriers, from low to high (see top of mstkbar.go)
-	// Not for gccgo: stkbarPos      uintptr        // index of lowest stack barrier not hit
 	// Not for gccgo: stktopsp       uintptr        // expected sp at top of stack, to check in traceback
 	param        unsafe.Pointer // passed parameter on wakeup
 	atomicstatus uint32
@@ -359,7 +344,7 @@ type g struct {
 	paniconfault   bool     // panic (instead of crash) on unexpected fault address
 	preemptscan    bool     // preempted g does scan for gc
 	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; TODO: remove?
 	throwsplit     bool     // must not split stack
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
@@ -376,17 +361,12 @@ type g struct {
 	startpc        uintptr // pc of goroutine function
 	// Not for gccgo: racectx        uintptr
 	waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
-	// Not for gccgo: cgoCtxt        []uintptr // cgo traceback context
+	// Not for gccgo: cgoCtxt        []uintptr      // cgo traceback context
+	labels unsafe.Pointer // profiler labels
+	timer  *timer         // cached timer for time.Sleep
 
 	// Per-G GC state
 
-	// gcRescan is this G's index in work.rescan.list. If this is
-	// -1, this G is not on the rescan list.
-	//
-	// If gcphase != _GCoff and this G is visible to the garbage
-	// collector, writes to this are protected by work.rescan.lock.
-	gcRescan int32
-
 	// gcAssistBytes is this G's GC assist credit in terms of
 	// bytes allocated. If this is positive, then the G has credit
 	// to allocate gcAssistBytes bytes without assisting. If this
@@ -452,6 +432,7 @@ type m struct {
 	inwb        bool // m is executing a write barrier
 	newSigstack bool // minit on C thread called sigaltstack
 	printlock   int8
+	incgo       bool // m is executing a cgo call
 	fastrand    uint32
 	ncgocall    uint64 // number of cgo calls in total
 	ncgo        int32  // number of cgo calls currently in progress
@@ -468,7 +449,6 @@ type m struct {
 	// Not for gccgo: fflag         uint32      // floating point compare flags
 	locked        uint32  // tracking for lockosthread
 	nextwaitm     uintptr // next m waiting for lock
-	gcstats       gcstats
 	needextram    bool
 	traceback     uint8
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
@@ -505,9 +485,10 @@ type p struct {
 	id          int32
 	status      uint32 // one of pidle/prunning/...
 	link        puintptr
-	schedtick   uint32   // incremented on every scheduler call
-	syscalltick uint32   // incremented on every system call
-	m           muintptr // back-link to associated m (nil if idle)
+	schedtick   uint32     // incremented on every scheduler call
+	syscalltick uint32     // incremented on every system call
+	sysmontick  sysmontick // last tick observed by sysmon
+	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
 	// Not for gccgo: racectx     uintptr
 
@@ -543,6 +524,14 @@ type p struct {
 
 	tracebuf traceBufPtr
 
+	// traceSweep indicates the sweep events should be traced.
+	// This is used to defer the sweep start event until a span
+	// has actually been swept.
+	traceSweep bool
+	// traceSwept and traceReclaimed track the number of bytes
+	// swept and reclaimed by sweeping in the current sweep loop.
+	traceSwept, traceReclaimed uintptr
+
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
@@ -563,7 +552,7 @@ type p struct {
 const (
 	// The max value of GOMAXPROCS.
 	// There are no fundamental restrictions on the value.
-	_MaxGomaxprocs = 1 << 8
+	_MaxGomaxprocs = 1 << 10
 )
 
 type schedt struct {
@@ -639,7 +628,6 @@ const (
 	_SigThrow                // if signal.Notify doesn't take it, exit loudly
 	_SigPanic                // if the signal is from the kernel, panic
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
-	_SigHandling             // our signal handler is registered
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
 	_SigUnblock              // unblocked in minit
@@ -753,13 +741,10 @@ const (
 const _TracebackMaxFrames = 100
 
 var (
-	//	emptystring string
-
 	allglen    uintptr
 	allm       *m
 	allp       [_MaxGomaxprocs + 1]*p
 	gomaxprocs int32
-	panicking  uint32
 	ncpu       int32
 	forcegc    forcegcstate
 	sched      schedt
@@ -767,6 +752,8 @@ var (
 
 	// Information about what cpu features are available.
 	// Set on startup in asm_{x86,amd64}.s.
+	// Packages outside the runtime should not use these
+	// as they are not an external api.
 	cpuid_ecx   uint32
 	support_aes bool
 
diff --git a/libgo/go/runtime/runtime_test.go b/libgo/go/runtime/runtime_test.go
index 1f403a1..b8f6ac2 100644
--- a/libgo/go/runtime/runtime_test.go
+++ b/libgo/go/runtime/runtime_test.go
@@ -50,6 +50,23 @@ func BenchmarkIfaceCmpNil100(b *testing.B) {
 	}
 }
 
+var efaceCmp1 interface{}
+var efaceCmp2 interface{}
+
+func BenchmarkEfaceCmpDiff(b *testing.B) {
+	x := 5
+	efaceCmp1 = &x
+	y := 6
+	efaceCmp2 = &y
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 100; j++ {
+			if efaceCmp1 == efaceCmp2 {
+				b.Fatal("bad comparison")
+			}
+		}
+	}
+}
+
 func BenchmarkDefer(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		defer1()
@@ -62,7 +79,6 @@ func defer1() {
 			panic("bad recover")
 		}
 	}(1, 2, 3)
-	return
 }
 
 func BenchmarkDefer10(b *testing.B) {
diff --git a/libgo/go/runtime/rwmutex.go b/libgo/go/runtime/rwmutex.go
new file mode 100644
index 0000000..7eeb559
--- /dev/null
+++ b/libgo/go/runtime/rwmutex.go
@@ -0,0 +1,125 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+)
+
+// This is a copy of sync/rwmutex.go rewritten to work in the runtime.
+
+// An rwmutex is a reader/writer mutual exclusion lock.
+// The lock can be held by an arbitrary number of readers or a single writer.
+// This is a variant of sync.RWMutex, for the runtime package.
+// Like mutex, rwmutex blocks the calling M.
+// It does not interact with the goroutine scheduler.
+type rwmutex struct {
+	rLock      mutex    // protects readers, readerPass, writer
+	readers    muintptr // list of pending readers
+	readerPass uint32   // number of pending readers to skip readers list
+
+	wLock  mutex    // serializes writers
+	writer muintptr // pending writer waiting for completing readers
+
+	readerCount uint32 // number of pending readers
+	readerWait  uint32 // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// rlock locks rw for reading.
+func (rw *rwmutex) rlock() {
+	// The reader must not be allowed to lose its P or else other
+	// things blocking on the lock may consume all of the Ps and
+	// deadlock (issue #20903). Alternatively, we could drop the P
+	// while sleeping.
+	acquirem()
+	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
+		// A writer is pending. Park on the reader queue.
+		systemstack(func() {
+			lock(&rw.rLock)
+			if rw.readerPass > 0 {
+				// Writer finished.
+				rw.readerPass -= 1
+				unlock(&rw.rLock)
+			} else {
+				// Queue this reader to be woken by
+				// the writer.
+				m := getg().m
+				m.schedlink = rw.readers
+				rw.readers.set(m)
+				unlock(&rw.rLock)
+				notesleep(&m.park)
+				noteclear(&m.park)
+			}
+		})
+	}
+}
+
+// runlock undoes a single rlock call on rw.
+func (rw *rwmutex) runlock() {
+	if r := int32(atomic.Xadd(&rw.readerCount, -1)); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			throw("runlock of unlocked rwmutex")
+		}
+		// A writer is pending.
+		if atomic.Xadd(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			lock(&rw.rLock)
+			w := rw.writer.ptr()
+			if w != nil {
+				notewakeup(&w.park)
+			}
+			unlock(&rw.rLock)
+		}
+	}
+	releasem(getg().m)
+}
+
+// lock locks rw for writing.
+func (rw *rwmutex) lock() {
+	// Resolve competition with other writers and stick to our P.
+	lock(&rw.wLock)
+	m := getg().m
+	// Announce that there is a pending writer.
+	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
+	// Wait for any active readers to complete.
+	lock(&rw.rLock)
+	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
+		// Wait for reader to wake us up.
+		systemstack(func() {
+			rw.writer.set(m)
+			unlock(&rw.rLock)
+			notesleep(&m.park)
+			noteclear(&m.park)
+		})
+	} else {
+		unlock(&rw.rLock)
+	}
+}
+
+// unlock unlocks rw for writing.
+func (rw *rwmutex) unlock() {
+	// Announce to readers that there is no active writer.
+	r := int32(atomic.Xadd(&rw.readerCount, rwmutexMaxReaders))
+	if r >= rwmutexMaxReaders {
+		throw("unlock of unlocked rwmutex")
+	}
+	// Unblock blocked readers.
+	lock(&rw.rLock)
+	for rw.readers.ptr() != nil {
+		reader := rw.readers.ptr()
+		rw.readers = reader.schedlink
+		reader.schedlink.set(nil)
+		notewakeup(&reader.park)
+		r -= 1
+	}
+	// If r > 0, there are pending readers that aren't on the
+	// queue. Tell them to skip waiting.
+	rw.readerPass += uint32(r)
+	unlock(&rw.rLock)
+	// Allow other writers to proceed.
+	unlock(&rw.wLock)
+}
diff --git a/libgo/go/runtime/rwmutex_test.go b/libgo/go/runtime/rwmutex_test.go
new file mode 100644
index 0000000..a69eca1
--- /dev/null
+++ b/libgo/go/runtime/rwmutex_test.go
@@ -0,0 +1,178 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// This is a copy of sync/rwmutex_test.go rewritten to test the
+// runtime rwmutex.
+
+package runtime_test
+
+import (
+	"fmt"
+	. "runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *RWMutex, clocked chan bool, cunlock *uint32, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	for atomic.LoadUint32(cunlock) == 0 {
+	}
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders int) {
+	GOMAXPROCS(numReaders + 1)
+	var m RWMutex
+	clocked := make(chan bool, numReaders)
+	var cunlock uint32
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, &cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	atomic.StoreUint32(&cunlock, 1)
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelRWMutexReaders(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	doTestParallelReaders(1)
+	doTestParallelReaders(3)
+	doTestParallelReaders(4)
+}
+
+func reader(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func HammerRWMutex(gomaxprocs, numReaders, num_iterations int) {
+	GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm RWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, num_iterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	go writer(&rwm, num_iterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	// Wait for the 2 writers and all readers to finish.
+	for i := 0; i < 2+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestRWMutex(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerRWMutex(1, 1, n)
+	HammerRWMutex(1, 3, n)
+	HammerRWMutex(1, 10, n)
+	HammerRWMutex(4, 1, n)
+	HammerRWMutex(4, 3, n)
+	HammerRWMutex(4, 10, n)
+	HammerRWMutex(10, 1, n)
+	HammerRWMutex(10, 3, n)
+	HammerRWMutex(10, 10, n)
+	HammerRWMutex(10, 5, n)
+}
+
+func BenchmarkRWMutexUncontended(b *testing.B) {
+	type PaddedRWMutex struct {
+		RWMutex
+		pad [32]uint32
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		var rwm PaddedRWMutex
+		for pb.Next() {
+			rwm.RLock()
+			rwm.RLock()
+			rwm.RUnlock()
+			rwm.RUnlock()
+			rwm.Lock()
+			rwm.Unlock()
+		}
+	})
+}
+
+func benchmarkRWMutex(b *testing.B, localWork, writeRatio int) {
+	var rwm RWMutex
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			foo++
+			if foo%writeRatio == 0 {
+				rwm.Lock()
+				rwm.Unlock()
+			} else {
+				rwm.RLock()
+				for i := 0; i != localWork; i += 1 {
+					foo *= 2
+					foo /= 2
+				}
+				rwm.RUnlock()
+			}
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkRWMutexWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 0, 100)
+}
+
+func BenchmarkRWMutexWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 0, 10)
+}
+
+func BenchmarkRWMutexWorkWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 100, 100)
+}
+
+func BenchmarkRWMutexWorkWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 100, 10)
+}
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index f0cad1b..9f8ac49 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -98,7 +98,6 @@ func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
 		return
 	}
 	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
-
 	cas.pc = pc
 	cas.c = c
 	cas.kind = caseSend
@@ -246,7 +245,7 @@ func selectgo(sel *hselect) int {
 	pollslice := slice{unsafe.Pointer(sel.pollorder), int(sel.ncase), int(sel.ncase)}
 	pollorder := *(*[]uint16)(unsafe.Pointer(&pollslice))
 	for i := 1; i < int(sel.ncase); i++ {
-		j := int(fastrand()) % (i + 1)
+		j := fastrandn(uint32(i + 1))
 		pollorder[i] = pollorder[j]
 		pollorder[j] = uint16(i)
 	}
@@ -408,7 +407,7 @@ loop:
 
 	// wait for someone to wake us up
 	gp.param = nil
-	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 2)
+	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
 
 	// While we were asleep, some goroutine came along and completed
 	// one of the cases in the select and woke us up (called ready).
@@ -602,7 +601,7 @@ bufsend:
 
 recv:
 	// can receive from sleeping sender (sg)
-	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
+	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
 	if debugSelect {
 		print("syncrecv: sel=", sel, " c=", c, "\n")
 	}
@@ -633,7 +632,7 @@ send:
 	if msanenabled {
 		msanread(cas.elem, c.elemtype.size)
 	}
-	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
+	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
 	if debugSelect {
 		print("syncsend: sel=", sel, " c=", c, "\n")
 	}
@@ -641,7 +640,7 @@ send:
 
 retc:
 	if cas.releasetime > 0 {
-		blockevent(cas.releasetime-t0, 2)
+		blockevent(cas.releasetime-t0, 1)
 	}
 	return casi
 
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index 37318ff..d04e6f5 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -27,10 +27,19 @@ import (
 
 // Asynchronous semaphore for sync.Mutex.
 
+// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
+// Each of those sudog may in turn point (through s.waitlink) to a list
+// of other sudogs waiting on the same address.
+// The operations on the inner lists of sudogs with the same address
+// are all O(1). The scanning of the top-level semaRoot list is O(log n),
+// where n is the number of distinct addresses with goroutines blocked
+// on them that hash to the given semaRoot.
+// See golang.org/issue/17953 for a program that worked badly
+// before we introduced the second level of list, and test/locklinear.go
+// for a test that exercises this.
 type semaRoot struct {
 	lock  mutex
-	head  *sudog
-	tail  *sudog
+	treap *sudog // root of balanced tree of unique waiters.
 	nwait uint32 // Number of waiters. Read w/o the lock.
 }
 
@@ -44,26 +53,26 @@ var semtable [semTabSize]struct {
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
 func sync_runtime_Semacquire(addr *uint32) {
-	semacquire(addr, semaBlockProfile)
+	semacquire1(addr, false, semaBlockProfile)
 }
 
-//go:linkname net_runtime_Semacquire net.runtime_Semacquire
-func net_runtime_Semacquire(addr *uint32) {
-	semacquire(addr, semaBlockProfile)
+//go:linkname poll_runtime_Semacquire internal_poll.runtime_Semacquire
+func poll_runtime_Semacquire(addr *uint32) {
+	semacquire1(addr, false, semaBlockProfile)
 }
 
 //go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
-func sync_runtime_Semrelease(addr *uint32) {
-	semrelease(addr)
+func sync_runtime_Semrelease(addr *uint32, handoff bool) {
+	semrelease1(addr, handoff)
 }
 
 //go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
-func sync_runtime_SemacquireMutex(addr *uint32) {
-	semacquire(addr, semaBlockProfile|semaMutexProfile)
+func sync_runtime_SemacquireMutex(addr *uint32, lifo bool) {
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile)
 }
 
-//go:linkname net_runtime_Semrelease net.runtime_Semrelease
-func net_runtime_Semrelease(addr *uint32) {
+//go:linkname poll_runtime_Semrelease internal_poll.runtime_Semrelease
+func poll_runtime_Semrelease(addr *uint32) {
 	semrelease(addr)
 }
 
@@ -82,7 +91,11 @@ const (
 )
 
 // Called from runtime.
-func semacquire(addr *uint32, profile semaProfileFlags) {
+func semacquire(addr *uint32) {
+	semacquire1(addr, false, 0)
+}
+
+func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags) {
 	gp := getg()
 	if gp != gp.m.curg {
 		throw("semacquire not on the G stack")
@@ -104,6 +117,7 @@ func semacquire(addr *uint32, profile semaProfileFlags) {
 	t0 := int64(0)
 	s.releasetime = 0
 	s.acquiretime = 0
+	s.ticket = 0
 	if profile&semaBlockProfile != 0 && blockprofilerate > 0 {
 		t0 = cputicks()
 		s.releasetime = -1
@@ -126,9 +140,9 @@ func semacquire(addr *uint32, profile semaProfileFlags) {
 		}
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
-		root.queue(addr, s)
+		root.queue(addr, s, lifo)
 		goparkunlock(&root.lock, "semacquire", traceEvGoBlockSync, 4)
-		if cansemacquire(addr) {
+		if s.ticket != 0 || cansemacquire(addr) {
 			break
 		}
 	}
@@ -139,6 +153,10 @@ func semacquire(addr *uint32, profile semaProfileFlags) {
 }
 
 func semrelease(addr *uint32) {
+	semrelease1(addr, false)
+}
+
+func semrelease1(addr *uint32, handoff bool) {
 	root := semroot(addr)
 	atomic.Xadd(addr, 1)
 
@@ -157,28 +175,22 @@ func semrelease(addr *uint32) {
 		unlock(&root.lock)
 		return
 	}
-	s := root.head
-	for ; s != nil; s = s.next {
-		if s.elem == unsafe.Pointer(addr) {
-			atomic.Xadd(&root.nwait, -1)
-			root.dequeue(s)
-			break
-		}
-	}
+	s, t0 := root.dequeue(addr)
 	if s != nil {
-		if s.acquiretime != 0 {
-			t0 := cputicks()
-			for x := root.head; x != nil; x = x.next {
-				if x.elem == unsafe.Pointer(addr) {
-					x.acquiretime = t0
-					break
-				}
-			}
-			mutexevent(t0-s.acquiretime, 3)
-		}
+		atomic.Xadd(&root.nwait, -1)
 	}
 	unlock(&root.lock)
 	if s != nil { // May be slow, so unlock first
+		acquiretime := s.acquiretime
+		if acquiretime != 0 {
+			mutexevent(t0-acquiretime, 3)
+		}
+		if s.ticket != 0 {
+			throw("corrupted semaphore ticket")
+		}
+		if handoff && cansemacquire(addr) {
+			s.ticket = 1
+		}
 		readyWithTime(s, 5)
 	}
 }
@@ -199,33 +211,230 @@ func cansemacquire(addr *uint32) bool {
 	}
 }
 
-func (root *semaRoot) queue(addr *uint32, s *sudog) {
+// queue adds s to the blocked goroutines in semaRoot.
+func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	s.g = getg()
 	s.elem = unsafe.Pointer(addr)
 	s.next = nil
-	s.prev = root.tail
-	if root.tail != nil {
-		root.tail.next = s
-	} else {
-		root.head = s
+	s.prev = nil
+
+	var last *sudog
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
+		if t.elem == unsafe.Pointer(addr) {
+			// Already have addr in list.
+			if lifo {
+				// Substitute s in t's place in treap.
+				*pt = s
+				s.ticket = t.ticket
+				s.acquiretime = t.acquiretime
+				s.parent = t.parent
+				s.prev = t.prev
+				s.next = t.next
+				if s.prev != nil {
+					s.prev.parent = s
+				}
+				if s.next != nil {
+					s.next.parent = s
+				}
+				// Add t first in s's wait list.
+				s.waitlink = t
+				s.waittail = t.waittail
+				if s.waittail == nil {
+					s.waittail = t
+				}
+				t.parent = nil
+				t.prev = nil
+				t.next = nil
+				t.waittail = nil
+			} else {
+				// Add s to end of t's wait list.
+				if t.waittail == nil {
+					t.waitlink = s
+				} else {
+					t.waittail.waitlink = s
+				}
+				t.waittail = s
+				s.waitlink = nil
+			}
+			return
+		}
+		last = t
+		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+			pt = &t.prev
+		} else {
+			pt = &t.next
+		}
+	}
+
+	// Add s as new leaf in tree of unique addrs.
+	// The balanced tree is a treap using ticket as the random heap priority.
+	// That is, it is a binary tree ordered according to the elem addresses,
+	// but then among the space of possible binary trees respecting those
+	// addresses, it is kept balanced on average by maintaining a heap ordering
+	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
+	// https://en.wikipedia.org/wiki/Treap
+	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
+	s.ticket = fastrand()
+	s.parent = last
+	*pt = s
+
+	// Rotate up into tree according to ticket (priority).
+	for s.parent != nil && s.parent.ticket > s.ticket {
+		if s.parent.prev == s {
+			root.rotateRight(s.parent)
+		} else {
+			if s.parent.next != s {
+				panic("semaRoot queue")
+			}
+			root.rotateLeft(s.parent)
+		}
 	}
-	root.tail = s
 }
 
-func (root *semaRoot) dequeue(s *sudog) {
-	if s.next != nil {
-		s.next.prev = s.prev
-	} else {
-		root.tail = s.prev
+// dequeue searches for and finds the first goroutine
+// in semaRoot blocked on addr.
+// If the sudog was being profiled, dequeue returns the time
+// at which it was woken up as now. Otherwise now is 0.
+func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
+	ps := &root.treap
+	s := *ps
+	for ; s != nil; s = *ps {
+		if s.elem == unsafe.Pointer(addr) {
+			goto Found
+		}
+		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+			ps = &s.prev
+		} else {
+			ps = &s.next
+		}
 	}
-	if s.prev != nil {
-		s.prev.next = s.next
+	return nil, 0
+
+Found:
+	now = int64(0)
+	if s.acquiretime != 0 {
+		now = cputicks()
+	}
+	if t := s.waitlink; t != nil {
+		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
+		*ps = t
+		t.ticket = s.ticket
+		t.parent = s.parent
+		t.prev = s.prev
+		if t.prev != nil {
+			t.prev.parent = t
+		}
+		t.next = s.next
+		if t.next != nil {
+			t.next.parent = t
+		}
+		if t.waitlink != nil {
+			t.waittail = s.waittail
+		} else {
+			t.waittail = nil
+		}
+		t.acquiretime = now
+		s.waitlink = nil
+		s.waittail = nil
 	} else {
-		root.head = s.next
+		// Rotate s down to be leaf of tree for removal, respecting priorities.
+		for s.next != nil || s.prev != nil {
+			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
+				root.rotateRight(s)
+			} else {
+				root.rotateLeft(s)
+			}
+		}
+		// Remove s, now a leaf.
+		if s.parent != nil {
+			if s.parent.prev == s {
+				s.parent.prev = nil
+			} else {
+				s.parent.next = nil
+			}
+		} else {
+			root.treap = nil
+		}
 	}
+	s.parent = nil
 	s.elem = nil
 	s.next = nil
 	s.prev = nil
+	s.ticket = 0
+	return s, now
+}
+
+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *semaRoot) rotateLeft(x *sudog) {
+	// p -> (x a (y b c))
+	p := x.parent
+	a, y := x.prev, x.next
+	b, c := y.prev, y.next
+
+	y.prev = x
+	x.parent = y
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.prev == x {
+		p.prev = y
+	} else {
+		if p.next != x {
+			throw("semaRoot rotateLeft")
+		}
+		p.next = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *semaRoot) rotateRight(y *sudog) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x, c := y.prev, y.next
+	a, b := x.prev, x.next
+
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = y
+	y.parent = x
+	y.prev = b
+	if b != nil {
+		b.parent = y
+	}
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.prev == y {
+		p.prev = x
+	} else {
+		if p.next != y {
+			throw("semaRoot rotateRight")
+		}
+		p.next = x
+	}
 }
 
 // notifyList is a ticket-based notification list used to implement sync.Cond.
@@ -352,10 +561,22 @@ func notifyListNotifyOne(l *notifyList) {
 		return
 	}
 
-	// Update the next notify ticket number, and try to find the G that
-	// needs to be notified. If it hasn't made it to the list yet we won't
-	// find it, but it won't park itself once it sees the new notify number.
+	// Update the next notify ticket number.
 	atomic.Store(&l.notify, t+1)
+
+	// Try to find the g that needs to be notified.
+	// If it hasn't made it to the list yet we won't find it,
+	// but it won't park itself once it sees the new notify number.
+	//
+	// This scan looks linear but essentially always stops quickly.
+	// Because g's queue separately from taking numbers,
+	// there may be minor reorderings in the list, but we
+	// expect the g we're looking for to be near the front.
+	// The g has others in front of it on the list only to the
+	// extent that it lost the race, so the iteration will not
+	// be too long. This applies even when the g is missing:
+	// it hasn't yet gotten to sleep and has lost the race to
+	// the (few) other g's that we find on the list.
 	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
 		if s.ticket == t {
 			n := s.next
@@ -383,3 +604,8 @@ func notifyListCheck(sz uintptr) {
 		throw("bad notifyList size")
 	}
 }
+
+//go:linkname sync_nanotime sync.runtime_nanotime
+func sync_nanotime() int64 {
+	return nanotime()
+}
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
index b71b21e..378c68e 100644
--- a/libgo/go/runtime/signal_sighandler.go
+++ b/libgo/go/runtime/signal_sighandler.go
@@ -111,7 +111,7 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 
 	if docrash {
 		crashing++
-		if crashing < sched.mcount {
+		if crashing < sched.mcount-int32(extraMCount) {
 			// There are other m's that need to dump their stacks.
 			// Relay SIGQUIT to the next m by sending it to the current process.
 			// All m's that have already received SIGQUIT have signal masks blocking
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index e2642ee..3237e18 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -7,14 +7,13 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
 
 // For gccgo's C code to call:
 //go:linkname initsig runtime.initsig
-//go:linkname crash runtime.crash
-//go:linkname resetcpuprofiler runtime.resetcpuprofiler
 //go:linkname sigtrampgo runtime.sigtrampgo
 
 //go:linkname os_sigpipe os.sigpipe
@@ -37,11 +36,18 @@ const (
 // Stores the signal handlers registered before Go installed its own.
 // These signal handlers will be invoked in cases where Go doesn't want to
 // handle a particular signal (e.g., signal occurred on a non-Go thread).
-// See sigfwdgo() for more information on when the signals are forwarded.
+// See sigfwdgo for more information on when the signals are forwarded.
 //
-// Signal forwarding is currently available only on Darwin and Linux.
+// This is read by the signal handler; accesses should use
+// atomic.Loaduintptr and atomic.Storeuintptr.
 var fwdSig [_NSIG]uintptr
 
+// handlingSig is indexed by signal number and is non-zero if we are
+// currently handling the signal. Or, to put it another way, whether
+// the signal handler is currently set to the Go signal handler or not.
+// This is uint32 rather than bool so that we can use atomic instructions.
+var handlingSig [_NSIG]uint32
+
 // channels for synchronizing signal mask updates with the signal mask
 // thread
 var (
@@ -87,6 +93,9 @@ func initsig(preinit bool) {
 		if t.flags == 0 || t.flags&_SigDefault != 0 {
 			continue
 		}
+
+		// We don't need to use atomic operations here because
+		// there shouldn't be any other goroutines running yet.
 		fwdSig[i] = getsig(i)
 
 		if !sigInstallGoHandler(i) {
@@ -98,7 +107,7 @@ func initsig(preinit bool) {
 			continue
 		}
 
-		t.flags |= _SigHandling
+		handlingSig[i] = 1
 		setsig(i, getSigtramp())
 	}
 }
@@ -111,7 +120,7 @@ func sigInstallGoHandler(sig uint32) bool {
 	// Even these signals can be fetched using the os/signal package.
 	switch sig {
 	case _SIGHUP, _SIGINT:
-		if fwdSig[sig] == _SIG_IGN {
+		if atomic.Loaduintptr(&fwdSig[sig]) == _SIG_IGN {
 			return false
 		}
 	}
@@ -122,37 +131,52 @@ func sigInstallGoHandler(sig uint32) bool {
 	}
 
 	// When built using c-archive or c-shared, only install signal
-	// handlers for synchronous signals.
-	if (isarchive || islibrary) && t.flags&_SigPanic == 0 {
+	// handlers for synchronous signals and SIGPIPE.
+	if (isarchive || islibrary) && t.flags&_SigPanic == 0 && sig != _SIGPIPE {
 		return false
 	}
 
 	return true
 }
 
+// sigenable enables the Go signal handler to catch the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.enableSignal and signal_enable.
 func sigenable(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
 		ensureSigM()
 		enableSigChan <- sig
 		<-maskUpdatedChan
-		if t.flags&_SigHandling == 0 {
-			t.flags |= _SigHandling
-			fwdSig[sig] = getsig(sig)
+		if atomic.Cas(&handlingSig[sig], 0, 1) {
+			atomic.Storeuintptr(&fwdSig[sig], getsig(sig))
 			setsig(sig, getSigtramp())
 		}
 	}
 }
 
+// sigdisable disables the Go signal handler for the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.disableSignal and signal_disable.
 func sigdisable(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
 		ensureSigM()
@@ -163,25 +187,71 @@ func sigdisable(sig uint32) {
 		// signal, then to go back to the state before Notify
 		// we should remove the one we installed.
 		if !sigInstallGoHandler(sig) {
-			t.flags &^= _SigHandling
-			setsig(sig, fwdSig[sig])
+			atomic.Store(&handlingSig[sig], 0)
+			setsig(sig, atomic.Loaduintptr(&fwdSig[sig]))
 		}
 	}
 }
 
+// sigignore ignores the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.ignoreSignal and signal_ignore.
 func sigignore(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
-		t.flags &^= _SigHandling
+		atomic.Store(&handlingSig[sig], 0)
 		setsig(sig, _SIG_IGN)
 	}
 }
 
-func resetcpuprofiler(hz int32) {
+// clearSignalHandlers clears all signal handlers that are not ignored
+// back to the default. This is called by the child after a fork, so that
+// we can enable the signal mask for the exec without worrying about
+// running a signal handler in the child.
+//go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+	for i := uint32(0); i < _NSIG; i++ {
+		if atomic.Load(&handlingSig[i]) != 0 {
+			setsig(i, _SIG_DFL)
+		}
+	}
+}
+
+// setProcessCPUProfiler is called when the profiling timer changes.
+// It is called with prof.lock held. hz is the new timer, and is 0 if
+// profiling is being disabled. Enable or disable the signal as
+// required for -buildmode=c-archive.
+func setProcessCPUProfiler(hz int32) {
+	if hz != 0 {
+		// Enable the Go signal handler if not enabled.
+		if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
+			atomic.Storeuintptr(&fwdSig[_SIGPROF], getsig(_SIGPROF))
+			setsig(_SIGPROF, funcPC(sighandler))
+		}
+	} else {
+		// If the Go signal handler should be disabled by default,
+		// disable it if it is enabled.
+		if !sigInstallGoHandler(_SIGPROF) {
+			if atomic.Cas(&handlingSig[_SIGPROF], 1, 0) {
+				setsig(_SIGPROF, atomic.Loaduintptr(&fwdSig[_SIGPROF]))
+			}
+		}
+	}
+}
+
+// setThreadCPUProfiler makes any thread-specific changes required to
+// implement profiling at a rate of hz.
+func setThreadCPUProfiler(hz int32) {
 	var it _itimerval
 	if hz == 0 {
 		setitimer(_ITIMER_PROF, &it, nil)
@@ -315,7 +385,7 @@ func raisebadsignal(sig uint32, c *sigctxt) {
 	if sig >= _NSIG {
 		handler = _SIG_DFL
 	} else {
-		handler = fwdSig[sig]
+		handler = atomic.Loaduintptr(&fwdSig[sig])
 	}
 
 	// Reset the signal handler and raise the signal.
@@ -431,6 +501,16 @@ func sigNotOnStack(sig uint32) {
 	throw("non-Go code set up signal handler without SA_ONSTACK flag")
 }
 
+// signalDuringFork is called if we receive a signal while doing a fork.
+// We do not want signals at that time, as a signal sent to the process
+// group may be delivered to the child process, causing confusion.
+// This should never be called, because we block signals across the fork;
+// this function is just a safety check. See issue 18600 for background.
+func signalDuringFork(sig uint32) {
+	println("signal", sig, "received during fork")
+	throw("signal received during fork")
+}
+
 // This runs on a foreign stack, without an m or a g. No stack split.
 //go:nosplit
 //go:norace
@@ -455,7 +535,7 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 	if sig >= uint32(len(sigtable)) {
 		return false
 	}
-	fwdFn := fwdSig[sig]
+	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
 
 	if !signalsOK {
 		// The only way we can get here is if we are in a
@@ -470,35 +550,44 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return true
 	}
 
-	flags := sigtable[sig].flags
-
 	// If there is no handler to forward to, no need to forward.
 	if fwdFn == _SIG_DFL {
 		return false
 	}
 
 	// If we aren't handling the signal, forward it.
-	if flags&_SigHandling == 0 {
+	// Really if we aren't handling the signal, we shouldn't get here,
+	// but on Darwin setsigstack can lead us here because it sets
+	// the sa_tramp field. The sa_tramp field is not returned by
+	// sigaction, so the fix for that is non-obvious.
+	if atomic.Load(&handlingSig[sig]) == 0 {
 		sigfwd(fwdFn, sig, info, ctx)
 		return true
 	}
 
-	// Only forward synchronous signals.
+	flags := sigtable[sig].flags
+
 	c := sigctxt{info, ctx}
-	if c.sigcode() == _SI_USER || flags&_SigPanic == 0 {
+	// Only forward synchronous signals and SIGPIPE.
+	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
+	// is set to _SI_USER even for a SIGPIPE raised from a write to a closed socket
+	// or pipe.
+	if (c.sigcode() == _SI_USER || flags&_SigPanic == 0) && sig != _SIGPIPE {
 		return false
 	}
 	// Determine if the signal occurred inside Go code. We test that:
 	//   (1) we were in a goroutine (i.e., m.curg != nil), and
-	//   (2) we weren't in CGO (i.e., m.curg.syscallsp == 0).
+	//   (2) we weren't in CGO.
 	g := getg()
-	if g != nil && g.m != nil && g.m.curg != nil && g.m.curg.syscallsp == 0 {
+	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
 		return false
 	}
+
 	// Signal not handled by Go, forward it.
 	if fwdFn != _SIG_IGN {
 		sigfwd(fwdFn, sig, info, ctx)
 	}
+
 	return true
 }
 
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index a6d498f..cd036ce 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -33,6 +33,17 @@ import (
 	_ "unsafe" // for go:linkname
 )
 
+// sig handles communication between the signal handler and os/signal.
+// Other than the inuse and recv fields, the fields are accessed atomically.
+//
+// The wanted and ignored fields are only written by one goroutine at
+// a time; access is controlled by the handlers Mutex in os/signal.
+// The fields are only read by that one goroutine and by the signal handler.
+// We access them atomically to minimize the race between setting them
+// in the goroutine calling os/signal and the signal handler,
+// which may be running in a different thread. That race is unavoidable,
+// as there is no connection between handling a signal and receiving one,
+// but atomic instructions should minimize it.
 var sig struct {
 	note    note
 	mask    [(_NSIG + 31) / 32]uint32
@@ -53,7 +64,11 @@ const (
 // Reports whether the signal was sent. If not, the caller typically crashes the program.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
-	if !sig.inuse || s >= uint32(32*len(sig.wanted)) || sig.wanted[s/32]&bit == 0 {
+	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
+		return false
+	}
+
+	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
 		return false
 	}
 
@@ -131,6 +146,23 @@ func signal_recv() uint32 {
 	}
 }
 
+// signalWaitUntilIdle waits until the signal delivery mechanism is idle.
+// This is used to ensure that we do not drop a signal notification due
+// to a race between disabling a signal and receiving a signal.
+// This assumes that signal delivery has already been disabled for
+// the signal(s) in question, and here we are just waiting to make sure
+// that all the signals have been delivered to the user channels
+// by the os/signal package.
+//go:linkname signalWaitUntilIdle os_signal.signalWaitUntilIdle
+func signalWaitUntilIdle() {
+	// Although WaitUntilIdle seems like the right name for this
+	// function, the state we are looking for is sigReceiving, not
+	// sigIdle.  The sigIdle state is really more like sigProcessing.
+	for atomic.Load(&sig.state) != sigReceiving {
+		Gosched()
+	}
+}
+
 // Must only be called from a single goroutine at a time.
 //go:linkname signal_enable os_signal.signal_enable
 func signal_enable(s uint32) {
@@ -146,8 +178,15 @@ func signal_enable(s uint32) {
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] |= 1 << (s & 31)
-	sig.ignored[s/32] &^= 1 << (s & 31)
+
+	w := sig.wanted[s/32]
+	w |= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i &^= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
+
 	sigenable(s)
 }
 
@@ -157,8 +196,11 @@ func signal_disable(s uint32) {
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] &^= 1 << (s & 31)
 	sigdisable(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
 }
 
 // Must only be called from a single goroutine at a time.
@@ -167,12 +209,19 @@ func signal_ignore(s uint32) {
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] &^= 1 << (s & 31)
-	sig.ignored[s/32] |= 1 << (s & 31)
 	sigignore(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i |= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
 }
 
 // Checked by signal handlers.
 func signal_ignored(s uint32) bool {
-	return sig.ignored[s/32]&(1<<(s&31)) != 0
+	i := atomic.Load(&sig.ignored[s/32])
+	return i&(1<<(s&31)) != 0
 }
diff --git a/libgo/go/runtime/sizeclasses.go b/libgo/go/runtime/sizeclasses.go
index e616e951..5366564 100644
--- a/libgo/go/runtime/sizeclasses.go
+++ b/libgo/go/runtime/sizeclasses.go
@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mksizeclasses.go; DO NOT EDIT
+// Code generated by mksizeclasses.go; DO NOT EDIT.
 //go:generate go run mksizeclasses.go
 
 package runtime
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 9cc2873..7436ddf 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -88,7 +88,7 @@ func concatstring5(buf *tmpBuf, a [5]string) string {
 
 // Buf is a fixed-size buffer for the result,
 // it is not nil if the result does not escape.
-func slicebytetostring(buf *tmpBuf, b []byte) string {
+func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
 	l := len(b)
 	if l == 0 {
 		// Turns out to be a relatively common case.
@@ -96,18 +96,26 @@ func slicebytetostring(buf *tmpBuf, b []byte) string {
 		// you find the indices and convert the subslice to string.
 		return ""
 	}
-	if raceenabled && l > 0 {
+	if raceenabled {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
 			getcallerpc(unsafe.Pointer(&buf)),
 			funcPC(slicebytetostring))
 	}
-	if msanenabled && l > 0 {
+	if msanenabled {
 		msanread(unsafe.Pointer(&b[0]), uintptr(l))
 	}
-	s, c := rawstringtmp(buf, l)
-	copy(c, b)
-	return s
+
+	var p unsafe.Pointer
+	if buf != nil && len(b) <= len(buf) {
+		p = unsafe.Pointer(buf)
+	} else {
+		p = mallocgc(uintptr(len(b)), nil, false)
+	}
+	stringStructOf(&str).str = p
+	stringStructOf(&str).len = len(b)
+	memmove(p, (*(*slice)(unsafe.Pointer(&b))).array, uintptr(len(b)))
+	return
 }
 
 func rawstringtmp(buf *tmpBuf, l int) (s string, b []byte) {
diff --git a/libgo/go/runtime/string_test.go b/libgo/go/runtime/string_test.go
index ee30699..555a7fc 100644
--- a/libgo/go/runtime/string_test.go
+++ b/libgo/go/runtime/string_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	"runtime"
+	"strconv"
 	"strings"
 	"testing"
 )
@@ -89,6 +90,20 @@ func BenchmarkConcatStringAndBytes(b *testing.B) {
 	}
 }
 
+var escapeString string
+
+func BenchmarkSliceByteToString(b *testing.B) {
+	buf := []byte{'!'}
+	for n := 0; n < 8; n++ {
+		b.Run(strconv.Itoa(len(buf)), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				escapeString = string(buf)
+			}
+		})
+		buf = append(buf, buf...)
+	}
+}
+
 var stringdata = []struct{ name, data string }{
 	{"ASCII", "01234567890"},
 	{"Japanese", "日本語日本語日本語"},
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 30d87c4..84fa1c7 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -109,8 +109,22 @@ func memcmp(a, b unsafe.Pointer, size uintptr) int32
 // exported value for testing
 var hashLoad = loadFactor
 
-// in asm_*.s
-func fastrand() uint32
+//go:nosplit
+func fastrand() uint32 {
+	mp := getg().m
+	fr := mp.fastrand
+	mx := uint32(int32(fr)>>31) & 0xa8888eef
+	fr = fr<<1 ^ mx
+	mp.fastrand = fr
+	return fr
+}
+
+//go:nosplit
+func fastrandn(n uint32) uint32 {
+	// This is similar to fastrand() % n, but faster.
+	// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+	return uint32(uint64(fastrand()) * uint64(n) >> 32)
+}
 
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
@@ -156,7 +170,7 @@ type neverCallThisFunction struct{}
 // This function must never be called directly. Call goexit1 instead.
 // gentraceback assumes that goexit terminates the stack. A direct
 // call on the stack will cause gentraceback to stop walking the stack
-// prematurely and if there are leftover stack barriers it may panic.
+// prematurely and if there is leftover state it may panic.
 func goexit(neverCallThisFunction)
 
 // publicationBarrier performs a store/store barrier (a "publication"
@@ -176,9 +190,6 @@ func goexit(neverCallThisFunction)
 // data dependency ordering.
 func publicationBarrier()
 
-//go:noescape
-func setcallerpc(argp unsafe.Pointer, pc uintptr)
-
 // getcallerpc returns the program counter (PC) of its caller's caller.
 // getcallersp returns the stack pointer (SP) of its caller's caller.
 // For both, the argp must be a pointer to the caller's first function argument.
@@ -213,12 +224,14 @@ func getcallerpc(argp unsafe.Pointer) uintptr
 //go:noescape
 func getcallersp(argp unsafe.Pointer) uintptr
 
+func asmcgocall(fn, arg unsafe.Pointer) int32 {
+	throw("asmcgocall")
+	return 0
+}
+
 // argp used in Defer structs when there is no argp.
 const _NoArgs = ^uintptr(0)
 
-//go:linkname time_now time.now
-func time_now() (sec int64, nsec int32)
-
 //extern __builtin_prefetch
 func prefetch(addr unsafe.Pointer, rw int32, locality int32)
 
@@ -238,13 +251,6 @@ func prefetchnta(addr uintptr) {
 	prefetch(unsafe.Pointer(addr), 0, 0)
 }
 
-// For gccgo, expose this for C callers.
-//go:linkname unixnanotime runtime.unixnanotime
-func unixnanotime() int64 {
-	sec, nsec := time_now()
-	return sec*1e9 + int64(nsec)
-}
-
 // round n up to a multiple of a.  a must be a power of 2.
 func round(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
@@ -315,18 +321,6 @@ func entersyscallblock(int32)
 // Here for gccgo until we port mgc.go.
 func GC()
 
-// For gccgo to call from C code.
-//go:linkname acquireWorldsema runtime.acquireWorldsema
-func acquireWorldsema() {
-	semacquire(&worldsema, 0)
-}
-
-// For gccgo to call from C code.
-//go:linkname releaseWorldsema runtime.releaseWorldsema
-func releaseWorldsema() {
-	semrelease(&worldsema)
-}
-
 // For gccgo to call from C code, so that the C code and the Go code
 // can share the memstats variable for now.
 //go:linkname getMstats runtime.getMstats
@@ -436,10 +430,6 @@ func setpagesize(s uintptr) {
 	}
 }
 
-// Temporary for gccgo until we port more of proc.go.
-func sigprofNonGoPC(pc uintptr) {
-}
-
 // Temporary for gccgo until we port mgc.go.
 //go:linkname runtime_m0 runtime.runtime_m0
 func runtime_m0() *m {
@@ -458,3 +448,11 @@ type bitvector struct {
 	n        int32 // # of bits
 	bytedata *uint8
 }
+
+// bool2int returns 0 if x is false or 1 if x is true.
+func bool2int(x bool) int {
+	if x {
+		return 1
+	}
+	return 0
+}
diff --git a/libgo/go/runtime/stubs_linux.go b/libgo/go/runtime/stubs_linux.go
new file mode 100644
index 0000000..d10f657
--- /dev/null
+++ b/libgo/go/runtime/stubs_linux.go
@@ -0,0 +1,9 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package runtime
+
+func sbrk0() uintptr
diff --git a/libgo/go/runtime/stubs_nonlinux.go b/libgo/go/runtime/stubs_nonlinux.go
new file mode 100644
index 0000000..e1ea05c
--- /dev/null
+++ b/libgo/go/runtime/stubs_nonlinux.go
@@ -0,0 +1,12 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !linux
+
+package runtime
+
+// sbrk0 returns the current process brk, or 0 if not implemented.
+func sbrk0() uintptr {
+	return 0
+}
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index bad0347..3d15fc8 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -7,6 +7,7 @@ package runtime
 // Frames may be used to get function/file/line information for a
 // slice of PC values returned by Callers.
 type Frames struct {
+	// callers is a slice of PCs that have not yet been expanded.
 	callers []uintptr
 
 	// The last PC we saw.
@@ -18,23 +19,34 @@ type Frames struct {
 
 // Frame is the information returned by Frames for each call frame.
 type Frame struct {
-	// Program counter for this frame; multiple frames may have
-	// the same PC value.
+	// PC is the program counter for the location in this frame.
+	// For a frame that calls another frame, this will be the
+	// program counter of a call instruction. Because of inlining,
+	// multiple frames may have the same PC value, but different
+	// symbolic information.
 	PC uintptr
 
-	// Func for this frame; may be nil for non-Go code or fully
-	// inlined functions.
+	// Func is the Func value of this call frame. This may be nil
+	// for non-Go code or fully inlined functions.
 	Func *Func
 
-	// Function name, file name, and line number for this call frame.
-	// May be the empty string or zero if not known.
+	// Function is the package path-qualified function name of
+	// this call frame. If non-empty, this string uniquely
+	// identifies a single function in the program.
+	// This may be the empty string if not known.
 	// If Func is not nil then Function == Func.Name().
 	Function string
-	File     string
-	Line     int
 
-	// Entry point for the function; may be zero if not known.
-	// If Func is not nil then Entry == Func.Entry().
+	// File and Line are the file name and line number of the
+	// location in this frame. For non-leaf frames, this will be
+	// the location of a call. These may be the empty string and
+	// zero, respectively, if not known.
+	File string
+	Line int
+
+	// Entry point program counter for the function; may be zero
+	// if not known. If Func is not nil then Entry ==
+	// Func.Entry().
 	Entry uintptr
 }
 
@@ -94,7 +106,8 @@ func (ci *Frames) Next() (frame Frame, more bool) {
 // NOTE: Func does not expose the actual unexported fields, because we return *Func
 // values to users, and we want to keep them from being able to overwrite the data
 // with (say) *f = Func{}.
-// All code operating on a *Func must call raw to get the *_func instead.
+// All code operating on a *Func must call raw() to get the *_func
+// or funcInfo() to get the funcInfo instead.
 
 // A Func represents a Go function in the running binary.
 type Func struct {
@@ -104,6 +117,9 @@ type Func struct {
 
 // FuncForPC returns a *Func describing the function that contains the
 // given program counter address, or else nil.
+//
+// If pc represents multiple functions because of inlining, it returns
+// the *Func describing the outermost function.
 func FuncForPC(pc uintptr) *Func {
 	name, _, _ := funcfileline(pc, -1)
 	if name == "" {
diff --git a/libgo/go/runtime/symtab_test.go b/libgo/go/runtime/symtab_test.go
index 8c8281f..807b50d 100644
--- a/libgo/go/runtime/symtab_test.go
+++ b/libgo/go/runtime/symtab_test.go
@@ -31,10 +31,14 @@ func TestCaller(t *testing.T) {
 	}
 }
 
+// These are marked noinline so that we can use FuncForPC
+// in testCallerBar.
+//go:noinline
 func testCallerFoo(t *testing.T) {
 	testCallerBar(t)
 }
 
+//go:noinline
 func testCallerBar(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		pc, file, line, ok := runtime.Caller(i)
@@ -94,7 +98,7 @@ var mapLit = map[int]int{ // 28
 }                           // 33
 var intLit = lineNumber() + // 34
 	lineNumber() + // 35
-			lineNumber() // 36
+	lineNumber() // 36
 func trythis() { // 37
 	recordLines(lineNumber(), // 38
 		lineNumber(), // 39
@@ -156,3 +160,14 @@ func TestLineNumber(t *testing.T) {
 		}
 	}
 }
+
+func TestNilName(t *testing.T) {
+	defer func() {
+		if ex := recover(); ex != nil {
+			t.Fatalf("expected no nil panic, got=%v", ex)
+		}
+	}()
+	if got := (*runtime.Func)(nil).Name(); got != "" {
+		t.Errorf("Name() = %q, want %q", got, "")
+	}
+}
diff --git a/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go b/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go
new file mode 100644
index 0000000..035c534
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go
@@ -0,0 +1,126 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+func init() {
+	register("FreeBSDNumCPU", FreeBSDNumCPU)
+	register("FreeBSDNumCPUHelper", FreeBSDNumCPUHelper)
+}
+
+func FreeBSDNumCPUHelper() {
+	fmt.Printf("%d\n", runtime.NumCPU())
+}
+
+func FreeBSDNumCPU() {
+	_, err := exec.LookPath("cpuset")
+	if err != nil {
+		// Can not test without cpuset command.
+		fmt.Println("OK")
+		return
+	}
+	_, err = exec.LookPath("sysctl")
+	if err != nil {
+		// Can not test without sysctl command.
+		fmt.Println("OK")
+		return
+	}
+	cmd := exec.Command("sysctl", "-n", "kern.smp.active")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Printf("fail to launch '%s', error: %s, output: %s\n", strings.Join(cmd.Args, " "), err, output)
+		return
+	}
+	if bytes.Equal(output, []byte("1\n")) == false {
+		// SMP mode deactivated in kernel.
+		fmt.Println("OK")
+		return
+	}
+
+	list, err := getList()
+	if err != nil {
+		fmt.Printf("%s\n", err)
+		return
+	}
+	err = checkNCPU(list)
+	if err != nil {
+		fmt.Printf("%s\n", err)
+		return
+	}
+	if len(list) >= 2 {
+		err = checkNCPU(list[:len(list)-1])
+		if err != nil {
+			fmt.Printf("%s\n", err)
+			return
+		}
+	}
+	fmt.Println("OK")
+	return
+}
+
+func getList() ([]string, error) {
+	pid := syscall.Getpid()
+
+	// Launch cpuset to print a list of available CPUs: pid <PID> mask: 0, 1, 2, 3.
+	cmd := exec.Command("cpuset", "-g", "-p", strconv.Itoa(pid))
+	cmdline := strings.Join(cmd.Args, " ")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("fail to execute '%s': %s", cmdline, err)
+	}
+	pos := bytes.IndexRune(output, ':')
+	if pos == -1 {
+		return nil, fmt.Errorf("invalid output from '%s', ':' not found: %s", cmdline, output)
+	}
+
+	var list []string
+	for _, val := range bytes.Split(output[pos+1:], []byte(",")) {
+		index := string(bytes.TrimSpace(val))
+		if len(index) == 0 {
+			continue
+		}
+		list = append(list, index)
+	}
+	if len(list) == 0 {
+		return nil, fmt.Errorf("empty CPU list from '%s': %s", cmdline, output)
+	}
+	return list, nil
+}
+
+func checkNCPU(list []string) error {
+	listString := strings.Join(list, ",")
+	if len(listString) == 0 {
+		return fmt.Errorf("could not check against an empty CPU list")
+	}
+
+	// Launch FreeBSDNumCPUHelper() with specified CPUs list.
+	cmd := exec.Command("cpuset", "-l", listString, os.Args[0], "FreeBSDNumCPUHelper")
+	cmdline := strings.Join(cmd.Args, " ")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return fmt.Errorf("fail to launch child '%s', error: %s, output: %s", cmdline, err, output)
+	}
+
+	// NumCPU from FreeBSDNumCPUHelper come with '\n'.
+	output = bytes.TrimSpace(output)
+	n, err := strconv.Atoi(string(output))
+	if err != nil {
+		return fmt.Errorf("fail to parse output from child '%s', error: %s, output: %s", cmdline, err, output)
+	}
+	if n != len(list) {
+		return fmt.Errorf("runtime.NumCPU() expected to %d, got %d when run with CPU list %s", len(list), n, listString)
+	}
+	return nil
+}
diff --git a/libgo/go/runtime/testdata/testprog/panicrace.go b/libgo/go/runtime/testdata/testprog/panicrace.go
new file mode 100644
index 0000000..f058994
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/panicrace.go
@@ -0,0 +1,27 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"runtime"
+	"sync"
+)
+
+func init() {
+	register("PanicRace", PanicRace)
+}
+
+func PanicRace() {
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer func() {
+			wg.Done()
+			runtime.Gosched()
+		}()
+		panic("crash")
+	}()
+	wg.Wait()
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/cgo.go b/libgo/go/runtime/testdata/testprogcgo/cgo.go
index 870d4ef..209524a 100644
--- a/libgo/go/runtime/testdata/testprogcgo/cgo.go
+++ b/libgo/go/runtime/testdata/testprogcgo/cgo.go
@@ -45,10 +45,13 @@ func CgoSignalDeadlock() {
 				}()
 				var s *string
 				*s = ""
+				fmt.Printf("continued after expected panic\n")
 			}()
 		}
 	}()
 	time.Sleep(time.Millisecond)
+	start := time.Now()
+	var times []time.Duration
 	for i := 0; i < 64; i++ {
 		go func() {
 			runtime.LockOSThread()
@@ -62,8 +65,9 @@ func CgoSignalDeadlock() {
 		ping <- false
 		select {
 		case <-ping:
+			times = append(times, time.Since(start))
 		case <-time.After(time.Second):
-			fmt.Printf("HANG\n")
+			fmt.Printf("HANG 1 %v\n", times)
 			return
 		}
 	}
@@ -71,7 +75,7 @@ func CgoSignalDeadlock() {
 	select {
 	case <-ping:
 	case <-time.After(time.Second):
-		fmt.Printf("HANG\n")
+		fmt.Printf("HANG 2 %v\n", times)
 		return
 	}
 	fmt.Printf("OK\n")
diff --git a/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go b/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go
new file mode 100644
index 0000000..12fda49
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/numgoroutine.go
@@ -0,0 +1,99 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <stddef.h>
+#include <pthread.h>
+
+extern void CallbackNumGoroutine();
+
+static void* thread2(void* arg __attribute__ ((unused))) {
+	CallbackNumGoroutine();
+	return NULL;
+}
+
+static void CheckNumGoroutine() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, thread2, NULL);
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+)
+
+var baseGoroutines int
+
+func init() {
+	register("NumGoroutine", NumGoroutine)
+}
+
+func NumGoroutine() {
+	// Test that there are just the expected number of goroutines
+	// running. Specifically, test that the spare M's goroutine
+	// doesn't show up.
+	//
+	// On non-Windows platforms there's a signal handling thread
+	// started by os/signal.init in addition to the main
+	// goroutine.
+	if runtime.GOOS != "windows" {
+		baseGoroutines = 1
+	}
+	if _, ok := checkNumGoroutine("first", 1+baseGoroutines); !ok {
+		return
+	}
+
+	// Test that the goroutine for a callback from C appears.
+	if C.CheckNumGoroutine(); !callbackok {
+		return
+	}
+
+	// Make sure we're back to the initial goroutines.
+	if _, ok := checkNumGoroutine("third", 1+baseGoroutines); !ok {
+		return
+	}
+
+	fmt.Println("OK")
+}
+
+func checkNumGoroutine(label string, want int) (string, bool) {
+	n := runtime.NumGoroutine()
+	if n != want {
+		fmt.Printf("%s NumGoroutine: want %d; got %d\n", label, want, n)
+		return "", false
+	}
+
+	sbuf := make([]byte, 32<<10)
+	sbuf = sbuf[:runtime.Stack(sbuf, true)]
+	n = strings.Count(string(sbuf), "goroutine ")
+	if n != want {
+		fmt.Printf("%s Stack: want %d; got %d:\n%s\n", label, want, n, string(sbuf))
+		return "", false
+	}
+	return string(sbuf), true
+}
+
+var callbackok bool
+
+//export CallbackNumGoroutine
+func CallbackNumGoroutine() {
+	stk, ok := checkNumGoroutine("second", 2+baseGoroutines)
+	if !ok {
+		return
+	}
+	if !strings.Contains(stk, "CallbackNumGoroutine") {
+		fmt.Printf("missing CallbackNumGoroutine from stack:\n%s\n", stk)
+		return
+	}
+
+	callbackok = true
+}
diff --git a/libgo/go/runtime/testdata/testprognet/signalexec.go b/libgo/go/runtime/testdata/testprognet/signalexec.go
new file mode 100644
index 0000000..4a988ef
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprognet/signalexec.go
@@ -0,0 +1,70 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+// This is in testprognet instead of testprog because testprog
+// must not import anything (like net, but also like os/signal)
+// that kicks off background goroutines during init.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"os/signal"
+	"sync"
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("SignalDuringExec", SignalDuringExec)
+	register("Nop", Nop)
+}
+
+func SignalDuringExec() {
+	pgrp := syscall.Getpgrp()
+
+	const tries = 10
+
+	var wg sync.WaitGroup
+	c := make(chan os.Signal, tries)
+	signal.Notify(c, syscall.SIGWINCH)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for range c {
+		}
+	}()
+
+	for i := 0; i < tries; i++ {
+		time.Sleep(time.Microsecond)
+		wg.Add(2)
+		go func() {
+			defer wg.Done()
+			cmd := exec.Command(os.Args[0], "Nop")
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+			if err := cmd.Run(); err != nil {
+				fmt.Printf("Start failed: %v", err)
+			}
+		}()
+		go func() {
+			defer wg.Done()
+			syscall.Kill(-pgrp, syscall.SIGWINCH)
+		}()
+	}
+
+	signal.Stop(c)
+	close(c)
+	wg.Wait()
+
+	fmt.Println("OK")
+}
+
+func Nop() {
+	// This is just for SignalDuringExec.
+}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index cc167a8..f204830 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -31,6 +31,7 @@ var timers struct {
 	created      bool
 	sleeping     bool
 	rescheduling bool
+	sleepUntil   int64
 	waitnote     note
 	t            []*timer
 }
@@ -50,7 +51,12 @@ func timeSleep(ns int64) {
 		return
 	}
 
-	t := new(timer)
+	t := getg().timer
+	if t == nil {
+		t = new(timer)
+		getg().timer = t
+	}
+	*t = timer{}
 	t.when = nanotime() + ns
 	t.f = goroutineReady
 	t.arg = getg()
@@ -207,6 +213,7 @@ func timerproc() {
 		}
 		// At least one timer pending. Sleep until then.
 		timers.sleeping = true
+		timers.sleepUntil = now + delta
 		noteclear(&timers.waitnote)
 		unlock(&timers.lock)
 		notetsleepg(&timers.waitnote, delta)
@@ -295,8 +302,8 @@ func siftdownTimer(i int) {
 
 // Entry points for net, time to call nanotime.
 
-//go:linkname net_runtimeNano net.runtimeNano
-func net_runtimeNano() int64 {
+//go:linkname poll_runtimeNano internal_poll.runtimeNano
+func poll_runtimeNano() int64 {
 	return nanotime()
 }
 
@@ -304,3 +311,5 @@ func net_runtimeNano() int64 {
 func time_runtimeNano() int64 {
 	return nanotime()
 }
+
+var startNano int64 = nanotime()
diff --git a/libgo/go/runtime/timeasm.go b/libgo/go/runtime/timeasm.go
new file mode 100644
index 0000000..d5f5ea3
--- /dev/null
+++ b/libgo/go/runtime/timeasm.go
@@ -0,0 +1,17 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now directly in assembly.
+// Those systems are also expected to have nanotime subtract startNano,
+// so that time.now and nanotime return the same monotonic clock readings.
+
+// +build ignore
+// +build darwin,amd64 darwin,386 windows
+
+package runtime
+
+import _ "unsafe"
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64)
diff --git a/libgo/go/runtime/timestub.go b/libgo/go/runtime/timestub.go
new file mode 100644
index 0000000..033734e
--- /dev/null
+++ b/libgo/go/runtime/timestub.go
@@ -0,0 +1,21 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now
+// indirectly, in terms of walltime and nanotime assembly.
+
+// -build !darwin !amd64,!386
+// -build !windows
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+func walltime() (sec int64, nsec int32)
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64) {
+	sec, nsec = walltime()
+	return sec, nsec, nanotime() - startNano
+}
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index 61cfa8e..af9313b 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -19,50 +19,52 @@ import (
 
 // Event types in the trace, args are given in square brackets.
 const (
-	traceEvNone           = 0  // unused
-	traceEvBatch          = 1  // start of per-P batch of events [pid, timestamp]
-	traceEvFrequency      = 2  // contains tracer timer frequency [frequency (ticks per second)]
-	traceEvStack          = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
-	traceEvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
-	traceEvProcStart      = 5  // start of P [timestamp, thread id]
-	traceEvProcStop       = 6  // stop of P [timestamp]
-	traceEvGCStart        = 7  // GC start [timestamp, seq, stack id]
-	traceEvGCDone         = 8  // GC done [timestamp]
-	traceEvGCScanStart    = 9  // GC mark termination start [timestamp]
-	traceEvGCScanDone     = 10 // GC mark termination done [timestamp]
-	traceEvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
-	traceEvGCSweepDone    = 12 // GC sweep done [timestamp]
-	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
-	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
-	traceEvGoEnd          = 15 // goroutine ends [timestamp]
-	traceEvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
-	traceEvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
-	traceEvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
-	traceEvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
-	traceEvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
-	traceEvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
-	traceEvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
-	traceEvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
-	traceEvGoBlockSync    = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
-	traceEvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
-	traceEvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
-	traceEvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
-	traceEvGoSysBlock     = 30 // syscall blocks [timestamp]
-	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
-	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
-	traceEvHeapAlloc      = 33 // memstats.heap_live change [timestamp, heap_alloc]
-	traceEvNextGC         = 34 // memstats.next_gc change [timestamp, next_gc]
-	traceEvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
-	traceEvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
-	traceEvString         = 37 // string dictionary entry [ID, length, string]
-	traceEvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
-	traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
-	traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
-	traceEvGoStartLabel   = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
-	traceEvGoBlockGC      = 42 // goroutine blocks on GC assist [timestamp, stack]
-	traceEvCount          = 43
+	traceEvNone              = 0  // unused
+	traceEvBatch             = 1  // start of per-P batch of events [pid, timestamp]
+	traceEvFrequency         = 2  // contains tracer timer frequency [frequency (ticks per second)]
+	traceEvStack             = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
+	traceEvGomaxprocs        = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
+	traceEvProcStart         = 5  // start of P [timestamp, thread id]
+	traceEvProcStop          = 6  // stop of P [timestamp]
+	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
+	traceEvGCDone            = 8  // GC done [timestamp]
+	traceEvGCScanStart       = 9  // GC mark termination start [timestamp]
+	traceEvGCScanDone        = 10 // GC mark termination done [timestamp]
+	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
+	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
+	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart           = 14 // goroutine starts running [timestamp, goroutine id, seq]
+	traceEvGoEnd             = 15 // goroutine ends [timestamp]
+	traceEvGoStop            = 16 // goroutine stops (like in select{}) [timestamp, stack]
+	traceEvGoSched           = 17 // goroutine calls Gosched [timestamp, stack]
+	traceEvGoPreempt         = 18 // goroutine is preempted [timestamp, stack]
+	traceEvGoSleep           = 19 // goroutine calls Sleep [timestamp, stack]
+	traceEvGoBlock           = 20 // goroutine blocks [timestamp, stack]
+	traceEvGoUnblock         = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
+	traceEvGoBlockSend       = 22 // goroutine blocks on chan send [timestamp, stack]
+	traceEvGoBlockRecv       = 23 // goroutine blocks on chan recv [timestamp, stack]
+	traceEvGoBlockSelect     = 24 // goroutine blocks on select [timestamp, stack]
+	traceEvGoBlockSync       = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
+	traceEvGoBlockCond       = 26 // goroutine blocks on Cond [timestamp, stack]
+	traceEvGoBlockNet        = 27 // goroutine blocks on network [timestamp, stack]
+	traceEvGoSysCall         = 28 // syscall enter [timestamp, stack]
+	traceEvGoSysExit         = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
+	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
+	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
+	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
+	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
+	traceEvTimerGoroutine    = 35 // denotes timer goroutine [timer goroutine id]
+	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
+	traceEvString            = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal    = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal    = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvGoStartLabel      = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
+	traceEvGoBlockGC         = 42 // goroutine blocks on GC assist [timestamp, stack]
+	traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack]
+	traceEvGCMarkAssistDone  = 44 // GC mark assist done [timestamp]
+	traceEvCount             = 45
 )
 
 const (
@@ -311,7 +313,7 @@ func StopTrace() {
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
-	semacquire(&trace.shutdownSema, 0)
+	semacquire(&trace.shutdownSema)
 	if raceenabled {
 		raceacquire(unsafe.Pointer(&trace.shutdownSema))
 	}
@@ -380,7 +382,7 @@ func ReadTrace() []byte {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.8 trace\x00\x00\x00\x00")
+		return []byte("go 1.9 trace\x00\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -776,11 +778,12 @@ func (tab *traceStackTable) dump() {
 		for ; stk != nil; stk = stk.link.ptr() {
 			tmpbuf := tmp[:0]
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
-			tmpbuf = traceAppend(tmpbuf, uint64(stk.n))
-			for _, pc := range stk.stack() {
+			frames := stk.stack()
+			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
+			for _, f := range frames {
 				var frame traceFrame
-				frame, buf = traceFrameForPC(buf, pc)
-				tmpbuf = traceAppend(tmpbuf, uint64(pc.pc))
+				frame, buf = traceFrameForPC(buf, f)
+				tmpbuf = traceAppend(tmpbuf, uint64(f.pc))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
@@ -810,16 +813,17 @@ type traceFrame struct {
 	line   uint64
 }
 
-func traceFrameForPC(buf *traceBuf, loc location) (traceFrame, *traceBuf) {
+func traceFrameForPC(buf *traceBuf, f location) (traceFrame, *traceBuf) {
 	var frame traceFrame
-	fn := loc.function
+
+	fn := f.function
 	const maxLen = 1 << 10
 	if len(fn) > maxLen {
 		fn = fn[len(fn)-maxLen:]
 	}
 	frame.funcID, buf = traceString(buf, fn)
-	file, line := loc.filename, loc.lineno
-	frame.line = uint64(line)
+	frame.line = uint64(f.lineno)
+	file := f.filename
 	if len(file) > maxLen {
 		file = file[len(file)-maxLen:]
 	}
@@ -921,12 +925,52 @@ func traceGCScanDone() {
 	traceEvent(traceEvGCScanDone, -1)
 }
 
+// traceGCSweepStart prepares to trace a sweep loop. This does not
+// emit any events until traceGCSweepSpan is called.
+//
+// traceGCSweepStart must be paired with traceGCSweepDone and there
+// must be no preemption points between these two calls.
 func traceGCSweepStart() {
-	traceEvent(traceEvGCSweepStart, 1)
+	// Delay the actual GCSweepStart event until the first span
+	// sweep. If we don't sweep anything, don't emit any events.
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		throw("double traceGCSweepStart")
+	}
+	_p_.traceSweep, _p_.traceSwept, _p_.traceReclaimed = true, 0, 0
+}
+
+// traceGCSweepSpan traces the sweep of a single page.
+//
+// This may be called outside a traceGCSweepStart/traceGCSweepDone
+// pair; however, it will not emit any trace events in this case.
+func traceGCSweepSpan(bytesSwept uintptr) {
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		if _p_.traceSwept == 0 {
+			traceEvent(traceEvGCSweepStart, 1)
+		}
+		_p_.traceSwept += bytesSwept
+	}
 }
 
 func traceGCSweepDone() {
-	traceEvent(traceEvGCSweepDone, -1)
+	_p_ := getg().m.p.ptr()
+	if !_p_.traceSweep {
+		throw("missing traceGCSweepStart")
+	}
+	if _p_.traceSwept != 0 {
+		traceEvent(traceEvGCSweepDone, -1, uint64(_p_.traceSwept), uint64(_p_.traceReclaimed))
+	}
+	_p_.traceSweep = false
+}
+
+func traceGCMarkAssistStart() {
+	traceEvent(traceEvGCMarkAssistStart, 1)
+}
+
+func traceGCMarkAssistDone() {
+	traceEvent(traceEvGCMarkAssistDone, -1)
 }
 
 func traceGoCreate(newg *g, pc uintptr) {
@@ -967,7 +1011,7 @@ func traceGoPreempt() {
 	traceEvent(traceEvGoPreempt, 1)
 }
 
-func traceGoPark(traceEv byte, skip int, gp *g) {
+func traceGoPark(traceEv byte, skip int) {
 	if traceEv&traceFutileWakeup != 0 {
 		traceEvent(traceEvFutileWakeup, -1)
 	}
diff --git a/libgo/go/runtime/trace/trace_stack_test.go b/libgo/go/runtime/trace/trace_stack_test.go
index c37b33d..274cdf7 100644
--- a/libgo/go/runtime/trace/trace_stack_test.go
+++ b/libgo/go/runtime/trace/trace_stack_test.go
@@ -151,7 +151,7 @@ func TestTraceSymbolize(t *testing.T) {
 			{"testing.tRunner", 0},
 		}},
 		{trace.EvGoCreate, []frame{
-			{"runtime/trace_test.TestTraceSymbolize", 39},
+			{"runtime/trace_test.TestTraceSymbolize", 37},
 			{"testing.tRunner", 0},
 		}},
 		{trace.EvGoStop, []frame{
@@ -231,6 +231,7 @@ func TestTraceSymbolize(t *testing.T) {
 	if runtime.GOOS != "windows" && runtime.GOOS != "plan9" {
 		want = append(want, []eventDesc{
 			{trace.EvGoBlockNet, []frame{
+				{"internal/poll.(*FD).Accept", 0},
 				{"net.(*netFD).accept", 0},
 				{"net.(*TCPListener).accept", 0},
 				{"net.(*TCPListener).Accept", 0},
@@ -239,6 +240,7 @@ func TestTraceSymbolize(t *testing.T) {
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
+				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
 				{"runtime/trace_test.TestTraceSymbolize.func11", 102},
@@ -274,9 +276,10 @@ func TestTraceSymbolize(t *testing.T) {
 				continue
 			}
 			for _, f := range ev.Stk {
-				t.Logf("  %v:%v", f.Fn, f.Line)
+				t.Logf("  %v :: %s:%v", f.Fn, f.File, f.Line)
 			}
 			t.Logf("---")
 		}
+		t.Logf("======")
 	}
 }
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 715772e..f29ccd7 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -12,14 +12,6 @@ import (
 	_ "unsafe" // for go:linkname
 )
 
-// For gccgo, use go:linkname to rename compiler-called functions to
-// themselves, so that the compiler will export them.
-// These are temporary for C runtime code to call.
-//go:linkname traceback runtime.traceback
-//go:linkname printtrace runtime.printtrace
-//go:linkname goroutineheader runtime.goroutineheader
-//go:linkname printcreatedby runtime.printcreatedby
-
 func printcreatedby(gp *g) {
 	// Show what created goroutine, except main goroutine (goid 1).
 	pc := gp.gopc
@@ -71,6 +63,7 @@ func traceback(skip int32) {
 	var locbuf [100]location
 	c := c_callers(skip+1, &locbuf[0], int32(len(locbuf)), false)
 	printtrace(locbuf[:c], getg())
+	printcreatedby(getg())
 }
 
 // printtrace prints a traceback from locbuf.
@@ -223,7 +216,7 @@ func tracebackothers(me *g) {
 			print("\tgoroutine running on other thread; stack unavailable\n")
 			printcreatedby(gp)
 		} else if readgstatus(gp)&^_Gscan == _Gsyscall {
-			print("\tgoroutine in C code; stack unavailable\n")
+			print("\tin C code; stack unavailable\n")
 			printcreatedby(gp)
 		} else {
 			gp.traceback = &tb
diff --git a/libgo/go/runtime/write_err_android.go b/libgo/go/runtime/write_err_android.go
index 748dec6..bf99b5f 100644
--- a/libgo/go/runtime/write_err_android.go
+++ b/libgo/go/runtime/write_err_android.go
@@ -144,7 +144,7 @@ func writeLogdHeader() int {
 	//      hdr[3:7] sec unsigned uint32, little endian.
 	//      hdr[7:11] nsec unsigned uint32, little endian.
 	hdr[0] = 0 // LOG_ID_MAIN
-	sec, nsec := time_now()
+	sec, nsec := walltime()
 	packUint32(hdr[3:7], uint32(sec))
 	packUint32(hdr[7:11], uint32(nsec))