1 files changed, 375 insertions, 272 deletions
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index a025137..c0e8577 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -548,6 +548,7 @@ func schedinit() {
 	usestackmaps = probestackmaps()
 
 	mallocinit()
+	fastrandinit() // must run before mcommoninit
 	mcommoninit(_g_.m)
 	cpuinit() // must run before alginit
 	alginit() // maps must not be used before this call
@@ -622,8 +623,8 @@ func mcommoninit(mp *m) {
 	sched.mnext++
 	checkmcount()
 
-	mp.fastrand[0] = 1597334677 * uint32(mp.id)
-	mp.fastrand[1] = uint32(cputicks())
+	mp.fastrand[0] = uint32(int64Hash(uint64(mp.id), fastrandseed))
+	mp.fastrand[1] = uint32(int64Hash(uint64(cputicks()), ^fastrandseed))
 	if mp.fastrand[0]|mp.fastrand[1] == 0 {
 		mp.fastrand[1] = 1
 	}
@@ -640,6 +641,13 @@ func mcommoninit(mp *m) {
 	unlock(&sched.lock)
 }
 
+var fastrandseed uintptr
+
+func fastrandinit() {
+	s := (*[unsafe.Sizeof(fastrandseed)]byte)(unsafe.Pointer(&fastrandseed))[:]
+	getRandomData(s)
+}
+
 // Mark gp ready to run.
 func ready(gp *g, traceskip int, next bool) {
 	if trace.enabled {
@@ -704,18 +712,6 @@ func readgstatus(gp *g) uint32 {
 	return atomic.Load(&gp.atomicstatus)
 }
 
-// Ownership of gcscanvalid:
-//
-// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
-// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
-//
-// Otherwise, a second goroutine can lock the scan state by setting _Gscan
-// in the status bit and then modify gcscanvalid, and then unlock the scan state.
-//
-// Note that the first condition implies an exception to the second:
-// if a second goroutine changes gp's status to _Grunning|_Gscan,
-// that second goroutine still does not have the right to modify gcscanvalid.
-
 // The Gscanstatuses are acting like locks and this releases them.
 // If it proves to be a performance hit we should be able to make these
 // simple atomic stores but for now we are going to throw if
@@ -732,7 +728,8 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 	case _Gscanrunnable,
 		_Gscanwaiting,
 		_Gscanrunning,
-		_Gscansyscall:
+		_Gscansyscall,
+		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
 			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
@@ -774,17 +771,6 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		})
 	}
 
-	if oldval == _Grunning && gp.gcscanvalid {
-		// If oldvall == _Grunning, then the actual status must be
-		// _Grunning or _Grunning|_Gscan; either way,
-		// we own gp.gcscanvalid, so it's safe to read.
-		// gp.gcscanvalid must not be true when we are running.
-		systemstack(func() {
-			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-			throw("casgstatus")
-		})
-	}
-
 	// See https://golang.org/cl/21503 for justification of the yield delay.
 	const yieldDelay = 5 * 1000
 	var nextYield int64
@@ -795,14 +781,6 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
 			throw("casgstatus: waiting for Gwaiting but is Grunnable")
 		}
-		// Help GC if needed.
-		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-		// 	gp.preemptscan = false
-		// 	systemstack(func() {
-		// 		gcphasework(gp)
-		// 	})
-		// }
-		// But meanwhile just yield.
 		if i == 0 {
 			nextYield = nanotime() + yieldDelay
 		}
@@ -815,174 +793,28 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
 }
 
-// scang blocks until gp's stack has been scanned.
-// It might be scanned by scang or it might be scanned by the goroutine itself.
-// Either way, the stack scan has completed when scang returns.
-func scang(gp *g, gcw *gcWork) {
-	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
-	// Nothing is racing with us now, but gcscandone might be set to true left over
-	// from an earlier round of stack scanning (we scan twice per GC).
-	// We use gcscandone to record whether the scan has been done during this round.
-
-	gp.gcscandone = false
-
-	// See https://golang.org/cl/21503 for justification of the yield delay.
-	const yieldDelay = 10 * 1000
-	var nextYield int64
-
-	// Endeavor to get gcscandone set to true,
-	// either by doing the stack scan ourselves or by coercing gp to scan itself.
-	// gp.gcscandone can transition from false to true when we're not looking
-	// (if we asked for preemption), so any time we lock the status using
-	// castogscanstatus we have to double-check that the scan is still not done.
-loop:
-	for i := 0; !gp.gcscandone; i++ {
-		switch s := readgstatus(gp); s {
-		default:
-			dumpgstatus(gp)
-			throw("stopg: invalid status")
-
-		case _Gdead:
-			// No stack.
-			gp.gcscandone = true
-			break loop
-
-		case _Gcopystack:
-		// Stack being switched. Go around again.
-
-		case _Gsyscall:
-			if usestackmaps {
-				// Claim goroutine by setting scan bit.
-				// Racing with execution or readying of gp.
-				// The scan bit keeps them from running
-				// the goroutine until we're done.
-				if castogscanstatus(gp, s, s|_Gscan) {
-					if gp.scanningself {
-						// Don't try to scan the stack
-						// if the goroutine is going to do
-						// it itself.
-						// FIXME: can this happen?
-						restartg(gp)
-						break
-					}
-					if !gp.gcscandone {
-						// Send a signal to let the goroutine scan
-						// itself. This races with enter/exitsyscall.
-						// If the goroutine is not stopped at a safepoint,
-						// it will not scan the stack and we'll try again.
-						mp := gp.m
-						noteclear(&mp.scannote)
-						gp.scangcw = uintptr(unsafe.Pointer(gcw))
-						tgkill(getpid(), _pid_t(mp.procid), _SIGURG)
-
-						// Wait for gp to scan its own stack.
-						notesleep(&mp.scannote)
-
-						if !gp.gcscandone {
-							// The signal delivered at a bad time.
-							// Try again.
-							restartg(gp)
-							break
-						}
-					}
-					restartg(gp)
-					break loop
-				}
-				break
-			}
-			fallthrough
-
-		case _Grunnable, _Gwaiting:
-			// Claim goroutine by setting scan bit.
-			// Racing with execution or readying of gp.
-			// The scan bit keeps them from running
-			// the goroutine until we're done.
-			if castogscanstatus(gp, s, s|_Gscan) {
-				if gp.scanningself {
-					// Don't try to scan the stack
-					// if the goroutine is going to do
-					// it itself.
-					restartg(gp)
-					break
-				}
-				if !gp.gcscandone {
-					scanstack(gp, gcw)
-					gp.gcscandone = true
-				}
-				restartg(gp)
-				break loop
-			}
-
-		case _Gexitingsyscall:
-			// This is a transient state during which we should not scan its stack.
-			// Try again.
-
-		case _Gscanwaiting:
-			// newstack is doing a scan for us right now. Wait.
-
-		case _Gscanrunning:
-			// checkPreempt is scanning. Wait.
-
-		case _Grunning:
-			// Goroutine running. Try to preempt execution so it can scan itself.
-			// The preemption handler (in newstack) does the actual scan.
-
-			// Optimization: if there is already a pending preemption request
-			// (from the previous loop iteration), don't bother with the atomics.
-			if gp.preemptscan && gp.preempt {
-				break
-			}
-
-			// Ask for preemption and self scan.
-			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
-				if !gp.gcscandone {
-					gp.preemptscan = true
-					gp.preempt = true
-				}
-				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
-			}
-		}
-
-		if i == 0 {
-			nextYield = nanotime() + yieldDelay
-		}
-		if nanotime() < nextYield {
-			procyield(10)
-		} else {
-			osyield()
-			nextYield = nanotime() + yieldDelay/2
-		}
+// casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted.
+//
+// TODO(austin): This is the only status operation that both changes
+// the status and locks the _Gscan bit. Rethink this.
+func casGToPreemptScan(gp *g, old, new uint32) {
+	if old != _Grunning || new != _Gscan|_Gpreempted {
+		throw("bad g transition")
 	}
-
-	gp.preemptscan = false // cancel scan request if no longer needed
-}
-
-// The GC requests that this routine be moved from a scanmumble state to a mumble state.
-func restartg(gp *g) {
-	if gp.scang != 0 || gp.scangcw != 0 {
-		print("g ", gp.goid, "is being scanned scang=", gp.scang, " scangcw=", gp.scangcw, "\n")
-		throw("restartg: being scanned")
+	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
 	}
+}
 
-	s := readgstatus(gp)
-	switch s {
-	default:
-		dumpgstatus(gp)
-		throw("restartg: unexpected status")
-
-	case _Gdead:
-	// ok
-
-	case _Gscanrunnable,
-		_Gscanwaiting,
-		_Gscansyscall:
-		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+// casGFromPreempted attempts to transition gp from _Gpreempted to
+// _Gwaiting. If successful, the caller is responsible for
+// re-scheduling gp.
+func casGFromPreempted(gp *g, old, new uint32) bool {
+	if old != _Gpreempted || new != _Gwaiting {
+		throw("bad g transition")
 	}
+	return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
 }
 
 // stopTheWorld stops all P's from executing goroutines, interrupting
@@ -1001,8 +833,23 @@ func restartg(gp *g) {
 // goroutines.
 func stopTheWorld(reason string) {
 	semacquire(&worldsema)
-	getg().m.preemptoff = reason
-	systemstack(stopTheWorldWithSema)
+	gp := getg()
+	gp.m.preemptoff = reason
+	systemstack(func() {
+		// Mark the goroutine which called stopTheWorld preemptible so its
+		// stack may be scanned.
+		// This lets a mark worker scan us while we try to stop the world
+		// since otherwise we could get in a mutual preemption deadlock.
+		// We must not modify anything on the G stack because a stack shrink
+		// may occur. A stack shrink is otherwise OK though because in order
+		// to return from this function (and to leave the system stack) we
+		// must have preempted all goroutines, including any attempting
+		// to scan our stack, in which case, any stack shrinking will
+		// have already completed by the time we exit.
+		casgstatus(gp, _Grunning, _Gwaiting)
+		stopTheWorldWithSema()
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
 }
 
 // startTheWorld undoes the effects of stopTheWorld.
@@ -1014,10 +861,31 @@ func startTheWorld() {
 	getg().m.preemptoff = ""
 }
 
-// Holding worldsema grants an M the right to try to stop the world
-// and prevents gomaxprocs from changing concurrently.
+// stopTheWorldGC has the same effect as stopTheWorld, but blocks
+// until the GC is not running. It also blocks a GC from starting
+// until startTheWorldGC is called.
+func stopTheWorldGC(reason string) {
+	semacquire(&gcsema)
+	stopTheWorld(reason)
+}
+
+// startTheWorldGC undoes the effects of stopTheWorldGC.
+func startTheWorldGC() {
+	startTheWorld()
+	semrelease(&gcsema)
+}
+
+// Holding worldsema grants an M the right to try to stop the world.
 var worldsema uint32 = 1
 
+// Holding gcsema grants the M the right to block a GC, and blocks
+// until the current GC is done. In particular, it prevents gomaxprocs
+// from changing concurrently.
+//
+// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle
+// being changed/enabled during a GC, remove this.
+var gcsema uint32 = 1
+
 // stopTheWorldWithSema is the core implementation of stopTheWorld.
 // The caller is responsible for acquiring worldsema and disabling
 // preemption first and then should stopTheWorldWithSema on the system
@@ -1119,7 +987,7 @@ func stopTheWorldWithSema() {
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if netpollinited() {
-		list := netpoll(false) // non-blocking
+		list := netpoll(0) // non-blocking
 		injectglist(&list)
 	}
 	lock(&sched.lock)
@@ -1299,6 +1167,11 @@ func mexit(osStack bool) {
 	// Free the gsignal stack.
 	if m.gsignal != nil {
 		stackfree(m.gsignal)
+		// On some platforms, when calling into VDSO (e.g. nanotime)
+		// we store our g on the gsignal stack, if there is one.
+		// Now the stack is freed, unlink it from the m, so we
+		// won't write to it when calling VDSO code.
+		m.gsignal = nil
 	}
 
 	// Remove m from allm.
@@ -1637,8 +1510,6 @@ func oneNewExtraM() {
 	// the goroutine stack ends.
 	mp, g0SP, g0SPSize := allocm(nil, nil, true)
 	gp := malg(true, false, nil, nil)
-	gp.gcscanvalid = true
-	gp.gcscandone = true
 	// malg returns status as _Gidle. Change to _Gdead before
 	// adding to allg where GC can see it. We use _Gdead to hide
 	// this from tracebacks and stack scans since it isn't a
@@ -1704,6 +1575,7 @@ func dropm() {
 
 	// Return mp.curg to dead state.
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	mp.curg.preemptStop = false
 	atomic.Xadd(&sched.ngsys, +1)
 
 	// Block signals before unminit.
@@ -2034,6 +1906,9 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
+	if when := nobarrierWakeTime(_p_); when != 0 {
+		wakeNetPoller(when)
+	}
 	pidleput(_p_)
 	unlock(&sched.lock)
 }
@@ -2135,14 +2010,16 @@ func gcstopm() {
 func execute(gp *g, inheritTime bool) {
 	_g_ := getg()
 
+	// Assign gp.m before entering _Grunning so running Gs have an
+	// M.
+	_g_.m.curg = gp
+	gp.m = _g_.m
 	casgstatus(gp, _Grunnable, _Grunning)
 	gp.waitsince = 0
 	gp.preempt = false
 	if !inheritTime {
 		_g_.m.p.ptr().schedtick++
 	}
-	_g_.m.curg = gp
-	gp.m = _g_.m
 
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
@@ -2163,7 +2040,7 @@ func execute(gp *g, inheritTime bool) {
 }
 
 // Finds a runnable goroutine to execute.
-// Tries to steal from other P's, get g from global queue, poll network.
+// Tries to steal from other P's, get g from local or global queue, poll network.
 func findrunnable() (gp *g, inheritTime bool) {
 	_g_ := getg()
 
@@ -2180,6 +2057,9 @@ top:
 	if _p_.runSafePointFn != 0 {
 		runSafePointFn()
 	}
+
+	now, pollUntil, _ := checkTimers(_p_, 0)
+
 	if fingwait && fingwake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
@@ -2212,7 +2092,7 @@ top:
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
-		if list := netpoll(false); !list.empty() { // non-blocking
+		if list := netpoll(0); !list.empty() { // non-blocking
 			gp := list.pop()
 			injectglist(&list)
 			casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2225,12 +2105,7 @@ top:
 
 	// Steal work from other P's.
 	procs := uint32(gomaxprocs)
-	if atomic.Load(&sched.npidle) == procs-1 {
-		// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
-		// New work can appear from returning syscall/cgocall, network or timers.
-		// Neither of that submits to local run queues, so no point in stealing.
-		goto stop
-	}
+	ranTimer := false
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
@@ -2247,11 +2122,48 @@ top:
 				goto top
 			}
 			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
-			if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+			p2 := allp[enum.position()]
+			if _p_ == p2 {
+				continue
+			}
+			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
 				return gp, false
 			}
+
+			// Consider stealing timers from p2.
+			// This call to checkTimers is the only place where
+			// we hold a lock on a different P's timers.
+			// Lock contention can be a problem here, so avoid
+			// grabbing the lock if p2 is running and not marked
+			// for preemption. If p2 is running and not being
+			// preempted we assume it will handle its own timers.
+			if i > 2 && shouldStealTimers(p2) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that is always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(_p_); gp != nil {
+						return gp, inheritTime
+					}
+					ranTimer = true
+				}
+			}
 		}
 	}
+	if ranTimer {
+		// Running a timer may have made some goroutine ready.
+		goto top
+	}
 
 stop:
 
@@ -2268,10 +2180,16 @@ stop:
 		return gp, false
 	}
 
+	delta := int64(-1)
+	if pollUntil != 0 {
+		// checkTimers ensures that polluntil > now.
+		delta = pollUntil - now
+	}
+
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
 	// then pause execution until a callback was triggered.
-	if beforeIdle() {
+	if beforeIdle(delta) {
 		// At least one goroutine got woken.
 		goto top
 	}
@@ -2359,21 +2277,35 @@ stop:
 	}
 
 	// poll network
-	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
-		list := netpoll(true) // block until new work is available
+		if faketime != 0 {
+			// When using fake time, just poll.
+			delta = 0
+		}
+		list := netpoll(delta) // block until new work is available
+		atomic.Store64(&sched.pollUntil, 0)
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
-		if !list.empty() {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
+		if faketime != 0 && list.empty() {
+			// Using fake time and nothing is ready; stop M.
+			// When all M's stop, checkdead will call timejump.
+			stopm()
+			goto top
+		}
+		lock(&sched.lock)
+		_p_ = pidleget()
+		unlock(&sched.lock)
+		if _p_ == nil {
+			injectglist(&list)
+		} else {
+			acquirep(_p_)
+			if !list.empty() {
 				gp := list.pop()
 				injectglist(&list)
 				casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2382,7 +2314,16 @@ stop:
 				}
 				return gp, false
 			}
-			injectglist(&list)
+			if wasSpinning {
+				_g_.m.spinning = true
+				atomic.Xadd(&sched.nmspinning, 1)
+			}
+			goto top
+		}
+	} else if pollUntil != 0 && netpollinited() {
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > pollUntil {
+			netpollBreak()
 		}
 	}
 	stopm()
@@ -2402,7 +2343,7 @@ func pollWork() bool {
 		return true
 	}
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
-		if list := netpoll(false); !list.empty() {
+		if list := netpoll(0); !list.empty() {
 			injectglist(&list)
 			return true
 		}
@@ -2410,6 +2351,22 @@ func pollWork() bool {
 	return false
 }
 
+// wakeNetPoller wakes up the thread sleeping in the network poller,
+// if there is one, and if it isn't going to wake up anyhow before
+// the when argument.
+func wakeNetPoller(when int64) {
+	if atomic.Load64(&sched.lastpoll) == 0 {
+		// In findrunnable we ensure that when polling the pollUntil
+		// field is either zero or the time to which the current
+		// poll is expected to run. This can have a spurious wakeup
+		// but should never miss a wakeup.
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > when {
+			netpollBreak()
+		}
+	}
+}
+
 func resetspinning() {
 	_g_ := getg()
 	if !_g_.m.spinning {
@@ -2474,14 +2431,26 @@ func schedule() {
 	}
 
 top:
+	pp := _g_.m.p.ptr()
+	pp.preempt = false
+
 	if sched.gcwaiting != 0 {
 		gcstopm()
 		goto top
 	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
+	if pp.runSafePointFn != 0 {
 		runSafePointFn()
 	}
 
+	// Sanity check: if we are spinning, the run queue should be empty.
+	// Check this before calling checkTimers, as that might call
+	// goready to put a ready goroutine on the local run queue.
+	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
+		throw("schedule: spinning with local work")
+	}
+
+	checkTimers(pp, 0)
+
 	var gp *g
 	var inheritTime bool
 
@@ -2513,9 +2482,8 @@ top:
 	}
 	if gp == nil {
 		gp, inheritTime = runqget(_g_.m.p.ptr())
-		if gp != nil && _g_.m.spinning {
-			throw("schedule: spinning with local work")
-		}
+		// We can see gp != nil here even if the M is spinning,
+		// if checkTimers added a local goroutine via goready.
 
 		// Because gccgo does not implement preemption as a stack check,
 		// we need to check for preemption here for fairness.
@@ -2591,6 +2559,62 @@ func dropg() {
 	setGNoWB(&_g_.m.curg, nil)
 }
 
+// checkTimers runs any timers for the P that are ready.
+// If now is not 0 it is the current time.
+// It returns the current time or 0 if it is not known,
+// and the time when the next timer should run or 0 if there is no next timer,
+// and reports whether it ran any timers.
+// If the time when the next timer should run is not 0,
+// it is always larger than the returned time.
+// We pass now in and out to avoid extra calls of nanotime.
+//go:yeswritebarrierrec
+func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
+	lock(&pp.timersLock)
+
+	adjusttimers(pp)
+
+	rnow = now
+	if len(pp.timers) > 0 {
+		if rnow == 0 {
+			rnow = nanotime()
+		}
+		for len(pp.timers) > 0 {
+			// Note that runtimer may temporarily unlock
+			// pp.timersLock.
+			if tw := runtimer(pp, rnow); tw != 0 {
+				if tw > 0 {
+					pollUntil = tw
+				}
+				break
+			}
+			ran = true
+		}
+	}
+
+	unlock(&pp.timersLock)
+
+	return rnow, pollUntil, ran
+}
+
+// shouldStealTimers reports whether we should try stealing the timers from p2.
+// We don't steal timers from a running P that is not marked for preemption,
+// on the assumption that it will run its own timers. This reduces
+// contention on the timers lock.
+func shouldStealTimers(p2 *p) bool {
+	if p2.status != _Prunning {
+		return true
+	}
+	mp := p2.m.ptr()
+	if mp == nil || mp.locks > 0 {
+		return false
+	}
+	gp := mp.curg
+	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
+		return false
+	}
+	return true
+}
+
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -2604,8 +2628,8 @@ func park_m(gp *g) {
 		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
 	}
 
-	dropg()
 	casgstatus(gp, _Grunning, _Gwaiting)
+	dropg()
 
 	if fn := _g_.m.waitunlockf; fn != nil {
 		ok := fn(gp, _g_.m.waitlock)
@@ -2628,8 +2652,8 @@ func goschedImpl(gp *g) {
 		dumpgstatus(gp)
 		throw("bad g status")
 	}
-	dropg()
 	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
 	lock(&sched.lock)
 	globrunqput(gp)
 	unlock(&sched.lock)
@@ -2648,7 +2672,7 @@ func gosched_m(gp *g) {
 // goschedguarded is a forbidden-states-avoided version of gosched_m
 func goschedguarded_m(gp *g) {
 
-	if gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.p.ptr().status != _Prunning {
+	if !canPreemptM(gp.m) {
 		gogo(gp) // never return
 	}
 
@@ -2665,6 +2689,47 @@ func gopreempt_m(gp *g) {
 	goschedImpl(gp)
 }
 
+// preemptPark parks gp and puts it in _Gpreempted.
+//
+//go:systemstack
+func preemptPark(gp *g) {
+	if trace.enabled {
+		traceGoPark(traceEvGoBlock, 0)
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	gp.waitreason = waitReasonPreempted
+	// Transition from _Grunning to _Gscan|_Gpreempted. We can't
+	// be in _Grunning when we dropg because then we'd be running
+	// without an M, but the moment we're in _Gpreempted,
+	// something could claim this G before we've fully cleaned it
+	// up. Hence, we set the scan bit to lock down further
+	// transitions until we can dropg.
+	casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
+	dropg()
+	casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+	schedule()
+}
+
+// goyield is like Gosched, but it:
+// - does not emit a GoSched trace event
+// - puts the current G on the runq of the current P instead of the globrunq
+func goyield() {
+	checkTimeouts()
+	mcall(goyield_m)
+}
+
+func goyield_m(gp *g) {
+	pp := gp.m.p.ptr()
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	runqput(pp, gp, false)
+	schedule()
+}
+
 // Finishes execution of the current goroutine.
 func goexit1() {
 	if trace.enabled {
@@ -2687,6 +2752,7 @@ func goexit0(gp *g) {
 	gp.lockedm = 0
 	_g_.m.lockedg = 0
 	gp.entry = nil
+	gp.preemptStop = false
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
@@ -2705,9 +2771,6 @@ func goexit0(gp *g) {
 		gp.gcAssistBytes = 0
 	}
 
-	// Note that gp's stack scan is now "valid" because it has no
-	// stack.
-	gp.gcscanvalid = true
 	dropg()
 
 	if GOARCH == "wasm" { // no threads yet on wasm
@@ -3215,7 +3278,6 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
-	newg.gcscanvalid = false
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -3736,6 +3798,20 @@ func (pp *p) destroy() {
 		globrunqputhead(pp.runnext.ptr())
 		pp.runnext = 0
 	}
+	if len(pp.timers) > 0 {
+		plocal := getg().m.p.ptr()
+		// The world is stopped, but we acquire timersLock to
+		// protect against sysmon calling timeSleepUntil.
+		// This is the only case where we hold the timersLock of
+		// more than one P, so there are no deadlock concerns.
+		lock(&plocal.timersLock)
+		lock(&pp.timersLock)
+		moveTimers(plocal, pp.timers)
+		pp.timers = nil
+		pp.adjustTimers = 0
+		unlock(&pp.timersLock)
+		unlock(&plocal.timersLock)
+	}
 	// If there's a background worker, make it runnable and put
 	// it on the global queue so it can clean itself up.
 	if gp := pp.gcBgMarkWorker.ptr(); gp != nil {
@@ -3761,14 +3837,18 @@ func (pp *p) destroy() {
 		pp.deferpoolbuf[i] = nil
 	}
 	pp.deferpool = pp.deferpoolbuf[:0]
+	systemstack(func() {
+		for i := 0; i < pp.mspancache.len; i++ {
+			// Safe to call since the world is stopped.
+			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
+		}
+		pp.mspancache.len = 0
+		pp.pcache.flush(&mheap_.pages)
+	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
-	if raceenabled {
-		raceprocdestroy(pp.raceprocctx)
-		pp.raceprocctx = 0
-	}
 	pp.gcAssistTime = 0
 	pp.status = _Pdead
 }
@@ -4016,7 +4096,8 @@ func checkdead() {
 		}
 		s := readgstatus(gp)
 		switch s &^ _Gscan {
-		case _Gwaiting:
+		case _Gwaiting,
+			_Gpreempted:
 			grunning++
 		case _Grunnable,
 			_Grunning,
@@ -4028,17 +4109,18 @@ func checkdead() {
 	}
 	unlock(&allglock)
 	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
 		throw("no goroutines (main called runtime.Goexit) - deadlock!")
 	}
 
 	// Maybe jump time forward for playground.
-	gp := timejump()
-	if gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
-		_p_ := pidleget()
-		if _p_ == nil {
-			throw("checkdead: no p for timer")
+	_p_ := timejump()
+	if _p_ != nil {
+		for pp := &sched.pidle; *pp != 0; pp = &(*pp).ptr().link {
+			if (*pp).ptr() == _p_ {
+				*pp = _p_.link
+				break
+			}
 		}
 		mp := mget()
 		if mp == nil {
@@ -4051,7 +4133,15 @@ func checkdead() {
 		return
 	}
 
+	// There are no goroutines running, so we can look at the P's.
+	for _, _p_ := range allp {
+		if len(_p_.timers) > 0 {
+			return
+		}
+	}
+
 	getg().m.throwing = -1 // do not dump full stacks
+	unlock(&sched.lock)    // unlock so that GODEBUG=scheddetail=1 doesn't hang
 	throw("all goroutines are asleep - deadlock!")
 }
 
@@ -4084,32 +4174,34 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		now := nanotime()
+		next := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
-				atomic.Store(&sched.sysmonwait, 1)
-				unlock(&sched.lock)
-				// Make wake-up period small enough
-				// for the sampling to be correct.
-				maxsleep := forcegcperiod / 2
-				shouldRelax := true
-				if osRelaxMinNS > 0 {
-					next := timeSleepUntil()
-					now := nanotime()
-					if next-now < osRelaxMinNS {
-						shouldRelax = false
+				if next > now {
+					atomic.Store(&sched.sysmonwait, 1)
+					unlock(&sched.lock)
+					// Make wake-up period small enough
+					// for the sampling to be correct.
+					sleep := forcegcperiod / 2
+					if next-now < sleep {
+						sleep = next - now
 					}
+					shouldRelax := sleep >= osRelaxMinNS
+					if shouldRelax {
+						osRelax(true)
+					}
+					notetsleep(&sched.sysmonnote, sleep)
+					if shouldRelax {
+						osRelax(false)
+					}
+					now = nanotime()
+					next = timeSleepUntil()
+					lock(&sched.lock)
+					atomic.Store(&sched.sysmonwait, 0)
+					noteclear(&sched.sysmonnote)
 				}
-				if shouldRelax {
-					osRelax(true)
-				}
-				notetsleep(&sched.sysmonnote, maxsleep)
-				if shouldRelax {
-					osRelax(false)
-				}
-				lock(&sched.lock)
-				atomic.Store(&sched.sysmonwait, 0)
-				noteclear(&sched.sysmonnote)
 				idle = 0
 				delay = 20
 			}
@@ -4121,10 +4213,9 @@ func sysmon() {
 		}
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
-		now := nanotime()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			list := netpoll(false) // non-blocking - returns list of goroutines
+			list := netpoll(0) // non-blocking - returns list of goroutines
 			if !list.empty() {
 				// Need to decrement number of idle locked M's
 				// (pretending that one more is running) before injectglist.
@@ -4138,6 +4229,12 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
+		if next < now {
+			// There are timers that should have already run,
+			// perhaps because there is an unpreemptible P.
+			// Try to start an M to run them.
+			startm(nil, false)
+		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
 		if retake(now) != 0 {
@@ -4296,6 +4393,12 @@ func preemptone(_p_ *p) bool {
 	// and a few other places, which is at least better than doing
 	// nothing at all.
 
+	// Request an async preemption of this P.
+	if preemptMSupported && debug.asyncpreemptoff == 0 {
+		_p_.preempt = true
+		preemptM(mp)
+	}
+
 	return true
 }
 
@@ -4324,7 +4427,7 @@ func schedtrace(detailed bool) {
 			if mp != nil {
 				id = mp.id
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, "\n")
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, " timerslen=", len(_p_.timers), "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]