libgo: Update to Go 1.1.1.

From-SVN: r200974
author: Ian Lance Taylor <ian@gcc.gnu.org> 2013-07-16 06:54:42 +0000
committer: Ian Lance Taylor <ian@gcc.gnu.org> 2013-07-16 06:54:42 +0000
commit: be47d6eceffd2c5dbbc1566d5eea490527fb2bd4 (patch)
tree: 0e8fda573576bb4181dba29d0e88380a8c38fafd /libgo/runtime
parent: efb30cdeb003fd7c585ee0d7657340086abcbd9e (diff)
download: gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.zip
gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.tar.gz
gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.tar.bz2
39 files changed, 4078 insertions, 1354 deletions
diff --git a/libgo/runtime/chan.c b/libgo/runtime/chan.c
index a79ee9e..6f52a1d 100644
--- a/libgo/runtime/chan.c
+++ b/libgo/runtime/chan.c
@@ -35,6 +35,8 @@ struct	WaitQ
 	SudoG*	last;
 };
 
+// The garbage collector is assuming that Hchan can only contain pointers into the stack
+// and cannot contain pointers into the heap.
 struct	Hchan
 {
 	uintgo	qcount;			// total data in the q
@@ -49,6 +51,8 @@ struct	Hchan
 	Lock;
 };
 
+uint32 runtime_Hchansize = sizeof(Hchan);
+
 // Buffer follows Hchan immediately in memory.
 // chanbuf(c, i) is pointer to the i'th slot in the buffer.
 #define chanbuf(c, i) ((byte*)((c)+1)+(uintptr)(c)->elemsize*(i))
@@ -107,6 +111,7 @@ runtime_makechan_c(ChanType *t, int64 hint)
 	c->elemsize = elem->__size;
 	c->elemalign = elem->__align;
 	c->dataqsiz = hint;
+	runtime_settype(c, (uintptr)t | TypeInfo_Chan);
 
 	if(debug)
 		runtime_printf("makechan: chan=%p; elemsize=%D; elemalign=%d; dataqsiz=%D\n",
@@ -875,16 +880,27 @@ sellock(Select *sel)
 static void
 selunlock(Select *sel)
 {
-	uint32 i;
-	Hchan *c, *c0;
+	int32 i, n, r;
+	Hchan *c;
 
-	c = nil;
-	for(i=sel->ncase; i-->0;) {
-		c0 = sel->lockorder[i];
-		if(c0 && c0 != c) {
-			c = c0;
-			runtime_unlock(c);
-		}
+	// We must be very careful here to not touch sel after we have unlocked
+	// the last lock, because sel can be freed right after the last unlock.
+	// Consider the following situation.
+	// First M calls runtime_park() in runtime_selectgo() passing the sel.
+	// Once runtime_park() has unlocked the last lock, another M makes
+	// the G that calls select runnable again and schedules it for execution.
+	// When the G runs on another M, it locks all the locks and frees sel.
+	// Now if the first M touches sel, it will access freed memory.
+	n = (int32)sel->ncase;
+	r = 0;
+	// skip the default case
+	if(n>0 && sel->lockorder[0] == nil)
+		r = 1;
+	for(i = n-1; i >= r; i--) {
+		c = sel->lockorder[i];
+		if(i>0 && sel->lockorder[i-1] == c)
+			continue;  // will unlock it on the next iteration
+		runtime_unlock(c);
 	}
 }
 
@@ -910,7 +926,7 @@ static int
 selectgo(Select **selp)
 {
 	Select *sel;
-	uint32 o, i, j;
+	uint32 o, i, j, k;
 	Scase *cas, *dfl;
 	Hchan *c;
 	SudoG *sg;
@@ -946,12 +962,42 @@ selectgo(Select **selp)
 	}
 
 	// sort the cases by Hchan address to get the locking order.
+	// simple heap sort, to guarantee n log n time and constant stack footprint.
 	for(i=0; i<sel->ncase; i++) {
-		c = sel->scase[i].chan;
-		for(j=i; j>0 && sel->lockorder[j-1] >= c; j--)
-			sel->lockorder[j] = sel->lockorder[j-1];
+		j = i;
+		c = sel->scase[j].chan;
+		while(j > 0 && sel->lockorder[k=(j-1)/2] < c) {
+			sel->lockorder[j] = sel->lockorder[k];
+			j = k;
+		}
 		sel->lockorder[j] = c;
 	}
+	for(i=sel->ncase; i-->0; ) {
+		c = sel->lockorder[i];
+		sel->lockorder[i] = sel->lockorder[0];
+		j = 0;
+		for(;;) {
+			k = j*2+1;
+			if(k >= i)
+				break;
+			if(k+1 < i && sel->lockorder[k] < sel->lockorder[k+1])
+				k++;
+			if(c < sel->lockorder[k]) {
+				sel->lockorder[j] = sel->lockorder[k];
+				j = k;
+				continue;
+			}
+			break;
+		}
+		sel->lockorder[j] = c;
+	}
+	/*
+	for(i=0; i+1<sel->ncase; i++)
+		if(sel->lockorder[i] > sel->lockorder[i+1]) {
+			runtime_printf("i=%d %p %p\n", i, sel->lockorder[i], sel->lockorder[i+1]);
+			runtime_throw("select: broken sort");
+		}
+	*/
 	sellock(sel);
 
 loop:
@@ -1048,7 +1094,7 @@ loop:
 	c = cas->chan;
 
 	if(c->dataqsiz > 0)
-		runtime_throw("selectgo: shouldnt happen");
+		runtime_throw("selectgo: shouldn't happen");
 
 	if(debug)
 		runtime_printf("wait-return: sel=%p c=%p cas=%p kind=%d\n",
diff --git a/libgo/runtime/cpuprof.c b/libgo/runtime/cpuprof.c
index 3ef08ef..5163873 100644
--- a/libgo/runtime/cpuprof.c
+++ b/libgo/runtime/cpuprof.c
@@ -121,7 +121,9 @@ static uintptr eod[3] = {0, 1, 0};
 // LostProfileData is a no-op function used in profiles
 // to mark the number of profiling stack traces that were
 // discarded due to slow data writers.
-static void LostProfileData(void) {
+static void
+LostProfileData(void)
+{
 }
 
 extern void runtime_SetCPUProfileRate(intgo)
@@ -365,7 +367,7 @@ getprofile(Profile *p)
 		return ret;
 
 	// Wait for new log.
-	runtime_entersyscall();
+	runtime_entersyscallblock();
 	runtime_notesleep(&p->wait);
 	runtime_exitsyscall();
 	runtime_noteclear(&p->wait);
diff --git a/libgo/runtime/go-main.c b/libgo/runtime/go-main.c
index 97d1405..77233d3 100644
--- a/libgo/runtime/go-main.c
+++ b/libgo/runtime/go-main.c
@@ -30,9 +30,6 @@
 
 extern char **environ;
 
-extern void runtime_main (void);
-static void mainstart (void *);
-
 /* The main function.  */
 
 int
@@ -42,13 +39,7 @@ main (int argc, char **argv)
   runtime_args (argc, (byte **) argv);
   runtime_osinit ();
   runtime_schedinit ();
-  __go_go (mainstart, NULL);
+  __go_go (runtime_main, NULL);
   runtime_mstart (runtime_m ());
   abort ();
 }
-
-static void
-mainstart (void *arg __attribute__ ((unused)))
-{
-  runtime_main ();
-}
diff --git a/libgo/runtime/go-map-index.c b/libgo/runtime/go-map-index.c
index a602d2a..499641c 100644
--- a/libgo/runtime/go-map-index.c
+++ b/libgo/runtime/go-map-index.c
@@ -98,7 +98,7 @@ __go_map_index (struct __go_map *map, const void *key, _Bool insert)
   key_descriptor = descriptor->__map_descriptor->__key_type;
   key_offset = descriptor->__key_offset;
   key_size = key_descriptor->__size;
-  __go_assert (key_size != 0 && key_size != -1UL);
+  __go_assert (key_size != -1UL);
   equalfn = key_descriptor->__equalfn;
 
   key_hash = key_descriptor->__hashfn (key, key_size);
diff --git a/libgo/runtime/go-reflect-map.c b/libgo/runtime/go-reflect-map.c
index 3697537..1ae7c96 100644
--- a/libgo/runtime/go-reflect-map.c
+++ b/libgo/runtime/go-reflect-map.c
@@ -238,3 +238,12 @@ makemap (const struct __go_map_type *t)
   __builtin_memcpy (ret, &map, sizeof (void *));
   return (uintptr_t) ret;
 }
+
+extern _Bool ismapkey (const struct __go_type_descriptor *)
+  __asm__ (GOSYM_PREFIX "reflect.ismapkey");
+
+_Bool
+ismapkey (const struct __go_type_descriptor *typ)
+{
+  return typ != NULL && typ->__hashfn != __go_type_hash_error;
+}
diff --git a/libgo/runtime/go-signal.c b/libgo/runtime/go-signal.c
index 1965e05..1e80057 100644
--- a/libgo/runtime/go-signal.c
+++ b/libgo/runtime/go-signal.c
@@ -12,6 +12,7 @@
 #include "runtime.h"
 #include "go-assert.h"
 #include "go-panic.h"
+#include "signal_unix.h"
 
 #ifndef SA_RESTART
   #define SA_RESTART 0
@@ -157,12 +158,15 @@ runtime_badsignal(int32 sig)
 /* Handle a signal, for cases where we don't panic.  We can split the
    stack here.  */
 
-static void
-sig_handler (int sig)
+void
+runtime_sighandler (int sig, Siginfo *info,
+		    void *context __attribute__ ((unused)), G *gp)
 {
+  M *m;
   int i;
 
-  if (runtime_m () == NULL)
+  m = runtime_m ();
+  if (m == NULL)
     {
       runtime_badsignal (sig);
       return;
@@ -171,7 +175,8 @@ sig_handler (int sig)
 #ifdef SIGPROF
   if (sig == SIGPROF)
     {
-      runtime_sigprof ();
+      if (gp != runtime_m ()->g0 && gp != runtime_m ()->gsignal)
+	runtime_sigprof ();
       return;
     }
 #endif
@@ -179,13 +184,18 @@ sig_handler (int sig)
   for (i = 0; runtime_sigtab[i].sig != -1; ++i)
     {
       SigTab *t;
+      bool notify, crash;
 
       t = &runtime_sigtab[i];
 
       if (t->sig != sig)
 	continue;
 
-      if ((t->flags & SigNotify) != 0)
+      notify = false;
+#ifdef SA_SIGINFO
+      notify = info != NULL && info->si_code == SI_USER;
+#endif
+      if (notify || (t->flags & SigNotify) != 0)
 	{
 	  if (__go_sigsend (sig))
 	    return;
@@ -210,9 +220,15 @@ sig_handler (int sig)
 	  runtime_printf ("%s\n", name);
       }
 
+      if (m->lockedg != NULL && m->ncgo > 0 && gp == m->g0)
+	{
+	  runtime_printf("signal arrived during cgo execution\n");
+	  gp = m->lockedg;
+	}
+
       runtime_printf ("\n");
 
-      if (runtime_gotraceback ())
+      if (runtime_gotraceback (&crash))
 	{
 	  G *g;
 
@@ -225,6 +241,9 @@ sig_handler (int sig)
 	     a readable form.  */
 	}
 
+      if (crash)
+	runtime_crash ();
+
       runtime_exit (2);
     }
 
@@ -259,15 +278,14 @@ sig_panic_leadin (int sig)
    permitted to split the stack.  */
 
 static void
-sig_panic_info_handler (int sig, siginfo_t *info,
-			void *context __attribute__ ((unused)))
+sig_panic_info_handler (int sig, Siginfo *info, void *context)
 {
   G *g;
 
   g = runtime_g ();
   if (g == NULL || info->si_code == SI_USER)
     {
-      sig_handler (sig);
+      runtime_sighandler (sig, info, context, g);
       return;
     }
 
@@ -331,7 +349,7 @@ sig_panic_handler (int sig)
   g = runtime_g ();
   if (g == NULL)
     {
-      sig_handler (sig);
+      runtime_sighandler (sig, NULL, NULL, g);
       return;
     }
 
@@ -373,10 +391,10 @@ sig_panic_handler (int sig)
    the stack.  */
 
 static void
-sig_tramp (int) __attribute__ ((no_split_stack));
+sig_tramp_info (int, Siginfo *, void *) __attribute__ ((no_split_stack));
 
 static void
-sig_tramp (int sig)
+sig_tramp_info (int sig, Siginfo *info, void *context)
 {
   G *gp;
   M *mp;
@@ -403,7 +421,7 @@ sig_tramp (int sig)
 #endif
     }
 
-  sig_handler (sig);
+  runtime_sighandler (sig, info, context, gp);
 
   /* We are going to return back to the signal trampoline and then to
      whatever we were doing before we got the signal.  Restore the
@@ -418,8 +436,20 @@ sig_tramp (int sig)
     }
 }
 
+#ifndef SA_SIGINFO
+
+static void sig_tramp (int sig) __attribute__ ((no_split_stack));
+
+static void
+sig_tramp (int sig)
+{
+  sig_tramp_info (sig, NULL, NULL);
+}
+
+#endif
+
 void
-runtime_setsig (int32 i, bool def __attribute__ ((unused)), bool restart)
+runtime_setsig (int32 i, GoSighandler *fn, bool restart)
 {
   struct sigaction sa;
   int r;
@@ -434,17 +464,30 @@ runtime_setsig (int32 i, bool def __attribute__ ((unused)), bool restart)
 
   if ((t->flags & SigPanic) == 0)
     {
+#ifdef SA_SIGINFO
+      sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
+      if (fn == runtime_sighandler)
+	fn = (void *) sig_tramp_info;
+      sa.sa_sigaction = (void *) fn;
+#else
       sa.sa_flags = SA_ONSTACK;
-      sa.sa_handler = sig_tramp;
+      if (fn == runtime_sighandler)
+	fn = (void *) sig_tramp;
+      sa.sa_handler = (void *) fn;
+#endif
     }
   else
     {
 #ifdef SA_SIGINFO
       sa.sa_flags = SA_SIGINFO;
-      sa.sa_sigaction = sig_panic_info_handler;
+      if (fn == runtime_sighandler)
+	fn = (void *) sig_panic_info_handler;
+      sa.sa_sigaction = (void *) fn;
 #else
       sa.sa_flags = 0;
-      sa.sa_handler = sig_panic_handler;
+      if (fn == runtime_sighandler)
+	fn = (void *) sig_panic_handler;
+      sa.sa_handler = (void *) fn;
 #endif
     }
 
@@ -455,6 +498,37 @@ runtime_setsig (int32 i, bool def __attribute__ ((unused)), bool restart)
     __go_assert (0);
 }
 
+GoSighandler*
+runtime_getsig (int32 i)
+{
+  struct sigaction sa;
+  int r;
+  SigTab *t;
+
+  memset (&sa, 0, sizeof sa);
+
+  r = sigemptyset (&sa.sa_mask);
+  __go_assert (r == 0);
+
+  t = &runtime_sigtab[i];
+
+  if (sigaction (t->sig, NULL, &sa) != 0)
+    runtime_throw ("sigaction read failure");
+
+  if ((void *) sa.sa_handler == sig_tramp_info)
+    return runtime_sighandler;
+#ifdef SA_SIGINFO
+  if ((void *) sa.sa_handler == sig_panic_info_handler)
+    return runtime_sighandler;
+#else
+  if ((void *) sa.sa_handler == sig_tramp
+      || (void *) sa.sa_handler == sig_panic_handler)
+    return runtime_sighandler;
+#endif
+
+  return (void *) sa.sa_handler;
+}
+
 /* Used by the os package to raise SIGPIPE.  */
 
 void os_sigpipe (void) __asm__ (GOSYM_PREFIX "os.sigpipe");
diff --git a/libgo/runtime/go-unsafe-new.c b/libgo/runtime/go-unsafe-new.c
index 54788f1..7848642 100644
--- a/libgo/runtime/go-unsafe-new.c
+++ b/libgo/runtime/go-unsafe-new.c
@@ -21,14 +21,5 @@ void *unsafe_New (const struct __go_type_descriptor *)
 void *
 unsafe_New (const struct __go_type_descriptor *descriptor)
 {
-  uint32 flag;
-  void *ret;
-
-  flag = (descriptor->__code & GO_NO_POINTERS) != 0 ? FlagNoPointers : 0;
-  ret = runtime_mallocgc (descriptor->__size, flag, 1, 1);
-
-  if (UseSpanType && flag == 0)
-    runtime_settype (ret, (uintptr) descriptor | TypeInfo_SingleObject);
-
-  return ret;
+  return runtime_cnew (descriptor);
 }
diff --git a/libgo/runtime/go-unsafe-newarray.c b/libgo/runtime/go-unsafe-newarray.c
index e4fb336..f5c5efc 100644
--- a/libgo/runtime/go-unsafe-newarray.c
+++ b/libgo/runtime/go-unsafe-newarray.c
@@ -21,21 +21,5 @@ void *unsafe_NewArray (const struct __go_type_descriptor *, intgo)
 void *
 unsafe_NewArray (const struct __go_type_descriptor *descriptor, intgo n)
 {
-  uint64 size;
-  void *ret;
-
-  size = n * descriptor->__size;
-  if (size == 0)
-    ret = &runtime_zerobase;
-  else if ((descriptor->__code & GO_NO_POINTERS) != 0)
-    ret = runtime_mallocgc (size, FlagNoPointers, 1, 1);
-  else
-    {
-      ret = runtime_mallocgc (size, 0, 1, 1);
-
-      if (UseSpanType)
-	runtime_settype (ret, (uintptr) descriptor | TypeInfo_Array);
-    }
-
-  return ret;
+  return runtime_cnewarray (descriptor, n);
 }
diff --git a/libgo/runtime/lock_futex.c b/libgo/runtime/lock_futex.c
index 5374aff..4b9651a 100644
--- a/libgo/runtime/lock_futex.c
+++ b/libgo/runtime/lock_futex.c
@@ -41,7 +41,7 @@ runtime_lock(Lock *l)
 		runtime_throw("runtime_lock: lock count");
 
 	// Speculative grab for lock.
-	v = runtime_xchg(&l->key, MUTEX_LOCKED);
+	v = runtime_xchg((uint32*)&l->key, MUTEX_LOCKED);
 	if(v == MUTEX_UNLOCKED)
 		return;
 
@@ -64,7 +64,7 @@ runtime_lock(Lock *l)
 		// Try for lock, spinning.
 		for(i = 0; i < spin; i++) {
 			while(l->key == MUTEX_UNLOCKED)
-				if(runtime_cas(&l->key, MUTEX_UNLOCKED, wait))
+				if(runtime_cas((uint32*)&l->key, MUTEX_UNLOCKED, wait))
 					return;
 			runtime_procyield(ACTIVE_SPIN_CNT);
 		}
@@ -72,17 +72,17 @@ runtime_lock(Lock *l)
 		// Try for lock, rescheduling.
 		for(i=0; i < PASSIVE_SPIN; i++) {
 			while(l->key == MUTEX_UNLOCKED)
-				if(runtime_cas(&l->key, MUTEX_UNLOCKED, wait))
+				if(runtime_cas((uint32*)&l->key, MUTEX_UNLOCKED, wait))
 					return;
 			runtime_osyield();
 		}
 
 		// Sleep.
-		v = runtime_xchg(&l->key, MUTEX_SLEEPING);
+		v = runtime_xchg((uint32*)&l->key, MUTEX_SLEEPING);
 		if(v == MUTEX_UNLOCKED)
 			return;
 		wait = MUTEX_SLEEPING;
-		runtime_futexsleep(&l->key, MUTEX_SLEEPING, -1);
+		runtime_futexsleep((uint32*)&l->key, MUTEX_SLEEPING, -1);
 	}
 }
 
@@ -94,11 +94,11 @@ runtime_unlock(Lock *l)
 	if(--runtime_m()->locks < 0)
 		runtime_throw("runtime_unlock: lock count");
 
-	v = runtime_xchg(&l->key, MUTEX_UNLOCKED);
+	v = runtime_xchg((uint32*)&l->key, MUTEX_UNLOCKED);
 	if(v == MUTEX_UNLOCKED)
 		runtime_throw("unlock of unlocked lock");
 	if(v == MUTEX_SLEEPING)
-		runtime_futexwakeup(&l->key, 1);
+		runtime_futexwakeup((uint32*)&l->key, 1);
 }
 
 // One-time notifications.
@@ -111,9 +111,9 @@ runtime_noteclear(Note *n)
 void
 runtime_notewakeup(Note *n)
 {
-	if(runtime_xchg(&n->key, 1))
+	if(runtime_xchg((uint32*)&n->key, 1))
 		runtime_throw("notewakeup - double wakeup");
-	runtime_futexwakeup(&n->key, 1);
+	runtime_futexwakeup((uint32*)&n->key, 1);
 }
 
 void
@@ -121,8 +121,8 @@ runtime_notesleep(Note *n)
 {
 	if(runtime_m()->profilehz > 0)
 		runtime_setprof(false);
-	while(runtime_atomicload(&n->key) == 0)
-		runtime_futexsleep(&n->key, 0, -1);
+	while(runtime_atomicload((uint32*)&n->key) == 0)
+		runtime_futexsleep((uint32*)&n->key, 0, -1);
 	if(runtime_m()->profilehz > 0)
 		runtime_setprof(true);
 }
@@ -137,15 +137,15 @@ runtime_notetsleep(Note *n, int64 ns)
 		return;
 	}
 
-	if(runtime_atomicload(&n->key) != 0)
+	if(runtime_atomicload((uint32*)&n->key) != 0)
 		return;
 
 	if(runtime_m()->profilehz > 0)
 		runtime_setprof(false);
 	deadline = runtime_nanotime() + ns;
 	for(;;) {
-		runtime_futexsleep(&n->key, 0, ns);
-		if(runtime_atomicload(&n->key) != 0)
+		runtime_futexsleep((uint32*)&n->key, 0, ns);
+		if(runtime_atomicload((uint32*)&n->key) != 0)
 			break;
 		now = runtime_nanotime();
 		if(now >= deadline)
diff --git a/libgo/runtime/lock_sema.c b/libgo/runtime/lock_sema.c
index 8c4b397..2663c54 100644
--- a/libgo/runtime/lock_sema.c
+++ b/libgo/runtime/lock_sema.c
@@ -43,7 +43,7 @@ runtime_lock(Lock *l)
 		runtime_throw("runtime_lock: lock count");
 
 	// Speculative grab for lock.
-	if(runtime_casp(&l->waitm, nil, (void*)LOCKED))
+	if(runtime_casp((void**)&l->key, nil, (void*)LOCKED))
 		return;
 
 	if(m->waitsema == 0)
@@ -56,10 +56,10 @@ runtime_lock(Lock *l)
 		spin = ACTIVE_SPIN;
 
 	for(i=0;; i++) {
-		v = (uintptr)runtime_atomicloadp(&l->waitm);
+		v = (uintptr)runtime_atomicloadp((void**)&l->key);
 		if((v&LOCKED) == 0) {
 unlocked:
-			if(runtime_casp(&l->waitm, (void*)v, (void*)(v|LOCKED)))
+			if(runtime_casp((void**)&l->key, (void*)v, (void*)(v|LOCKED)))
 				return;
 			i = 0;
 		}
@@ -74,9 +74,9 @@ unlocked:
 			// Queue this M.
 			for(;;) {
 				m->nextwaitm = (void*)(v&~LOCKED);
-				if(runtime_casp(&l->waitm, (void*)v, (void*)((uintptr)m|LOCKED)))
+				if(runtime_casp((void**)&l->key, (void*)v, (void*)((uintptr)m|LOCKED)))
 					break;
-				v = (uintptr)runtime_atomicloadp(&l->waitm);
+				v = (uintptr)runtime_atomicloadp((void**)&l->key);
 				if((v&LOCKED) == 0)
 					goto unlocked;
 			}
@@ -99,15 +99,15 @@ runtime_unlock(Lock *l)
 		runtime_throw("runtime_unlock: lock count");
 
 	for(;;) {
-		v = (uintptr)runtime_atomicloadp(&l->waitm);
+		v = (uintptr)runtime_atomicloadp((void**)&l->key);
 		if(v == LOCKED) {
-			if(runtime_casp(&l->waitm, (void*)LOCKED, nil))
+			if(runtime_casp((void**)&l->key, (void*)LOCKED, nil))
 				break;
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
 			mp = (void*)(v&~LOCKED);
-			if(runtime_casp(&l->waitm, (void*)v, mp->nextwaitm)) {
+			if(runtime_casp((void**)&l->key, (void*)v, mp->nextwaitm)) {
 				// Dequeued an M.  Wake it.
 				runtime_semawakeup(mp);
 				break;
@@ -120,7 +120,7 @@ runtime_unlock(Lock *l)
 void
 runtime_noteclear(Note *n)
 {
-	n->waitm = nil;
+	n->key = 0;
 }
 
 void
@@ -129,8 +129,8 @@ runtime_notewakeup(Note *n)
 	M *mp;
 
 	do
-		mp = runtime_atomicloadp(&n->waitm);
-	while(!runtime_casp(&n->waitm, mp, (void*)LOCKED));
+		mp = runtime_atomicloadp((void**)&n->key);
+	while(!runtime_casp((void**)&n->key, mp, (void*)LOCKED));
 
 	// Successfully set waitm to LOCKED.
 	// What was it before?
@@ -153,8 +153,8 @@ runtime_notesleep(Note *n)
 	m = runtime_m();
 	if(m->waitsema == 0)
 		m->waitsema = runtime_semacreate();
-	if(!runtime_casp(&n->waitm, nil, m)) {  // must be LOCKED (got wakeup)
-		if(n->waitm != (void*)LOCKED)
+	if(!runtime_casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup)
+		if(n->key != LOCKED)
 			runtime_throw("notesleep - waitm out of sync");
 		return;
 	}
@@ -183,8 +183,8 @@ runtime_notetsleep(Note *n, int64 ns)
 		m->waitsema = runtime_semacreate();
 
 	// Register for wakeup on n->waitm.
-	if(!runtime_casp(&n->waitm, nil, m)) {  // must be LOCKED (got wakeup already)
-		if(n->waitm != (void*)LOCKED)
+	if(!runtime_casp((void**)&n->key, nil, m)) {  // must be LOCKED (got wakeup already)
+		if(n->key != LOCKED)
 			runtime_throw("notetsleep - waitm out of sync");
 		return;
 	}
@@ -219,10 +219,10 @@ runtime_notetsleep(Note *n, int64 ns)
 	// so that any notewakeup racing with the return does not
 	// try to grant us the semaphore when we don't expect it.
 	for(;;) {
-		mp = runtime_atomicloadp(&n->waitm);
+		mp = runtime_atomicloadp((void**)&n->key);
 		if(mp == m) {
 			// No wakeup yet; unregister if possible.
-			if(runtime_casp(&n->waitm, mp, nil))
+			if(runtime_casp((void**)&n->key, mp, nil))
 				return;
 		} else if(mp == (M*)LOCKED) {
 			// Wakeup happened so semaphore is available.
diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
index a484642..dfab683 100644
--- a/libgo/runtime/malloc.goc
+++ b/libgo/runtime/malloc.goc
@@ -18,7 +18,7 @@ package runtime
 #include "go-type.h"
 #include "race.h"
 
-MHeap runtime_mheap;
+MHeap *runtime_mheap;
 
 int32	runtime_checking;
 
@@ -46,7 +46,7 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 	g = runtime_g();
 	if(g->status == Gsyscall)
 		dogc = 0;
-	if(runtime_gcwaiting && g != m->g0 && m->locks == 0 && g->status != Gsyscall) {
+	if(runtime_gcwaiting && g != m->g0 && m->locks == 0 && dogc) {
 		runtime_gosched();
 		m = runtime_m();
 	}
@@ -78,7 +78,7 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		npages = size >> PageShift;
 		if((size & PageMask) != 0)
 			npages++;
-		s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, zeroed);
+		s = runtime_MHeap_Alloc(runtime_mheap, npages, 0, 1, zeroed);
 		if(s == nil)
 			runtime_throw("out of memory");
 		size = npages<<PageShift;
@@ -92,9 +92,9 @@ runtime_mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 
 	if (sizeof(void*) == 4 && c->local_total_alloc >= (1<<30)) {
 		// purge cache stats to prevent overflow
-		runtime_lock(&runtime_mheap);
+		runtime_lock(runtime_mheap);
 		runtime_purgecachedstats(c);
-		runtime_unlock(&runtime_mheap);
+		runtime_unlock(runtime_mheap);
 	}
 
 	if(!(flag & FlagNoGC))
@@ -175,17 +175,17 @@ __go_free(void *v)
 	if(sizeclass == 0) {
 		// Large object.
 		size = s->npages<<PageShift;
-		*(uintptr*)(s->start<<PageShift) = 1;	// mark as "needs to be zeroed"
+		*(uintptr*)(s->start<<PageShift) = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
 		// Must mark v freed before calling unmarkspan and MHeap_Free:
 		// they might coalesce v into other spans and change the bitmap further.
 		runtime_markfreed(v, size);
 		runtime_unmarkspan(v, 1<<PageShift);
-		runtime_MHeap_Free(&runtime_mheap, s, 1);
+		runtime_MHeap_Free(runtime_mheap, s, 1);
 	} else {
 		// Small object.
 		size = runtime_class_to_size[sizeclass];
 		if(size > sizeof(uintptr))
-			((uintptr*)v)[1] = 1;	// mark as "needs to be zeroed"
+			((uintptr*)v)[1] = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
 		// Must mark v freed before calling MCache_Free:
 		// it might coalesce v and other blocks into a bigger span
 		// and change the bitmap further.
@@ -213,12 +213,12 @@ runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	m->mcache->local_nlookup++;
 	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
 		// purge cache stats to prevent overflow
-		runtime_lock(&runtime_mheap);
+		runtime_lock(runtime_mheap);
 		runtime_purgecachedstats(m->mcache);
-		runtime_unlock(&runtime_mheap);
+		runtime_unlock(runtime_mheap);
 	}
 
-	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
+	s = runtime_MHeap_LookupMaybe(runtime_mheap, v);
 	if(sp)
 		*sp = s;
 	if(s == nil) {
@@ -262,11 +262,11 @@ runtime_allocmcache(void)
 	intgo rate;
 	MCache *c;
 
-	runtime_lock(&runtime_mheap);
-	c = runtime_FixAlloc_Alloc(&runtime_mheap.cachealloc);
-	mstats.mcache_inuse = runtime_mheap.cachealloc.inuse;
-	mstats.mcache_sys = runtime_mheap.cachealloc.sys;
-	runtime_unlock(&runtime_mheap);
+	runtime_lock(runtime_mheap);
+	c = runtime_FixAlloc_Alloc(&runtime_mheap->cachealloc);
+	mstats.mcache_inuse = runtime_mheap->cachealloc.inuse;
+	mstats.mcache_sys = runtime_mheap->cachealloc.sys;
+	runtime_unlock(runtime_mheap);
 	runtime_memclr((byte*)c, sizeof(*c));
 
 	// Set first allocation sample size.
@@ -283,10 +283,10 @@ void
 runtime_freemcache(MCache *c)
 {
 	runtime_MCache_ReleaseAll(c);
-	runtime_lock(&runtime_mheap);
+	runtime_lock(runtime_mheap);
 	runtime_purgecachedstats(c);
-	runtime_FixAlloc_Free(&runtime_mheap.cachealloc, c);
-	runtime_unlock(&runtime_mheap);
+	runtime_FixAlloc_Free(&runtime_mheap->cachealloc, c);
+	runtime_unlock(runtime_mheap);
 }
 
 void
@@ -334,9 +334,15 @@ runtime_mallocinit(void)
 	USED(arena_size);
 	USED(bitmap_size);
 
+	if((runtime_mheap = runtime_SysAlloc(sizeof(*runtime_mheap))) == nil)
+		runtime_throw("runtime: cannot allocate heap metadata");
+
 	runtime_InitSizes();
 
-	limit = runtime_memlimit();
+	// limit = runtime_memlimit();
+	// See https://code.google.com/p/go/issues/detail?id=5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0;
 
 	// Set up the allocation arena, a contiguous area of memory where
 	// allocated data will be found.  The arena begins with a bitmap large
@@ -414,13 +420,13 @@ runtime_mallocinit(void)
 	if((uintptr)p & (((uintptr)1<<PageShift)-1))
 		runtime_throw("runtime: SysReserve returned unaligned address");
 
-	runtime_mheap.bitmap = p;
-	runtime_mheap.arena_start = p + bitmap_size;
-	runtime_mheap.arena_used = runtime_mheap.arena_start;
-	runtime_mheap.arena_end = runtime_mheap.arena_start + arena_size;
+	runtime_mheap->bitmap = p;
+	runtime_mheap->arena_start = p + bitmap_size;
+	runtime_mheap->arena_used = runtime_mheap->arena_start;
+	runtime_mheap->arena_end = runtime_mheap->arena_start + arena_size;
 
 	// Initialize the rest of the allocator.	
-	runtime_MHeap_Init(&runtime_mheap, runtime_SysAlloc);
+	runtime_MHeap_Init(runtime_mheap, runtime_SysAlloc);
 	runtime_m()->mcache = runtime_allocmcache();
 
 	// See if it works.
@@ -519,8 +525,8 @@ runtime_settype_flush(M *mp, bool sysalloc)
 		// (Manually inlined copy of runtime_MHeap_Lookup)
 		p = (uintptr)v>>PageShift;
 		if(sizeof(void*) == 8)
-			p -= (uintptr)runtime_mheap.arena_start >> PageShift;
-		s = runtime_mheap.map[p];
+			p -= (uintptr)runtime_mheap->arena_start >> PageShift;
+		s = runtime_mheap->map[p];
 
 		if(s->sizeclass == 0) {
 			s->types.compression = MTypes_Single;
@@ -537,9 +543,11 @@ runtime_settype_flush(M *mp, bool sysalloc)
 			nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
 
 			if(!sysalloc) {
-				data3 = runtime_mallocgc(nbytes3, FlagNoPointers, 0, 1);
+				data3 = runtime_mallocgc(nbytes3, FlagNoProfiling|FlagNoPointers, 0, 1);
 			} else {
 				data3 = runtime_SysAlloc(nbytes3);
+				if(data3 == nil)
+					runtime_throw("runtime: cannot allocate memory");
 				if(0) runtime_printf("settype(0->3): SysAlloc(%x) --> %p\n", (uint32)nbytes3, data3);
 			}
 
@@ -573,9 +581,11 @@ runtime_settype_flush(M *mp, bool sysalloc)
 				nbytes2 = ntypes * sizeof(uintptr);
 
 				if(!sysalloc) {
-					data2 = runtime_mallocgc(nbytes2, FlagNoPointers, 0, 1);
+					data2 = runtime_mallocgc(nbytes2, FlagNoProfiling|FlagNoPointers, 0, 1);
 				} else {
 					data2 = runtime_SysAlloc(nbytes2);
+					if(data2 == nil)
+						runtime_throw("runtime: cannot allocate memory");
 					if(0) runtime_printf("settype.(3->2): SysAlloc(%x) --> %p\n", (uint32)nbytes2, data2);
 				}
 
@@ -633,7 +643,7 @@ runtime_settype(void *v, uintptr t)
 	}
 
 	if(DebugTypeAtBlockEnd) {
-		s = runtime_MHeap_Lookup(&runtime_mheap, v);
+		s = runtime_MHeap_Lookup(runtime_mheap, v);
 		*(uintptr*)((uintptr)v+s->elemsize-sizeof(uintptr)) = t;
 	}
 }
@@ -672,7 +682,7 @@ runtime_gettype(void *v)
 	uintptr t, ofs;
 	byte *data;
 
-	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
+	s = runtime_MHeap_LookupMaybe(runtime_mheap, v);
 	if(s != nil) {
 		t = 0;
 		switch(s->types.compression) {
@@ -731,9 +741,8 @@ runtime_new(const Type *typ)
 		ret = runtime_mallocgc(typ->__size, flag, 1, 1);
 
 		if(UseSpanType && !flag) {
-			if(false) {
+			if(false)
 				runtime_printf("new %S: %p\n", *typ->__reflection, ret);
-			}
 			runtime_settype(ret, (uintptr)typ | TypeInfo_SingleObject);
 		}
 	}
@@ -741,6 +750,45 @@ runtime_new(const Type *typ)
 	return ret;
 }
 
+static void*
+cnew(const Type *typ, intgo n, int32 objtyp)
+{
+	uint32 flag;
+	void *ret;
+
+	if((objtyp&(PtrSize-1)) != objtyp)
+		runtime_throw("runtime: invalid objtyp");
+	if(n < 0 || (typ->__size > 0 && (uintptr)n > (MaxMem/typ->__size)))
+		runtime_panicstring("runtime: allocation size out of range");
+	if(typ->__size == 0 || n == 0) {
+		// All 0-length allocations use this pointer.
+		// The language does not require the allocations to
+		// have distinct values.
+		return &runtime_zerobase;
+	}
+	flag = typ->__code&GO_NO_POINTERS ? FlagNoPointers : 0;
+	ret = runtime_mallocgc(typ->__size*n, flag, 1, 1);
+	if(UseSpanType && !flag) {
+		if(false)
+			runtime_printf("cnew [%D]%S: %p\n", (int64)n, *typ->__reflection, ret);
+		runtime_settype(ret, (uintptr)typ | TypeInfo_SingleObject);
+	}
+	return ret;
+}
+
+// same as runtime_new, but callable from C
+void*
+runtime_cnew(const Type *typ)
+{
+	return cnew(typ, 1, TypeInfo_SingleObject);
+}
+
+void*
+runtime_cnewarray(const Type *typ, intgo n)
+{
+	return cnew(typ, n, TypeInfo_Array);
+}
+
 func GC() {
 	runtime_gc(1);
 }
diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h
index 7ebb762..ebea34e 100644
--- a/libgo/runtime/malloc.h
+++ b/libgo/runtime/malloc.h
@@ -86,6 +86,7 @@ typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
 typedef struct MLink	MLink;
 typedef struct MTypes	MTypes;
+typedef struct GCStats	GCStats;
 
 enum
 {
@@ -114,10 +115,18 @@ enum
 	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
 
 	// Number of bits in page to span calculations (4k pages).
-	// On 64-bit, we limit the arena to 128GB, or 37 bits.
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits (see below for reason).
+	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
 	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
 #if __SIZEOF_POINTER__ == 8
+#ifdef GOOS_windows
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+	MHeapMap_Bits = 35 - PageShift,
+#else
 	MHeapMap_Bits = 37 - PageShift,
+#endif
 #else
 	MHeapMap_Bits = 32 - PageShift,
 #endif
@@ -133,7 +142,7 @@ enum
 // This must be a #define instead of an enum because it
 // is so large.
 #if __SIZEOF_POINTER__ == 8
-#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB */
+#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB or 32 GB */
 #else
 #define	MaxMem	((uintptr)-1)
 #endif
@@ -229,7 +238,7 @@ struct MStats
 	uint64	buckhash_sys;	// profiling bucket hash table
 
 	// Statistics about garbage collector.
-	// Protected by stopping the world during GC.
+	// Protected by mheap or stopping the world during GC.
 	uint64	next_gc;	// next GC (in heap_alloc time)
 	uint64  last_gc;	// last GC (in absolute time)
 	uint64	pause_total_ns;
@@ -249,7 +258,6 @@ struct MStats
 extern MStats mstats
   __asm__ (GOSYM_PREFIX "runtime.VmemStats");
 
-
 // Size classes.  Computed and initialized by InitSizes.
 //
 // SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
@@ -416,18 +424,18 @@ struct MHeap
 	byte *arena_end;
 
 	// central free lists for small size classes.
-	// the union makes sure that the MCentrals are
+	// the padding makes sure that the MCentrals are
 	// spaced CacheLineSize bytes apart, so that each MCentral.Lock
 	// gets its own cache line.
-	union {
+	struct {
 		MCentral;
-		byte pad[CacheLineSize];
+		byte pad[64];
 	} central[NumSizeClasses];
 
 	FixAlloc spanalloc;	// allocator for Span*
 	FixAlloc cachealloc;	// allocator for MCache*
 };
-extern MHeap runtime_mheap;
+extern MHeap *runtime_mheap;
 
 void	runtime_MHeap_Init(MHeap *h, void *(*allocator)(uintptr));
 MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
@@ -452,8 +460,8 @@ void	runtime_unmarkspan(void *v, uintptr size);
 bool	runtime_blockspecial(void*);
 void	runtime_setblockspecial(void*, bool);
 void	runtime_purgecachedstats(MCache*);
-void*	runtime_new(const Type *);
-#define runtime_cnew(T) runtime_new(T)
+void*	runtime_cnew(const Type*);
+void*	runtime_cnewarray(const Type*, intgo);
 
 void	runtime_settype(void*, uintptr);
 void	runtime_settype_flush(M*, bool);
@@ -493,6 +501,7 @@ enum
 	TypeInfo_SingleObject = 0,
 	TypeInfo_Array = 1,
 	TypeInfo_Map = 2,
+	TypeInfo_Chan = 3,
 
 	// Enables type information at the end of blocks allocated from heap	
 	DebugTypeAtBlockEnd = 0,
@@ -504,4 +513,5 @@ void	runtime_gc_itab_ptr(Eface*);
 
 void	runtime_memorydump(void);
 
+void	runtime_proc_scan(void (*)(Obj));
 void	runtime_time_scan(void (*)(Obj));
diff --git a/libgo/runtime/mcache.c b/libgo/runtime/mcache.c
index 570c06a..45bac4f 100644
--- a/libgo/runtime/mcache.c
+++ b/libgo/runtime/mcache.c
@@ -21,7 +21,7 @@ runtime_MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed)
 	l = &c->list[sizeclass];
 	if(l->list == nil) {
 		// Replenish using central lists.
-		n = runtime_MCentral_AllocList(&runtime_mheap.central[sizeclass],
+		n = runtime_MCentral_AllocList(&runtime_mheap->central[sizeclass],
 			runtime_class_to_transfercount[sizeclass], &first);
 		if(n == 0)
 			runtime_throw("out of memory");
@@ -69,7 +69,7 @@ ReleaseN(MCache *c, MCacheList *l, int32 n, int32 sizeclass)
 	c->size -= n*runtime_class_to_size[sizeclass];
 
 	// Return them to central free list.
-	runtime_MCentral_FreeList(&runtime_mheap.central[sizeclass], n, first);
+	runtime_MCentral_FreeList(&runtime_mheap->central[sizeclass], n, first);
 }
 
 void
diff --git a/libgo/runtime/mcentral.c b/libgo/runtime/mcentral.c
index b405438..b3108a1 100644
--- a/libgo/runtime/mcentral.c
+++ b/libgo/runtime/mcentral.c
@@ -108,7 +108,7 @@ MCentral_Free(MCentral *c, void *v)
 	int32 size;
 
 	// Find span for v.
-	s = runtime_MHeap_Lookup(&runtime_mheap, v);
+	s = runtime_MHeap_Lookup(runtime_mheap, v);
 	if(s == nil || s->ref == 0)
 		runtime_throw("invalid free");
 
@@ -133,7 +133,7 @@ MCentral_Free(MCentral *c, void *v)
 		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime_unlock(c);
-		runtime_MHeap_Free(&runtime_mheap, s, 0);
+		runtime_MHeap_Free(runtime_mheap, s, 0);
 		runtime_lock(c);
 	}
 }
@@ -168,7 +168,7 @@ runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *e
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime_unlock(c);
 		runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		runtime_MHeap_Free(&runtime_mheap, s, 0);
+		runtime_MHeap_Free(runtime_mheap, s, 0);
 	} else {
 		runtime_unlock(c);
 	}
@@ -200,7 +200,7 @@ MCentral_Grow(MCentral *c)
 
 	runtime_unlock(c);
 	runtime_MGetSizeClassInfo(c->sizeclass, &size, &npages, &n);
-	s = runtime_MHeap_Alloc(&runtime_mheap, npages, c->sizeclass, 0, 1);
+	s = runtime_MHeap_Alloc(runtime_mheap, npages, c->sizeclass, 0, 1);
 	if(s == nil) {
 		// TODO(rsc): Log out of memory
 		runtime_lock(c);
diff --git a/libgo/runtime/mem.c b/libgo/runtime/mem.c
index e606bdd..8481e95 100644
--- a/libgo/runtime/mem.c
+++ b/libgo/runtime/mem.c
@@ -78,7 +78,7 @@ runtime_SysAlloc(uintptr n)
 	fd = dev_zero;
 #endif
 
-	p = runtime_mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, fd, 0);
+	p = runtime_mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, fd, 0);
 	if (p == MAP_FAILED) {
 		if(errno == EACCES) {
 			runtime_printf("runtime: mmap: access denied\n");
@@ -169,7 +169,7 @@ runtime_SysMap(void *v, uintptr n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if(sizeof(void*) == 8 && (uintptr)v >= 0xffffffffU) {
-		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, fd, 0);
+		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, fd, 0);
 		if(p == MAP_FAILED && errno == ENOMEM)
 			runtime_throw("runtime: out of memory");
 		if(p != v) {
@@ -179,7 +179,9 @@ runtime_SysMap(void *v, uintptr n)
 		return;
 	}
 
-	p = runtime_mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, fd, 0);
+	p = runtime_mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, fd, 0);
+	if(p == MAP_FAILED && errno == ENOMEM)
+		runtime_throw("runtime: out of memory");
 	if(p != v)
 		runtime_throw("runtime: cannot map pages in arena address space");
 }
diff --git a/libgo/runtime/mfixalloc.c b/libgo/runtime/mfixalloc.c
index 109cfe8..6e4f0c6 100644
--- a/libgo/runtime/mfixalloc.c
+++ b/libgo/runtime/mfixalloc.c
@@ -30,6 +30,11 @@ void*
 runtime_FixAlloc_Alloc(FixAlloc *f)
 {
 	void *v;
+	
+	if(f->size == 0) {
+		runtime_printf("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n");
+		runtime_throw("runtime: internal error");
+	}
 
 	if(f->list) {
 		v = f->list;
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
index 88283cc..36afd2b 100644
--- a/libgo/runtime/mgc0.c
+++ b/libgo/runtime/mgc0.c
@@ -21,8 +21,11 @@
 #define tab __methods
 // Eface aka __go_empty_interface.
 #define type __type_descriptor
+// Hmap aka __go_map
+typedef struct __go_map Hmap;
 // Type aka __go_type_descriptor
 #define kind __code
+#define string __reflection
 #define KindPtr GO_PTR
 #define KindNoPointers GO_NO_POINTERS
 // PtrType aka __go_ptr_type
@@ -41,6 +44,9 @@ extern void * __splitstack_find_context (void *context[10], size_t *, void **,
 enum {
 	Debug = 0,
 	DebugMark = 0,  // run second pass to check mark
+	CollectStats = 0,
+	ScanStackByFrames = 0,
+	IgnorePreciseGC = 0,
 
 	// Four bits per word (see #defines below).
 	wordsPerBitmapWord = sizeof(void*)*8/4,
@@ -147,6 +153,7 @@ static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
 static void	putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
+static void	gchelperstart(void);
 
 static struct {
 	uint64	full;  // lock-free list of full blocks
@@ -170,11 +177,114 @@ static struct {
 } work;
 
 enum {
-	// TODO(atom): to be expanded in a next CL
 	GC_DEFAULT_PTR = GC_NUM_INSTR,
+	GC_MAP_NEXT,
+	GC_CHAN,
+
+	GC_NUM_INSTR2
 };
 
-// PtrTarget and BitTarget are structures used by intermediate buffers.
+static struct {
+	struct {
+		uint64 sum;
+		uint64 cnt;
+	} ptr;
+	uint64 nbytes;
+	struct {
+		uint64 sum;
+		uint64 cnt;
+		uint64 notype;
+		uint64 typelookup;
+	} obj;
+	uint64 rescan;
+	uint64 rescanbytes;
+	uint64 instr[GC_NUM_INSTR2];
+	uint64 putempty;
+	uint64 getfull;
+} gcstats;
+
+// markonly marks an object. It returns true if the object
+// has been marked by this function, false otherwise.
+// This function doesn't append the object to any buffer.
+static bool
+markonly(void *obj)
+{
+	byte *p;
+	uintptr *bitp, bits, shift, x, xbits, off;
+	MSpan *s;
+	PageID k;
+
+	// Words outside the arena cannot be pointers.
+	if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
+		return false;
+
+	// obj may be a pointer to a live object.
+	// Try to find the beginning of the object.
+
+	// Round down to word boundary.
+	obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+
+	// Find bits for this word.
+	off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
+	bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+	xbits = *bitp;
+	bits = xbits >> shift;
+
+	// Pointing at the beginning of a block?
+	if((bits & (bitAllocated|bitBlockBoundary)) != 0)
+		goto found;
+
+	// Otherwise consult span table to find beginning.
+	// (Manually inlined copy of MHeap_LookupMaybe.)
+	k = (uintptr)obj>>PageShift;
+	x = k;
+	if(sizeof(void*) == 8)
+		x -= (uintptr)runtime_mheap->arena_start>>PageShift;
+	s = runtime_mheap->map[x];
+	if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
+		return false;
+	p = (byte*)((uintptr)s->start<<PageShift);
+	if(s->sizeclass == 0) {
+		obj = p;
+	} else {
+		if((byte*)obj >= (byte*)s->limit)
+			return false;
+		uintptr size = s->elemsize;
+		int32 i = ((byte*)obj - p)/size;
+		obj = p+i*size;
+	}
+
+	// Now that we know the object header, reload bits.
+	off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
+	bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+	xbits = *bitp;
+	bits = xbits >> shift;
+
+found:
+	// Now we have bits, bitp, and shift correct for
+	// obj pointing at the base of the object.
+	// Only care about allocated and not marked.
+	if((bits & (bitAllocated|bitMarked)) != bitAllocated)
+		return false;
+	if(work.nproc == 1)
+		*bitp |= bitMarked<<shift;
+	else {
+		for(;;) {
+			x = *bitp;
+			if(x & (bitMarked<<shift))
+				return false;
+			if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
+				break;
+		}
+	}
+
+	// The object is now marked
+	return true;
+}
+
+// PtrTarget is a structure used by intermediate buffers.
 // The intermediate buffers hold GC data before it
 // is moved/flushed to the work buffer (Workbuf).
 // The size of an intermediate buffer is very small,
@@ -186,24 +296,16 @@ struct PtrTarget
 	uintptr ti;
 };
 
-typedef struct BitTarget BitTarget;
-struct BitTarget
-{
-	void *p;
-	uintptr ti;
-	uintptr *bitp, shift;
-};
-
 typedef struct BufferList BufferList;
 struct BufferList
 {
 	PtrTarget ptrtarget[IntermediateBufferCapacity];
-	BitTarget bittarget[IntermediateBufferCapacity];
-	BufferList *next;
+	Obj obj[IntermediateBufferCapacity];
+	uint32 busy;
+	byte pad[CacheLineSize];
 };
-static BufferList *bufferList;
+static BufferList bufferList[MaxGcproc];
 
-static Lock lock;
 static Type *itabtype;
 
 static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
@@ -214,7 +316,6 @@ static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
 // and are prepared to be scanned by the garbage collector.
 //
 // _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
-// bitbuf holds temporary data generated by this function.
 //
 // A simplified drawing explaining how the todo-list moves from a structure to another:
 //
@@ -222,14 +323,12 @@ static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
 //  (find pointers)
 //    Obj ------> PtrTarget (pointer targets)
 //     ↑          |
-//     |          | flushptrbuf (1st part,
-//     |          | find block start)
-//     |          ↓
-//     `--------- BitTarget (pointer targets and the corresponding locations in bitmap)
-//  flushptrbuf
-//  (2nd part, mark and enqueue)
+//     |          |
+//     `----------'
+//     flushptrbuf
+//  (find block start, mark and enqueue)
 static void
-flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj, BitTarget *bitbuf)
+flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj)
 {
 	byte *p, *arena_start, *obj;
 	uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
@@ -238,9 +337,8 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 	Obj *wp;
 	Workbuf *wbuf;
 	PtrTarget *ptrbuf_end;
-	BitTarget *bitbufpos, *bt;
 
-	arena_start = runtime_mheap.arena_start;
+	arena_start = runtime_mheap->arena_start;
 
 	wp = *_wp;
 	wbuf = *_wbuf;
@@ -250,6 +348,11 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 	n = ptrbuf_end - ptrbuf;
 	*ptrbufpos = ptrbuf;
 
+	if(CollectStats) {
+		runtime_xadd64(&gcstats.ptr.sum, n);
+		runtime_xadd64(&gcstats.ptr.cnt, 1);
+	}
+
 	// If buffer is nearly full, get a new one.
 	if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
 		if(wbuf != nil)
@@ -267,8 +370,6 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 	{
 		// Multi-threaded version.
 
-		bitbufpos = bitbuf;
-
 		while(ptrbuf < ptrbuf_end) {
 			obj = ptrbuf->p;
 			ti = ptrbuf->ti;
@@ -276,7 +377,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 
 			// obj belongs to interval [mheap.arena_start, mheap.arena_used).
 			if(Debug > 1) {
-				if(obj < runtime_mheap.arena_start || obj >= runtime_mheap.arena_used)
+				if(obj < runtime_mheap->arena_start || obj >= runtime_mheap->arena_used)
 					runtime_throw("object is outside of mheap");
 			}
 
@@ -319,7 +420,7 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			x = k;
 			if(sizeof(void*) == 8)
 				x -= (uintptr)arena_start>>PageShift;
-			s = runtime_mheap.map[x];
+			s = runtime_mheap->map[x];
 			if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
 				continue;
 			p = (byte*)((uintptr)s->start<<PageShift);
@@ -346,40 +447,36 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 			// Only care about allocated and not marked.
 			if((bits & (bitAllocated|bitMarked)) != bitAllocated)
 				continue;
-
-			*bitbufpos++ = (BitTarget){obj, ti, bitp, shift};
-		}
-
-		runtime_lock(&lock);
-		for(bt=bitbuf; bt<bitbufpos; bt++){
-			xbits = *bt->bitp;
-			bits = xbits >> bt->shift;
-			if((bits & bitMarked) != 0)
-				continue;
-
-			// Mark the block
-			*bt->bitp = xbits | (bitMarked << bt->shift);
+			if(work.nproc == 1)
+				*bitp |= bitMarked<<shift;
+			else {
+				for(;;) {
+					x = *bitp;
+					if(x & (bitMarked<<shift))
+						goto continue_obj;
+					if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
+						break;
+				}
+			}
 
 			// If object has no pointers, don't need to scan further.
 			if((bits & bitNoPointers) != 0)
 				continue;
 
-			obj = bt->p;
-
 			// Ask span about size class.
 			// (Manually inlined copy of MHeap_Lookup.)
 			x = (uintptr)obj >> PageShift;
 			if(sizeof(void*) == 8)
 				x -= (uintptr)arena_start>>PageShift;
-			s = runtime_mheap.map[x];
+			s = runtime_mheap->map[x];
 
 			PREFETCH(obj);
 
-			*wp = (Obj){obj, s->elemsize, bt->ti};
+			*wp = (Obj){obj, s->elemsize, ti};
 			wp++;
 			nobj++;
+		continue_obj:;
 		}
-		runtime_unlock(&lock);
 
 		// If another proc wants a pointer, give it some.
 		if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
@@ -395,9 +492,73 @@ flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf
 	*_nobj = nobj;
 }
 
+static void
+flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj)
+{
+	uintptr nobj, off;
+	Obj *wp, obj;
+	Workbuf *wbuf;
+	Obj *objbuf_end;
+
+	wp = *_wp;
+	wbuf = *_wbuf;
+	nobj = *_nobj;
+
+	objbuf_end = *objbufpos;
+	*objbufpos = objbuf;
+
+	while(objbuf < objbuf_end) {
+		obj = *objbuf++;
+
+		// Align obj.b to a word boundary.
+		off = (uintptr)obj.p & (PtrSize-1);
+		if(off != 0) {
+			obj.p += PtrSize - off;
+			obj.n -= PtrSize - off;
+			obj.ti = 0;
+		}
+
+		if(obj.p == nil || obj.n == 0)
+			continue;
+
+		// If buffer is full, get a new one.
+		if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
+			if(wbuf != nil)
+				wbuf->nobj = nobj;
+			wbuf = getempty(wbuf);
+			wp = wbuf->obj;
+			nobj = 0;
+		}
+
+		*wp = obj;
+		wp++;
+		nobj++;
+	}
+
+	// If another proc wants a pointer, give it some.
+	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
+		wbuf->nobj = nobj;
+		wbuf = handoff(wbuf);
+		nobj = wbuf->nobj;
+		wp = wbuf->obj + nobj;
+	}
+
+	*_wp = wp;
+	*_wbuf = wbuf;
+	*_nobj = nobj;
+}
+
 // Program that scans the whole block and treats every block element as a potential pointer
 static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
 
+#if 0
+// Hashmap iterator program
+static uintptr mapProg[2] = {0, GC_MAP_NEXT};
+
+// Hchan program
+static uintptr chanProg[2] = {0, GC_CHAN};
+#endif
+
 // Local variables of a program fragment or loop
 typedef struct Frame Frame;
 struct Frame {
@@ -405,6 +566,61 @@ struct Frame {
 	uintptr *loop_or_ret;
 };
 
+// Sanity check for the derived type info objti.
+static void
+checkptr(void *obj, uintptr objti)
+{
+	uintptr type, tisize, i, x;
+	byte *objstart;
+	Type *t;
+	MSpan *s;
+
+	if(!Debug)
+		runtime_throw("checkptr is debug only");
+
+	if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
+		return;
+	type = runtime_gettype(obj);
+	t = (Type*)(type & ~(uintptr)(PtrSize-1));
+	if(t == nil)
+		return;
+	x = (uintptr)obj >> PageShift;
+	if(sizeof(void*) == 8)
+		x -= (uintptr)(runtime_mheap->arena_start)>>PageShift;
+	s = runtime_mheap->map[x];
+	objstart = (byte*)((uintptr)s->start<<PageShift);
+	if(s->sizeclass != 0) {
+		i = ((byte*)obj - objstart)/s->elemsize;
+		objstart += i*s->elemsize;
+	}
+	tisize = *(uintptr*)objti;
+	// Sanity check for object size: it should fit into the memory block.
+	if((byte*)obj + tisize > objstart + s->elemsize)
+		runtime_throw("invalid gc type info");
+	if(obj != objstart)
+		return;
+	// If obj points to the beginning of the memory block,
+	// check type info as well.
+	if(t->string == nil ||
+		// Gob allocates unsafe pointers for indirection.
+		(runtime_strcmp((const char *)t->string->str, (const char*)"unsafe.Pointer") &&
+		// Runtime and gc think differently about closures.
+		 runtime_strstr((const char *)t->string->str, (const char*)"struct { F uintptr") != (const char *)t->string->str)) {
+#if 0
+		pc1 = (uintptr*)objti;
+		pc2 = (uintptr*)t->gc;
+		// A simple best-effort check until first GC_END.
+		for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
+			if(pc1[j] != pc2[j]) {
+				runtime_printf("invalid gc type info for '%s' at %p, type info %p, block info %p\n",
+					t->string ? (const int8*)t->string->str : (const int8*)"?", j, pc1[j], pc2[j]);
+				runtime_throw("invalid gc type info");
+			}
+		}
+#endif
+	}
+}					
+
 // scanblock scans a block of n bytes starting at pointer b for references
 // to other objects, scanning any it finds recursively until there are no
 // unscanned objects left.  Instead of using an explicit recursion, it keeps
@@ -419,49 +635,64 @@ static void
 scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 {
 	byte *b, *arena_start, *arena_used;
-	uintptr n, i, end_b, elemsize, ti, objti, count /* , type */;
+	uintptr n, i, end_b, elemsize, size, ti, objti, count /* , type */;
 	uintptr *pc, precise_type, nominal_size;
+#if 0
+	uintptr *map_ret, mapkey_size, mapval_size, mapkey_ti, mapval_ti, *chan_ret, chancap;
+#endif
 	void *obj;
 	const Type *t;
 	Slice *sliceptr;
 	Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
 	BufferList *scanbuffers;
 	PtrTarget *ptrbuf, *ptrbuf_end, *ptrbufpos;
-	BitTarget *bitbuf;
+	Obj *objbuf, *objbuf_end, *objbufpos;
 	Eface *eface;
 	Iface *iface;
+#if 0
+	Hmap *hmap;
+	MapType *maptype;
+	bool mapkey_kind, mapval_kind;
+	struct hash_gciter map_iter;
+	struct hash_gciter_data d;
+	Hchan *chan;
+	ChanType *chantype;
+#endif
 
 	if(sizeof(Workbuf) % PageSize != 0)
 		runtime_throw("scanblock: size of Workbuf is suboptimal");
 
 	// Memory arena parameters.
-	arena_start = runtime_mheap.arena_start;
-	arena_used = runtime_mheap.arena_used;
+	arena_start = runtime_mheap->arena_start;
+	arena_used = runtime_mheap->arena_used;
 
 	stack_ptr = stack+nelem(stack)-1;
 	
 	precise_type = false;
 	nominal_size = 0;
 
-	// Allocate ptrbuf, bitbuf
+	// Allocate ptrbuf
 	{
-		runtime_lock(&lock);
-
-		if(bufferList == nil) {
-			bufferList = runtime_SysAlloc(sizeof(*bufferList));
-			bufferList->next = nil;
-		}
-		scanbuffers = bufferList;
-		bufferList = bufferList->next;
-
+		scanbuffers = &bufferList[runtime_m()->helpgc];
 		ptrbuf = &scanbuffers->ptrtarget[0];
 		ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
-		bitbuf = &scanbuffers->bittarget[0];
-
-		runtime_unlock(&lock);
+		objbuf = &scanbuffers->obj[0];
+		objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj);
 	}
 
 	ptrbufpos = ptrbuf;
+	objbufpos = objbuf;
+
+	// (Silence the compiler)
+#if 0
+	map_ret = nil;
+	mapkey_size = mapval_size = 0;
+	mapkey_kind = mapval_kind = false;
+	mapkey_ti = mapval_ti = 0;
+	chan = nil;
+	chantype = nil;
+	chan_ret = nil;
+#endif
 
 	goto next_block;
 
@@ -472,7 +703,13 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			runtime_printf("scanblock %p %D\n", b, (int64)n);
 		}
 
-		if(ti != 0 && 0) {
+		if(CollectStats) {
+			runtime_xadd64(&gcstats.nbytes, n);
+			runtime_xadd64(&gcstats.obj.sum, nobj);
+			runtime_xadd64(&gcstats.obj.cnt, 1);
+		}
+
+		if(ti != 0 && false) {
 			pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
 			precise_type = (ti & PRECISE);
 			stack_top.elemsize = pc[0];
@@ -484,10 +721,27 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			} else {
 				stack_top.count = 1;
 			}
-		} else if(UseSpanType && 0) {
+			if(Debug) {
+				// Simple sanity check for provided type info ti:
+				// The declared size of the object must be not larger than the actual size
+				// (it can be smaller due to inferior pointers).
+				// It's difficult to make a comprehensive check due to inferior pointers,
+				// reflection, gob, etc.
+				if(pc[0] > n) {
+					runtime_printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
+					runtime_throw("invalid gc type info");
+				}
+			}
+		} else if(UseSpanType && false) {
+			if(CollectStats)
+				runtime_xadd64(&gcstats.obj.notype, 1);
+
 #if 0
 			type = runtime_gettype(b);
 			if(type != 0) {
+				if(CollectStats)
+					runtime_xadd64(&gcstats.obj.typelookup, 1);
+
 				t = (Type*)(type & ~(uintptr)(PtrSize-1));
 				switch(type & (PtrSize-1)) {
 				case TypeInfo_SingleObject:
@@ -506,8 +760,27 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 					stack_top.loop_or_ret = pc+1;
 					break;
 				case TypeInfo_Map:
-					// TODO(atom): to be expanded in a next CL
-					pc = defaultProg;
+					hmap = (Hmap*)b;
+					maptype = (MapType*)t;
+					if(hash_gciter_init(hmap, &map_iter)) {
+						mapkey_size = maptype->key->size;
+						mapkey_kind = maptype->key->kind;
+						mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
+						mapval_size = maptype->elem->size;
+						mapval_kind = maptype->elem->kind;
+						mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
+
+						map_ret = nil;
+						pc = mapProg;
+					} else {
+						goto next_block;
+					}
+					break;
+				case TypeInfo_Chan:
+					chan = (Hchan*)b;
+					chantype = (ChanType*)t;
+					chan_ret = nil;
+					pc = chanProg;
 					break;
 				default:
 					runtime_throw("scanblock: invalid type");
@@ -521,12 +794,18 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			pc = defaultProg;
 		}
 
+		if(IgnorePreciseGC)
+			pc = defaultProg;
+
 		pc++;
 		stack_top.b = (uintptr)b;
 
 		end_b = (uintptr)b + n - PtrSize;
 
 	for(;;) {
+		if(CollectStats)
+			runtime_xadd64(&gcstats.instr[pc[0]], 1);
+
 		obj = nil;
 		objti = 0;
 		switch(pc[0]) {
@@ -534,13 +813,19 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			obj = *(void**)(stack_top.b + pc[1]);
 			objti = pc[2];
 			pc += 3;
+			if(Debug)
+				checkptr(obj, objti);
 			break;
 
 		case GC_SLICE:
 			sliceptr = (Slice*)(stack_top.b + pc[1]);
 			if(sliceptr->cap != 0) {
 				obj = sliceptr->array;
-				objti = pc[2] | PRECISE | LOOP;
+				// Can't use slice element type for scanning,
+				// because if it points to an array embedded
+				// in the beginning of a struct,
+				// we will scan the whole struct as the slice.
+				// So just obtain type info from heap.
 			}
 			pc += 3;
 			break;
@@ -552,17 +837,31 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 
 		case GC_STRING:
 			obj = *(void**)(stack_top.b + pc[1]);
+			markonly(obj);
 			pc += 2;
-			break;
+			continue;
 
 		case GC_EFACE:
 			eface = (Eface*)(stack_top.b + pc[1]);
 			pc += 2;
-			if(eface->type != nil && ((byte*)eface->__object >= arena_start && (byte*)eface->__object < arena_used)) {
-				t = eface->type;
+			if(eface->type == nil)
+				continue;
+
+			// eface->type
+			t = eface->type;
+			if((const byte*)t >= arena_start && (const byte*)t < arena_used) {
+				union { const Type *tc; Type *tr; } u;
+				u.tc = t;
+				*ptrbufpos++ = (struct PtrTarget){(void*)u.tr, 0};
+				if(ptrbufpos == ptrbuf_end)
+					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
+			}
+
+			// eface->__object
+			if((byte*)eface->__object >= arena_start && (byte*)eface->__object < arena_used) {
 				if(t->__size <= sizeof(void*)) {
 					if((t->kind & KindNoPointers))
-						break;
+						continue;
 
 					obj = eface->__object;
 					if((t->kind & ~KindNoPointers) == KindPtr)
@@ -580,14 +879,14 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			iface = (Iface*)(stack_top.b + pc[1]);
 			pc += 2;
 			if(iface->tab == nil)
-				break;
+				continue;
 			
 			// iface->tab
 			if((byte*)iface->tab >= arena_start && (byte*)iface->tab < arena_used) {
 				// *ptrbufpos++ = (struct PtrTarget){iface->tab, (uintptr)itabtype->gc};
 				*ptrbufpos++ = (struct PtrTarget){iface->tab, 0};
 				if(ptrbufpos == ptrbuf_end)
-					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
 			}
 
 			// iface->data
@@ -596,7 +895,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 				t = nil;
 				if(t->__size <= sizeof(void*)) {
 					if((t->kind & KindNoPointers))
-						break;
+						continue;
 
 					obj = iface->__object;
 					if((t->kind & ~KindNoPointers) == KindPtr)
@@ -611,13 +910,13 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			break;
 
 		case GC_DEFAULT_PTR:
-			while((i = stack_top.b) <= end_b) {
+			while(stack_top.b <= end_b) {
+				obj = *(byte**)stack_top.b;
 				stack_top.b += PtrSize;
-				obj = *(byte**)i;
 				if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
 					*ptrbufpos++ = (struct PtrTarget){obj, 0};
 					if(ptrbufpos == ptrbuf_end)
-						flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+						flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
 				}
 			}
 			goto next_block;
@@ -625,9 +924,8 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		case GC_END:
 			if(--stack_top.count != 0) {
 				// Next iteration of a loop if possible.
-				elemsize = stack_top.elemsize;
-				stack_top.b += elemsize;
-				if(stack_top.b + elemsize <= end_b+PtrSize) {
+				stack_top.b += stack_top.elemsize;
+				if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
 					pc = stack_top.loop_or_ret;
 					continue;
 				}
@@ -648,6 +946,10 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 						// Found a value that may be a pointer.
 						// Do a rescan of the entire block.
 						enqueue((Obj){b, n, 0}, &wbuf, &wp, &nobj);
+						if(CollectStats) {
+							runtime_xadd64(&gcstats.rescan, 1);
+							runtime_xadd64(&gcstats.rescanbytes, n);
+						}
 						break;
 					}
 				}
@@ -680,20 +982,136 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			// Stack push.
 			*stack_ptr-- = stack_top;
 			stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
-			pc = (uintptr*)pc[2];  // target of the CALL instruction
+			pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
 			continue;
 
+#if 0
 		case GC_MAP_PTR:
-			// TODO(atom): to be expanded in a next CL. Same as GC_APTR for now.
-			obj = *(void**)(stack_top.b + pc[1]);
-			pc += 3;
-			break;
+			hmap = *(Hmap**)(stack_top.b + pc[1]);
+			if(hmap == nil) {
+				pc += 3;
+				continue;
+			}
+			if(markonly(hmap)) {
+				maptype = (MapType*)pc[2];
+				if(hash_gciter_init(hmap, &map_iter)) {
+					mapkey_size = maptype->key->size;
+					mapkey_kind = maptype->key->kind;
+					mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
+					mapval_size = maptype->elem->size;
+					mapval_kind = maptype->elem->kind;
+					mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
+
+					// Start mapProg.
+					map_ret = pc+3;
+					pc = mapProg+1;
+				} else {
+					pc += 3;
+				}
+			} else {
+				pc += 3;
+			}
+			continue;
+
+		case GC_MAP_NEXT:
+			// Add all keys and values to buffers, mark all subtables.
+			while(hash_gciter_next(&map_iter, &d)) {
+				// buffers: reserve space for 2 objects.
+				if(ptrbufpos+2 >= ptrbuf_end)
+					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
+				if(objbufpos+2 >= objbuf_end)
+					flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+
+				if(d.st != nil)
+					markonly(d.st);
+
+				if(d.key_data != nil) {
+					if(!(mapkey_kind & KindNoPointers) || d.indirectkey) {
+						if(!d.indirectkey)
+							*objbufpos++ = (Obj){d.key_data, mapkey_size, mapkey_ti};
+						else {
+							if(Debug) {
+								obj = *(void**)d.key_data;
+								if(!(arena_start <= obj && obj < arena_used))
+									runtime_throw("scanblock: inconsistent hashmap");
+							}
+							*ptrbufpos++ = (struct PtrTarget){*(void**)d.key_data, mapkey_ti};
+						}
+					}
+					if(!(mapval_kind & KindNoPointers) || d.indirectval) {
+						if(!d.indirectval)
+							*objbufpos++ = (Obj){d.val_data, mapval_size, mapval_ti};
+						else {
+							if(Debug) {
+								obj = *(void**)d.val_data;
+								if(!(arena_start <= obj && obj < arena_used))
+									runtime_throw("scanblock: inconsistent hashmap");
+							}
+							*ptrbufpos++ = (struct PtrTarget){*(void**)d.val_data, mapval_ti};
+						}
+					}
+				}
+			}
+			if(map_ret == nil)
+				goto next_block;
+			pc = map_ret;
+			continue;
+#endif
 
 		case GC_REGION:
-			// TODO(atom): to be expanded in a next CL. Same as GC_APTR for now.
 			obj = (void*)(stack_top.b + pc[1]);
+			size = pc[2];
+			objti = pc[3];
 			pc += 4;
-			break;
+
+			*objbufpos++ = (Obj){obj, size, objti};
+			if(objbufpos == objbuf_end)
+				flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+			continue;
+
+#if 0
+		case GC_CHAN_PTR:
+			// Similar to GC_MAP_PTR
+			chan = *(Hchan**)(stack_top.b + pc[1]);
+			if(chan == nil) {
+				pc += 3;
+				continue;
+			}
+			if(markonly(chan)) {
+				chantype = (ChanType*)pc[2];
+				if(!(chantype->elem->kind & KindNoPointers)) {
+					// Start chanProg.
+					chan_ret = pc+3;
+					pc = chanProg+1;
+					continue;
+				}
+			}
+			pc += 3;
+			continue;
+
+		case GC_CHAN:
+			// There are no heap pointers in struct Hchan,
+			// so we can ignore the leading sizeof(Hchan) bytes.
+			if(!(chantype->elem->kind & KindNoPointers)) {
+				// Channel's buffer follows Hchan immediately in memory.
+				// Size of buffer (cap(c)) is second int in the chan struct.
+				chancap = ((uintgo*)chan)[1];
+				if(chancap > 0) {
+					// TODO(atom): split into two chunks so that only the
+					// in-use part of the circular buffer is scanned.
+					// (Channel routines zero the unused part, so the current
+					// code does not lead to leaks, it's just a little inefficient.)
+					*objbufpos++ = (Obj){(byte*)chan+runtime_Hchansize, chancap*chantype->elem->size,
+						(uintptr)chantype->elem->gc | PRECISE | LOOP};
+					if(objbufpos == objbuf_end)
+						flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+				}
+			}
+			if(chan_ret == nil)
+				goto next_block;
+			pc = chan_ret;
+			continue;
+#endif
 
 		default:
 			runtime_throw("scanblock: invalid GC instruction");
@@ -701,9 +1119,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		}
 
 		if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
-			*ptrbufpos++ = (PtrTarget){obj, objti};
+			*ptrbufpos++ = (struct PtrTarget){obj, objti};
 			if(ptrbufpos == ptrbuf_end)
-				flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+				flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
 		}
 	}
 
@@ -712,7 +1130,8 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		// the loop by setting b, n, ti to the parameters for the next block.
 
 		if(nobj == 0) {
-			flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+			flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
+			flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
 
 			if(nobj == 0) {
 				if(!keepworking) {
@@ -737,11 +1156,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		nobj--;
 	}
 
-endscan:
-	runtime_lock(&lock);
-	scanbuffers->next = bufferList;
-	bufferList = scanbuffers;
-	runtime_unlock(&lock);
+endscan:;
 }
 
 // debug_scanblock is the debug copy of scanblock.
@@ -776,14 +1191,14 @@ debug_scanblock(byte *b, uintptr n)
 		obj = (byte*)vp[i];
 
 		// Words outside the arena cannot be pointers.
-		if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
+		if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
 			continue;
 
 		// Round down to word boundary.
 		obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
 		// Consult span table to find beginning.
-		s = runtime_MHeap_LookupMaybe(&runtime_mheap, obj);
+		s = runtime_MHeap_LookupMaybe(runtime_mheap, obj);
 		if(s == nil)
 			continue;
 
@@ -799,8 +1214,8 @@ debug_scanblock(byte *b, uintptr n)
 		}
 
 		// Now that we know the object header, reload bits.
-		off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
-		bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
+		bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		xbits = *bitp;
 		bits = xbits >> shift;
@@ -906,6 +1321,8 @@ getempty(Workbuf *b)
 		if(work.nchunk < sizeof *b) {
 			work.nchunk = 1<<20;
 			work.chunk = runtime_SysAlloc(work.nchunk);
+			if(work.chunk == nil)
+				runtime_throw("runtime: cannot allocate memory");
 		}
 		b = (Workbuf*)work.chunk;
 		work.chunk += sizeof *b;
@@ -919,6 +1336,9 @@ getempty(Workbuf *b)
 static void
 putempty(Workbuf *b)
 {
+	if(CollectStats)
+		runtime_xadd64(&gcstats.putempty, 1);
+
 	runtime_lfstackpush(&work.empty, &b->node);
 }
 
@@ -929,6 +1349,9 @@ getfull(Workbuf *b)
 	M *m;
 	int32 i;
 
+	if(CollectStats)
+		runtime_xadd64(&gcstats.getfull, 1);
+
 	if(b != nil)
 		runtime_lfstackpush(&work.empty, &b->node);
 	b = (Workbuf*)runtime_lfstackpop(&work.full);
@@ -994,6 +1417,8 @@ addroot(Obj obj)
 		if(cap < 2*work.rootcap)
 			cap = 2*work.rootcap;
 		new = (Obj*)runtime_SysAlloc(cap*sizeof(Obj));
+		if(new == nil)
+			runtime_throw("runtime: cannot allocate memory");
 		if(work.roots != nil) {
 			runtime_memmove(new, work.roots, work.rootcap*sizeof(Obj));
 			runtime_SysFree(work.roots, work.rootcap*sizeof(Obj));
@@ -1081,13 +1506,14 @@ static void
 addfinroots(void *v)
 {
 	uintptr size;
+	void *base;
 
 	size = 0;
-	if(!runtime_mlookup(v, (byte**)&v, &size, nil) || !runtime_blockspecial(v))
+	if(!runtime_mlookup(v, (byte**)&base, &size, nil) || !runtime_blockspecial(base))
 		runtime_throw("mark - finalizer inconsistency");
 
 	// do not mark the finalizer block itself.  just mark the things it points at.
-	addroot((Obj){v, size, 0});
+	addroot((Obj){base, size, 0});
 }
 
 static struct root_list* roots;
@@ -1128,22 +1554,27 @@ addroots(void)
 	addroot((Obj){(byte*)&runtime_g0, sizeof runtime_g0, 0});
 	addroot((Obj){(byte*)&runtime_allg, sizeof runtime_allg, 0});
 	addroot((Obj){(byte*)&runtime_allm, sizeof runtime_allm, 0});
+	addroot((Obj){(byte*)&runtime_allp, sizeof runtime_allp, 0});
+	runtime_proc_scan(addroot);
 	runtime_MProf_Mark(addroot);
 	runtime_time_scan(addroot);
 
 	// MSpan.types
-	allspans = runtime_mheap.allspans;
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
+	allspans = runtime_mheap->allspans;
+	for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
 		s = allspans[spanidx];
 		if(s->state == MSpanInUse) {
+			// The garbage collector ignores type pointers stored in MSpan.types:
+			//  - Compiler-generated types are stored outside of heap.
+			//  - The reflect package has runtime-generated types cached in its data structures.
+			//    The garbage collector relies on finding the references via that cache.
 			switch(s->types.compression) {
 			case MTypes_Empty:
 			case MTypes_Single:
 				break;
 			case MTypes_Words:
 			case MTypes_Bytes:
-				// TODO(atom): consider using defaultProg instead of 0
-				addroot((Obj){(byte*)&s->types.data, sizeof(void*), 0});
+				markonly((byte*)s->types.data);
 				break;
 			}
 		}
@@ -1196,6 +1627,8 @@ handlespecial(byte *p, uintptr size)
 	if(finq == nil || finq->cnt == finq->cap) {
 		if(finc == nil) {
 			finc = runtime_SysAlloc(PageSize);
+			if(finc == nil)
+				runtime_throw("runtime: cannot allocate memory");
 			finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
 			finc->alllink = allfin;
 			allfin = finc;
@@ -1235,10 +1668,10 @@ sweepspan(ParFor *desc, uint32 idx)
 	m = runtime_m();
 
 	USED(&desc);
-	s = runtime_mheap.allspans[idx];
+	s = runtime_mheap->allspans[idx];
 	if(s->state != MSpanInUse)
 		return;
-	arena_start = runtime_mheap.arena_start;
+	arena_start = runtime_mheap->arena_start;
 	p = (byte*)(s->start << PageShift);
 	cl = s->sizeclass;
 	size = s->elemsize;
@@ -1301,8 +1734,8 @@ sweepspan(ParFor *desc, uint32 idx)
 		if(cl == 0) {
 			// Free large span.
 			runtime_unmarkspan(p, 1<<PageShift);
-			*(uintptr*)p = 1;	// needs zeroing
-			runtime_MHeap_Free(&runtime_mheap, s, 1);
+			*(uintptr*)p = (uintptr)0xdeaddeaddeaddeadll;	// needs zeroing
+			runtime_MHeap_Free(runtime_mheap, s, 1);
 			c->local_alloc -= size;
 			c->local_nfree++;
 		} else {
@@ -1316,7 +1749,7 @@ sweepspan(ParFor *desc, uint32 idx)
 				break;
 			}
 			if(size > sizeof(uintptr))
-				((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
+				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
 			
 			end->next = (MLink*)p;
 			end = (MLink*)p;
@@ -1330,7 +1763,7 @@ sweepspan(ParFor *desc, uint32 idx)
 		c->local_nfree += nfree;
 		c->local_cachealloc -= nfree * size;
 		c->local_objects -= nfree;
-		runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
+		runtime_MCentral_FreeSpan(&runtime_mheap->central[cl], s, nfree, head.next, end);
 	}
 }
 
@@ -1344,10 +1777,10 @@ dumpspan(uint32 idx)
 	MSpan *s;
 	bool allocated, special;
 
-	s = runtime_mheap.allspans[idx];
+	s = runtime_mheap->allspans[idx];
 	if(s->state != MSpanInUse)
 		return;
-	arena_start = runtime_mheap.arena_start;
+	arena_start = runtime_mheap->arena_start;
 	p = (byte*)(s->start << PageShift);
 	sizeclass = s->sizeclass;
 	size = s->elemsize;
@@ -1405,7 +1838,7 @@ runtime_memorydump(void)
 {
 	uint32 spanidx;
 
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
+	for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
 		dumpspan(spanidx);
 	}
 }
@@ -1413,6 +1846,8 @@ runtime_memorydump(void)
 void
 runtime_gchelper(void)
 {
+	gchelperstart();
+
 	// parallel mark for over gc roots
 	runtime_parfordo(work.markfor);
 
@@ -1426,10 +1861,13 @@ runtime_gchelper(void)
 	}
 
 	runtime_parfordo(work.sweepfor);
+	bufferList[runtime_m()->helpgc].busy = 0;
 	if(runtime_xadd(&work.ndone, +1) == work.nproc-1)
 		runtime_notewakeup(&work.alldone);
 }
 
+#define GcpercentUnknown (-2)
+
 // Initialized from $GOGC.  GOGC=off means no gc.
 //
 // Next gc is after we've allocated an extra amount of
@@ -1439,22 +1877,14 @@ runtime_gchelper(void)
 // proportion to the allocation cost.  Adjusting gcpercent
 // just changes the linear constant (and also the amount of
 // extra memory used).
-static int32 gcpercent = -2;
-
-static void
-stealcache(void)
-{
-	M *mp;
-
-	for(mp=runtime_allm; mp; mp=mp->alllink)
-		runtime_MCache_ReleaseAll(mp->mcache);
-}
+static int32 gcpercent = GcpercentUnknown;
 
 static void
 cachestats(GCStats *stats)
 {
 	M *mp;
 	MCache *c;
+	P *p, **pp;
 	uint32 i;
 	uint64 stacks_inuse;
 	uint64 *src, *dst;
@@ -1463,9 +1893,7 @@ cachestats(GCStats *stats)
 		runtime_memclr((byte*)stats, sizeof(*stats));
 	stacks_inuse = 0;
 	for(mp=runtime_allm; mp; mp=mp->alllink) {
-		c = mp->mcache;
-		runtime_purgecachedstats(c);
-		// stacks_inuse += mp->stackinuse*FixedStack;
+		//stacks_inuse += mp->stackinuse*FixedStack;
 		if(stats) {
 			src = (uint64*)&mp->gcstats;
 			dst = (uint64*)stats;
@@ -1473,6 +1901,12 @@ cachestats(GCStats *stats)
 				dst[i] += src[i];
 			runtime_memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
 		}
+	}
+	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime_purgecachedstats(c);
 		for(i=0; i<nelem(c->local_by_size); i++) {
 			mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
 			c->local_by_size[i].nmalloc = 0;
@@ -1492,6 +1926,19 @@ struct gc_args
 
 static void gc(struct gc_args *args);
 
+static int32
+readgogc(void)
+{
+	const byte *p;
+
+	p = runtime_getenv("GOGC");
+	if(p == nil || p[0] == '\0')
+		return 100;
+	if(runtime_strcmp((const char *)p, "off") == 0)
+		return -1;
+	return runtime_atoi(p);
+}
+
 void
 runtime_gc(int32 force)
 {
@@ -1504,6 +1951,8 @@ runtime_gc(int32 force)
 	// a problem in the past.
 	if((((uintptr)&work.empty) & 7) != 0)
 		runtime_throw("runtime: gc work buffer is misaligned");
+	if((((uintptr)&work.full) & 7) != 0)
+		runtime_throw("runtime: gc work buffer is misaligned");
 
 	// Make sure all registers are saved on stack so that
 	// scanstack sees them.
@@ -1521,14 +1970,8 @@ runtime_gc(int32 force)
 	if(!mstats.enablegc || m->locks > 0 || runtime_panicking)
 		return;
 
-	if(gcpercent == -2) {	// first time through
-		p = runtime_getenv("GOGC");
-		if(p == nil || p[0] == '\0')
-			gcpercent = 100;
-		else if(runtime_strcmp((const char*)p, "off") == 0)
-			gcpercent = -1;
-		else
-			gcpercent = runtime_atoi(p);
+	if(gcpercent == GcpercentUnknown) {	// first time through
+		gcpercent = readgogc();
 
 		p = runtime_getenv("GOGCTRACE");
 		if(p != nil)
@@ -1555,7 +1998,7 @@ gc(struct gc_args *args)
 {
 	M *m;
 	int64 t0, t1, t2, t3, t4;
-	uint64 heap0, heap1, obj0, obj1;
+	uint64 heap0, heap1, obj0, obj1, ninstr;
 	GCStats stats;
 	M *mp;
 	uint32 i;
@@ -1574,6 +2017,9 @@ gc(struct gc_args *args)
 	m->gcing = 1;
 	runtime_stoptheworld();
 
+	if(CollectStats)
+		runtime_memclr((byte*)&gcstats, sizeof(gcstats));
+
 	for(mp=runtime_allm; mp; mp=mp->alllink)
 		runtime_settype_flush(mp, false);
 
@@ -1604,7 +2050,7 @@ gc(struct gc_args *args)
 	work.nproc = runtime_gcprocs();
 	addroots();
 	runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
-	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap.nspan, nil, true, sweepspan);
+	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap->nspan, nil, true, sweepspan);
 	if(work.nproc > 1) {
 		runtime_noteclear(&work.alldone);
 		runtime_helpgc(work.nproc);
@@ -1612,6 +2058,7 @@ gc(struct gc_args *args)
 
 	t1 = runtime_nanotime();
 
+	gchelperstart();
 	runtime_parfordo(work.markfor);
 	scanblock(nil, nil, 0, true);
 
@@ -1623,14 +2070,14 @@ gc(struct gc_args *args)
 	t2 = runtime_nanotime();
 
 	runtime_parfordo(work.sweepfor);
+	bufferList[m->helpgc].busy = 0;
 	t3 = runtime_nanotime();
 
-	stealcache();
-	cachestats(&stats);
-
 	if(work.nproc > 1)
 		runtime_notesleep(&work.alldone);
 
+	cachestats(&stats);
+
 	stats.nprocyield += work.sweepfor->nprocyield;
 	stats.nosyield += work.sweepfor->nosyield;
 	stats.nsleep += work.sweepfor->nsleep;
@@ -1670,6 +2117,27 @@ gc(struct gc_args *args)
 			stats.nhandoff, stats.nhandoffcnt,
 			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
 			stats.nprocyield, stats.nosyield, stats.nsleep);
+		if(CollectStats) {
+			runtime_printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
+				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
+			if(gcstats.ptr.cnt != 0)
+				runtime_printf("avg ptrbufsize: %D (%D/%D)\n",
+					gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
+			if(gcstats.obj.cnt != 0)
+				runtime_printf("avg nobj: %D (%D/%D)\n",
+					gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
+			runtime_printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
+
+			runtime_printf("instruction counts:\n");
+			ninstr = 0;
+			for(i=0; i<nelem(gcstats.instr); i++) {
+				runtime_printf("\t%d:\t%D\n", i, gcstats.instr[i]);
+				ninstr += gcstats.instr[i];
+			}
+			runtime_printf("\ttotal:\t%D\n", ninstr);
+
+			runtime_printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
+		}
 	}
 
 	runtime_MProf_GC();
@@ -1704,6 +2172,71 @@ runtime_ReadMemStats(MStats *stats)
 	runtime_starttheworld();
 }
 
+void runtime_debug_readGCStats(Slice*)
+  __asm__("runtime_debug.readGCStats");
+
+void
+runtime_debug_readGCStats(Slice *pauses)
+{
+	uint64 *p;
+	uint32 i, n;
+
+	// Calling code in runtime/debug should make the slice large enough.
+	if((size_t)pauses->cap < nelem(mstats.pause_ns)+3)
+		runtime_throw("runtime: short slice passed to readGCStats");
+
+	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
+	p = (uint64*)pauses->array;
+	runtime_lock(runtime_mheap);
+	n = mstats.numgc;
+	if(n > nelem(mstats.pause_ns))
+		n = nelem(mstats.pause_ns);
+	
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	for(i=0; i<n; i++)
+		p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
+
+	p[n] = mstats.last_gc;
+	p[n+1] = mstats.numgc;
+	p[n+2] = mstats.pause_total_ns;	
+	runtime_unlock(runtime_mheap);
+	pauses->__count = n+3;
+}
+
+intgo runtime_debug_setGCPercent(intgo)
+  __asm__("runtime_debug.setGCPercent");
+
+intgo
+runtime_debug_setGCPercent(intgo in)
+{
+	intgo out;
+
+	runtime_lock(runtime_mheap);
+	if(gcpercent == GcpercentUnknown)
+		gcpercent = readgogc();
+	out = gcpercent;
+	if(in < 0)
+		in = -1;
+	gcpercent = in;
+	runtime_unlock(runtime_mheap);
+	return out;
+}
+
+static void
+gchelperstart(void)
+{
+	M *m;
+
+	m = runtime_m();
+	if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
+		runtime_throw("gchelperstart: bad m->helpgc");
+	if(runtime_xchg(&bufferList[m->helpgc].busy, 1))
+		runtime_throw("gchelperstart: already busy");
+}
+
 static void
 runfinq(void* dummy __attribute__ ((unused)))
 {
@@ -1757,11 +2290,11 @@ runtime_markallocated(void *v, uintptr n, bool noptr)
 	if(0)
 		runtime_printf("markallocated %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
 		runtime_throw("markallocated: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
@@ -1789,11 +2322,11 @@ runtime_markfreed(void *v, uintptr n)
 	if(0)
 		runtime_printf("markallocated %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
 		runtime_throw("markallocated: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
@@ -1819,11 +2352,11 @@ runtime_checkfreed(void *v, uintptr n)
 	if(!runtime_checking)
 		return;
 
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
 		return;	// not allocated, so okay
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;  // word offset
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	bits = *b>>shift;
@@ -1842,7 +2375,7 @@ runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
 	uintptr *b, off, shift;
 	byte *p;
 
-	if((byte*)v+size*n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+	if((byte*)v+size*n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
 		runtime_throw("markspan: bad pointer");
 
 	p = v;
@@ -1853,8 +2386,8 @@ runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
 		// the entire span, and each bitmap word has bits for only
 		// one span, so no other goroutines are changing these
 		// bitmap words.
-		off = (uintptr*)p - (uintptr*)runtime_mheap.arena_start;  // word offset
-		b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)p - (uintptr*)runtime_mheap->arena_start;  // word offset
+		b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		*b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 	}
@@ -1866,14 +2399,14 @@ runtime_unmarkspan(void *v, uintptr n)
 {
 	uintptr *p, *b, off;
 
-	if((byte*)v+n > (byte*)runtime_mheap.arena_used || (byte*)v < runtime_mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
 		runtime_throw("markspan: bad pointer");
 
 	p = v;
-	off = p - (uintptr*)runtime_mheap.arena_start;  // word offset
+	off = p - (uintptr*)runtime_mheap->arena_start;  // word offset
 	if(off % wordsPerBitmapWord != 0)
 		runtime_throw("markspan: unaligned pointer");
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	n /= PtrSize;
 	if(n%wordsPerBitmapWord != 0)
 		runtime_throw("unmarkspan: unaligned length");
@@ -1894,8 +2427,8 @@ runtime_blockspecial(void *v)
 	if(DebugMark)
 		return true;
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	return (*b & (bitSpecial<<shift)) != 0;
@@ -1909,8 +2442,8 @@ runtime_setblockspecial(void *v, bool s)
 	if(DebugMark)
 		return;
 
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
+	b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
diff --git a/libgo/runtime/mgc0.h b/libgo/runtime/mgc0.h
index a2798ef..d14fb37 100644
--- a/libgo/runtime/mgc0.h
+++ b/libgo/runtime/mgc0.h
@@ -12,17 +12,22 @@
 // Meaning of arguments:
 //   off      Offset (in bytes) from the start of the current object
 //   objgc    Pointer to GC info of an object
+//   objgcrel Offset to GC info of an object
 //   len      Length of an array
 //   elemsize Size (in bytes) of an element
 //   size     Size (in bytes)
+//
+// NOTE: There is a copy of these in ../reflect/type.go.
+// They must be kept in sync.
 enum {
 	GC_END,         // End of object, loop or subroutine. Args: none
 	GC_PTR,         // A typed pointer. Args: (off, objgc)
 	GC_APTR,        // Pointer to an arbitrary object. Args: (off)
 	GC_ARRAY_START, // Start an array with a fixed length. Args: (off, len, elemsize)
 	GC_ARRAY_NEXT,  // The next element of an array. Args: none
-	GC_CALL,        // Call a subroutine. Args: (off, objgc)
+	GC_CALL,        // Call a subroutine. Args: (off, objgcrel)
 	GC_MAP_PTR,     // Go map. Args: (off, MapType*)
+	GC_CHAN_PTR,    // Go channel. Args: (off, ChanType*)
 	GC_STRING,      // Go string. Args: (off)
 	GC_EFACE,       // interface{}. Args: (off)
 	GC_IFACE,       // interface{...}. Args: (off)
diff --git a/libgo/runtime/mheap.c b/libgo/runtime/mheap.c
index 6636b01..b4d94b6 100644
--- a/libgo/runtime/mheap.c
+++ b/libgo/runtime/mheap.c
@@ -37,6 +37,8 @@ RecordSpan(void *vh, byte *p)
 		if(cap < h->nspancap*3/2)
 			cap = h->nspancap*3/2;
 		all = (MSpan**)runtime_SysAlloc(cap*sizeof(all[0]));
+		if(all == nil)
+			runtime_throw("runtime: cannot allocate memory");
 		if(h->allspans) {
 			runtime_memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
 			runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]));
@@ -119,6 +121,25 @@ HaveSpan:
 	s->state = MSpanInUse;
 	mstats.heap_idle -= s->npages<<PageShift;
 	mstats.heap_released -= s->npreleased<<PageShift;
+	if(s->npreleased > 0) {
+		// We have called runtime_SysUnused with these pages, and on
+		// Unix systems it called madvise.  At this point at least
+		// some BSD-based kernels will return these pages either as
+		// zeros or with the old data.  For our caller, the first word
+		// in the page indicates whether the span contains zeros or
+		// not (this word was set when the span was freed by
+		// MCentral_Free or runtime_MCentral_FreeSpan).  If the first
+		// page in the span is returned as zeros, and some subsequent
+		// page is returned with the old data, then we will be
+		// returning a span that is assumed to be all zeros, but the
+		// actual data will not be all zeros.  Avoid that problem by
+		// explicitly marking the span as not being zeroed, just in
+		// case.  The beadbead constant we use here means nothing, it
+		// is just a unique constant not seen elsewhere in the
+		// runtime, as a clue in case it turns up unexpectedly in
+		// memory or in a stack trace.
+		*(uintptr*)(s->start<<PageShift) = (uintptr)0xbeadbeadbeadbeadULL;
+	}
 	s->npreleased = 0;
 
 	if(s->npages > npage) {
@@ -356,23 +377,64 @@ forcegchelper(void *vnote)
 	runtime_notewakeup(note);
 }
 
+static uintptr
+scavengelist(MSpan *list, uint64 now, uint64 limit)
+{
+	uintptr released, sumreleased;
+	MSpan *s;
+
+	if(runtime_MSpanList_IsEmpty(list))
+		return 0;
+
+	sumreleased = 0;
+	for(s=list->next; s != list; s=s->next) {
+		if((now - s->unusedsince) > limit) {
+			released = (s->npages - s->npreleased) << PageShift;
+			mstats.heap_released += released;
+			sumreleased += released;
+			s->npreleased = s->npages;
+			runtime_SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
+		}
+	}
+	return sumreleased;
+}
+
+static uintptr
+scavenge(uint64 now, uint64 limit)
+{
+	uint32 i;
+	uintptr sumreleased;
+	MHeap *h;
+	
+	h = runtime_mheap;
+	sumreleased = 0;
+	for(i=0; i < nelem(h->free); i++)
+		sumreleased += scavengelist(&h->free[i], now, limit);
+	sumreleased += scavengelist(&h->large, now, limit);
+	return sumreleased;
+}
+
 // Release (part of) unused memory to OS.
 // Goroutine created at startup.
 // Loop forever.
 void
 runtime_MHeap_Scavenger(void* dummy)
 {
+	G *g;
 	MHeap *h;
-	MSpan *s, *list;
 	uint64 tick, now, forcegc, limit;
-	uint32 k, i;
-	uintptr released, sumreleased;
+	uint32 k;
+	uintptr sumreleased;
 	const byte *env;
 	bool trace;
 	Note note, *notep;
 
 	USED(dummy);
 
+	g = runtime_g();
+	g->issystem = true;
+	g->isbackground = true;
+
 	// If we go two minutes without a garbage collection, force one to run.
 	forcegc = 2*60*1e9;
 	// If a span goes unused for 5 minutes after a garbage collection,
@@ -389,10 +451,10 @@ runtime_MHeap_Scavenger(void* dummy)
 	if(env != nil)
 		trace = runtime_atoi(env) > 0;
 
-	h = &runtime_mheap;
+	h = runtime_mheap;
 	for(k=0;; k++) {
 		runtime_noteclear(&note);
-		runtime_entersyscall();
+		runtime_entersyscallblock();
 		runtime_notetsleep(&note, tick);
 		runtime_exitsyscall();
 
@@ -406,7 +468,7 @@ runtime_MHeap_Scavenger(void* dummy)
 			runtime_noteclear(&note);
 			notep = &note;
 			__go_go(forcegchelper, (void*)notep);
-			runtime_entersyscall();
+			runtime_entersyscallblock();
 			runtime_notesleep(&note);
 			runtime_exitsyscall();
 			if(trace)
@@ -414,24 +476,7 @@ runtime_MHeap_Scavenger(void* dummy)
 			runtime_lock(h);
 			now = runtime_nanotime();
 		}
-		sumreleased = 0;
-		for(i=0; i < nelem(h->free)+1; i++) {
-			if(i < nelem(h->free))
-				list = &h->free[i];
-			else
-				list = &h->large;
-			if(runtime_MSpanList_IsEmpty(list))
-				continue;
-			for(s=list->next; s != list; s=s->next) {
-				if((now - s->unusedsince) > limit) {
-					released = (s->npages - s->npreleased) << PageShift;
-					mstats.heap_released += released;
-					sumreleased += released;
-					s->npreleased = s->npages;
-					runtime_SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
-				}
-			}
-		}
+		sumreleased = scavenge(now, limit);
 		runtime_unlock(h);
 
 		if(trace) {
@@ -444,6 +489,17 @@ runtime_MHeap_Scavenger(void* dummy)
 	}
 }
 
+void runtime_debug_freeOSMemory(void) __asm__("runtime_debug.freeOSMemory");
+
+void
+runtime_debug_freeOSMemory(void)
+{
+	runtime_gc(1);
+	runtime_lock(runtime_mheap);
+	scavenge(~(uintptr)0, 0);
+	runtime_unlock(runtime_mheap);
+}
+
 // Initialize a new span with the given start and npages.
 void
 runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages)
diff --git a/libgo/runtime/mprof.goc b/libgo/runtime/mprof.goc
index c1b09be..73d9379 100644
--- a/libgo/runtime/mprof.goc
+++ b/libgo/runtime/mprof.goc
@@ -14,7 +14,43 @@ package runtime
 #include "go-string.h"
 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
-static Lock proflock;
+static Lock proflock, alloclock;
+
+// All memory allocations are local and do not escape outside of the profiler.
+// The profiler is forbidden from referring to garbage-collected memory.
+
+static byte *pool;        // memory allocation pool
+static uintptr poolfree;  // number of bytes left in the pool
+enum {
+	Chunk = 32*PageSize,  // initial size of the pool
+};
+
+// Memory allocation local to this file.
+// There is no way to return the allocated memory back to the OS.
+static void*
+allocate(uintptr size)
+{
+	void *v;
+
+	if(size == 0)
+		return nil;
+
+	if(size >= Chunk/2)
+		return runtime_SysAlloc(size);
+
+	runtime_lock(&alloclock);
+	if(size > poolfree) {
+		pool = runtime_SysAlloc(Chunk);
+		if(pool == nil)
+			runtime_throw("runtime: cannot allocate memory");
+		poolfree = Chunk;
+	}
+	v = pool;
+	pool += size;
+	poolfree -= size;
+	runtime_unlock(&alloclock);
+	return v;
+}
 
 enum { MProf, BProf };  // profile types
 
@@ -26,6 +62,8 @@ struct Bucket
 	Bucket	*next;	// next in hash list
 	Bucket	*allnext;	// next in list of all mbuckets/bbuckets
 	int32	typ;
+	// Generally unions can break precise GC,
+	// this one is fine because it does not contain pointers.
 	union
 	{
 		struct  // typ == MProf
@@ -67,6 +105,8 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 
 	if(buckhash == nil) {
 		buckhash = runtime_SysAlloc(BuckHashSize*sizeof buckhash[0]);
+		if(buckhash == nil)
+			runtime_throw("runtime: cannot allocate memory");
 		mstats.buckhash_sys += BuckHashSize*sizeof buckhash[0];
 	}
 
@@ -97,7 +137,9 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 	if(!alloc)
 		return nil;
 
-	b = runtime_mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1);
+	b = allocate(sizeof *b + nstk*sizeof stk[0]);
+	if(b == nil)
+		runtime_throw("runtime: cannot allocate memory");
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime_memmove(b->stk, stk, nstk*sizeof stk[0]);
 	b->typ = typ;
@@ -115,13 +157,11 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 	return b;
 }
 
-// Record that a gc just happened: all the 'recent' statistics are now real.
-void
-runtime_MProf_GC(void)
+static void
+MProf_GC(void)
 {
 	Bucket *b;
-	
-	runtime_lock(&proflock);
+
 	for(b=mbuckets; b; b=b->allnext) {
 		b->allocs += b->recent_allocs;
 		b->frees += b->recent_frees;
@@ -132,6 +172,14 @@ runtime_MProf_GC(void)
 		b->recent_alloc_bytes = 0;
 		b->recent_free_bytes = 0;
 	}
+}
+
+// Record that a gc just happened: all the 'recent' statistics are now real.
+void
+runtime_MProf_GC(void)
+{
+	runtime_lock(&proflock);
+	MProf_GC();
 	runtime_unlock(&proflock);
 }
 
@@ -166,7 +214,7 @@ struct AddrEntry
 	Bucket *b;
 };
 
-static AddrHash *addrhash[1<<AddrHashBits];
+static AddrHash **addrhash;	// points to (AddrHash*)[1<<AddrHashBits]
 static AddrEntry *addrfree;
 static uintptr addrmem;
 
@@ -193,7 +241,7 @@ setaddrbucket(uintptr addr, Bucket *b)
 		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 
-	ah = runtime_mallocgc(sizeof *ah, FlagNoProfiling, 0, 1);
+	ah = allocate(sizeof *ah);
 	addrmem += sizeof *ah;
 	ah->next = addrhash[h];
 	ah->addr = addr>>AddrHashShift;
@@ -201,7 +249,7 @@ setaddrbucket(uintptr addr, Bucket *b)
 
 found:
 	if((e = addrfree) == nil) {
-		e = runtime_mallocgc(64*sizeof *e, FlagNoProfiling, 0, 0);
+		e = allocate(64*sizeof *e);
 		addrmem += 64*sizeof *e;
 		for(i=0; i+1<64; i++)
 			e[i].next = &e[i+1];
@@ -353,12 +401,28 @@ record(Record *r, Bucket *b)
 func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 	Bucket *b;
 	Record *r;
+	bool clear;
 
 	runtime_lock(&proflock);
 	n = 0;
-	for(b=mbuckets; b; b=b->allnext)
+	clear = true;
+	for(b=mbuckets; b; b=b->allnext) {
 		if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 			n++;
+		if(b->allocs != 0 || b->frees != 0)
+			clear = false;
+	}
+	if(clear) {
+		// Absolutely no data, suggesting that a garbage collection
+		// has not yet happened. In order to allow profiling when
+		// garbage collection is disabled from the beginning of execution,
+		// accumulate stats as if a GC just happened, and recount buckets.
+		MProf_GC();
+		n = 0;
+		for(b=mbuckets; b; b=b->allnext)
+			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
+				n++;
+	}
 	ok = false;
 	if(n <= p.__count) {
 		ok = true;
@@ -531,3 +595,8 @@ func GoroutineProfile(b Slice) (n int, ok bool) {
 	}
 }
 
+void
+runtime_mprofinit(void)
+{
+	addrhash = allocate((1<<AddrHashBits)*sizeof *addrhash);
+}
diff --git a/libgo/runtime/netpoll.goc b/libgo/runtime/netpoll.goc
new file mode 100644
index 0000000..a0bd735
--- /dev/null
+++ b/libgo/runtime/netpoll.goc
@@ -0,0 +1,356 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin linux
+
+package net
+
+#include "runtime.h"
+#include "defs.h"
+#include "arch.h"
+#include "malloc.h"
+
+// Map gccgo field names to gc field names.
+// Eface aka __go_empty_interface.
+#define type __type_descriptor
+#define data __object
+
+// Integrated network poller (platform-independent part).
+// A particular implementation (epoll/kqueue) must define the following functions:
+// void runtime_netpollinit(void);			// to initialize the poller
+// int32 runtime_netpollopen(int32 fd, PollDesc *pd);	// to arm edge-triggered notifications
+							// and associate fd with pd.
+// An implementation must call the following function to denote that the pd is ready.
+// void runtime_netpollready(G **gpp, PollDesc *pd, int32 mode);
+
+#define READY ((G*)1)
+
+struct PollDesc
+{
+	PollDesc* link;	// in pollcache, protected by pollcache.Lock
+	Lock;		// protectes the following fields
+	int32	fd;
+	bool	closing;
+	uintptr	seq;	// protects from stale timers and ready notifications
+	G*	rg;	// G waiting for read or READY (binary semaphore)
+	Timer	rt;	// read deadline timer (set if rt.fv != nil)
+	int64	rd;	// read deadline
+	G*	wg;	// the same for writes
+	Timer	wt;
+	int64	wd;
+};
+
+static struct
+{
+	Lock;
+	PollDesc*	first;
+	// PollDesc objects must be type-stable,
+	// because we can get ready notification from epoll/kqueue
+	// after the descriptor is closed/reused.
+	// Stale notifications are detected using seq variable,
+	// seq is incremented when deadlines are changed or descriptor is reused.
+} pollcache;
+
+static void	netpollblock(PollDesc*, int32);
+static G*	netpollunblock(PollDesc*, int32);
+static void	deadline(int64, Eface);
+static void	readDeadline(int64, Eface);
+static void	writeDeadline(int64, Eface);
+static PollDesc*	allocPollDesc(void);
+static intgo	checkerr(PollDesc *pd, int32 mode);
+
+static FuncVal deadlineFn	= {(void(*)(void))deadline};
+static FuncVal readDeadlineFn	= {(void(*)(void))readDeadline};
+static FuncVal writeDeadlineFn	= {(void(*)(void))writeDeadline};
+
+func runtime_pollServerInit() {
+	runtime_netpollinit();
+}
+
+func runtime_pollOpen(fd int) (pd *PollDesc, errno int) {
+	pd = allocPollDesc();
+	runtime_lock(pd);
+	if(pd->wg != nil && pd->wg != READY)
+		runtime_throw("runtime_pollOpen: blocked write on free descriptor");
+	if(pd->rg != nil && pd->rg != READY)
+		runtime_throw("runtime_pollOpen: blocked read on free descriptor");
+	pd->fd = fd;
+	pd->closing = false;
+	pd->seq++;
+	pd->rg = nil;
+	pd->rd = 0;
+	pd->wg = nil;
+	pd->wd = 0;
+	runtime_unlock(pd);
+
+	errno = runtime_netpollopen(fd, pd);
+}
+
+func runtime_pollClose(pd *PollDesc) {
+	if(!pd->closing)
+		runtime_throw("runtime_pollClose: close w/o unblock");
+	if(pd->wg != nil && pd->wg != READY)
+		runtime_throw("runtime_pollClose: blocked write on closing descriptor");
+	if(pd->rg != nil && pd->rg != READY)
+		runtime_throw("runtime_pollClose: blocked read on closing descriptor");
+	runtime_netpollclose(pd->fd);
+	runtime_lock(&pollcache);
+	pd->link = pollcache.first;
+	pollcache.first = pd;
+	runtime_unlock(&pollcache);
+}
+
+func runtime_pollReset(pd *PollDesc, mode int) (err int) {
+	runtime_lock(pd);
+	err = checkerr(pd, mode);
+	if(err)
+		goto ret;
+	if(mode == 'r')
+		pd->rg = nil;
+	else if(mode == 'w')
+		pd->wg = nil;
+ret:
+	runtime_unlock(pd);
+}
+
+func runtime_pollWait(pd *PollDesc, mode int) (err int) {
+	runtime_lock(pd);
+	err = checkerr(pd, mode);
+	if(err)
+		goto ret;
+	netpollblock(pd, mode);
+	err = checkerr(pd, mode);
+ret:
+	runtime_unlock(pd);
+}
+
+func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
+	runtime_lock(pd);
+	if(pd->closing)
+		goto ret;
+	pd->seq++;  // invalidate current timers
+	// Reset current timers.
+	if(pd->rt.fv) {
+		runtime_deltimer(&pd->rt);
+		pd->rt.fv = nil;
+	}
+	if(pd->wt.fv) {
+		runtime_deltimer(&pd->wt);
+		pd->wt.fv = nil;
+	}
+	// Setup new timers.
+	if(d != 0 && d <= runtime_nanotime()) {
+		d = -1;
+	}
+	if(mode == 'r' || mode == 'r'+'w')
+		pd->rd = d;
+	if(mode == 'w' || mode == 'r'+'w')
+		pd->wd = d;
+	if(pd->rd > 0 && pd->rd == pd->wd) {
+		pd->rt.fv = &deadlineFn;
+		pd->rt.when = pd->rd;
+		// Copy current seq into the timer arg.
+		// Timer func will check the seq against current descriptor seq,
+		// if they differ the descriptor was reused or timers were reset.
+		pd->rt.arg.type = (Type*)pd->seq;
+		pd->rt.arg.data = pd;
+		runtime_addtimer(&pd->rt);
+	} else {
+		if(pd->rd > 0) {
+			pd->rt.fv = &readDeadlineFn;
+			pd->rt.when = pd->rd;
+			pd->rt.arg.type = (Type*)pd->seq;
+			pd->rt.arg.data = pd;
+			runtime_addtimer(&pd->rt);
+		}
+		if(pd->wd > 0) {
+			pd->wt.fv = &writeDeadlineFn;
+			pd->wt.when = pd->wd;
+			pd->wt.arg.type = (Type*)pd->seq;
+			pd->wt.arg.data = pd;
+			runtime_addtimer(&pd->wt);
+		}
+	}
+ret:
+	runtime_unlock(pd);
+}
+
+func runtime_pollUnblock(pd *PollDesc) {
+	G *rg, *wg;
+
+	runtime_lock(pd);
+	if(pd->closing)
+		runtime_throw("runtime_pollUnblock: already closing");
+	pd->closing = true;
+	pd->seq++;
+	rg = netpollunblock(pd, 'r');
+	wg = netpollunblock(pd, 'w');
+	if(pd->rt.fv) {
+		runtime_deltimer(&pd->rt);
+		pd->rt.fv = nil;
+	}
+	if(pd->wt.fv) {
+		runtime_deltimer(&pd->wt);
+		pd->wt.fv = nil;
+	}
+	runtime_unlock(pd);
+	if(rg)
+		runtime_ready(rg);
+	if(wg)
+		runtime_ready(wg);
+}
+
+// make pd ready, newly runnable goroutines (if any) are enqueued info gpp list
+void
+runtime_netpollready(G **gpp, PollDesc *pd, int32 mode)
+{
+	G *rg, *wg;
+
+	rg = wg = nil;
+	runtime_lock(pd);
+	if(mode == 'r' || mode == 'r'+'w')
+		rg = netpollunblock(pd, 'r');
+	if(mode == 'w' || mode == 'r'+'w')
+		wg = netpollunblock(pd, 'w');
+	runtime_unlock(pd);
+	if(rg) {
+		rg->schedlink = *gpp;
+		*gpp = rg;
+	}
+	if(wg) {
+		wg->schedlink = *gpp;
+		*gpp = wg;
+	}
+}
+
+static intgo
+checkerr(PollDesc *pd, int32 mode)
+{
+	if(pd->closing)
+		return 1;  // errClosing
+	if((mode == 'r' && pd->rd < 0) || (mode == 'w' && pd->wd < 0))
+		return 2;  // errTimeout
+	return 0;
+}
+
+static void
+netpollblock(PollDesc *pd, int32 mode)
+{
+	G **gpp;
+
+	gpp = &pd->rg;
+	if(mode == 'w')
+		gpp = &pd->wg;
+	if(*gpp == READY) {
+		*gpp = nil;
+		return;
+	}
+	if(*gpp != nil)
+		runtime_throw("epoll: double wait");
+	*gpp = runtime_g();
+	runtime_park(runtime_unlock, &pd->Lock, "IO wait");
+	runtime_lock(pd);
+}
+
+static G*
+netpollunblock(PollDesc *pd, int32 mode)
+{
+	G **gpp, *old;
+
+	gpp = &pd->rg;
+	if(mode == 'w')
+		gpp = &pd->wg;
+	if(*gpp == READY)
+		return nil;
+	if(*gpp == nil) {
+		*gpp = READY;
+		return nil;
+	}
+	old = *gpp;
+	*gpp = nil;
+	return old;
+}
+
+static void
+deadlineimpl(int64 now, Eface arg, bool read, bool write)
+{
+	PollDesc *pd;
+	uint32 seq;
+	G *rg, *wg;
+
+	USED(now);
+	pd = (PollDesc*)arg.data;
+	// This is the seq when the timer was set.
+	// If it's stale, ignore the timer event.
+	seq = (uintptr)arg.type;
+	rg = wg = nil;
+	runtime_lock(pd);
+	if(seq != pd->seq) {
+		// The descriptor was reused or timers were reset.
+		runtime_unlock(pd);
+		return;
+	}
+	if(read) {
+		if(pd->rd <= 0 || pd->rt.fv == nil)
+			runtime_throw("deadlineimpl: inconsistent read deadline");
+		pd->rd = -1;
+		pd->rt.fv = nil;
+		rg = netpollunblock(pd, 'r');
+	}
+	if(write) {
+		if(pd->wd <= 0 || (pd->wt.fv == nil && !read))
+			runtime_throw("deadlineimpl: inconsistent write deadline");
+		pd->wd = -1;
+		pd->wt.fv = nil;
+		wg = netpollunblock(pd, 'w');
+	}
+	runtime_unlock(pd);
+	if(rg)
+		runtime_ready(rg);
+	if(wg)
+		runtime_ready(wg);
+}
+
+static void
+deadline(int64 now, Eface arg)
+{
+	deadlineimpl(now, arg, true, true);
+}
+
+static void
+readDeadline(int64 now, Eface arg)
+{
+	deadlineimpl(now, arg, true, false);
+}
+
+static void
+writeDeadline(int64 now, Eface arg)
+{
+	deadlineimpl(now, arg, false, true);
+}
+
+static PollDesc*
+allocPollDesc(void)
+{
+	PollDesc *pd;
+	uint32 i, n;
+
+	runtime_lock(&pollcache);
+	if(pollcache.first == nil) {
+		n = PageSize/sizeof(*pd);
+		if(n == 0)
+			n = 1;
+		// Must be in non-GC memory because can be referenced
+		// only from epoll/kqueue internals.
+		pd = runtime_SysAlloc(n*sizeof(*pd));
+		for(i = 0; i < n; i++) {
+			pd[i].link = pollcache.first;
+			pollcache.first = &pd[i];
+		}
+	}
+	pd = pollcache.first;
+	pollcache.first = pd->link;
+	runtime_unlock(&pollcache);
+	return pd;
+}
diff --git a/libgo/runtime/netpoll_epoll.c b/libgo/runtime/netpoll_epoll.c
new file mode 100644
index 0000000..04f9c75
--- /dev/null
+++ b/libgo/runtime/netpoll_epoll.c
@@ -0,0 +1,154 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/epoll.h>
+
+#include "runtime.h"
+#include "defs.h"
+
+#ifndef EPOLLRDHUP
+#define EPOLLRDHUP 0x2000
+#endif
+
+#ifndef EPOLL_CLOEXEC
+#define EPOLL_CLOEXEC 02000000
+#endif
+
+typedef struct epoll_event EpollEvent;
+
+static int32
+runtime_epollcreate(int32 size)
+{
+	int r;
+
+	r = epoll_create(size);
+	if(r >= 0)
+		return r;
+	return - errno;
+}
+
+static int32
+runtime_epollcreate1(int32 flags)
+{
+	int r;
+
+	r = epoll_create1(flags);
+	if(r >= 0)
+		return r;
+	return - errno;
+}
+
+static int32
+runtime_epollctl(int32 epfd, int32 op, int32 fd, EpollEvent *ev)
+{
+	int r;
+
+	r = epoll_ctl(epfd, op, fd, ev);
+	if(r >= 0)
+		return r;
+	return - errno;
+}
+
+static int32
+runtime_epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout)
+{
+	int r;
+
+	r = epoll_wait(epfd, ev, nev, timeout);
+	if(r >= 0)
+		return r;
+	return - errno;
+}
+
+static void
+runtime_closeonexec(int32 fd)
+{
+	fcntl(fd, F_SETFD, FD_CLOEXEC);
+}
+
+static int32 epfd = -1;  // epoll descriptor
+
+void
+runtime_netpollinit(void)
+{
+	epfd = runtime_epollcreate1(EPOLL_CLOEXEC);
+	if(epfd >= 0)
+		return;
+	epfd = runtime_epollcreate(1024);
+	if(epfd >= 0) {
+		runtime_closeonexec(epfd);
+		return;
+	}
+	runtime_printf("netpollinit: failed to create descriptor (%d)\n", -epfd);
+	runtime_throw("netpollinit: failed to create descriptor");
+}
+
+int32
+runtime_netpollopen(int32 fd, PollDesc *pd)
+{
+	EpollEvent ev;
+	int32 res;
+
+	ev.events = EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET;
+	ev.data.ptr = (void*)pd;
+	res = runtime_epollctl(epfd, EPOLL_CTL_ADD, fd, &ev);
+	return -res;
+}
+
+int32
+runtime_netpollclose(int32 fd)
+{
+	EpollEvent ev;
+	int32 res;
+
+	res = runtime_epollctl(epfd, EPOLL_CTL_DEL, fd, &ev);
+	return -res;
+}
+
+// polls for ready network connections
+// returns list of goroutines that become runnable
+G*
+runtime_netpoll(bool block)
+{
+	static int32 lasterr;
+	EpollEvent events[128], *ev;
+	int32 n, i, waitms, mode;
+	G *gp;
+
+	if(epfd == -1)
+		return nil;
+	waitms = -1;
+	if(!block)
+		waitms = 0;
+retry:
+	n = runtime_epollwait(epfd, events, nelem(events), waitms);
+	if(n < 0) {
+		if(n != -EINTR && n != lasterr) {
+			lasterr = n;
+			runtime_printf("runtime: epollwait on fd %d failed with %d\n", epfd, -n);
+		}
+		goto retry;
+	}
+	gp = nil;
+	for(i = 0; i < n; i++) {
+		ev = &events[i];
+		if(ev->events == 0)
+			continue;
+		mode = 0;
+		if(ev->events & (EPOLLIN|EPOLLRDHUP|EPOLLHUP|EPOLLERR))
+			mode += 'r';
+		if(ev->events & (EPOLLOUT|EPOLLHUP|EPOLLERR))
+			mode += 'w';
+		if(mode)
+			runtime_netpollready(&gp, (void*)ev->data.ptr, mode);
+	}
+	if(block && gp == nil)
+		goto retry;
+	return gp;
+}
diff --git a/libgo/runtime/netpoll_kqueue.c b/libgo/runtime/netpoll_kqueue.c
new file mode 100644
index 0000000..9b79b20
--- /dev/null
+++ b/libgo/runtime/netpoll_kqueue.c
@@ -0,0 +1,108 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+
+// Integrated network poller (kqueue-based implementation).
+
+int32	runtime_kqueue(void);
+int32	runtime_kevent(int32, Kevent*, int32, Kevent*, int32, Timespec*);
+void	runtime_closeonexec(int32);
+
+static int32 kq = -1;
+
+void
+runtime_netpollinit(void)
+{
+	kq = runtime_kqueue();
+	if(kq < 0) {
+		runtime_printf("netpollinit: kqueue failed with %d\n", -kq);
+		runtime_throw("netpollinit: kqueue failed");
+	}
+	runtime_closeonexec(kq);
+}
+
+int32
+runtime_netpollopen(int32 fd, PollDesc *pd)
+{
+	Kevent ev[2];
+	int32 n;
+
+	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
+	// for the whole fd lifetime.  The notifications are automatically unregistered
+	// when fd is closed.
+	ev[0].ident = fd;
+	ev[0].filter = EVFILT_READ;
+	ev[0].flags = EV_ADD|EV_RECEIPT|EV_CLEAR;
+	ev[0].fflags = 0;
+	ev[0].data = 0;
+	ev[0].udata = (byte*)pd;
+	ev[1] = ev[0];
+	ev[1].filter = EVFILT_WRITE;
+	n = runtime_kevent(kq, ev, 2, ev, 2, nil);
+	if(n < 0)
+		return -n;
+	if(n != 2 ||
+		(ev[0].flags&EV_ERROR) == 0 || ev[0].ident != fd || ev[0].filter != EVFILT_READ ||
+		(ev[1].flags&EV_ERROR) == 0 || ev[1].ident != fd || ev[1].filter != EVFILT_WRITE)
+		return EFAULT;  // just to mark out from other errors
+	if(ev[0].data != 0)
+		return ev[0].data;
+	if(ev[1].data != 0)
+		return ev[1].data;
+	return 0;
+}
+
+int32
+runtime_netpollclose(int32 fd)
+{
+	// Don't need to unregister because calling close()
+	// on fd will remove any kevents that reference the descriptor.
+	USED(fd);
+	return 0;
+}
+
+// Polls for ready network connections.
+// Returns list of goroutines that become runnable.
+G*
+runtime_netpoll(bool block)
+{
+	static int32 lasterr;
+	Kevent events[64], *ev;
+	Timespec ts, *tp;
+	int32 n, i;
+	G *gp;
+
+	if(kq == -1)
+		return nil;
+	tp = nil;
+	if(!block) {
+		ts.tv_sec = 0;
+		ts.tv_nsec = 0;
+		tp = &ts;
+	}
+	gp = nil;
+retry:
+	n = runtime_kevent(kq, nil, 0, events, nelem(events), tp);
+	if(n < 0) {
+		if(n != -EINTR && n != lasterr) {
+			lasterr = n;
+			runtime_printf("runtime: kevent on fd %d failed with %d\n", kq, -n);
+		}
+		goto retry;
+	}
+	for(i = 0; i < n; i++) {
+		ev = &events[i];
+		if(ev->filter == EVFILT_READ)
+			runtime_netpollready(&gp, (PollDesc*)ev->udata, 'r');
+		if(ev->filter == EVFILT_WRITE)
+			runtime_netpollready(&gp, (PollDesc*)ev->udata, 'w');
+	}
+	if(block && gp == nil)
+		goto retry;
+	return gp;
+}
diff --git a/libgo/runtime/netpoll_stub.c b/libgo/runtime/netpoll_stub.c
new file mode 100644
index 0000000..e28e38e
--- /dev/null
+++ b/libgo/runtime/netpoll_stub.c
@@ -0,0 +1,18 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd netbsd openbsd plan9 windows
+
+#include "runtime.h"
+
+// Polls for ready network connections.
+// Returns list of goroutines that become runnable.
+G*
+runtime_netpoll(bool block)
+{
+	// Implementation for platforms that do not support
+	// integrated network poller.
+	USED(block);
+	return nil;
+}
diff --git a/libgo/runtime/panic.c b/libgo/runtime/panic.c
index 7b9b578..7d79256 100644
--- a/libgo/runtime/panic.c
+++ b/libgo/runtime/panic.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "malloc.h"
 #include "go-defer.h"
 #include "go-panic.h"
 
@@ -37,6 +38,11 @@ runtime_startpanic(void)
 	M *m;
 
 	m = runtime_m();
+	if(runtime_mheap == 0 || runtime_mheap->cachealloc.size == 0) { // very early
+		runtime_printf("runtime: panic before malloc heap initialized\n");
+		m->mallocing = 1; // tell rest of panic not to try to malloc
+	} else if(m->mcache == nil) // can happen if called from signal handler or throw
+		m->mcache = runtime_allocmcache();
 	if(m->dying) {
 		runtime_printf("panic during panic\n");
 		runtime_exit(3);
@@ -51,13 +57,14 @@ runtime_dopanic(int32 unused __attribute__ ((unused)))
 {
 	G *g;
 	static bool didothers;
+	bool crash;
 
 	g = runtime_g();
 	if(g->sig != 0)
 		runtime_printf("[signal %x code=%p addr=%p]\n",
 			       g->sig, (void*)g->sigcode0, (void*)g->sigcode1);
 
-	if(runtime_gotraceback()){
+	if(runtime_gotraceback(&crash)){
 		if(g != runtime_m()->g0) {
 			runtime_printf("\n");
 			runtime_goroutineheader(g);
@@ -79,6 +86,9 @@ runtime_dopanic(int32 unused __attribute__ ((unused)))
 		runtime_lock(&deadlock);
 		runtime_lock(&deadlock);
 	}
+	
+	if(crash)
+		runtime_crash();
 
 	runtime_exit(2);
 }
diff --git a/libgo/runtime/parfor.c b/libgo/runtime/parfor.c
index 65ca586..c0e40f5 100644
--- a/libgo/runtime/parfor.c
+++ b/libgo/runtime/parfor.c
@@ -49,6 +49,7 @@ void
 runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
 {
 	uint32 i, begin, end;
+	uint64 *pos;
 
 	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
 		runtime_printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
@@ -70,7 +71,10 @@ runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, v
 	for(i=0; i<nthr; i++) {
 		begin = (uint64)n*i / nthr;
 		end = (uint64)n*(i+1) / nthr;
-		desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32);
+		pos = &desc->thr[i].pos;
+		if(((uintptr)pos & 7) != 0)
+			runtime_throw("parforsetup: pos is not aligned");
+		*pos = (uint64)begin | (((uint64)end)<<32);
 	}
 }
 
@@ -152,7 +156,7 @@ runtime_parfordo(ParFor *desc)
 				// See if it has any work.
 				begin = (uint32)pos;
 				end = (uint32)(pos>>32);
-				if(begin >= end-1) {
+				if(begin+1 >= end) {
 					begin = end = 0;
 					break;
 				}
diff --git a/libgo/runtime/print.c b/libgo/runtime/print.c
index 9e0c45b..f5c6e82 100644
--- a/libgo/runtime/print.c
+++ b/libgo/runtime/print.c
@@ -88,6 +88,9 @@ go_vprintf(const char *s, va_list va)
 		case 'a':
 			runtime_printslice(va_arg(va, Slice));
 			break;
+		case 'c':
+			runtime_printbyte(va_arg(va, int32));
+			break;
 		case 'd':
 			runtime_printint(va_arg(va, int32));
 			break;
@@ -154,6 +157,12 @@ runtime_printbool(_Bool v)
 }
 
 void
+runtime_printbyte(int8 c)
+{
+	gwrite(&c, 1);
+}
+
+void
 runtime_printfloat(double v)
 {
 	byte buf[20];
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 9b563a5..9639922 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -56,15 +56,8 @@ extern void __splitstack_block_signals_context (void *context[10], int *,
 
 uintptr runtime_stacks_sys;
 
-static void schedule(G*);
-
 static void gtraceback(G*);
 
-typedef struct Sched Sched;
-
-M	runtime_m0;
-G	runtime_g0;	// idle goroutine for m0
-
 #ifdef __rtems__
 #define __thread
 #endif
@@ -166,194 +159,61 @@ runtime_m(void)
 	return m;
 }
 
-int32	runtime_gcwaiting;
-
-G*	runtime_allg;
-G*	runtime_lastg;
-M*	runtime_allm;
-
-int8*	runtime_goos;
-int32	runtime_ncpu;
-
-// The static TLS size.  See runtime_newm.
-static int tlssize;
-
-#ifdef HAVE_DL_ITERATE_PHDR
-
-// Called via dl_iterate_phdr.
-
-static int
-addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data)
-{
-	size_t *total = (size_t *)data;
-	unsigned int i;
-
-	for(i = 0; i < info->dlpi_phnum; ++i) {
-		if(info->dlpi_phdr[i].p_type == PT_TLS)
-			*total += info->dlpi_phdr[i].p_memsz;
-	}
-	return 0;
-}
-
-// Set the total TLS size.
-
-static void
-inittlssize()
+// Set m and g.
+void
+runtime_setmg(M* mp, G* gp)
 {
-	size_t total = 0;
-
-	dl_iterate_phdr(addtls, (void *)&total);
-	tlssize = total;
+	m = mp;
+	g = gp;
 }
 
-#else
+// The static TLS size.  See runtime_newm.
+static int tlssize;
 
+// Start a new thread.
 static void
-inittlssize()
+runtime_newosproc(M *mp)
 {
-}
-
-#endif
-
-// Go scheduler
-//
-// The go scheduler's job is to match ready-to-run goroutines (`g's)
-// with waiting-for-work schedulers (`m's).  If there are ready g's
-// and no waiting m's, ready() will start a new m running in a new
-// OS thread, so that all ready g's can run simultaneously, up to a limit.
-// For now, m's never go away.
-//
-// By default, Go keeps only one kernel thread (m) running user code
-// at a single time; other threads may be blocked in the operating system.
-// Setting the environment variable $GOMAXPROCS or calling
-// runtime.GOMAXPROCS() will change the number of user threads
-// allowed to execute simultaneously.  $GOMAXPROCS is thus an
-// approximation of the maximum number of cores to use.
-//
-// Even a program that can run without deadlock in a single process
-// might use more m's if given the chance.  For example, the prime
-// sieve will use as many m's as there are primes (up to runtime_sched.mmax),
-// allowing different stages of the pipeline to execute in parallel.
-// We could revisit this choice, only kicking off new m's for blocking
-// system calls, but that would limit the amount of parallel computation
-// that go would try to do.
-//
-// In general, one could imagine all sorts of refinements to the
-// scheduler, but the goal now is just to get something working on
-// Linux and OS X.
-
-struct Sched {
-	Lock;
-
-	G *gfree;	// available g's (status == Gdead)
-	int64 goidgen;
-
-	G *ghead;	// g's waiting to run
-	G *gtail;
-	int32 gwait;	// number of g's waiting to run
-	int32 gcount;	// number of g's that are alive
-	int32 grunning;	// number of g's running on cpu or in syscall
-
-	M *mhead;	// m's waiting for work
-	int32 mwait;	// number of m's waiting for work
-	int32 mcount;	// number of m's that have been created
+	pthread_attr_t attr;
+	size_t stacksize;
+	sigset_t clear, old;
+	pthread_t tid;
+	int ret;
 
-	volatile uint32 atomic;	// atomic scheduling word (see below)
+	if(pthread_attr_init(&attr) != 0)
+		runtime_throw("pthread_attr_init");
+	if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
+		runtime_throw("pthread_attr_setdetachstate");
 
-	int32 profilehz;	// cpu profiling rate
+	stacksize = PTHREAD_STACK_MIN;
 
-	bool init;  // running initialization
-	bool lockmain;  // init called runtime.LockOSThread
+	// With glibc before version 2.16 the static TLS size is taken
+	// out of the stack size, and we get an error or a crash if
+	// there is not enough stack space left.  Add it back in if we
+	// can, in case the program uses a lot of TLS space.  FIXME:
+	// This can be disabled in glibc 2.16 and later, if the bug is
+	// indeed fixed then.
+	stacksize += tlssize;
 
-	Note	stopped;	// one g can set waitstop and wait here for m's to stop
-};
+	if(pthread_attr_setstacksize(&attr, stacksize) != 0)
+		runtime_throw("pthread_attr_setstacksize");
 
-// The atomic word in sched is an atomic uint32 that
-// holds these fields.
-//
-//	[15 bits] mcpu		number of m's executing on cpu
-//	[15 bits] mcpumax	max number of m's allowed on cpu
-//	[1 bit] waitstop	some g is waiting on stopped
-//	[1 bit] gwaiting	gwait != 0
-//
-// These fields are the information needed by entersyscall
-// and exitsyscall to decide whether to coordinate with the
-// scheduler.  Packing them into a single machine word lets
-// them use a fast path with a single atomic read/write and
-// no lock/unlock.  This greatly reduces contention in
-// syscall- or cgo-heavy multithreaded programs.
-//
-// Except for entersyscall and exitsyscall, the manipulations
-// to these fields only happen while holding the schedlock,
-// so the routines holding schedlock only need to worry about
-// what entersyscall and exitsyscall do, not the other routines
-// (which also use the schedlock).
-//
-// In particular, entersyscall and exitsyscall only read mcpumax,
-// waitstop, and gwaiting.  They never write them.  Thus, writes to those
-// fields can be done (holding schedlock) without fear of write conflicts.
-// There may still be logic conflicts: for example, the set of waitstop must
-// be conditioned on mcpu >= mcpumax or else the wait may be a
-// spurious sleep.  The Promela model in proc.p verifies these accesses.
-enum {
-	mcpuWidth = 15,
-	mcpuMask = (1<<mcpuWidth) - 1,
-	mcpuShift = 0,
-	mcpumaxShift = mcpuShift + mcpuWidth,
-	waitstopShift = mcpumaxShift + mcpuWidth,
-	gwaitingShift = waitstopShift+1,
-
-	// The max value of GOMAXPROCS is constrained
-	// by the max value we can store in the bit fields
-	// of the atomic word.  Reserve a few high values
-	// so that we can detect accidental decrement
-	// beyond zero.
-	maxgomaxprocs = mcpuMask - 10,
-};
+	// Block signals during pthread_create so that the new thread
+	// starts with signals disabled.  It will enable them in minit.
+	sigfillset(&clear);
 
-#define atomic_mcpu(v)		(((v)>>mcpuShift)&mcpuMask)
-#define atomic_mcpumax(v)	(((v)>>mcpumaxShift)&mcpuMask)
-#define atomic_waitstop(v)	(((v)>>waitstopShift)&1)
-#define atomic_gwaiting(v)	(((v)>>gwaitingShift)&1)
-
-Sched runtime_sched;
-int32 runtime_gomaxprocs;
-bool runtime_singleproc;
-
-static bool canaddmcpu(void);
-
-// An m that is waiting for notewakeup(&m->havenextg).  This may
-// only be accessed while the scheduler lock is held.  This is used to
-// minimize the number of times we call notewakeup while the scheduler
-// lock is held, since the m will normally move quickly to lock the
-// scheduler itself, producing lock contention.
-static M* mwakeup;
-
-// Scheduling helpers.  Sched must be locked.
-static void gput(G*);	// put/get on ghead/gtail
-static G* gget(void);
-static void mput(M*);	// put/get on mhead
-static M* mget(G*);
-static void gfput(G*);	// put/get on gfree
-static G* gfget(void);
-static void matchmg(void);	// match m's to g's
-static void readylocked(G*);	// ready, but sched is locked
-static void mnextg(M*, G*);
-static void mcommoninit(M*);
+#ifdef SIGTRAP
+	// Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
+	sigdelset(&clear, SIGTRAP);
+#endif
 
-void
-setmcpumax(uint32 n)
-{
-	uint32 v, w;
+	sigemptyset(&old);
+	sigprocmask(SIG_BLOCK, &clear, &old);
+	ret = pthread_create(&tid, &attr, runtime_mstart, mp);
+	sigprocmask(SIG_SETMASK, &old, nil);
 
-	for(;;) {
-		v = runtime_sched.atomic;
-		w = v;
-		w &= ~(mcpuMask<<mcpumaxShift);
-		w |= n<<mcpumaxShift;
-		if(runtime_cas(&runtime_sched.atomic, v, w))
-			break;
-	}
+	if (ret != 0)
+		runtime_throw("pthread_create");
 }
 
 // First function run by a new goroutine.  This replaces gogocall.
@@ -449,8 +309,142 @@ runtime_mcall(void (*pfn)(G*))
 	}
 }
 
-// Keep trace of scavenger's goroutine for deadlock detection.
-static G *scvg;
+#ifdef HAVE_DL_ITERATE_PHDR
+
+// Called via dl_iterate_phdr.
+
+static int
+addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data)
+{
+	size_t *total = (size_t *)data;
+	unsigned int i;
+
+	for(i = 0; i < info->dlpi_phnum; ++i) {
+		if(info->dlpi_phdr[i].p_type == PT_TLS)
+			*total += info->dlpi_phdr[i].p_memsz;
+	}
+	return 0;
+}
+
+// Set the total TLS size.
+
+static void
+inittlssize()
+{
+	size_t total = 0;
+
+	dl_iterate_phdr(addtls, (void *)&total);
+	tlssize = total;
+}
+
+#else
+
+static void
+inittlssize()
+{
+}
+
+#endif
+
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+//
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
+//
+// Design doc at http://golang.org/s/go11sched.
+
+typedef struct Sched Sched;
+struct Sched {
+	Lock;
+
+	uint64	goidgen;
+	M*	midle;	 // idle m's waiting for work
+	int32	nmidle;	 // number of idle m's waiting for work
+	int32	mlocked; // number of locked m's waiting for work
+	int32	mcount;	 // number of m's that have been created
+
+	P*	pidle;  // idle P's
+	uint32	npidle;
+	uint32	nmspinning;
+
+	// Global runnable queue.
+	G*	runqhead;
+	G*	runqtail;
+	int32	runqsize;
+
+	// Global cache of dead G's.
+	Lock	gflock;
+	G*	gfree;
+
+	int32	stopwait;
+	Note	stopnote;
+	uint32	sysmonwait;
+	Note	sysmonnote;
+	uint64	lastpoll;
+
+	int32	profilehz;	// cpu profiling rate
+};
+
+// The max value of GOMAXPROCS.
+// There are no fundamental restrictions on the value.
+enum { MaxGomaxprocs = 1<<8 };
+
+Sched	runtime_sched;
+int32	runtime_gomaxprocs;
+bool	runtime_singleproc;
+bool	runtime_iscgo;
+uint32	runtime_gcwaiting;
+M	runtime_m0;
+G	runtime_g0;	 // idle goroutine for m0
+G*	runtime_allg;
+G*	runtime_lastg;
+M*	runtime_allm;
+P**	runtime_allp;
+M*	runtime_extram;
+int8*	runtime_goos;
+int32	runtime_ncpu;
+static int32	newprocs;
+
+void* runtime_mstart(void*);
+static void runqput(P*, G*);
+static G* runqget(P*);
+static void runqgrow(P*);
+static G* runqsteal(P*, P*);
+static void mput(M*);
+static M* mget(void);
+static void mcommoninit(M*);
+static void schedule(void);
+static void procresize(int32);
+static void acquirep(P*);
+static P* releasep(void);
+static void newm(void(*)(void), P*);
+static void stopm(void);
+static void startm(P*, bool);
+static void handoffp(P*);
+static void wakep(void);
+static void stoplockedm(void);
+static void startlockedm(G*);
+static void sysmon(void);
+static uint32 retake(uint32*);
+static void inclocked(int32);
+static void checkdead(void);
+static void exitsyscall0(G*);
+static void park0(G*);
+static void gosched0(G*);
+static void goexit0(G*);
+static void gfput(P*, G*);
+static G* gfget(P*);
+static void gfpurge(P*);
+static void globrunqput(G*);
+static G* globrunqget(P*);
+static P* pidleget(void);
+static void pidleput(P*);
+static void injectglist(G*);
 
 // The bootstrap sequence is:
 //
@@ -463,7 +457,7 @@ static G *scvg;
 void
 runtime_schedinit(void)
 {
-	int32 n;
+	int32 n, procs;
 	const byte *p;
 
 	m = &runtime_m0;
@@ -476,6 +470,7 @@ runtime_schedinit(void)
 	inittlssize();
 
 	m->nomemprof++;
+	runtime_mprofinit();
 	runtime_mallocinit();
 	mcommoninit(m);
 
@@ -487,28 +482,20 @@ runtime_schedinit(void)
 	// so that we don't need to call malloc when we crash.
 	// runtime_findfunc(0);
 
-	runtime_gomaxprocs = 1;
+	runtime_sched.lastpoll = runtime_nanotime();
+	procs = 1;
 	p = runtime_getenv("GOMAXPROCS");
-	if(p != nil && (n = runtime_atoi(p)) != 0) {
-		if(n > maxgomaxprocs)
-			n = maxgomaxprocs;
-		runtime_gomaxprocs = n;
+	if(p != nil && (n = runtime_atoi(p)) > 0) {
+		if(n > MaxGomaxprocs)
+			n = MaxGomaxprocs;
+		procs = n;
 	}
-	// wait for the main goroutine to start before taking
-	// GOMAXPROCS into account.
-	setmcpumax(1);
-	runtime_singleproc = runtime_gomaxprocs == 1;
-
-	canaddmcpu();	// mcpu++ to account for bootstrap m
-	m->helpgc = 1;	// flag to tell schedule() to mcpu--
-	runtime_sched.grunning++;
+	runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
+	procresize(procs);
 
 	// Can not enable GC until all roots are registered.
 	// mstats.enablegc = 1;
 	m->nomemprof--;
-
-	if(raceenabled)
-		runtime_raceinit();
 }
 
 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
@@ -516,70 +503,44 @@ extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 
 // The main goroutine.
 void
-runtime_main(void)
+runtime_main(void* dummy __attribute__((unused)))
 {
+	newm(sysmon, nil);
+
 	// Lock the main goroutine onto this, the main OS thread,
 	// during initialization.  Most programs won't care, but a few
 	// do require certain calls to be made by the main thread.
 	// Those can arrange for main.main to run in the main thread
 	// by calling runtime.LockOSThread during initialization
 	// to preserve the lock.
-	runtime_LockOSThread();
-	// From now on, newgoroutines may use non-main threads.
-	setmcpumax(runtime_gomaxprocs);
-	runtime_sched.init = true;
-	scvg = __go_go(runtime_MHeap_Scavenger, nil);
-	scvg->issystem = true;
+	runtime_lockOSThread();
+	if(m != &runtime_m0)
+		runtime_throw("runtime_main not on m0");
+	__go_go(runtime_MHeap_Scavenger, nil);
 	main_init();
-	runtime_sched.init = false;
-	if(!runtime_sched.lockmain)
-		runtime_UnlockOSThread();
+	runtime_unlockOSThread();
 
 	// For gccgo we have to wait until after main is initialized
 	// to enable GC, because initializing main registers the GC
 	// roots.
 	mstats.enablegc = 1;
 
-	// The deadlock detection has false negatives.
-	// Let scvg start up, to eliminate the false negative
-	// for the trivial program func main() { select{} }.
-	runtime_gosched();
-
 	main_main();
 	if(raceenabled)
 		runtime_racefini();
+
+	// Make racy client program work: if panicking on
+	// another goroutine at the same time as main returns,
+	// let the other goroutine finish printing the panic trace.
+	// Once it does, it will exit. See issue 3934.
+	if(runtime_panicking)
+		runtime_park(nil, nil, "panicwait");
+
 	runtime_exit(0);
 	for(;;)
 		*(int32*)0 = 0;
 }
 
-// Lock the scheduler.
-static void
-schedlock(void)
-{
-	runtime_lock(&runtime_sched);
-}
-
-// Unlock the scheduler.
-static void
-schedunlock(void)
-{
-	M *mp;
-
-	mp = mwakeup;
-	mwakeup = nil;
-	runtime_unlock(&runtime_sched);
-	if(mp != nil)
-		runtime_notewakeup(&mp->havenextg);
-}
-
-void
-runtime_goexit(void)
-{
-	g->status = Gmoribund;
-	runtime_gosched();
-}
-
 void
 runtime_goroutineheader(G *gp)
 {
@@ -604,9 +565,6 @@ runtime_goroutineheader(G *gp)
 		else
 			status = "waiting";
 		break;
-	case Gmoribund:
-		status = "moribund";
-		break;
 	default:
 		status = "???";
 		break;
@@ -644,7 +602,7 @@ runtime_tracebackothers(G * volatile me)
 	int32 traceback;
 
 	tb.gp = me;
-	traceback = runtime_gotraceback();
+	traceback = runtime_gotraceback(nil);
 	for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
 		if(gp == me || gp->status == Gdead)
 			continue;
@@ -698,28 +656,20 @@ gtraceback(G* gp)
 	runtime_gogo(traceback->gp);
 }
 
-// Mark this g as m's idle goroutine.
-// This functionality might be used in environments where programs
-// are limited to a single thread, to simulate a select-driven
-// network server.  It is not exposed via the standard runtime API.
-void
-runtime_idlegoroutine(void)
-{
-	if(g->idlem != nil)
-		runtime_throw("g is already an idle goroutine");
-	g->idlem = m;
-}
-
 static void
 mcommoninit(M *mp)
 {
-	mp->id = runtime_sched.mcount++;
+	// If there is no mcache runtime_callers() will crash,
+	// and we are most likely in sysmon thread so the stack is senseless anyway.
+	if(m->mcache)
+		runtime_callers(1, mp->createstack, nelem(mp->createstack));
+
 	mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 
-	if(mp->mcache == nil)
-		mp->mcache = runtime_allocmcache();
+	runtime_lock(&runtime_sched);
+	mp->id = runtime_sched.mcount++;
 
-	runtime_callers(1, mp->createstack, nelem(mp->createstack));
+	runtime_mpreinit(mp);
 
 	// Add to runtime_allm so garbage collector doesn't free m
 	// when it is just in a register or thread-local storage.
@@ -727,324 +677,77 @@ mcommoninit(M *mp)
 	// runtime_NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
 	runtime_atomicstorep(&runtime_allm, mp);
+	runtime_unlock(&runtime_sched);
 }
 
-// Try to increment mcpu.  Report whether succeeded.
-static bool
-canaddmcpu(void)
-{
-	uint32 v;
-
-	for(;;) {
-		v = runtime_sched.atomic;
-		if(atomic_mcpu(v) >= atomic_mcpumax(v))
-			return 0;
-		if(runtime_cas(&runtime_sched.atomic, v, v+(1<<mcpuShift)))
-			return 1;
-	}
-}
-
-// Put on `g' queue.  Sched must be locked.
-static void
-gput(G *gp)
-{
-	M *mp;
-
-	// If g is wired, hand it off directly.
-	if((mp = gp->lockedm) != nil && canaddmcpu()) {
-		mnextg(mp, gp);
-		return;
-	}
-
-	// If g is the idle goroutine for an m, hand it off.
-	if(gp->idlem != nil) {
-		if(gp->idlem->idleg != nil) {
-			runtime_printf("m%d idle out of sync: g%D g%D\n",
-				gp->idlem->id,
-				gp->idlem->idleg->goid, gp->goid);
-			runtime_throw("runtime: double idle");
-		}
-		gp->idlem->idleg = gp;
-		return;
-	}
-
-	gp->schedlink = nil;
-	if(runtime_sched.ghead == nil)
-		runtime_sched.ghead = gp;
-	else
-		runtime_sched.gtail->schedlink = gp;
-	runtime_sched.gtail = gp;
-
-	// increment gwait.
-	// if it transitions to nonzero, set atomic gwaiting bit.
-	if(runtime_sched.gwait++ == 0)
-		runtime_xadd(&runtime_sched.atomic, 1<<gwaitingShift);
-}
-
-// Report whether gget would return something.
-static bool
-haveg(void)
-{
-	return runtime_sched.ghead != nil || m->idleg != nil;
-}
-
-// Get from `g' queue.  Sched must be locked.
-static G*
-gget(void)
-{
-	G *gp;
-
-	gp = runtime_sched.ghead;
-	if(gp) {
-		runtime_sched.ghead = gp->schedlink;
-		if(runtime_sched.ghead == nil)
-			runtime_sched.gtail = nil;
-		// decrement gwait.
-		// if it transitions to zero, clear atomic gwaiting bit.
-		if(--runtime_sched.gwait == 0)
-			runtime_xadd(&runtime_sched.atomic, -1<<gwaitingShift);
-	} else if(m->idleg != nil) {
-		gp = m->idleg;
-		m->idleg = nil;
-	}
-	return gp;
-}
-
-// Put on `m' list.  Sched must be locked.
-static void
-mput(M *mp)
-{
-	mp->schedlink = runtime_sched.mhead;
-	runtime_sched.mhead = mp;
-	runtime_sched.mwait++;
-}
-
-// Get an `m' to run `g'.  Sched must be locked.
-static M*
-mget(G *gp)
-{
-	M *mp;
-
-	// if g has its own m, use it.
-	if(gp && (mp = gp->lockedm) != nil)
-		return mp;
-
-	// otherwise use general m pool.
-	if((mp = runtime_sched.mhead) != nil) {
-		runtime_sched.mhead = mp->schedlink;
-		runtime_sched.mwait--;
-	}
-	return mp;
-}
-
-// Mark g ready to run.
+// Mark gp ready to run.
 void
 runtime_ready(G *gp)
 {
-	schedlock();
-	readylocked(gp);
-	schedunlock();
-}
-
-// Mark g ready to run.  Sched is already locked.
-// G might be running already and about to stop.
-// The sched lock protects g->status from changing underfoot.
-static void
-readylocked(G *gp)
-{
-	if(gp->m) {
-		// Running on another machine.
-		// Ready it when it stops.
-		gp->readyonstop = 1;
-		return;
-	}
-
 	// Mark runnable.
-	if(gp->status == Grunnable || gp->status == Grunning) {
+	if(gp->status != Gwaiting) {
 		runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 		runtime_throw("bad g->status in ready");
 	}
 	gp->status = Grunnable;
-
-	gput(gp);
-	matchmg();
-}
-
-// Same as readylocked but a different symbol so that
-// debuggers can set a breakpoint here and catch all
-// new goroutines.
-static void
-newprocreadylocked(G *gp)
-{
-	readylocked(gp);
-}
-
-// Pass g to m for running.
-// Caller has already incremented mcpu.
-static void
-mnextg(M *mp, G *gp)
-{
-	runtime_sched.grunning++;
-	mp->nextg = gp;
-	if(mp->waitnextg) {
-		mp->waitnextg = 0;
-		if(mwakeup != nil)
-			runtime_notewakeup(&mwakeup->havenextg);
-		mwakeup = mp;
-	}
-}
-
-// Get the next goroutine that m should run.
-// Sched must be locked on entry, is unlocked on exit.
-// Makes sure that at most $GOMAXPROCS g's are
-// running on cpus (not in system calls) at any given time.
-static G*
-nextgandunlock(void)
-{
-	G *gp;
-	uint32 v;
-
-top:
-	if(atomic_mcpu(runtime_sched.atomic) >= maxgomaxprocs)
-		runtime_throw("negative mcpu");
-
-	// If there is a g waiting as m->nextg, the mcpu++
-	// happened before it was passed to mnextg.
-	if(m->nextg != nil) {
-		gp = m->nextg;
-		m->nextg = nil;
-		schedunlock();
-		return gp;
-	}
-
-	if(m->lockedg != nil) {
-		// We can only run one g, and it's not available.
-		// Make sure some other cpu is running to handle
-		// the ordinary run queue.
-		if(runtime_sched.gwait != 0) {
-			matchmg();
-			// m->lockedg might have been on the queue.
-			if(m->nextg != nil) {
-				gp = m->nextg;
-				m->nextg = nil;
-				schedunlock();
-				return gp;
-			}
-		}
-	} else {
-		// Look for work on global queue.
-		while(haveg() && canaddmcpu()) {
-			gp = gget();
-			if(gp == nil)
-				runtime_throw("gget inconsistency");
-
-			if(gp->lockedm) {
-				mnextg(gp->lockedm, gp);
-				continue;
-			}
-			runtime_sched.grunning++;
-			schedunlock();
-			return gp;
-		}
-
-		// The while loop ended either because the g queue is empty
-		// or because we have maxed out our m procs running go
-		// code (mcpu >= mcpumax).  We need to check that
-		// concurrent actions by entersyscall/exitsyscall cannot
-		// invalidate the decision to end the loop.
-		//
-		// We hold the sched lock, so no one else is manipulating the
-		// g queue or changing mcpumax.  Entersyscall can decrement
-		// mcpu, but if does so when there is something on the g queue,
-		// the gwait bit will be set, so entersyscall will take the slow path
-		// and use the sched lock.  So it cannot invalidate our decision.
-		//
-		// Wait on global m queue.
-		mput(m);
-	}
-
-	// Look for deadlock situation.
-	// There is a race with the scavenger that causes false negatives:
-	// if the scavenger is just starting, then we have
-	//	scvg != nil && grunning == 0 && gwait == 0
-	// and we do not detect a deadlock.  It is possible that we should
-	// add that case to the if statement here, but it is too close to Go 1
-	// to make such a subtle change.  Instead, we work around the
-	// false negative in trivial programs by calling runtime.gosched
-	// from the main goroutine just before main.main.
-	// See runtime_main above.
-	//
-	// On a related note, it is also possible that the scvg == nil case is
-	// wrong and should include gwait, but that does not happen in
-	// standard Go programs, which all start the scavenger.
-	//
-	if((scvg == nil && runtime_sched.grunning == 0) ||
-	   (scvg != nil && runtime_sched.grunning == 1 && runtime_sched.gwait == 0 &&
-	    (scvg->status == Grunning || scvg->status == Gsyscall))) {
-		m->throwing = -1;  // do not dump full stacks
-		runtime_throw("all goroutines are asleep - deadlock!");
-	}
-
-	m->nextg = nil;
-	m->waitnextg = 1;
-	runtime_noteclear(&m->havenextg);
-
-	// Stoptheworld is waiting for all but its cpu to go to stop.
-	// Entersyscall might have decremented mcpu too, but if so
-	// it will see the waitstop and take the slow path.
-	// Exitsyscall never increments mcpu beyond mcpumax.
-	v = runtime_atomicload(&runtime_sched.atomic);
-	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
-		// set waitstop = 0 (known to be 1)
-		runtime_xadd(&runtime_sched.atomic, -1<<waitstopShift);
-		runtime_notewakeup(&runtime_sched.stopped);
-	}
-	schedunlock();
-
-	runtime_notesleep(&m->havenextg);
-	if(m->helpgc) {
-		runtime_gchelper();
-		m->helpgc = 0;
-		runtime_lock(&runtime_sched);
-		goto top;
-	}
-	if((gp = m->nextg) == nil)
-		runtime_throw("bad m->nextg in nextgoroutine");
-	m->nextg = nil;
-	return gp;
+	runqput(m->p, gp);
+	if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
+		wakep();
 }
 
 int32
 runtime_gcprocs(void)
 {
 	int32 n;
-	
+
 	// Figure out how many CPUs to use during GC.
 	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+	runtime_lock(&runtime_sched);
 	n = runtime_gomaxprocs;
 	if(n > runtime_ncpu)
 		n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 	if(n > MaxGcproc)
 		n = MaxGcproc;
-	if(n > runtime_sched.mwait+1) // one M is currently running
-		n = runtime_sched.mwait+1;
+	if(n > runtime_sched.nmidle+1) // one M is currently running
+		n = runtime_sched.nmidle+1;
+	runtime_unlock(&runtime_sched);
 	return n;
 }
 
+static bool
+needaddgcproc(void)
+{
+	int32 n;
+
+	runtime_lock(&runtime_sched);
+	n = runtime_gomaxprocs;
+	if(n > runtime_ncpu)
+		n = runtime_ncpu;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	n -= runtime_sched.nmidle+1; // one M is currently running
+	runtime_unlock(&runtime_sched);
+	return n > 0;
+}
+
 void
 runtime_helpgc(int32 nproc)
 {
 	M *mp;
-	int32 n;
+	int32 n, pos;
 
 	runtime_lock(&runtime_sched);
-	for(n = 1; n < nproc; n++) { // one M is currently running
-		mp = mget(nil);
+	pos = 0;
+	for(n = 1; n < nproc; n++) {  // one M is currently running
+		if(runtime_allp[pos]->mcache == m->mcache)
+			pos++;
+		mp = mget();
 		if(mp == nil)
 			runtime_throw("runtime_gcprocs inconsistency");
-		mp->helpgc = 1;
-		mp->waitnextg = 0;
-		runtime_notewakeup(&mp->havenextg);
+		mp->helpgc = n;
+		mp->mcache = runtime_allp[pos]->mcache;
+		pos++;
+		runtime_notewakeup(&mp->park);
 	}
 	runtime_unlock(&runtime_sched);
 }
@@ -1052,57 +755,104 @@ runtime_helpgc(int32 nproc)
 void
 runtime_stoptheworld(void)
 {
-	uint32 v;
-
-	schedlock();
-	runtime_gcwaiting = 1;
-
-	setmcpumax(1);
-
-	// while mcpu > 1
-	for(;;) {
-		v = runtime_sched.atomic;
-		if(atomic_mcpu(v) <= 1)
-			break;
-
-		// It would be unsafe for multiple threads to be using
-		// the stopped note at once, but there is only
-		// ever one thread doing garbage collection.
-		runtime_noteclear(&runtime_sched.stopped);
-		if(atomic_waitstop(v))
-			runtime_throw("invalid waitstop");
+	int32 i;
+	uint32 s;
+	P *p;
+	bool wait;
 
-		// atomic { waitstop = 1 }, predicated on mcpu <= 1 check above
-		// still being true.
-		if(!runtime_cas(&runtime_sched.atomic, v, v+(1<<waitstopShift)))
-			continue;
+	runtime_lock(&runtime_sched);
+	runtime_sched.stopwait = runtime_gomaxprocs;
+	runtime_atomicstore((uint32*)&runtime_gcwaiting, 1);
+	// stop current P
+	m->p->status = Pgcstop;
+	runtime_sched.stopwait--;
+	// try to retake all P's in Psyscall status
+	for(i = 0; i < runtime_gomaxprocs; i++) {
+		p = runtime_allp[i];
+		s = p->status;
+		if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
+			runtime_sched.stopwait--;
+	}
+	// stop idle P's
+	while((p = pidleget()) != nil) {
+		p->status = Pgcstop;
+		runtime_sched.stopwait--;
+	}
+	wait = runtime_sched.stopwait > 0;
+	runtime_unlock(&runtime_sched);
 
-		schedunlock();
-		runtime_notesleep(&runtime_sched.stopped);
-		schedlock();
+	// wait for remaining P's to stop voluntary
+	if(wait) {
+		runtime_notesleep(&runtime_sched.stopnote);
+		runtime_noteclear(&runtime_sched.stopnote);
+	}
+	if(runtime_sched.stopwait)
+		runtime_throw("stoptheworld: not stopped");
+	for(i = 0; i < runtime_gomaxprocs; i++) {
+		p = runtime_allp[i];
+		if(p->status != Pgcstop)
+			runtime_throw("stoptheworld: not stopped");
 	}
-	runtime_singleproc = runtime_gomaxprocs == 1;
-	schedunlock();
+}
+
+static void
+mhelpgc(void)
+{
+	m->helpgc = -1;
 }
 
 void
 runtime_starttheworld(void)
 {
+	P *p, *p1;
 	M *mp;
-	int32 max;
-	
-	// Figure out how many CPUs GC could possibly use.
-	max = runtime_gomaxprocs;
-	if(max > runtime_ncpu)
-		max = runtime_ncpu > 0 ? runtime_ncpu : 1;
-	if(max > MaxGcproc)
-		max = MaxGcproc;
-
-	schedlock();
+	G *gp;
+	bool add;
+
+	gp = runtime_netpoll(false);  // non-blocking
+	injectglist(gp);
+	add = needaddgcproc();
+	runtime_lock(&runtime_sched);
+	if(newprocs) {
+		procresize(newprocs);
+		newprocs = 0;
+	} else
+		procresize(runtime_gomaxprocs);
 	runtime_gcwaiting = 0;
-	setmcpumax(runtime_gomaxprocs);
-	matchmg();
-	if(runtime_gcprocs() < max && canaddmcpu()) {
+
+	p1 = nil;
+	while((p = pidleget()) != nil) {
+		// procresize() puts p's with work at the beginning of the list.
+		// Once we reach a p without a run queue, the rest don't have one either.
+		if(p->runqhead == p->runqtail) {
+			pidleput(p);
+			break;
+		}
+		mp = mget();
+		if(mp == nil) {
+			p->link = p1;
+			p1 = p;
+			continue;
+		}
+		if(mp->nextp)
+			runtime_throw("starttheworld: inconsistent mp->nextp");
+		mp->nextp = p;
+		runtime_notewakeup(&mp->park);
+	}
+	if(runtime_sched.sysmonwait) {
+		runtime_sched.sysmonwait = false;
+		runtime_notewakeup(&runtime_sched.sysmonnote);
+	}
+	runtime_unlock(&runtime_sched);
+
+	while(p1) {
+		p = p1;
+		p1 = p1->link;
+		add = false;
+		newm(nil, p);
+	}
+
+	if(add) {
 		// If GC could have used another helper proc, start one now,
 		// in the hope that it will be available next time.
 		// It would have been even better to start it before the collection,
@@ -1110,17 +860,8 @@ runtime_starttheworld(void)
 		// coordinate.  This lazy approach works out in practice:
 		// we don't mind if the first couple gc rounds don't have quite
 		// the maximum number of procs.
-		// canaddmcpu above did mcpu++
-		// (necessary, because m will be doing various
-		// initialization work so is definitely running),
-		// but m is not running a specific goroutine,
-		// so set the helpgc flag as a signal to m's
-		// first schedule(nil) to mcpu-- and grunning--.
-		mp = runtime_newm();
-		mp->helpgc = 1;
-		runtime_sched.grunning++;
+		newm(mhelpgc, nil);
 	}
-	schedunlock();
 }
 
 // Called to start an M.
@@ -1167,10 +908,23 @@ runtime_mstart(void* mp)
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
-	if(m == &runtime_m0)
+	if(m == &runtime_m0) {
 		runtime_initsig();
+		if(runtime_iscgo)
+			runtime_newextram();
+	}
+	
+	if(m->mstartfn)
+		m->mstartfn();
 
-	schedule(nil);
+	if(m->helpgc) {
+		m->helpgc = 0;
+		stopm();
+	} else if(m != &runtime_m0) {
+		acquirep(m->nextp);
+		m->nextp = nil;
+	}
+	schedule();
 
 	// TODO(brainman): This point is never reached, because scheduler
 	// does not release os threads at the moment. But once this path
@@ -1187,43 +941,17 @@ struct CgoThreadStart
 	void (*fn)(void);
 };
 
-// Kick off new m's as needed (up to mcpumax).
-// Sched is locked.
-static void
-matchmg(void)
-{
-	G *gp;
-	M *mp;
-
-	if(m->mallocing || m->gcing)
-		return;
-
-	while(haveg() && canaddmcpu()) {
-		gp = gget();
-		if(gp == nil)
-			runtime_throw("gget inconsistency");
-
-		// Find the m that will run gp.
-		if((mp = mget(gp)) == nil)
-			mp = runtime_newm();
-		mnextg(mp, gp);
-	}
-}
-
-// Create a new m.  It will start off with a call to runtime_mstart.
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
 M*
-runtime_newm(void)
+runtime_allocm(P *p)
 {
 	M *mp;
-	pthread_attr_t attr;
-	pthread_t tid;
-	size_t stacksize;
-	sigset_t clear;
-	sigset_t old;
-	int ret;
 
+	m->locks++;  // disable GC because it can be called from sysmon
+	if(m->p == nil)
+		acquirep(p);  // temporarily borrow p for mallocs in this function
 #if 0
-	static const Type *mtype;  // The Go type M
 	if(mtype == nil) {
 		Eface e;
 		runtime_gc_m_ptr(&e);
@@ -1235,112 +963,418 @@ runtime_newm(void)
 	mcommoninit(mp);
 	mp->g0 = runtime_malg(-1, nil, nil);
 
-	if(pthread_attr_init(&attr) != 0)
-		runtime_throw("pthread_attr_init");
-	if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
-		runtime_throw("pthread_attr_setdetachstate");
+	if(p == m->p)
+		releasep();
+	m->locks--;
 
-	stacksize = PTHREAD_STACK_MIN;
+	return mp;
+}
 
-	// With glibc before version 2.16 the static TLS size is taken
-	// out of the stack size, and we get an error or a crash if
-	// there is not enough stack space left.  Add it back in if we
-	// can, in case the program uses a lot of TLS space.  FIXME:
-	// This can be disabled in glibc 2.16 and later, if the bug is
-	// indeed fixed then.
-	stacksize += tlssize;
+static M* lockextra(bool nilokay);
+static void unlockextra(M*);
 
-	if(pthread_attr_setstacksize(&attr, stacksize) != 0)
-		runtime_throw("pthread_attr_setstacksize");
+// needm is called when a cgo callback happens on a
+// thread without an m (a thread not created by Go).
+// In this case, needm is expected to find an m to use
+// and return with m, g initialized correctly.
+// Since m and g are not set now (likely nil, but see below)
+// needm is limited in what routines it can call. In particular
+// it can only call nosplit functions (textflag 7) and cannot
+// do any scheduling that requires an m.
+//
+// In order to avoid needing heavy lifting here, we adopt
+// the following strategy: there is a stack of available m's
+// that can be stolen. Using compare-and-swap
+// to pop from the stack has ABA races, so we simulate
+// a lock by doing an exchange (via casp) to steal the stack
+// head and replace the top pointer with MLOCKED (1).
+// This serves as a simple spin lock that we can use even
+// without an m. The thread that locks the stack in this way
+// unlocks the stack by storing a valid stack head pointer.
+//
+// In order to make sure that there is always an m structure
+// available to be stolen, we maintain the invariant that there
+// is always one more than needed. At the beginning of the
+// program (if cgo is in use) the list is seeded with a single m.
+// If needm finds that it has taken the last m off the list, its job
+// is - once it has installed its own m so that it can do things like
+// allocate memory - to create a spare m and put it on the list.
+//
+// Each of these extra m's also has a g0 and a curg that are
+// pressed into service as the scheduling stack and current
+// goroutine for the duration of the cgo callback.
+//
+// When the callback is done with the m, it calls dropm to
+// put the m back on the list.
+void
+runtime_needm(void)
+{
+	M *mp;
 
-	// Block signals during pthread_create so that the new thread
-	// starts with signals disabled.  It will enable them in minit.
-	sigfillset(&clear);
+	// Lock extra list, take head, unlock popped list.
+	// nilokay=false is safe here because of the invariant above,
+	// that the extra list always contains or will soon contain
+	// at least one m.
+	mp = lockextra(false);
+
+	// Set needextram when we've just emptied the list,
+	// so that the eventual call into cgocallbackg will
+	// allocate a new m for the extra list. We delay the
+	// allocation until then so that it can be done
+	// after exitsyscall makes sure it is okay to be
+	// running at all (that is, there's no garbage collection
+	// running right now).
+	mp->needextram = mp->schedlink == nil;
+	unlockextra(mp->schedlink);
+
+	// Install m and g (= m->g0) and set the stack bounds
+	// to match the current stack. We don't actually know
+	// how big the stack is, like we don't know how big any
+	// scheduling stack is, but we assume there's at least 32 kB,
+	// which is more than enough for us.
+	runtime_setmg(mp, mp->g0);
+
+	// We assume that the split stack support has been initialized
+	// for this new thread.
+
+	// Initialize this thread to use the m.
+	runtime_minit();
+}
 
-#ifdef SIGTRAP
-	// Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
-	sigdelset(&clear, SIGTRAP);
-#endif
+// newextram allocates an m and puts it on the extra list.
+// It is called with a working local m, so that it can do things
+// like call schedlock and allocate.
+void
+runtime_newextram(void)
+{
+	M *mp, *mnext;
+	G *gp;
 
-	sigemptyset(&old);
-	sigprocmask(SIG_BLOCK, &clear, &old);
-	ret = pthread_create(&tid, &attr, runtime_mstart, mp);
-	sigprocmask(SIG_SETMASK, &old, nil);
+	// Create extra goroutine locked to extra m.
+	// The goroutine is the context in which the cgo callback will run.
+	// The sched.pc will never be returned to, but setting it to
+	// runtime.goexit makes clear to the traceback routines where
+	// the goroutine stack ends.
+	mp = runtime_allocm(nil);
+	gp = runtime_malg(StackMin, nil, nil);
+	gp->status = Gsyscall;
+	mp->curg = gp;
+	mp->locked = LockInternal;
+	mp->lockedg = gp;
+	gp->lockedm = mp;
+	// put on allg for garbage collector
+	runtime_lock(&runtime_sched);
+	if(runtime_lastg == nil)
+		runtime_allg = gp;
+	else
+		runtime_lastg->alllink = gp;
+	runtime_lastg = gp;
+	runtime_unlock(&runtime_sched);
+	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
 
-	if (ret != 0)
-		runtime_throw("pthread_create");
+	// Add m to the extra list.
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
+}
 
-	return mp;
+// dropm is called when a cgo callback has called needm but is now
+// done with the callback and returning back into the non-Go thread.
+// It puts the current m back onto the extra list.
+//
+// The main expense here is the call to signalstack to release the
+// m's signal stack, and then the call to needm on the next callback
+// from this thread. It is tempting to try to save the m for next time,
+// which would eliminate both these costs, but there might not be
+// a next time: the current thread (which Go does not control) might exit.
+// If we saved the m for that thread, there would be an m leak each time
+// such a thread exited. Instead, we acquire and release an m on each
+// call. These should typically not be scheduling operations, just a few
+// atomics, so the cost should be small.
+//
+// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+// variable using pthread_key_create. Unlike the pthread keys we already use
+// on OS X, this dummy key would never be read by Go code. It would exist
+// only so that we could register at thread-exit-time destructor.
+// That destructor would put the m back onto the extra list.
+// This is purely a performance optimization. The current version,
+// in which dropm happens on each cgo call, is still correct too.
+// We may have to keep the current version on systems with cgo
+// but without pthreads, like Windows.
+void
+runtime_dropm(void)
+{
+	M *mp, *mnext;
+
+	// Undo whatever initialization minit did during needm.
+	runtime_unminit();
+
+	// Clear m and g, and return m to the extra list.
+	// After the call to setmg we can only call nosplit functions.
+	mp = m;
+	runtime_setmg(nil, nil);
+
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
 }
 
-// One round of scheduler: find a goroutine and run it.
-// The argument is the goroutine that was running before
-// schedule was called, or nil if this is the first call.
-// Never returns.
-static void
-schedule(G *gp)
+#define MLOCKED ((M*)1)
+
+// lockextra locks the extra list and returns the list head.
+// The caller must unlock the list by storing a new list head
+// to runtime.extram. If nilokay is true, then lockextra will
+// return a nil list head if that's what it finds. If nilokay is false,
+// lockextra will keep waiting until the list head is no longer nil.
+static M*
+lockextra(bool nilokay)
 {
-	int32 hz;
-	uint32 v;
+	M *mp;
+	void (*yield)(void);
 
-	schedlock();
-	if(gp != nil) {
-		// Just finished running gp.
-		gp->m = nil;
-		runtime_sched.grunning--;
-
-		// atomic { mcpu-- }
-		v = runtime_xadd(&runtime_sched.atomic, -1<<mcpuShift);
-		if(atomic_mcpu(v) > maxgomaxprocs)
-			runtime_throw("negative mcpu in scheduler");
-
-		switch(gp->status) {
-		case Grunnable:
-		case Gdead:
-			// Shouldn't have been running!
-			runtime_throw("bad gp->status in sched");
-		case Grunning:
-			gp->status = Grunnable;
-			gput(gp);
-			break;
-		case Gmoribund:
-			if(raceenabled)
-				runtime_racegoend(gp->goid);
-			gp->status = Gdead;
-			if(gp->lockedm) {
-				gp->lockedm = nil;
-				m->lockedg = nil;
-			}
-			gp->idlem = nil;
-			runtime_memclr(&gp->context, sizeof gp->context);
-			gfput(gp);
-			if(--runtime_sched.gcount == 0)
-				runtime_exit(0);
-			break;
+	for(;;) {
+		mp = runtime_atomicloadp(&runtime_extram);
+		if(mp == MLOCKED) {
+			yield = runtime_osyield;
+			yield();
+			continue;
 		}
-		if(gp->readyonstop) {
-			gp->readyonstop = 0;
-			readylocked(gp);
+		if(mp == nil && !nilokay) {
+			runtime_usleep(1);
+			continue;
 		}
-	} else if(m->helpgc) {
-		// Bootstrap m or new m started by starttheworld.
-		// atomic { mcpu-- }
-		v = runtime_xadd(&runtime_sched.atomic, -1<<mcpuShift);
-		if(atomic_mcpu(v) > maxgomaxprocs)
-			runtime_throw("negative mcpu in scheduler");
-		// Compensate for increment in starttheworld().
-		runtime_sched.grunning--;
+		if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
+			yield = runtime_osyield;
+			yield();
+			continue;
+		}
+		break;
+	}
+	return mp;
+}
+
+static void
+unlockextra(M *mp)
+{
+	runtime_atomicstorep(&runtime_extram, mp);
+}
+
+
+// Create a new m.  It will start off with a call to fn, or else the scheduler.
+static void
+newm(void(*fn)(void), P *p)
+{
+	M *mp;
+
+	mp = runtime_allocm(p);
+	mp->nextp = p;
+	mp->mstartfn = fn;
+
+	runtime_newosproc(mp);
+}
+
+// Stops execution of the current m until new work is available.
+// Returns with acquired P.
+static void
+stopm(void)
+{
+	if(m->locks)
+		runtime_throw("stopm holding locks");
+	if(m->p)
+		runtime_throw("stopm holding p");
+	if(m->spinning) {
+		m->spinning = false;
+		runtime_xadd(&runtime_sched.nmspinning, -1);
+	}
+
+retry:
+	runtime_lock(&runtime_sched);
+	mput(m);
+	runtime_unlock(&runtime_sched);
+	runtime_notesleep(&m->park);
+	runtime_noteclear(&m->park);
+	if(m->helpgc) {
+		runtime_gchelper();
 		m->helpgc = 0;
-	} else if(m->nextg != nil) {
-		// New m started by matchmg.
-	} else {
-		runtime_throw("invalid m state in scheduler");
+		m->mcache = nil;
+		goto retry;
 	}
+	acquirep(m->nextp);
+	m->nextp = nil;
+}
+
+static void
+mspinning(void)
+{
+	m->spinning = true;
+}
+
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's returns false.
+static void
+startm(P *p, bool spinning)
+{
+	M *mp;
+	void (*fn)(void);
 
-	// Find (or wait for) g to run.  Unlocks runtime_sched.
-	gp = nextgandunlock();
-	gp->readyonstop = 0;
+	runtime_lock(&runtime_sched);
+	if(p == nil) {
+		p = pidleget();
+		if(p == nil) {
+			runtime_unlock(&runtime_sched);
+			if(spinning)
+				runtime_xadd(&runtime_sched.nmspinning, -1);
+			return;
+		}
+	}
+	mp = mget();
+	runtime_unlock(&runtime_sched);
+	if(mp == nil) {
+		fn = nil;
+		if(spinning)
+			fn = mspinning;
+		newm(fn, p);
+		return;
+	}
+	if(mp->spinning)
+		runtime_throw("startm: m is spinning");
+	if(mp->nextp)
+		runtime_throw("startm: m has p");
+	mp->spinning = spinning;
+	mp->nextp = p;
+	runtime_notewakeup(&mp->park);
+}
+
+// Hands off P from syscall or locked M.
+static void
+handoffp(P *p)
+{
+	// if it has local work, start it straight away
+	if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
+		startm(p, false);
+		return;
+	}
+	// no local work, check that there are no spinning/idle M's,
+	// otherwise our help is not required
+	if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
+		runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
+		startm(p, true);
+		return;
+	}
+	runtime_lock(&runtime_sched);
+	if(runtime_gcwaiting) {
+		p->status = Pgcstop;
+		if(--runtime_sched.stopwait == 0)
+			runtime_notewakeup(&runtime_sched.stopnote);
+		runtime_unlock(&runtime_sched);
+		return;
+	}
+	if(runtime_sched.runqsize) {
+		runtime_unlock(&runtime_sched);
+		startm(p, false);
+		return;
+	}
+	// If this is the last running P and nobody is polling network,
+	// need to wakeup another M to poll network.
+	if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
+		runtime_unlock(&runtime_sched);
+		startm(p, false);
+		return;
+	}
+	pidleput(p);
+	runtime_unlock(&runtime_sched);
+}
+
+// Tries to add one more P to execute G's.
+// Called when a G is made runnable (newproc, ready).
+static void
+wakep(void)
+{
+	// be conservative about spinning threads
+	if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
+		return;
+	startm(nil, true);
+}
+
+// Stops execution of the current m that is locked to a g until the g is runnable again.
+// Returns with acquired P.
+static void
+stoplockedm(void)
+{
+	P *p;
+
+	if(m->lockedg == nil || m->lockedg->lockedm != m)
+		runtime_throw("stoplockedm: inconsistent locking");
+	if(m->p) {
+		// Schedule another M to run this p.
+		p = releasep();
+		handoffp(p);
+	}
+	inclocked(1);
+	// Wait until another thread schedules lockedg again.
+	runtime_notesleep(&m->park);
+	runtime_noteclear(&m->park);
+	if(m->lockedg->status != Grunnable)
+		runtime_throw("stoplockedm: not runnable");
+	acquirep(m->nextp);
+	m->nextp = nil;
+}
+
+// Schedules the locked m to run the locked gp.
+static void
+startlockedm(G *gp)
+{
+	M *mp;
+	P *p;
+
+	mp = gp->lockedm;
+	if(mp == m)
+		runtime_throw("startlockedm: locked to me");
+	if(mp->nextp)
+		runtime_throw("startlockedm: m has p");
+	// directly handoff current P to the locked m
+	inclocked(-1);
+	p = releasep();
+	mp->nextp = p;
+	runtime_notewakeup(&mp->park);
+	stopm();
+}
+
+// Stops the current m for stoptheworld.
+// Returns when the world is restarted.
+static void
+gcstopm(void)
+{
+	P *p;
+
+	if(!runtime_gcwaiting)
+		runtime_throw("gcstopm: not waiting for gc");
+	if(m->spinning) {
+		m->spinning = false;
+		runtime_xadd(&runtime_sched.nmspinning, -1);
+	}
+	p = releasep();
+	runtime_lock(&runtime_sched);
+	p->status = Pgcstop;
+	if(--runtime_sched.stopwait == 0)
+		runtime_notewakeup(&runtime_sched.stopnote);
+	runtime_unlock(&runtime_sched);
+	stopm();
+}
+
+// Schedules gp to run on the current M.
+// Never returns.
+static void
+execute(G *gp)
+{
+	int32 hz;
+
+	if(gp->status != Grunnable) {
+		runtime_printf("execute: bad g status %d\n", gp->status);
+		runtime_throw("execute: bad g status");
+	}
 	gp->status = Grunning;
+	m->p->tick++;
 	m->curg = gp;
 	gp->m = m;
 
@@ -1352,30 +1386,261 @@ schedule(G *gp)
 	runtime_gogo(gp);
 }
 
-// Enter scheduler.  If g->status is Grunning,
-// re-queues g and runs everyone else who is waiting
-// before running g again.  If g->status is Gmoribund,
-// kills off g.
-void
-runtime_gosched(void)
+// Finds a runnable goroutine to execute.
+// Tries to steal from other P's, get g from global queue, poll network.
+static G*
+findrunnable(void)
+{
+	G *gp;
+	P *p;
+	int32 i;
+
+top:
+	if(runtime_gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+	// local runq
+	gp = runqget(m->p);
+	if(gp)
+		return gp;
+	// global runq
+	if(runtime_sched.runqsize) {
+		runtime_lock(&runtime_sched);
+		gp = globrunqget(m->p);
+		runtime_unlock(&runtime_sched);
+		if(gp)
+			return gp;
+	}
+	// poll network
+	gp = runtime_netpoll(false);  // non-blocking
+	if(gp) {
+		injectglist(gp->schedlink);
+		gp->status = Grunnable;
+		return gp;
+	}
+	// If number of spinning M's >= number of busy P's, block.
+	// This is necessary to prevent excessive CPU consumption
+	// when GOMAXPROCS>>1 but the program parallelism is low.
+	if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
+		goto stop;
+	if(!m->spinning) {
+		m->spinning = true;
+		runtime_xadd(&runtime_sched.nmspinning, 1);
+	}
+	// random steal from other P's
+	for(i = 0; i < 2*runtime_gomaxprocs; i++) {
+		if(runtime_gcwaiting)
+			goto top;
+		p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
+		if(p == m->p)
+			gp = runqget(p);
+		else
+			gp = runqsteal(m->p, p);
+		if(gp)
+			return gp;
+	}
+stop:
+	// return P and block
+	runtime_lock(&runtime_sched);
+	if(runtime_gcwaiting) {
+		runtime_unlock(&runtime_sched);
+		goto top;
+	}
+	if(runtime_sched.runqsize) {
+		gp = globrunqget(m->p);
+		runtime_unlock(&runtime_sched);
+		return gp;
+	}
+	p = releasep();
+	pidleput(p);
+	runtime_unlock(&runtime_sched);
+	if(m->spinning) {
+		m->spinning = false;
+		runtime_xadd(&runtime_sched.nmspinning, -1);
+	}
+	// check all runqueues once again
+	for(i = 0; i < runtime_gomaxprocs; i++) {
+		p = runtime_allp[i];
+		if(p && p->runqhead != p->runqtail) {
+			runtime_lock(&runtime_sched);
+			p = pidleget();
+			runtime_unlock(&runtime_sched);
+			if(p) {
+				acquirep(p);
+				goto top;
+			}
+			break;
+		}
+	}
+	// poll network
+	if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
+		if(m->p)
+			runtime_throw("findrunnable: netpoll with p");
+		if(m->spinning)
+			runtime_throw("findrunnable: netpoll with spinning");
+		gp = runtime_netpoll(true);  // block until new work is available
+		runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
+		if(gp) {
+			runtime_lock(&runtime_sched);
+			p = pidleget();
+			runtime_unlock(&runtime_sched);
+			if(p) {
+				acquirep(p);
+				injectglist(gp->schedlink);
+				gp->status = Grunnable;
+				return gp;
+			}
+			injectglist(gp);
+		}
+	}
+	stopm();
+	goto top;
+}
+
+// Injects the list of runnable G's into the scheduler.
+// Can run concurrently with GC.
+static void
+injectglist(G *glist)
 {
-	if(m->locks != 0)
-		runtime_throw("gosched holding locks");
-	if(g == m->g0)
-		runtime_throw("gosched of g0");
-	runtime_mcall(schedule);
+	int32 n;
+	G *gp;
+
+	if(glist == nil)
+		return;
+	runtime_lock(&runtime_sched);
+	for(n = 0; glist; n++) {
+		gp = glist;
+		glist = gp->schedlink;
+		gp->status = Grunnable;
+		globrunqput(gp);
+	}
+	runtime_unlock(&runtime_sched);
+
+	for(; n && runtime_sched.npidle; n--)
+		startm(nil, false);
+}
+
+// One round of scheduler: find a runnable goroutine and execute it.
+// Never returns.
+static void
+schedule(void)
+{
+	G *gp;
+
+	if(m->locks)
+		runtime_throw("schedule: holding locks");
+
+top:
+	if(runtime_gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+
+	gp = runqget(m->p);
+	if(gp == nil)
+		gp = findrunnable();
+
+	if(m->spinning) {
+		m->spinning = false;
+		runtime_xadd(&runtime_sched.nmspinning, -1);
+	}
+
+	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+	// so see if we need to wakeup another M here.
+	if (m->p->runqhead != m->p->runqtail &&
+		runtime_atomicload(&runtime_sched.nmspinning) == 0 &&
+		runtime_atomicload(&runtime_sched.npidle) > 0)  // TODO: fast atomic
+		wakep();
+
+	if(gp->lockedm) {
+		startlockedm(gp);
+		goto top;
+	}
+
+	execute(gp);
 }
 
 // Puts the current goroutine into a waiting state and unlocks the lock.
 // The goroutine can be made runnable again by calling runtime_ready(gp).
 void
-runtime_park(void (*unlockf)(Lock*), Lock *lock, const char *reason)
+runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason)
 {
-	g->status = Gwaiting;
+	m->waitlock = lock;
+	m->waitunlockf = unlockf;
 	g->waitreason = reason;
-	if(unlockf)
-		unlockf(lock);
-	runtime_gosched();
+	runtime_mcall(park0);
+}
+
+// runtime_park continuation on g0.
+static void
+park0(G *gp)
+{
+	gp->status = Gwaiting;
+	gp->m = nil;
+	m->curg = nil;
+	if(m->waitunlockf) {
+		m->waitunlockf(m->waitlock);
+		m->waitunlockf = nil;
+		m->waitlock = nil;
+	}
+	if(m->lockedg) {
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	schedule();
+}
+
+// Scheduler yield.
+void
+runtime_gosched(void)
+{
+	runtime_mcall(gosched0);
+}
+
+// runtime_gosched continuation on g0.
+static void
+gosched0(G *gp)
+{
+	gp->status = Grunnable;
+	gp->m = nil;
+	m->curg = nil;
+	runtime_lock(&runtime_sched);
+	globrunqput(gp);
+	runtime_unlock(&runtime_sched);
+	if(m->lockedg) {
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	schedule();
+}
+
+// Finishes execution of the current goroutine.
+void
+runtime_goexit(void)
+{
+	if(raceenabled)
+		runtime_racegoend();
+	runtime_mcall(goexit0);
+}
+
+// runtime_goexit continuation on g0.
+static void
+goexit0(G *gp)
+{
+	gp->status = Gdead;
+	gp->entry = nil;
+	gp->m = nil;
+	gp->lockedm = nil;
+	m->curg = nil;
+	m->lockedg = nil;
+	if(m->locked & ~LockExternal) {
+		runtime_printf("invalid m->locked = %d", m->locked);
+		runtime_throw("internal lockOSThread error");
+	}	
+	m->locked = 0;
+	gfput(m->p, gp);
+	schedule();
 }
 
 // The goroutine g is about to enter a system call.
@@ -1386,17 +1651,12 @@ runtime_park(void (*unlockf)(Lock*), Lock *lock, const char *reason)
 // Entersyscall cannot split the stack: the runtime_gosave must
 // make g->sched refer to the caller's stack segment, because
 // entersyscall is going to return immediately after.
-// It's okay to call matchmg and notewakeup even after
-// decrementing mcpu, because we haven't released the
-// sched lock yet, so the garbage collector cannot be running.
 
 void runtime_entersyscall(void) __attribute__ ((no_split_stack));
 
 void
-runtime_entersyscall(void)
+runtime_entersyscall()
 {
-	uint32 v;
-
 	if(m->profilehz > 0)
 		runtime_setprof(false);
 
@@ -1415,30 +1675,57 @@ runtime_entersyscall(void)
 
 	g->status = Gsyscall;
 
-	// Fast path.
-	// The slow path inside the schedlock/schedunlock will get
-	// through without stopping if it does:
-	//	mcpu--
-	//	gwait not true
-	//	waitstop && mcpu <= mcpumax not true
-	// If we can do the same with a single atomic add,
-	// then we can skip the locks.
-	v = runtime_xadd(&runtime_sched.atomic, -1<<mcpuShift);
-	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
-		return;
-
-	schedlock();
-	v = runtime_atomicload(&runtime_sched.atomic);
-	if(atomic_gwaiting(v)) {
-		matchmg();
-		v = runtime_atomicload(&runtime_sched.atomic);
+	if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
+		runtime_lock(&runtime_sched);
+		if(runtime_atomicload(&runtime_sched.sysmonwait)) {
+			runtime_atomicstore(&runtime_sched.sysmonwait, 0);
+			runtime_notewakeup(&runtime_sched.sysmonnote);
+		}
+		runtime_unlock(&runtime_sched);
 	}
-	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
-		runtime_xadd(&runtime_sched.atomic, -1<<waitstopShift);
-		runtime_notewakeup(&runtime_sched.stopped);
+
+	m->mcache = nil;
+	m->p->tick++;
+	m->p->m = nil;
+	runtime_atomicstore(&m->p->status, Psyscall);
+	if(runtime_gcwaiting) {
+		runtime_lock(&runtime_sched);
+		if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
+			if(--runtime_sched.stopwait == 0)
+				runtime_notewakeup(&runtime_sched.stopnote);
+		}
+		runtime_unlock(&runtime_sched);
 	}
+}
+
+// The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
+void
+runtime_entersyscallblock(void)
+{
+	P *p;
+
+	if(m->profilehz > 0)
+		runtime_setprof(false);
+
+	// Leave SP around for gc and traceback.
+#ifdef USING_SPLIT_STACK
+	g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
+				       &g->gcnext_segment, &g->gcnext_sp,
+				       &g->gcinitial_sp);
+#else
+	g->gcnext_sp = (byte *) &v;
+#endif
+
+	// Save the registers in the g structure so that any pointers
+	// held in registers will be seen by the garbage collector.
+	getcontext(&g->gcregs);
 
-	schedunlock();
+	g->status = Gsyscall;
+
+	p = releasep();
+	handoffp(p);
+	if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
+		inclocked(1);
 }
 
 // The goroutine g exited its system call.
@@ -1449,46 +1736,53 @@ void
 runtime_exitsyscall(void)
 {
 	G *gp;
-	uint32 v;
-
-	// Fast path.
-	// If we can do the mcpu++ bookkeeping and
-	// find that we still have mcpu <= mcpumax, then we can
-	// start executing Go code immediately, without having to
-	// schedlock/schedunlock.
-	// Also do fast return if any locks are held, so that
-	// panic code can use syscalls to open a file.
+	P *p;
+
+	// Check whether the profiler needs to be turned on.
+	if(m->profilehz > 0)
+		runtime_setprof(true);
+
 	gp = g;
-	v = runtime_xadd(&runtime_sched.atomic, (1<<mcpuShift));
-	if((m->profilehz == runtime_sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) || m->locks > 0) {
+	// Try to re-acquire the last P.
+	if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
 		// There's a cpu for us, so we can run.
+		m->mcache = m->p->mcache;
+		m->p->m = m;
+		m->p->tick++;
 		gp->status = Grunning;
 		// Garbage collector isn't running (since we are),
-		// so okay to clear gcstack.
+		// so okay to clear gcstack and gcsp.
 #ifdef USING_SPLIT_STACK
 		gp->gcstack = nil;
 #endif
 		gp->gcnext_sp = nil;
 		runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
-
-		if(m->profilehz > 0)
-			runtime_setprof(true);
 		return;
 	}
 
-	// Tell scheduler to put g back on the run queue:
-	// mostly equivalent to g->status = Grunning,
-	// but keeps the garbage collector from thinking
-	// that g is running right now, which it's not.
-	gp->readyonstop = 1;
+	if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
+		inclocked(-1);
+	// Try to get any other idle P.
+	m->p = nil;
+	if(runtime_sched.pidle) {
+		runtime_lock(&runtime_sched);
+		p = pidleget();
+		runtime_unlock(&runtime_sched);
+		if(p) {
+			acquirep(p);
+#ifdef USING_SPLIT_STACK
+			gp->gcstack = nil;
+#endif
+			gp->gcnext_sp = nil;
+			runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
+			return;
+		}
+	}
 
-	// All the cpus are taken.
-	// The scheduler will ready g and put this m to sleep.
-	// When the scheduler takes g away from m,
-	// it will undo the runtime_sched.mcpu++ above.
-	runtime_gosched();
+	// Call the scheduler.
+	runtime_mcall(exitsyscall0);
 
-	// Gosched returned, so we're allowed to run now.
+	// Scheduler returned, so we're allowed to run now.
 	// Delete the gcstack information that we left for
 	// the garbage collector during the system call.
 	// Must wait until now because until gosched returns
@@ -1501,6 +1795,34 @@ runtime_exitsyscall(void)
 	runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
 }
 
+// runtime_exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+static void
+exitsyscall0(G *gp)
+{
+	P *p;
+
+	gp->status = Grunnable;
+	gp->m = nil;
+	m->curg = nil;
+	runtime_lock(&runtime_sched);
+	p = pidleget();
+	if(p == nil)
+		globrunqput(gp);
+	runtime_unlock(&runtime_sched);
+	if(p) {
+		acquirep(p);
+		execute(gp);  // Never returns.
+	}
+	if(m->lockedg) {
+		// Wait until another thread schedules gp and so m again.
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	stopm();
+	schedule();  // Never returns.
+}
+
 // Allocate a new g, with a stack big enough for stacksize bytes.
 G*
 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
@@ -1554,15 +1876,10 @@ __go_go(void (*fn)(void*), void* arg)
 	byte *sp;
 	size_t spsize;
 	G *newg;
-	int64 goid;
-
-	goid = runtime_xadd64((uint64*)&runtime_sched.goidgen, 1);
-	if(raceenabled)
-		runtime_racegostart(goid, runtime_getcallerpc(&fn));
 
-	schedlock();
+	m->locks++;  // disable preemption because it can be holding p in a local var
 
-	if((newg = gfget()) != nil) {
+	if((newg = gfget(m->p)) != nil) {
 #ifdef USING_SPLIT_STACK
 		int dont_block_signals = 0;
 
@@ -1579,24 +1896,20 @@ __go_go(void (*fn)(void*), void* arg)
 #endif
 	} else {
 		newg = runtime_malg(StackMin, &sp, &spsize);
+		runtime_lock(&runtime_sched);
 		if(runtime_lastg == nil)
 			runtime_allg = newg;
 		else
 			runtime_lastg->alllink = newg;
 		runtime_lastg = newg;
+		runtime_unlock(&runtime_sched);
 	}
-	newg->status = Gwaiting;
-	newg->waitreason = "new goroutine";
 
 	newg->entry = (byte*)fn;
 	newg->param = arg;
 	newg->gopc = (uintptr)__builtin_return_address(0);
-
-	runtime_sched.gcount++;
-	newg->goid = goid;
-
-	if(sp == nil)
-		runtime_throw("nil g->stack0");
+	newg->status = Grunnable;
+	newg->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
 
 	{
 		// Avoid warnings about variables clobbered by
@@ -1613,33 +1926,87 @@ __go_go(void (*fn)(void*), void* arg)
 		vnewg->context.uc_stack.ss_size = vspsize;
 		makecontext(&vnewg->context, kickoff, 0);
 
-		newprocreadylocked(vnewg);
-		schedunlock();
+		runqput(m->p, vnewg);
 
+		if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
+			wakep();
+		m->locks--;
 		return vnewg;
 	}
 }
 
-// Put on gfree list.  Sched must be locked.
+// Put on gfree list.
+// If local list is too long, transfer a batch to the global list.
 static void
-gfput(G *gp)
-{
-	gp->schedlink = runtime_sched.gfree;
-	runtime_sched.gfree = gp;
+gfput(P *p, G *gp)
+{
+	gp->schedlink = p->gfree;
+	p->gfree = gp;
+	p->gfreecnt++;
+	if(p->gfreecnt >= 64) {
+		runtime_lock(&runtime_sched.gflock);
+		while(p->gfreecnt >= 32) {
+			p->gfreecnt--;
+			gp = p->gfree;
+			p->gfree = gp->schedlink;
+			gp->schedlink = runtime_sched.gfree;
+			runtime_sched.gfree = gp;
+		}
+		runtime_unlock(&runtime_sched.gflock);
+	}
 }
 
-// Get from gfree list.  Sched must be locked.
+// Get from gfree list.
+// If local list is empty, grab a batch from global list.
 static G*
-gfget(void)
+gfget(P *p)
 {
 	G *gp;
 
-	gp = runtime_sched.gfree;
-	if(gp)
-		runtime_sched.gfree = gp->schedlink;
+retry:
+	gp = p->gfree;
+	if(gp == nil && runtime_sched.gfree) {
+		runtime_lock(&runtime_sched.gflock);
+		while(p->gfreecnt < 32 && runtime_sched.gfree) {
+			p->gfreecnt++;
+			gp = runtime_sched.gfree;
+			runtime_sched.gfree = gp->schedlink;
+			gp->schedlink = p->gfree;
+			p->gfree = gp;
+		}
+		runtime_unlock(&runtime_sched.gflock);
+		goto retry;
+	}
+	if(gp) {
+		p->gfree = gp->schedlink;
+		p->gfreecnt--;
+	}
 	return gp;
 }
 
+// Purge all cached G's from gfree list to the global list.
+static void
+gfpurge(P *p)
+{
+	G *gp;
+
+	runtime_lock(&runtime_sched.gflock);
+	while(p->gfreecnt) {
+		p->gfreecnt--;
+		gp = p->gfree;
+		p->gfree = gp->schedlink;
+		gp->schedlink = runtime_sched.gfree;
+		runtime_sched.gfree = gp;
+	}
+	runtime_unlock(&runtime_sched.gflock);
+}
+
+void
+runtime_Breakpoint(void)
+{
+	runtime_breakpoint();
+}
+
 void runtime_Gosched (void) __asm__ (GOSYM_PREFIX "runtime.Gosched");
 
 void
@@ -1649,67 +2016,82 @@ runtime_Gosched(void)
 }
 
 // Implementation of runtime.GOMAXPROCS.
-// delete when scheduler is stronger
+// delete when scheduler is even stronger
 int32
 runtime_gomaxprocsfunc(int32 n)
 {
 	int32 ret;
-	uint32 v;
 
-	schedlock();
+	if(n > MaxGomaxprocs)
+		n = MaxGomaxprocs;
+	runtime_lock(&runtime_sched);
 	ret = runtime_gomaxprocs;
-	if(n <= 0)
-		n = ret;
-	if(n > maxgomaxprocs)
-		n = maxgomaxprocs;
-	runtime_gomaxprocs = n;
-	if(runtime_gomaxprocs > 1)
-		runtime_singleproc = false;
- 	if(runtime_gcwaiting != 0) {
- 		if(atomic_mcpumax(runtime_sched.atomic) != 1)
- 			runtime_throw("invalid mcpumax during gc");
-		schedunlock();
+	if(n <= 0 || n == ret) {
+		runtime_unlock(&runtime_sched);
 		return ret;
 	}
+	runtime_unlock(&runtime_sched);
 
-	setmcpumax(n);
+	runtime_semacquire(&runtime_worldsema);
+	m->gcing = 1;
+	runtime_stoptheworld();
+	newprocs = n;
+	m->gcing = 0;
+	runtime_semrelease(&runtime_worldsema);
+	runtime_starttheworld();
 
-	// If there are now fewer allowed procs
-	// than procs running, stop.
-	v = runtime_atomicload(&runtime_sched.atomic);
-	if((int32)atomic_mcpu(v) > n) {
-		schedunlock();
-		runtime_gosched();
-		return ret;
-	}
-	// handle more procs
-	matchmg();
-	schedunlock();
 	return ret;
 }
 
-void
-runtime_LockOSThread(void)
+static void
+LockOSThread(void)
 {
-	if(m == &runtime_m0 && runtime_sched.init) {
-		runtime_sched.lockmain = true;
-		return;
-	}
 	m->lockedg = g;
 	g->lockedm = m;
 }
 
+void	runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
 void
-runtime_UnlockOSThread(void)
+runtime_LockOSThread(void)
 {
-	if(m == &runtime_m0 && runtime_sched.init) {
-		runtime_sched.lockmain = false;
+	m->locked |= LockExternal;
+	LockOSThread();
+}
+
+void
+runtime_lockOSThread(void)
+{
+	m->locked += LockInternal;
+	LockOSThread();
+}
+
+static void
+UnlockOSThread(void)
+{
+	if(m->locked != 0)
 		return;
-	}
 	m->lockedg = nil;
 	g->lockedm = nil;
 }
 
+void	runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
+
+void
+runtime_UnlockOSThread(void)
+{
+	m->locked &= ~LockExternal;
+	UnlockOSThread();
+}
+
+void
+runtime_unlockOSThread(void)
+{
+	if(m->locked < LockInternal)
+		runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
+	m->locked -= LockInternal;
+	UnlockOSThread();
+}
+
 bool
 runtime_lockedOSThread(void)
 {
@@ -1740,13 +2122,28 @@ intgo runtime_NumGoroutine (void)
 intgo
 runtime_NumGoroutine()
 {
-	return runtime_sched.gcount;
+	return runtime_gcount();
 }
 
 int32
 runtime_gcount(void)
 {
-	return runtime_sched.gcount;
+	G *gp;
+	int32 n, s;
+
+	n = 0;
+	runtime_lock(&runtime_sched);
+	// TODO(dvyukov): runtime.NumGoroutine() is O(N).
+	// We do not want to increment/decrement centralized counter in newproc/goexit,
+	// just to make runtime.NumGoroutine() faster.
+	// Compromise solution is to introduce per-P counters of active goroutines.
+	for(gp = runtime_allg; gp; gp = gp->alllink) {
+		s = gp->status;
+		if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
+			n++;
+	}
+	runtime_unlock(&runtime_sched);
+	return n;
 }
 
 int32
@@ -1769,6 +2166,9 @@ runtime_sigprof()
 {
 	int32 n, i;
 
+	// Windows does profiling in a dedicated thread w/o m.
+	if(!Windows && (m == nil || m->mcache == nil))
+		return;
 	if(prof.fn == nil || prof.hz == 0)
 		return;
 
@@ -1813,3 +2213,555 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 	if(hz != 0)
 		runtime_resetcpuprofiler(hz);
 }
+
+// Change number of processors.  The world is stopped, sched is locked.
+static void
+procresize(int32 new)
+{
+	int32 i, old;
+	G *gp;
+	P *p;
+
+	old = runtime_gomaxprocs;
+	if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
+		runtime_throw("procresize: invalid arg");
+	// initialize new P's
+	for(i = 0; i < new; i++) {
+		p = runtime_allp[i];
+		if(p == nil) {
+			p = (P*)runtime_mallocgc(sizeof(*p), 0, 0, 1);
+			p->status = Pgcstop;
+			runtime_atomicstorep(&runtime_allp[i], p);
+		}
+		if(p->mcache == nil) {
+			if(old==0 && i==0)
+				p->mcache = m->mcache;  // bootstrap
+			else
+				p->mcache = runtime_allocmcache();
+		}
+		if(p->runq == nil) {
+			p->runqsize = 128;
+			p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, 0, 1);
+		}
+	}
+
+	// redistribute runnable G's evenly
+	for(i = 0; i < old; i++) {
+		p = runtime_allp[i];
+		while((gp = runqget(p)) != nil)
+			globrunqput(gp);
+	}
+	// start at 1 because current M already executes some G and will acquire allp[0] below,
+	// so if we have a spare G we want to put it into allp[1].
+	for(i = 1; runtime_sched.runqhead; i++) {
+		gp = runtime_sched.runqhead;
+		runtime_sched.runqhead = gp->schedlink;
+		runqput(runtime_allp[i%new], gp);
+	}
+	runtime_sched.runqtail = nil;
+	runtime_sched.runqsize = 0;
+
+	// free unused P's
+	for(i = new; i < old; i++) {
+		p = runtime_allp[i];
+		runtime_freemcache(p->mcache);
+		p->mcache = nil;
+		gfpurge(p);
+		p->status = Pdead;
+		// can't free P itself because it can be referenced by an M in syscall
+	}
+
+	if(m->p)
+		m->p->m = nil;
+	m->p = nil;
+	m->mcache = nil;
+	p = runtime_allp[0];
+	p->m = nil;
+	p->status = Pidle;
+	acquirep(p);
+	for(i = new-1; i > 0; i--) {
+		p = runtime_allp[i];
+		p->status = Pidle;
+		pidleput(p);
+	}
+	runtime_singleproc = new == 1;
+	runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
+}
+
+// Associate p and the current m.
+static void
+acquirep(P *p)
+{
+	if(m->p || m->mcache)
+		runtime_throw("acquirep: already in go");
+	if(p->m || p->status != Pidle) {
+		runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
+		runtime_throw("acquirep: invalid p state");
+	}
+	m->mcache = p->mcache;
+	m->p = p;
+	p->m = m;
+	p->status = Prunning;
+}
+
+// Disassociate p and the current m.
+static P*
+releasep(void)
+{
+	P *p;
+
+	if(m->p == nil || m->mcache == nil)
+		runtime_throw("releasep: invalid arg");
+	p = m->p;
+	if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
+		runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
+			m, m->p, p->m, m->mcache, p->mcache, p->status);
+		runtime_throw("releasep: invalid p state");
+	}
+	m->p = nil;
+	m->mcache = nil;
+	p->m = nil;
+	p->status = Pidle;
+	return p;
+}
+
+static void
+inclocked(int32 v)
+{
+	runtime_lock(&runtime_sched);
+	runtime_sched.mlocked += v;
+	if(v > 0)
+		checkdead();
+	runtime_unlock(&runtime_sched);
+}
+
+// Check for deadlock situation.
+// The check is based on number of running M's, if 0 -> deadlock.
+static void
+checkdead(void)
+{
+	G *gp;
+	int32 run, grunning, s;
+
+	// -1 for sysmon
+	run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.mlocked - 1;
+	if(run > 0)
+		return;
+	if(run < 0) {
+		runtime_printf("checkdead: nmidle=%d mlocked=%d mcount=%d\n",
+			runtime_sched.nmidle, runtime_sched.mlocked, runtime_sched.mcount);
+		runtime_throw("checkdead: inconsistent counts");
+	}
+	grunning = 0;
+	for(gp = runtime_allg; gp; gp = gp->alllink) {
+		if(gp->isbackground)
+			continue;
+		s = gp->status;
+		if(s == Gwaiting)
+			grunning++;
+		else if(s == Grunnable || s == Grunning || s == Gsyscall) {
+			runtime_printf("checkdead: find g %D in status %d\n", gp->goid, s);
+			runtime_throw("checkdead: runnable g");
+		}
+	}
+	if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
+		runtime_exit(0);
+	m->throwing = -1;  // do not dump full stacks
+	runtime_throw("all goroutines are asleep - deadlock!");
+}
+
+static void
+sysmon(void)
+{
+	uint32 idle, delay;
+	int64 now, lastpoll;
+	G *gp;
+	uint32 ticks[MaxGomaxprocs];
+
+	idle = 0;  // how many cycles in succession we had not wokeup somebody
+	delay = 0;
+	for(;;) {
+		if(idle == 0)  // start with 20us sleep...
+			delay = 20;
+		else if(idle > 50)  // start doubling the sleep after 1ms...
+			delay *= 2;
+		if(delay > 10*1000)  // up to 10ms
+			delay = 10*1000;
+		runtime_usleep(delay);
+		if(runtime_gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {  // TODO: fast atomic
+			runtime_lock(&runtime_sched);
+			if(runtime_atomicload(&runtime_gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
+				runtime_atomicstore(&runtime_sched.sysmonwait, 1);
+				runtime_unlock(&runtime_sched);
+				runtime_notesleep(&runtime_sched.sysmonnote);
+				runtime_noteclear(&runtime_sched.sysmonnote);
+				idle = 0;
+				delay = 20;
+			} else
+				runtime_unlock(&runtime_sched);
+		}
+		// poll network if not polled for more than 10ms
+		lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
+		now = runtime_nanotime();
+		if(lastpoll != 0 && lastpoll + 10*1000*1000 > now) {
+			gp = runtime_netpoll(false);  // non-blocking
+			injectglist(gp);
+		}
+		// retake P's blocked in syscalls
+		if(retake(ticks))
+			idle = 0;
+		else
+			idle++;
+	}
+}
+
+static uint32
+retake(uint32 *ticks)
+{
+	uint32 i, s, n;
+	int64 t;
+	P *p;
+
+	n = 0;
+	for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
+		p = runtime_allp[i];
+		if(p==nil)
+			continue;
+		t = p->tick;
+		if(ticks[i] != t) {
+			ticks[i] = t;
+			continue;
+		}
+		s = p->status;
+		if(s != Psyscall)
+			continue;
+		if(p->runqhead == p->runqtail && runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)  // TODO: fast atomic
+			continue;
+		// Need to increment number of locked M's before the CAS.
+		// Otherwise the M from which we retake can exit the syscall,
+		// increment nmidle and report deadlock.
+		inclocked(-1);
+		if(runtime_cas(&p->status, s, Pidle)) {
+			n++;
+			handoffp(p);
+		}
+		inclocked(1);
+	}
+	return n;
+}
+
+// Put mp on midle list.
+// Sched must be locked.
+static void
+mput(M *mp)
+{
+	mp->schedlink = runtime_sched.midle;
+	runtime_sched.midle = mp;
+	runtime_sched.nmidle++;
+	checkdead();
+}
+
+// Try to get an m from midle list.
+// Sched must be locked.
+static M*
+mget(void)
+{
+	M *mp;
+
+	if((mp = runtime_sched.midle) != nil){
+		runtime_sched.midle = mp->schedlink;
+		runtime_sched.nmidle--;
+	}
+	return mp;
+}
+
+// Put gp on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqput(G *gp)
+{
+	gp->schedlink = nil;
+	if(runtime_sched.runqtail)
+		runtime_sched.runqtail->schedlink = gp;
+	else
+		runtime_sched.runqhead = gp;
+	runtime_sched.runqtail = gp;
+	runtime_sched.runqsize++;
+}
+
+// Try get a batch of G's from the global runnable queue.
+// Sched must be locked.
+static G*
+globrunqget(P *p)
+{
+	G *gp, *gp1;
+	int32 n;
+
+	if(runtime_sched.runqsize == 0)
+		return nil;
+	n = runtime_sched.runqsize/runtime_gomaxprocs+1;
+	if(n > runtime_sched.runqsize)
+		n = runtime_sched.runqsize;
+	runtime_sched.runqsize -= n;
+	if(runtime_sched.runqsize == 0)
+		runtime_sched.runqtail = nil;
+	gp = runtime_sched.runqhead;
+	runtime_sched.runqhead = gp->schedlink;
+	n--;
+	while(n--) {
+		gp1 = runtime_sched.runqhead;
+		runtime_sched.runqhead = gp1->schedlink;
+		runqput(p, gp1);
+	}
+	return gp;
+}
+
+// Put p to on pidle list.
+// Sched must be locked.
+static void
+pidleput(P *p)
+{
+	p->link = runtime_sched.pidle;
+	runtime_sched.pidle = p;
+	runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
+}
+
+// Try get a p from pidle list.
+// Sched must be locked.
+static P*
+pidleget(void)
+{
+	P *p;
+
+	p = runtime_sched.pidle;
+	if(p) {
+		runtime_sched.pidle = p->link;
+		runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
+	}
+	return p;
+}
+
+// Put g on local runnable queue.
+// TODO(dvyukov): consider using lock-free queue.
+static void
+runqput(P *p, G *gp)
+{
+	int32 h, t, s;
+
+	runtime_lock(p);
+retry:
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	if(t == h-1 || (h == 0 && t == s-1)) {
+		runqgrow(p);
+		goto retry;
+	}
+	p->runq[t++] = gp;
+	if(t == s)
+		t = 0;
+	p->runqtail = t;
+	runtime_unlock(p);
+}
+
+// Get g from local runnable queue.
+static G*
+runqget(P *p)
+{
+	G *gp;
+	int32 t, h, s;
+
+	if(p->runqhead == p->runqtail)
+		return nil;
+	runtime_lock(p);
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	if(t == h) {
+		runtime_unlock(p);
+		return nil;
+	}
+	gp = p->runq[h++];
+	if(h == s)
+		h = 0;
+	p->runqhead = h;
+	runtime_unlock(p);
+	return gp;
+}
+
+// Grow local runnable queue.
+// TODO(dvyukov): consider using fixed-size array
+// and transfer excess to the global list (local queue can grow way too big).
+static void
+runqgrow(P *p)
+{
+	G **q;
+	int32 s, t, h, t2;
+
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	t2 = 0;
+	q = runtime_malloc(2*s*sizeof(*q));
+	while(t != h) {
+		q[t2++] = p->runq[h++];
+		if(h == s)
+			h = 0;
+	}
+	runtime_free(p->runq);
+	p->runq = q;
+	p->runqhead = 0;
+	p->runqtail = t2;
+	p->runqsize = 2*s;
+}
+
+// Steal half of elements from local runnable queue of p2
+// and put onto local runnable queue of p.
+// Returns one of the stolen elements (or nil if failed).
+static G*
+runqsteal(P *p, P *p2)
+{
+	G *gp, *gp1;
+	int32 t, h, s, t2, h2, s2, c, i;
+
+	if(p2->runqhead == p2->runqtail)
+		return nil;
+	// sort locks to prevent deadlocks
+	if(p < p2)
+		runtime_lock(p);
+	runtime_lock(p2);
+	if(p2->runqhead == p2->runqtail) {
+		runtime_unlock(p2);
+		if(p < p2)
+			runtime_unlock(p);
+		return nil;
+	}
+	if(p >= p2)
+		runtime_lock(p);
+	// now we've locked both queues and know the victim is not empty
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	h2 = p2->runqhead;
+	t2 = p2->runqtail;
+	s2 = p2->runqsize;
+	gp = p2->runq[h2++];  // return value
+	if(h2 == s2)
+		h2 = 0;
+	// steal roughly half
+	if(t2 > h2)
+		c = (t2 - h2) / 2;
+	else
+		c = (s2 - h2 + t2) / 2;
+	// copy
+	for(i = 0; i != c; i++) {
+		// the target queue is full?
+		if(t == h-1 || (h == 0 && t == s-1))
+			break;
+		// the victim queue is empty?
+		if(t2 == h2)
+			break;
+		gp1 = p2->runq[h2++];
+		if(h2 == s2)
+			h2 = 0;
+		p->runq[t++] = gp1;
+		if(t == s)
+			t = 0;
+	}
+	p->runqtail = t;
+	p2->runqhead = h2;
+	runtime_unlock(p2);
+	runtime_unlock(p);
+	return gp;
+}
+
+void runtime_testSchedLocalQueue(void)
+  __asm__("runtime.testSchedLocalQueue");
+
+void
+runtime_testSchedLocalQueue(void)
+{
+	P p;
+	G gs[1000];
+	int32 i, j;
+
+	runtime_memclr((byte*)&p, sizeof(p));
+	p.runqsize = 1;
+	p.runqhead = 0;
+	p.runqtail = 0;
+	p.runq = runtime_malloc(p.runqsize*sizeof(*p.runq));
+
+	for(i = 0; i < (int32)nelem(gs); i++) {
+		if(runqget(&p) != nil)
+			runtime_throw("runq is not empty initially");
+		for(j = 0; j < i; j++)
+			runqput(&p, &gs[i]);
+		for(j = 0; j < i; j++) {
+			if(runqget(&p) != &gs[i]) {
+				runtime_printf("bad element at iter %d/%d\n", i, j);
+				runtime_throw("bad element");
+			}
+		}
+		if(runqget(&p) != nil)
+			runtime_throw("runq is not empty afterwards");
+	}
+}
+
+void runtime_testSchedLocalQueueSteal(void)
+  __asm__("runtime.testSchedLocalQueueSteal");
+
+void
+runtime_testSchedLocalQueueSteal(void)
+{
+	P p1, p2;
+	G gs[1000], *gp;
+	int32 i, j, s;
+
+	runtime_memclr((byte*)&p1, sizeof(p1));
+	p1.runqsize = 1;
+	p1.runqhead = 0;
+	p1.runqtail = 0;
+	p1.runq = runtime_malloc(p1.runqsize*sizeof(*p1.runq));
+
+	runtime_memclr((byte*)&p2, sizeof(p2));
+	p2.runqsize = nelem(gs);
+	p2.runqhead = 0;
+	p2.runqtail = 0;
+	p2.runq = runtime_malloc(p2.runqsize*sizeof(*p2.runq));
+
+	for(i = 0; i < (int32)nelem(gs); i++) {
+		for(j = 0; j < i; j++) {
+			gs[j].sig = 0;
+			runqput(&p1, &gs[j]);
+		}
+		gp = runqsteal(&p2, &p1);
+		s = 0;
+		if(gp) {
+			s++;
+			gp->sig++;
+		}
+		while((gp = runqget(&p2)) != nil) {
+			s++;
+			gp->sig++;
+		}
+		while((gp = runqget(&p1)) != nil)
+			gp->sig++;
+		for(j = 0; j < i; j++) {
+			if(gs[j].sig != 1) {
+				runtime_printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
+				runtime_throw("bad element");
+			}
+		}
+		if(s != i/2 && s != i/2+1) {
+			runtime_printf("bad steal %d, want %d or %d, iter %d\n",
+				s, i/2, i/2+1, i);
+			runtime_throw("bad steal");
+		}
+	}
+}
+
+void
+runtime_proc_scan(void (*addroot)(Obj))
+{
+	addroot((Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
+}
diff --git a/libgo/runtime/race.h b/libgo/runtime/race.h
index 9f3b3ec..3357bed 100644
--- a/libgo/runtime/race.h
+++ b/libgo/runtime/race.h
@@ -11,17 +11,19 @@ enum { raceenabled = 0 };
 #endif
 
 // Initialize race detection subsystem.
-void	runtime_raceinit(void);
+uintptr	runtime_raceinit(void);
 // Finalize race detection subsystem, does not return.
 void	runtime_racefini(void);
 
 void	runtime_racemapshadow(void *addr, uintptr size);
 void	runtime_racemalloc(void *p, uintptr sz, void *pc);
 void	runtime_racefree(void *p);
-void	runtime_racegostart(int32 goid, void *pc);
-void	runtime_racegoend(int32 goid);
+uintptr	runtime_racegostart(void *pc);
+void	runtime_racegoend(void);
 void	runtime_racewritepc(void *addr, void *callpc, void *pc);
 void	runtime_racereadpc(void *addr, void *callpc, void *pc);
+void	runtime_racewriterangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
+void	runtime_racereadrangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
 void	runtime_racefingo(void);
 void	runtime_raceacquire(void *addr);
 void	runtime_raceacquireg(G *gp, void *addr);
diff --git a/libgo/runtime/runtime.c b/libgo/runtime/runtime.c
index 48ece55..138e5af 100644
--- a/libgo/runtime/runtime.c
+++ b/libgo/runtime/runtime.c
@@ -10,14 +10,27 @@
 #include "array.h"
 #include "go-panic.h"
 
+// The GOTRACEBACK environment variable controls the
+// behavior of a Go program that is crashing and exiting.
+//	GOTRACEBACK=0   suppress all tracebacks
+//	GOTRACEBACK=1   default behavior - show tracebacks but exclude runtime frames
+//	GOTRACEBACK=2   show tracebacks including runtime frames
+//	GOTRACEBACK=crash   show tracebacks including runtime frames, then crash (core dump etc)
 int32
-runtime_gotraceback(void)
+runtime_gotraceback(bool *crash)
 {
 	const byte *p;
 
+	if(crash != nil)
+		*crash = false;
 	p = runtime_getenv("GOTRACEBACK");
 	if(p == nil || p[0] == '\0')
 		return 1;	// default is on
+	if(runtime_strcmp((const char *)p, "crash") == 0) {
+		if(crash != nil)
+			*crash = true;
+		return 2;	// extra information
+	}
 	return runtime_atoi(p);
 }
 
@@ -44,6 +57,11 @@ runtime_progname()
   return argc == 0 ? nil : argv[0];
 }
 
+// Information about what cpu features are available.
+// Set on startup in asm_{x86/amd64}.s.
+uint32 runtime_cpuid_ecx;
+uint32 runtime_cpuid_edx;
+
 void
 runtime_goargs(void)
 {
@@ -90,6 +108,52 @@ runtime_atoi(const byte *p)
 	return n;
 }
 
+static struct root_list runtime_roots =
+{ nil,
+  { { &syscall_Envs, sizeof syscall_Envs },
+    { &os_Args, sizeof os_Args },
+    { nil, 0 } },
+};
+
+static void
+TestAtomic64(void)
+{
+	uint64 z64, x64;
+
+	z64 = 42;
+	x64 = 0;
+	PREFETCH(&z64);
+	if(runtime_cas64(&z64, &x64, 1))
+		runtime_throw("cas64 failed");
+	if(x64 != 42)
+		runtime_throw("cas64 failed");
+	if(!runtime_cas64(&z64, &x64, 1))
+		runtime_throw("cas64 failed");
+	if(x64 != 42 || z64 != 1)
+		runtime_throw("cas64 failed");
+	if(runtime_atomicload64(&z64) != 1)
+		runtime_throw("load64 failed");
+	runtime_atomicstore64(&z64, (1ull<<40)+1);
+	if(runtime_atomicload64(&z64) != (1ull<<40)+1)
+		runtime_throw("store64 failed");
+	if(runtime_xadd64(&z64, (1ull<<40)+1) != (2ull<<40)+2)
+		runtime_throw("xadd64 failed");
+	if(runtime_atomicload64(&z64) != (2ull<<40)+2)
+		runtime_throw("xadd64 failed");
+	if(runtime_xchg64(&z64, (3ull<<40)+3) != (2ull<<40)+2)
+		runtime_throw("xchg64 failed");
+	if(runtime_atomicload64(&z64) != (3ull<<40)+3)
+		runtime_throw("xchg64 failed");
+}
+
+void
+runtime_check(void)
+{
+	__go_register_gc_roots(&runtime_roots);
+
+	TestAtomic64();
+}
+
 uint32
 runtime_fastrand1(void)
 {
@@ -105,19 +169,6 @@ runtime_fastrand1(void)
 	return x;
 }
 
-static struct root_list runtime_roots =
-{ nil,
-  { { &syscall_Envs, sizeof syscall_Envs },
-    { &os_Args, sizeof os_Args },
-    { nil, 0 } },
-};
-
-void
-runtime_check(void)
-{
-	__go_register_gc_roots(&runtime_roots);
-}
-
 int64
 runtime_cputicks(void)
 {
@@ -139,7 +190,7 @@ runtime_showframe(String s, bool current)
 	if(current && runtime_m()->throwing > 0)
 		return 1;
 	if(traceback < 0)
-		traceback = runtime_gotraceback();
+		traceback = runtime_gotraceback(nil);
 	return traceback > 1 || (__builtin_memchr(s.str, '.', s.len) != nil && __builtin_memcmp(s.str, "runtime.", 7) != 0);
 }
 
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index 959220d..5b2a64f 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -54,9 +54,11 @@ typedef	uint8			bool;
 typedef	uint8			byte;
 typedef	struct	Func		Func;
 typedef	struct	G		G;
-typedef	union	Lock		Lock;
+typedef	struct	Lock		Lock;
 typedef	struct	M		M;
-typedef	union	Note		Note;
+typedef	struct	P		P;
+typedef	struct	Note		Note;
+typedef	struct	String		String;
 typedef	struct	FuncVal		FuncVal;
 typedef	struct	SigTab		SigTab;
 typedef	struct	MCache		MCache;
@@ -64,14 +66,14 @@ typedef struct	FixAlloc	FixAlloc;
 typedef	struct	Hchan		Hchan;
 typedef	struct	Timers		Timers;
 typedef	struct	Timer		Timer;
-typedef struct	GCStats		GCStats;
-typedef struct	LFNode		LFNode;
-typedef struct	ParFor		ParFor;
-typedef struct	ParForThread	ParForThread;
-typedef struct	CgoMal		CgoMal;
+typedef	struct	GCStats		GCStats;
+typedef	struct	LFNode		LFNode;
+typedef	struct	ParFor		ParFor;
+typedef	struct	ParForThread	ParForThread;
+typedef	struct	CgoMal		CgoMal;
+typedef	struct	PollDesc	PollDesc;
 
 typedef	struct	__go_open_array		Slice;
-typedef	struct	String			String;
 typedef struct	__go_interface		Iface;
 typedef	struct	__go_empty_interface	Eface;
 typedef	struct	__go_type_descriptor	Type;
@@ -81,6 +83,7 @@ typedef	struct	__go_panic_stack	Panic;
 typedef struct	__go_ptr_type		PtrType;
 typedef struct	__go_func_type		FuncType;
 typedef struct	__go_map_type		MapType;
+typedef struct	__go_channel_type	ChanType;
 
 typedef struct  Traceback	Traceback;
 
@@ -110,11 +113,20 @@ enum
 	Grunning,
 	Gsyscall,
 	Gwaiting,
-	Gmoribund,
+	Gmoribund_unused,  // currently unused, but hardcoded in gdb scripts
 	Gdead,
 };
 enum
 {
+	// P status
+	Pidle,
+	Prunning,
+	Psyscall,
+	Pgcstop,
+	Pdead,
+};
+enum
+{
 	true	= 1,
 	false	= 0,
 };
@@ -129,19 +141,22 @@ enum
 	// Global <-> per-M stack segment cache transfer batch size.
 	StackCacheBatch = 16,
 };
-
 /*
  * structures
  */
-union	Lock
+struct	Lock
 {
-	uint32	key;	// futex-based impl
-	M*	waitm;	// linked list of waiting M's (sema-based impl)
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	uintptr	key;
 };
-union	Note
+struct	Note
 {
-	uint32	key;	// futex-based impl
-	M*	waitm;	// waiting M (sema-based impl)
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	uintptr	key;
 };
 struct String
 {
@@ -194,13 +209,12 @@ struct	G
 	uint32	selgen;		// valid sudog pointer
 	const char*	waitreason;	// if status==Gwaiting
 	G*	schedlink;
-	bool	readyonstop;
 	bool	ispanic;
-	bool	issystem;
-	int8	raceignore; // ignore race detection events
+	bool	issystem;	// do not output in stack dump
+	bool	isbackground;	// ignore in deadlock detector
+	bool	blockingsyscall;	// hint that the next syscall will block
 	M*	m;		// for debuggers, but offset not hard-coded
 	M*	lockedm;
-	M*	idlem;
 	int32	sig;
 	int32	writenbuf;
 	byte*	writebuf;
@@ -224,34 +238,44 @@ struct	M
 {
 	G*	g0;		// goroutine with scheduling stack
 	G*	gsignal;	// signal-handling G
+	byte*	gsignalstack;
+	size_t	gsignalstacksize;
+	void	(*mstartfn)(void);
 	G*	curg;		// current running goroutine
+	P*	p;		// attached P for executing Go code (nil if not executing Go code)
+	P*	nextp;
 	int32	id;
 	int32	mallocing;
 	int32	throwing;
 	int32	gcing;
 	int32	locks;
 	int32	nomemprof;
-	int32	waitnextg;
 	int32	dying;
 	int32	profilehz;
 	int32	helpgc;
+	bool	blockingsyscall;
+	bool	spinning;
 	uint32	fastrand;
 	uint64	ncgocall;	// number of cgo calls in total
-	Note	havenextg;
-	G*	nextg;
+	int32	ncgo;		// number of cgo calls currently in progress
+	CgoMal*	cgomal;
+	Note	park;
 	M*	alllink;	// on allm
 	M*	schedlink;
 	MCache	*mcache;
 	G*	lockedg;
-	G*	idleg;
 	Location createstack[32];	// Stack that created this thread.
+	uint32	locked;	// tracking for LockOSThread
 	M*	nextwaitm;	// next M waiting for lock
 	uintptr	waitsema;	// semaphore for parking on locks
 	uint32	waitsemacount;
 	uint32	waitsemalock;
 	GCStats	gcstats;
 	bool	racecall;
+	bool	needextram;
 	void*	racepc;
+	void	(*waitunlockf)(Lock*);
+	void*	waitlock;
 
 	uintptr	settype_buf[1024];
 	uintptr	settype_bufsize;
@@ -259,6 +283,38 @@ struct	M
 	uintptr	end[];
 };
 
+struct P
+{
+	Lock;
+
+	uint32	status;  // one of Pidle/Prunning/...
+	P*	link;
+	uint32	tick;   // incremented on every scheduler or system call
+	M*	m;	// back-link to associated M (nil if idle)
+	MCache*	mcache;
+
+	// Queue of runnable goroutines.
+	G**	runq;
+	int32	runqhead;
+	int32	runqtail;
+	int32	runqsize;
+
+	// Available G's (status == Gdead)
+	G*	gfree;
+	int32	gfreecnt;
+
+	byte	pad[64];
+};
+
+// The m->locked word holds a single bit saying whether
+// external calls to LockOSThread are in effect, and then a counter
+// of the internal nesting depth of lockOSThread / unlockOSThread.
+enum
+{
+	LockExternal = 1,
+	LockInternal = 2,
+};
+
 struct	SigTab
 {
 	int32	sig;
@@ -271,6 +327,8 @@ enum
 	SigThrow = 1<<2,	// if signal.Notify doesn't take it, exit loudly
 	SigPanic = 1<<3,	// if the signal is from the kernel, panic
 	SigDefault = 1<<4,	// if the signal isn't explicitly requested, don't monitor it
+	SigHandling = 1<<5,	// our signal handler is registered
+	SigIgnored = 1<<6,	// the signal was ignored before we registered for it
 };
 
 #ifndef NSIG
@@ -343,6 +401,7 @@ struct ParFor
 	bool wait;			// if true, wait while all threads finish processing,
 					// otherwise parfor may return while other threads are still working
 	ParForThread *thr;		// array of thread descriptors
+	uint32 pad;			// to align ParForThread.pos for 64-bit atomic operations
 	// stats
 	uint64 nsteal;
 	uint64 nstealcnt;
@@ -356,7 +415,7 @@ struct ParFor
 struct CgoMal
 {
 	CgoMal	*next;
-	byte	*alloc;
+	void	*alloc;
 };
 
 /*
@@ -369,6 +428,19 @@ struct CgoMal
 #define USED(v)		((void) v)
 #define	ROUND(x, n)	(((x)+(n)-1)&~((n)-1)) /* all-caps to mark as macro: it evaluates n twice */
 
+byte*	runtime_startup_random_data;
+uint32	runtime_startup_random_data_len;
+void	runtime_get_random_data(byte**, int32*);
+
+enum {
+	// hashinit wants this many random bytes
+	HashRandomBytes = 32
+};
+void	runtime_hashinit(void);
+
+void	runtime_traceback();
+void	runtime_tracebackothers(G*);
+
 /*
  * external data
  */
@@ -376,21 +448,27 @@ extern	uintptr runtime_zerobase;
 extern	G*	runtime_allg;
 extern	G*	runtime_lastg;
 extern	M*	runtime_allm;
+extern	P**	runtime_allp;
 extern	int32	runtime_gomaxprocs;
 extern	bool	runtime_singleproc;
 extern	uint32	runtime_panicking;
-extern	int32	runtime_gcwaiting;		// gc is waiting to run
+extern	uint32	runtime_gcwaiting;		// gc is waiting to run
+extern	int8*	runtime_goos;
 extern	int32	runtime_ncpu;
+extern 	void	(*runtime_sysargs)(int32, uint8**);
 
 /*
  * common functions and data
  */
+#define runtime_strcmp(s1, s2) __builtin_strcmp((s1), (s2))
+#define runtime_strstr(s1, s2) __builtin_strstr((s1), (s2))
 intgo	runtime_findnull(const byte*);
 void	runtime_dump(byte*, int32);
 
 /*
  * very low level c-called
  */
+struct __go_func_type;
 void	runtime_args(int32, byte**);
 void	runtime_osinit();
 void	runtime_goargs(void);
@@ -400,42 +478,98 @@ void	runtime_throw(const char*) __attribute__ ((noreturn));
 void	runtime_panicstring(const char*) __attribute__ ((noreturn));
 void	runtime_prints(const char*);
 void	runtime_printf(const char*, ...);
+#define runtime_mcmp(a, b, s) __builtin_memcmp((a), (b), (s))
+#define runtime_memmove(a, b, s) __builtin_memmove((a), (b), (s))
 void*	runtime_mal(uintptr);
+String	runtime_gostring(const byte*);
+String	runtime_gostringnocopy(const byte*);
 void	runtime_schedinit(void);
 void	runtime_initsig(void);
 void	runtime_sigenable(uint32 sig);
-int32	runtime_gotraceback(void);
+void	runtime_sigdisable(uint32 sig);
+int32	runtime_gotraceback(bool *crash);
 void	runtime_goroutineheader(G*);
 void	runtime_goroutinetrailer(G*);
-void	runtime_traceback();
-void	runtime_tracebackothers(G*);
 void	runtime_printtrace(Location*, int32, bool);
-String	runtime_gostring(const byte*);
-String	runtime_gostringnocopy(const byte*);
+#define runtime_open(p, f, m) open((p), (f), (m))
+#define runtime_read(d, v, n) read((d), (v), (n))
+#define runtime_write(d, v, n) write((d), (v), (n))
+#define runtime_close(d) close(d)
+#define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+#define runtime_cas64(pval, pold, new) __atomic_compare_exchange_n (pval, pold, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+// Don't confuse with XADD x86 instruction,
+// this one is actually 'addx', that is, add-and-fetch.
+#define runtime_xadd(p, v) __sync_add_and_fetch (p, v)
+#define runtime_xadd64(p, v) __sync_add_and_fetch (p, v)
+#define runtime_xchg(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_xchg64(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicload(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicstore(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicstore64(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicload64(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicloadp(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicstorep(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+void	runtime_ready(G*);
+const byte*	runtime_getenv(const char*);
+int32	runtime_atoi(const byte*);
 void*	runtime_mstart(void*);
 G*	runtime_malg(int32, byte**, size_t*);
+void	runtime_mpreinit(M*);
 void	runtime_minit(void);
+void	runtime_unminit(void);
+void	runtime_signalstack(byte*, int32);
+MCache*	runtime_allocmcache(void);
+void	runtime_freemcache(MCache*);
 void	runtime_mallocinit(void);
+void	runtime_mprofinit(void);
+#define runtime_malloc(s) __go_alloc(s)
+#define runtime_free(p) __go_free(p)
+bool	runtime_addfinalizer(void*, FuncVal *fn, const struct __go_func_type *);
+#define runtime_getcallersp(p) __builtin_frame_address(1)
+int32	runtime_mcount(void);
+int32	runtime_gcount(void);
+uint32	runtime_fastrand1(void);
+
+void runtime_setmg(M*, G*);
+void runtime_newextram(void);
+#define runtime_exit(s) exit(s)
+#define runtime_breakpoint() __builtin_trap()
 void	runtime_gosched(void);
 void	runtime_park(void(*)(Lock*), Lock*, const char*);
 void	runtime_tsleep(int64, const char*);
 M*	runtime_newm(void);
 void	runtime_goexit(void);
 void	runtime_entersyscall(void) __asm__ (GOSYM_PREFIX "syscall.Entersyscall");
+void	runtime_entersyscallblock(void);
 void	runtime_exitsyscall(void) __asm__ (GOSYM_PREFIX "syscall.Exitsyscall");
+G*	__go_go(void (*pfn)(void*), void*);
 void	siginit(void);
 bool	__go_sigsend(int32 sig);
 int32	runtime_callers(int32, Location*, int32);
 int64	runtime_nanotime(void);
+void	runtime_dopanic(int32) __attribute__ ((noreturn));
+void	runtime_startpanic(void);
+void	runtime_sigprof();
+void	runtime_resetcpuprofiler(int32);
+void	runtime_setcpuprofilerate(void(*)(uintptr*, int32), int32);
+void	runtime_usleep(uint32);
 int64	runtime_cputicks(void);
 int64	runtime_tickspersecond(void);
 void	runtime_blockevent(int64, int32);
 extern int64 runtime_blockprofilerate;
+void	runtime_addtimer(Timer*);
+bool	runtime_deltimer(Timer*);
+G*	runtime_netpoll(bool);
+void	runtime_netpollinit(void);
+int32	runtime_netpollopen(int32, PollDesc*);
+int32   runtime_netpollclose(int32);
+void	runtime_netpollready(G**, PollDesc*, int32);
+void	runtime_crash(void);
 
 void	runtime_stoptheworld(void);
 void	runtime_starttheworld(void);
 extern uint32 runtime_worldsema;
-G*	__go_go(void (*pfn)(void*), void*);
 
 /*
  * mutual exclusion locks.  in the uncontended case,
@@ -533,6 +667,7 @@ void __wrap_rtems_task_variable_add(void **);
  * runtime go-called
  */
 void	runtime_printbool(_Bool);
+void	runtime_printbyte(int8);
 void	runtime_printfloat(double);
 void	runtime_printint(int64);
 void	runtime_printiface(Iface);
@@ -544,53 +679,10 @@ void	runtime_printuint(uint64);
 void	runtime_printhex(uint64);
 void	runtime_printslice(Slice);
 void	runtime_printcomplex(__complex double);
-
-struct __go_func_type;
 void reflect_call(const struct __go_func_type *, FuncVal *, _Bool, _Bool,
 		  void **, void **)
   __asm__ (GOSYM_PREFIX "reflect.call");
-
-/* Functions.  */
 #define runtime_panic __go_panic
-#define runtime_write(d, v, n) write((d), (v), (n))
-#define runtime_malloc(s) __go_alloc(s)
-#define runtime_free(p) __go_free(p)
-#define runtime_strcmp(s1, s2) __builtin_strcmp((s1), (s2))
-#define runtime_mcmp(a, b, s) __builtin_memcmp((a), (b), (s))
-#define runtime_memmove(a, b, s) __builtin_memmove((a), (b), (s))
-#define runtime_exit(s) exit(s)
-MCache*	runtime_allocmcache(void);
-void	free(void *v);
-#define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-#define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-#define runtime_cas64(pval, pold, new) __atomic_compare_exchange_n (pval, pold, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-#define runtime_xadd(p, v) __sync_add_and_fetch (p, v)
-#define runtime_xadd64(p, v) __sync_add_and_fetch (p, v)
-#define runtime_xchg(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicload(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicstore(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicloadp(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicstorep(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicload64(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicstore64(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
-#define PREFETCH(p) __builtin_prefetch(p)
-
-struct __go_func_type;
-bool	runtime_addfinalizer(void*, FuncVal *fn, const struct __go_func_type *);
-#define runtime_getcallersp(p) __builtin_frame_address(1)
-int32	runtime_mcount(void);
-int32	runtime_gcount(void);
-void	runtime_dopanic(int32) __attribute__ ((noreturn));
-void	runtime_startpanic(void);
-void	runtime_ready(G*);
-const byte*	runtime_getenv(const char*);
-int32	runtime_atoi(const byte*);
-uint32	runtime_fastrand1(void);
-
-void	runtime_sigprof();
-void	runtime_resetcpuprofiler(int32);
-void	runtime_setcpuprofilerate(void(*)(uintptr*, int32), int32);
-void	runtime_usleep(uint32);
 
 /*
  * runtime c-called (but written in Go)
@@ -605,14 +697,13 @@ void	runtime_newErrorString(String, Eface*)
 /*
  * wrapped for go users
  */
-#define ISNAN(f) __builtin_isnan(f)
 void	runtime_semacquire(uint32 volatile *);
 void	runtime_semrelease(uint32 volatile *);
 int32	runtime_gomaxprocsfunc(int32 n);
 void	runtime_procyield(uint32);
 void	runtime_osyield(void);
-void	runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
-void	runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
+void	runtime_lockOSThread(void);
+void	runtime_unlockOSThread(void);
 
 bool	runtime_showframe(String, bool);
 
@@ -628,12 +719,13 @@ uintptr	runtime_memlimit(void);
 // This is a no-op on other systems.
 void	runtime_setprof(bool);
 
+#define ISNAN(f) __builtin_isnan(f)
+
 enum
 {
-	UseSpanType = 1,
+	UseSpanType = 0,
 };
 
-void	runtime_setsig(int32, bool, bool);
 #define runtime_setitimer setitimer
 
 void	runtime_check(void);
@@ -658,5 +750,8 @@ struct backtrace_state;
 extern struct backtrace_state *__go_get_backtrace_state(void);
 extern _Bool __go_file_line(uintptr, String*, String*, intgo *);
 extern byte* runtime_progname();
+extern void runtime_main(void*);
 
 int32 getproccount(void);
+
+#define PREFETCH(p) __builtin_prefetch(p)
diff --git a/libgo/runtime/sema.goc b/libgo/runtime/sema.goc
index 4622f6c..be971bd 100644
--- a/libgo/runtime/sema.goc
+++ b/libgo/runtime/sema.goc
@@ -44,12 +44,12 @@ struct SemaRoot
 // Prime to not correlate with any user patterns.
 #define SEMTABLESZ 251
 
-union semtable
+struct semtable
 {
 	SemaRoot;
-	uint8 pad[CacheLineSize];
+	uint8 pad[CacheLineSize-sizeof(SemaRoot)];
 };
-static union semtable semtable[SEMTABLESZ];
+static struct semtable semtable[SEMTABLESZ];
 
 static SemaRoot*
 semroot(uint32 volatile *addr)
diff --git a/libgo/runtime/signal_unix.c b/libgo/runtime/signal_unix.c
index 3b8f439..5a506c8 100644
--- a/libgo/runtime/signal_unix.c
+++ b/libgo/runtime/signal_unix.c
@@ -8,6 +8,7 @@
 
 #include "runtime.h"
 #include "defs.h"
+#include "signal_unix.h"
 
 extern SigTab runtime_sigtab[];
 
@@ -22,7 +23,21 @@ runtime_initsig(void)
 		t = &runtime_sigtab[i];
 		if((t->flags == 0) || (t->flags & SigDefault))
 			continue;
-		runtime_setsig(i, false, true);
+
+		// For some signals, we respect an inherited SIG_IGN handler
+		// rather than insist on installing our own default handler.
+		// Even these signals can be fetched using the os/signal package.
+		switch(t->sig) {
+		case SIGHUP:
+		case SIGINT:
+			if(runtime_getsig(i) == GO_SIG_IGN) {
+				t->flags = SigNotify | SigIgnored;
+				continue;
+			}
+		}
+
+		t->flags |= SigHandling;
+		runtime_setsig(i, runtime_sighandler, true);
 	}
 }
 
@@ -32,16 +47,49 @@ runtime_sigenable(uint32 sig)
 	int32 i;
 	SigTab *t;
 
+	t = nil;
 	for(i = 0; runtime_sigtab[i].sig != -1; i++) {
-		// ~0 means all signals.
-		if(~sig == 0 || runtime_sigtab[i].sig == (int32)sig) {
+		if(runtime_sigtab[i].sig == (int32)sig) {
 			t = &runtime_sigtab[i];
-			if(t->flags & SigDefault) {
-				runtime_setsig(i, false, true);
-				t->flags &= ~SigDefault;  // make this idempotent
-			}
+			break;
 		}
 	}
+
+	if(t == nil)
+		return;
+
+	if((t->flags & SigNotify) && !(t->flags & SigHandling)) {
+		t->flags |= SigHandling;
+		if(runtime_getsig(i) == GO_SIG_IGN)
+			t->flags |= SigIgnored;
+		runtime_setsig(i, runtime_sighandler, true);
+	}
+}
+
+void
+runtime_sigdisable(uint32 sig)
+{
+	int32 i;
+	SigTab *t;
+
+	t = nil;
+	for(i = 0; runtime_sigtab[i].sig != -1; i++) {
+		if(runtime_sigtab[i].sig == (int32)sig) {
+			t = &runtime_sigtab[i];
+			break;
+		}
+	}
+
+	if(t == nil)
+		return;
+
+	if((t->flags & SigNotify) && (t->flags & SigHandling)) {
+		t->flags &= ~SigHandling;
+		if(t->flags & SigIgnored)
+			runtime_setsig(i, GO_SIG_IGN, true);
+		else
+			runtime_setsig(i, GO_SIG_DFL, true);
+	}
 }
 
 void
@@ -62,3 +110,44 @@ runtime_resetcpuprofiler(int32 hz)
 	}
 	runtime_m()->profilehz = hz;
 }
+
+void
+os_sigpipe(void)
+{
+	int32 i;
+
+	for(i = 0; runtime_sigtab[i].sig != -1; i++)
+		if(runtime_sigtab[i].sig == SIGPIPE)
+			break;
+	runtime_setsig(i, GO_SIG_DFL, false);
+	runtime_raise(SIGPIPE);
+}
+
+void
+runtime_crash(void)
+{
+	int32 i;
+
+#ifdef GOOS_darwin
+	// OS X core dumps are linear dumps of the mapped memory,
+	// from the first virtual byte to the last, with zeros in the gaps.
+	// Because of the way we arrange the address space on 64-bit systems,
+	// this means the OS X core file will be >128 GB and even on a zippy
+	// workstation can take OS X well over an hour to write (uninterruptible).
+	// Save users from making that mistake.
+	if(sizeof(void*) == 8)
+		return;
+#endif
+
+	for(i = 0; runtime_sigtab[i].sig != -1; i++)
+		if(runtime_sigtab[i].sig == SIGABRT)
+			break;
+	runtime_setsig(i, GO_SIG_DFL, false);
+	runtime_raise(SIGABRT);
+}
+
+void
+runtime_raise(int32 sig)
+{
+	raise(sig);
+}
diff --git a/libgo/runtime/signal_unix.h b/libgo/runtime/signal_unix.h
new file mode 100644
index 0000000..1c51740
--- /dev/null
+++ b/libgo/runtime/signal_unix.h
@@ -0,0 +1,22 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <signal.h>
+
+#define GO_SIG_DFL ((void*)SIG_DFL)
+#define GO_SIG_IGN ((void*)SIG_IGN)
+
+#ifdef SA_SIGINFO
+typedef siginfo_t Siginfo;
+#else
+typedef void *Siginfo;
+#endif
+
+typedef void GoSighandler(int32, Siginfo*, void*, G*);
+void	runtime_setsig(int32, GoSighandler*, bool);
+GoSighandler* runtime_getsig(int32);
+
+void	runtime_sighandler(int32 sig, Siginfo *info, void *context, G *gp);
+void	runtime_raise(int32);
+
diff --git a/libgo/runtime/sigqueue.goc b/libgo/runtime/sigqueue.goc
index 82b0400..8657216 100644
--- a/libgo/runtime/sigqueue.goc
+++ b/libgo/runtime/sigqueue.goc
@@ -107,7 +107,7 @@ func signal_recv() (m uint32) {
 				new = HASWAITER;
 			if(runtime_cas(&sig.state, old, new)) {
 				if (new == HASWAITER) {
-					runtime_entersyscall();
+					runtime_entersyscallblock();
 					runtime_notesleep(&sig);
 					runtime_exitsyscall();
 					runtime_noteclear(&sig);
@@ -135,8 +135,6 @@ done:;
 
 // Must only be called from a single goroutine at a time.
 func signal_enable(s uint32) {
-	int32 i;
-
 	if(!sig.inuse) {
 		// The first call to signal_enable is for us
 		// to use for initialization.  It does not pass
@@ -146,16 +144,16 @@ func signal_enable(s uint32) {
 		return;
 	}
 	
-	if(~s == 0) {
-		// Special case: want everything.
-		for(i=0; (size_t)i<nelem(sig.wanted); i++)
-			sig.wanted[i] = ~(uint32)0;
-		runtime_sigenable(s);
-		return;
-	}
-
 	if(s >= nelem(sig.wanted)*32)
 		return;
 	sig.wanted[s/32] |= 1U<<(s&31);
 	runtime_sigenable(s);
 }
+
+// Must only be called from a single goroutine at a time.
+func signal_disable(s uint32) {
+	if(s >= nelem(sig.wanted)*32)
+		return;
+	sig.wanted[s/32] &= ~(1U<<(s&31));
+	runtime_sigdisable(s);
+}
diff --git a/libgo/runtime/string.goc b/libgo/runtime/string.goc
index 04ecbe6..64ed4f6e 100644
--- a/libgo/runtime/string.goc
+++ b/libgo/runtime/string.goc
@@ -7,6 +7,7 @@ package runtime
 #include "arch.h"
 #include "malloc.h"
 #include "go-string.h"
+#include "race.h"
 
 #define charntorune(pv, str, len) __go_get_rune(str, len, pv)
 
diff --git a/libgo/runtime/thread-linux.c b/libgo/runtime/thread-linux.c
index 13d23c4..74139ea 100644
--- a/libgo/runtime/thread-linux.c
+++ b/libgo/runtime/thread-linux.c
@@ -15,6 +15,7 @@
 // Futexsleep is allowed to wake up spuriously.
 
 #include <errno.h>
+#include <signal.h>
 #include <string.h>
 #include <time.h>
 #include <sys/types.h>
@@ -83,3 +84,48 @@ runtime_goenvs(void)
 {
 	runtime_goenvs_unix();
 }
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime_mpreinit(M *mp)
+{
+	mp->gsignal = runtime_malg(32*1024, &mp->gsignalstack, &mp->gsignalstacksize);	// OS X wants >=8K, Linux >=2K
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime_minit(void)
+{
+	M* m;
+	sigset_t sigs;
+
+	// Initialize signal handling.
+	m = runtime_m();
+	runtime_signalstack(m->gsignalstack, m->gsignalstacksize);
+	if (sigemptyset(&sigs) != 0)
+		runtime_throw("sigemptyset");
+	sigprocmask(SIG_SETMASK, &sigs, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime_unminit(void)
+{
+	runtime_signalstack(nil, 0);
+}
+
+void
+runtime_signalstack(byte *p, int32 n)
+{
+	stack_t st;
+
+	st.ss_sp = p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	if(sigaltstack(&st, nil) < 0)
+		*(int *)0xf1 = 0xf1;
+}
diff --git a/libgo/runtime/thread.c b/libgo/runtime/thread.c
index 12d0099..83ee006 100644
--- a/libgo/runtime/thread.c
+++ b/libgo/runtime/thread.c
@@ -133,27 +133,6 @@ __sync_add_and_fetch_8 (uint64* ptr, uint64 add)
 
 #endif
 
-// Called to initialize a new m (including the bootstrap m).
-void
-runtime_minit(void)
-{
-	byte* stack;
-	size_t stacksize;
-	stack_t ss;
-	sigset_t sigs;
-
-	// Initialize signal handling.
-	runtime_m()->gsignal = runtime_malg(32*1024, &stack, &stacksize);	// OS X wants >=8K, Linux >=2K
-	ss.ss_sp = stack;
-	ss.ss_flags = 0;
-	ss.ss_size = stacksize;
-	if(sigaltstack(&ss, nil) < 0)
-		*(int *)0xf1 = 0xf1;
-	if (sigemptyset(&sigs) != 0)
-		runtime_throw("sigemptyset");
-	sigprocmask(SIG_SETMASK, &sigs, nil);
-}
-
 uintptr
 runtime_memlimit(void)
 {
diff --git a/libgo/runtime/time.goc b/libgo/runtime/time.goc
index e9f087a..e06b75c 100644
--- a/libgo/runtime/time.goc
+++ b/libgo/runtime/time.goc
@@ -14,7 +14,6 @@ package time
 
 static Timers timers;
 static void addtimer(Timer*);
-static bool deltimer(Timer*);
 
 // Package time APIs.
 // Godoc uses the comments in package time, not these.
@@ -30,15 +29,13 @@ func Sleep(ns int64) {
 func startTimer(t *Timer) {
 	if(raceenabled)
 		runtime_racerelease(t);
-	runtime_lock(&timers);
-	addtimer(t);
-	runtime_unlock(&timers);
+	runtime_addtimer(t);
 }
 
 // stopTimer removes t from the timer heap if it is there.
 // It returns true if t was removed, false if t wasn't even there.
 func stopTimer(t *Timer) (stopped bool) {
-	stopped = deltimer(t);
+	stopped = runtime_deltimer(t);
 }
 
 // C runtime.
@@ -80,6 +77,14 @@ runtime_tsleep(int64 ns, const char *reason)
 	runtime_park(runtime_unlock, &timers, reason);
 }
 
+void
+runtime_addtimer(Timer *t)
+{
+	runtime_lock(&timers);
+	addtimer(t);
+	runtime_unlock(&timers);
+}
+
 // Add a timer to the heap and start or kick the timer proc
 // if the new timer is earlier than any of the others.
 static void
@@ -122,8 +127,8 @@ addtimer(Timer *t)
 // Delete timer t from the heap.
 // Do not need to update the timerproc:
 // if it wakes up early, no big deal.
-static bool
-deltimer(Timer *t)
+bool
+runtime_deltimer(Timer *t)
 {
 	int32 i;
 
@@ -205,7 +210,7 @@ timerproc(void* dummy __attribute__ ((unused)))
 		timers.sleeping = true;
 		runtime_noteclear(&timers.waitnote);
 		runtime_unlock(&timers);
-		runtime_entersyscall();
+		runtime_entersyscallblock();
 		runtime_notetsleep(&timers.waitnote, delta);
 		runtime_exitsyscall();
 	}
author	Ian Lance Taylor <ian@gcc.gnu.org>	2013-07-16 06:54:42 +0000
committer	Ian Lance Taylor <ian@gcc.gnu.org>	2013-07-16 06:54:42 +0000
commit	be47d6eceffd2c5dbbc1566d5eea490527fb2bd4 (patch)
tree	0e8fda573576bb4181dba29d0e88380a8c38fafd /libgo/runtime
parent	efb30cdeb003fd7c585ee0d7657340086abcbd9e (diff)
download	gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.zip gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.tar.gz gcc-be47d6eceffd2c5dbbc1566d5eea490527fb2bd4.tar.bz2