libgo: Merge to master revision 19184.

The next revision, 19185, renames several runtime files, and will be handled in a separate change. From-SVN: r211328
author: Ian Lance Taylor <ian@gcc.gnu.org> 2014-06-06 22:37:27 +0000
committer: Ian Lance Taylor <ian@gcc.gnu.org> 2014-06-06 22:37:27 +0000
commit: 6736ef96eab222e58e6294f42be981a5afb59811 (patch)
tree: 2bc668fae9bf96f9a3988e0b0a16685bde8c4f0b /libgo/runtime
parent: 38a138411da4206c53f9a153ee9c3624fce58a52 (diff)
download: gcc-6736ef96eab222e58e6294f42be981a5afb59811.zip
gcc-6736ef96eab222e58e6294f42be981a5afb59811.tar.gz
gcc-6736ef96eab222e58e6294f42be981a5afb59811.tar.bz2
35 files changed, 2143 insertions, 1517 deletions
diff --git a/libgo/runtime/chan.c b/libgo/runtime/chan.c
index 6bd12e4..cd3a2c5 100644
--- a/libgo/runtime/chan.c
+++ b/libgo/runtime/chan.c
@@ -8,8 +8,6 @@
 #include "race.h"
 #include "malloc.h"
 
-#define	NOSELGEN	1
-
 typedef	struct	WaitQ	WaitQ;
 typedef	struct	SudoG	SudoG;
 typedef	struct	Select	Select;
@@ -20,8 +18,8 @@ typedef struct	__go_channel_type	ChanType;
 
 struct	SudoG
 {
-	G*	g;		// g and selgen constitute
-	uint32	selgen;		// a weak pointer to g
+	G*	g;
+	uint32*	selectdone;
 	SudoG*	link;
 	int64	releasetime;
 	byte*	elem;		// data element
@@ -43,6 +41,7 @@ struct	Hchan
 	uint8	elemalign;
 	uint8	pad;			// ensures proper alignment of the buffer that follows Hchan in memory
 	bool	closed;
+	const Type* elemtype;		// element type
 	uintgo	sendx;			// send index
 	uintgo	recvx;			// receive index
 	WaitQ	recvq;			// list of recv waiters
@@ -89,8 +88,8 @@ static	SudoG*	dequeue(WaitQ*);
 static	void	enqueue(WaitQ*, SudoG*);
 static	void	racesync(Hchan*, SudoG*);
 
-Hchan*
-runtime_makechan_c(ChanType *t, int64 hint)
+static Hchan*
+makechan(ChanType *t, int64 hint)
 {
 	Hchan *c;
 	uintptr n;
@@ -102,16 +101,16 @@ runtime_makechan_c(ChanType *t, int64 hint)
 	if(elem->__size >= (1<<16))
 		runtime_throw("makechan: invalid channel element type");
 
-	if(hint < 0 || (intgo)hint != hint || (elem->__size > 0 && (uintptr)hint > MaxMem / elem->__size))
+	if(hint < 0 || (intgo)hint != hint || (elem->__size > 0 && (uintptr)hint > (MaxMem - sizeof(*c)) / elem->__size))
 		runtime_panicstring("makechan: size out of range");
 
 	n = sizeof(*c);
 	n = ROUND(n, elem->__align);
 
 	// allocate memory in one call
-	c = (Hchan*)runtime_mallocgc(n + hint*elem->__size, (uintptr)t | TypeInfo_Chan, 0);
+	c = (Hchan*)runtime_mallocgc(sizeof(*c) + hint*elem->__size, (uintptr)t | TypeInfo_Chan, 0);
 	c->elemsize = elem->__size;
-	c->elemalign = elem->__align;
+	c->elemtype = elem;
 	c->dataqsiz = hint;
 
 	if(debug)
@@ -131,7 +130,7 @@ reflect_makechan(ChanType *t, uint64 size)
 {
 	Hchan *c;
 
-	c = runtime_makechan_c(t, size);
+	c = makechan(t, size);
 	return c;
 }
 
@@ -139,13 +138,13 @@ reflect_makechan(ChanType *t, uint64 size)
 Hchan*
 __go_new_channel(ChanType *t, uintptr hint)
 {
-	return runtime_makechan_c(t, hint);
+	return makechan(t, hint);
 }
 
 Hchan*
 __go_new_channel_big(ChanType *t, uint64 hint)
 {
-	return runtime_makechan_c(t, hint);
+	return makechan(t, hint);
 }
 
 /*
@@ -162,8 +161,8 @@ __go_new_channel_big(ChanType *t, uint64 hint)
  * been closed.  it is easiest to loop and re-run
  * the operation; we'll see that it's now closed.
  */
-void
-runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
+static bool
+chansend(ChanType *t, Hchan *c, byte *ep, bool block, void *pc)
 {
 	SudoG *sg;
 	SudoG mysg;
@@ -173,14 +172,15 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 
 	g = runtime_g();
 
+	if(raceenabled)
+		runtime_racereadobjectpc(ep, t->__element_type, runtime_getcallerpc(&t), chansend);
+
 	if(c == nil) {
 		USED(t);
-		if(pres != nil) {
-			*pres = false;
-			return;
-		}
+		if(!block)
+			return false;
 		runtime_park(nil, nil, "chan send (nil chan)");
-		return;  // not reached
+		return false;  // not reached
 	}
 
 	if(runtime_gcwaiting())
@@ -199,7 +199,7 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 
 	runtime_lock(c);
 	if(raceenabled)
-		runtime_racereadpc(c, pc, runtime_chansend);
+		runtime_racereadpc(c, pc, chansend);
 	if(c->closed)
 		goto closed;
 
@@ -219,24 +219,20 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 		if(sg->releasetime)
 			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
-
-		if(pres != nil)
-			*pres = true;
-		return;
+		return true;
 	}
 
-	if(pres != nil) {
+	if(!block) {
 		runtime_unlock(c);
-		*pres = false;
-		return;
+		return false;
 	}
 
 	mysg.elem = ep;
 	mysg.g = g;
-	mysg.selgen = NOSELGEN;
+	mysg.selectdone = nil;
 	g->param = nil;
 	enqueue(&c->sendq, &mysg);
-	runtime_park(runtime_unlock, c, "chan send");
+	runtime_parkunlock(c, "chan send");
 
 	if(g->param == nil) {
 		runtime_lock(c);
@@ -248,23 +244,22 @@ runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 	if(mysg.releasetime > 0)
 		runtime_blockevent(mysg.releasetime - t0, 2);
 
-	return;
+	return true;
 
 asynch:
 	if(c->closed)
 		goto closed;
 
 	if(c->qcount >= c->dataqsiz) {
-		if(pres != nil) {
+		if(!block) {
 			runtime_unlock(c);
-			*pres = false;
-			return;
+			return false;
 		}
 		mysg.g = g;
 		mysg.elem = nil;
-		mysg.selgen = NOSELGEN;
+		mysg.selectdone = nil;
 		enqueue(&c->sendq, &mysg);
-		runtime_park(runtime_unlock, c, "chan send");
+		runtime_parkunlock(c, "chan send");
 
 		runtime_lock(c);
 		goto asynch;
@@ -287,20 +282,19 @@ asynch:
 		runtime_ready(gp);
 	} else
 		runtime_unlock(c);
-	if(pres != nil)
-		*pres = true;
 	if(mysg.releasetime > 0)
 		runtime_blockevent(mysg.releasetime - t0, 2);
-	return;
+	return true;
 
 closed:
 	runtime_unlock(c);
 	runtime_panicstring("send on closed channel");
+	return false;  // not reached
 }
 
 
-void
-runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received)
+static bool
+chanrecv(ChanType *t, Hchan* c, byte *ep, bool block, bool *received)
 {
 	SudoG *sg;
 	SudoG mysg;
@@ -311,6 +305,8 @@ runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received
 	if(runtime_gcwaiting())
 		runtime_gosched();
 
+	// raceenabled: don't need to check ep, as it is always on the stack.
+
 	if(debug)
 		runtime_printf("chanrecv: chan=%p\n", c);
 
@@ -318,12 +314,10 @@ runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received
 
 	if(c == nil) {
 		USED(t);
-		if(selected != nil) {
-			*selected = false;
-			return;
-		}
+		if(!block)
+			return false;
 		runtime_park(nil, nil, "chan receive (nil chan)");
-		return;  // not reached
+		return false;  // not reached
 	}
 
 	t0 = 0;
@@ -354,25 +348,22 @@ runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received
 			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 
-		if(selected != nil)
-			*selected = true;
 		if(received != nil)
 			*received = true;
-		return;
+		return true;
 	}
 
-	if(selected != nil) {
+	if(!block) {
 		runtime_unlock(c);
-		*selected = false;
-		return;
+		return false;
 	}
 
 	mysg.elem = ep;
 	mysg.g = g;
-	mysg.selgen = NOSELGEN;
+	mysg.selectdone = nil;
 	g->param = nil;
 	enqueue(&c->recvq, &mysg);
-	runtime_park(runtime_unlock, c, "chan receive");
+	runtime_parkunlock(c, "chan receive");
 
 	if(g->param == nil) {
 		runtime_lock(c);
@@ -385,25 +376,24 @@ runtime_chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *received
 		*received = true;
 	if(mysg.releasetime > 0)
 		runtime_blockevent(mysg.releasetime - t0, 2);
-	return;
+	return true;
 
 asynch:
 	if(c->qcount <= 0) {
 		if(c->closed)
 			goto closed;
 
-		if(selected != nil) {
+		if(!block) {
 			runtime_unlock(c);
-			*selected = false;
 			if(received != nil)
 				*received = false;
-			return;
+			return false;
 		}
 		mysg.g = g;
 		mysg.elem = nil;
-		mysg.selgen = NOSELGEN;
+		mysg.selectdone = nil;
 		enqueue(&c->recvq, &mysg);
-		runtime_park(runtime_unlock, c, "chan receive");
+		runtime_parkunlock(c, "chan receive");
 
 		runtime_lock(c);
 		goto asynch;
@@ -429,19 +419,15 @@ asynch:
 	} else
 		runtime_unlock(c);
 
-	if(selected != nil)
-		*selected = true;
 	if(received != nil)
 		*received = true;
 	if(mysg.releasetime > 0)
 		runtime_blockevent(mysg.releasetime - t0, 2);
-	return;
+	return true;
 
 closed:
 	if(ep != nil)
 		runtime_memclr(ep, c->elemsize);
-	if(selected != nil)
-		*selected = true;
 	if(received != nil)
 		*received = false;
 	if(raceenabled)
@@ -449,6 +435,7 @@ closed:
 	runtime_unlock(c);
 	if(mysg.releasetime > 0)
 		runtime_blockevent(mysg.releasetime - t0, 2);
+	return true;
 }
 
 // The compiler generates a call to __go_send_small to send a value 8
@@ -461,46 +448,46 @@ __go_send_small(ChanType *t, Hchan* c, uint64 val)
 		byte b[sizeof(uint64)];
 		uint64 v;
 	} u;
-	byte *p;
+	byte *v;
 
 	u.v = val;
 #ifndef WORDS_BIGENDIAN
-	p = u.b;
+	v = u.b;
 #else
-	p = u.b + sizeof(uint64) - t->__element_type->__size;
+	v = u.b + sizeof(uint64) - t->__element_type->__size;
 #endif
-	runtime_chansend(t, c, p, nil, runtime_getcallerpc(&t));
+	chansend(t, c, v, true, runtime_getcallerpc(&t));
 }
 
 // The compiler generates a call to __go_send_big to send a value
 // larger than 8 bytes or smaller.
 void
-__go_send_big(ChanType *t, Hchan* c, byte* p)
+__go_send_big(ChanType *t, Hchan* c, byte* v)
 {
-	runtime_chansend(t, c, p, nil, runtime_getcallerpc(&t));
+	chansend(t, c, v, true, runtime_getcallerpc(&t));
 }
 
 // The compiler generates a call to __go_receive to receive a
 // value from a channel.
 void
-__go_receive(ChanType *t, Hchan* c, byte* p)
+__go_receive(ChanType *t, Hchan* c, byte* v)
 {
-	runtime_chanrecv(t, c, p, nil, nil);
+	chanrecv(t, c, v, true, nil);
 }
 
-_Bool runtime_chanrecv2(ChanType *t, Hchan* c, byte* p)
+_Bool runtime_chanrecv2(ChanType *t, Hchan* c, byte* v)
   __asm__ (GOSYM_PREFIX "runtime.chanrecv2");
 
 _Bool
-runtime_chanrecv2(ChanType *t, Hchan* c, byte* p)
+runtime_chanrecv2(ChanType *t, Hchan* c, byte* v)
 {
 	bool received;
 
-	runtime_chanrecv(t, c, p, nil, &received);
+	chanrecv(t, c, v, true, &received);
 	return received;
 }
 
-// func selectnbsend(c chan any, elem any) bool
+// func selectnbsend(c chan any, elem *any) bool
 //
 // compiler implements
 //
@@ -520,12 +507,12 @@ runtime_chanrecv2(ChanType *t, Hchan* c, byte* p)
 //	}
 //
 _Bool
-runtime_selectnbsend(ChanType *t, Hchan *c, byte *p)
+runtime_selectnbsend(ChanType *t, Hchan *c, byte *val)
 {
 	bool res;
 
-	runtime_chansend(t, c, p, &res, runtime_getcallerpc(&t));
-	return res;
+	res = chansend(t, c, val, false, runtime_getcallerpc(&t));
+	return (_Bool)res;
 }
 
 // func selectnbrecv(elem *any, c chan any) bool
@@ -552,8 +539,8 @@ runtime_selectnbrecv(ChanType *t, byte *v, Hchan *c)
 {
 	bool selected;
 
-	runtime_chanrecv(t, c, v, &selected, nil);
-	return selected;
+	selected = chanrecv(t, c, v, false, nil);
+	return (_Bool)selected;
 }
 
 // func selectnbrecv2(elem *any, ok *bool, c chan any) bool
@@ -582,88 +569,60 @@ runtime_selectnbrecv2(ChanType *t, byte *v, _Bool *received, Hchan *c)
 	bool r;
 
 	r = false;
-	runtime_chanrecv(t, c, v, &selected, received == nil ? nil : &r);
+	selected = chanrecv(t, c, v, false, received == nil ? nil : &r);
 	if(received != nil)
 		*received = r;
 	return selected;
 }
 
 // For reflect:
-//	func chansend(c chan, val iword, nb bool) (selected bool)
-// where an iword is the same word an interface value would use:
-// the actual data if it fits, or else a pointer to the data.
+//	func chansend(c chan, val *any, nb bool) (selected bool)
+// where val points to the data to be sent.
+//
+// The "uintptr selected" is really "bool selected" but saying
+// uintptr gets us the right alignment for the output parameter block.
 
-_Bool reflect_chansend(ChanType *, Hchan *, uintptr, _Bool)
+_Bool reflect_chansend(ChanType *, Hchan *, byte *, _Bool)
   __asm__ (GOSYM_PREFIX "reflect.chansend");
 
 _Bool
-reflect_chansend(ChanType *t, Hchan *c, uintptr val, _Bool nb)
+reflect_chansend(ChanType *t, Hchan *c, byte *val, _Bool nb)
 {
 	bool selected;
-	bool *sp;
-	byte *vp;
 
-	if(nb) {
-		selected = false;
-		sp = (bool*)&selected;
-	} else {
-		selected = true;
-		sp = nil;
-	}
-	if(__go_is_pointer_type(t->__element_type))
-		vp = (byte*)&val;
-	else
-		vp = (byte*)val;
-	runtime_chansend(t, c, vp, sp, runtime_getcallerpc(&t));
-	return selected;
+	selected = chansend(t, c, val, !nb, runtime_getcallerpc(&t));
+	return (_Bool)selected;
 }
 
 // For reflect:
-//	func chanrecv(c chan, nb bool) (val iword, selected, received bool)
-// where an iword is the same word an interface value would use:
-// the actual data if it fits, or else a pointer to the data.
+//	func chanrecv(c chan, nb bool, val *any) (selected, received bool)
+// where val points to a data area that will be filled in with the
+// received value.  val must have the size and type of the channel element type.
 
 struct chanrecv_ret
 {
-	uintptr val;
 	_Bool selected;
 	_Bool received;
 };
 
-struct chanrecv_ret reflect_chanrecv(ChanType *, Hchan *, _Bool)
+struct chanrecv_ret reflect_chanrecv(ChanType *, Hchan *, _Bool, byte *val)
   __asm__ (GOSYM_PREFIX "reflect.chanrecv");
 
 struct chanrecv_ret
-reflect_chanrecv(ChanType *t, Hchan *c, _Bool nb)
+reflect_chanrecv(ChanType *t, Hchan *c, _Bool nb, byte *val)
 {
 	struct chanrecv_ret ret;
-	byte *vp;
-	bool *sp;
 	bool selected;
 	bool received;
 
-	if(nb) {
-		selected = false;
-		sp = &selected;
-	} else {
-		ret.selected = true;
-		sp = nil;
-	}
 	received = false;
-	if(__go_is_pointer_type(t->__element_type)) {
-		vp = (byte*)&ret.val;
-	} else {
-		vp = runtime_mal(t->__element_type->__size);
-		ret.val = (uintptr)vp;
-	}
-	runtime_chanrecv(t, c, vp, sp, &received);
-	if(nb)
-		ret.selected = selected;
-	ret.received = received;
+	selected = chanrecv(t, c, val, !nb, &received);
+	ret.selected = (_Bool)selected;
+	ret.received = (_Bool)received;
 	return ret;
 }
 
-static void newselect(int32, Select**);
+static Select* newselect(int32);
 
 // newselect(size uint32) (sel *byte);
 
@@ -672,14 +631,11 @@ void* runtime_newselect(int32) __asm__ (GOSYM_PREFIX "runtime.newselect");
 void*
 runtime_newselect(int32 size)
 {
-	Select *sel;
-
-	newselect(size, &sel);
-	return (void*)sel;
+	return (void*)newselect(size);
 }
 
-static void
-newselect(int32 size, Select **selp)
+static Select*
+newselect(int32 size)
 {
 	int32 n;
 	Select *sel;
@@ -701,10 +657,10 @@ newselect(int32 size, Select **selp)
 	sel->ncase = 0;
 	sel->lockorder = (void*)(sel->scase + size);
 	sel->pollorder = (void*)(sel->lockorder + size);
-	*selp = sel;
 
 	if(debug)
 		runtime_printf("newselect s=%p size=%d\n", sel, size);
+	return sel;
 }
 
 // cut in half to give stack a chance to split
@@ -880,6 +836,14 @@ selunlock(Select *sel)
 	}
 }
 
+static bool
+selparkcommit(G *gp, void *sel)
+{
+	USED(gp);
+	selunlock(sel);
+	return true;
+}
+
 void
 runtime_block(void)
 {
@@ -902,7 +866,7 @@ static int
 selectgo(Select **selp)
 {
 	Select *sel;
-	uint32 o, i, j, k;
+	uint32 o, i, j, k, done;
 	int64 t0;
 	Scase *cas, *dfl;
 	Hchan *c;
@@ -1008,7 +972,7 @@ loop:
 
 		case CaseSend:
 			if(raceenabled)
-				runtime_racereadpc(c, runtime_selectgo, runtime_chansend);
+				runtime_racereadpc(c, runtime_selectgo, chansend);
 			if(c->closed)
 				goto sclose;
 			if(c->dataqsiz > 0) {
@@ -1035,13 +999,14 @@ loop:
 
 
 	// pass 2 - enqueue on all chans
+	done = 0;
 	for(i=0; i<sel->ncase; i++) {
 		o = sel->pollorder[i];
 		cas = &sel->scase[o];
 		c = cas->chan;
 		sg = &cas->sg;
 		sg->g = g;
-		sg->selgen = g->selgen;
+		sg->selectdone = &done;
 
 		switch(cas->kind) {
 		case CaseRecv:
@@ -1055,7 +1020,7 @@ loop:
 	}
 
 	g->param = nil;
-	runtime_park((void(*)(Lock*))selunlock, (Lock*)sel, "select");
+	runtime_park(selparkcommit, sel, "select");
 
 	sellock(sel);
 	sg = g->param;
@@ -1091,13 +1056,23 @@ loop:
 			*cas->receivedp = true;
 	}
 
+	if(raceenabled) {
+		if(cas->kind == CaseRecv && cas->sg.elem != nil)
+			runtime_racewriteobjectpc(cas->sg.elem, c->elemtype, selectgo, chanrecv);
+		else if(cas->kind == CaseSend)
+			runtime_racereadobjectpc(cas->sg.elem, c->elemtype, selectgo, chansend);
+	}
+
 	selunlock(sel);
 	goto retc;
 
 asyncrecv:
 	// can receive from buffer
-	if(raceenabled)
+	if(raceenabled) {
+		if(cas->sg.elem != nil)
+			runtime_racewriteobjectpc(cas->sg.elem, c->elemtype, selectgo, chanrecv);
 		runtime_raceacquire(chanbuf(c, c->recvx));
+	}
 	if(cas->receivedp != nil)
 		*cas->receivedp = true;
 	if(cas->sg.elem != nil)
@@ -1120,8 +1095,10 @@ asyncrecv:
 
 asyncsend:
 	// can send to buffer
-	if(raceenabled)
+	if(raceenabled) {
 		runtime_racerelease(chanbuf(c, c->sendx));
+		runtime_racereadobjectpc(cas->sg.elem, c->elemtype, selectgo, chansend);
+	}
 	runtime_memmove(chanbuf(c, c->sendx), cas->sg.elem, c->elemsize);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
@@ -1140,8 +1117,11 @@ asyncsend:
 
 syncrecv:
 	// can receive from sleeping sender (sg)
-	if(raceenabled)
+	if(raceenabled) {
+		if(cas->sg.elem != nil)
+			runtime_racewriteobjectpc(cas->sg.elem, c->elemtype, selectgo, chanrecv);
 		racesync(c, sg);
+	}
 	selunlock(sel);
 	if(debug)
 		runtime_printf("syncrecv: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1169,8 +1149,10 @@ rclose:
 
 syncsend:
 	// can send to sleeping receiver (sg)
-	if(raceenabled)
+	if(raceenabled) {
+		runtime_racereadobjectpc(cas->sg.elem, c->elemtype, selectgo, chansend);
 		racesync(c, sg);
+	}
 	selunlock(sel);
 	if(debug)
 		runtime_printf("syncsend: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1204,7 +1186,7 @@ struct runtimeSelect
 	uintptr dir;
 	ChanType *typ;
 	Hchan *ch;
-	uintptr val;
+	byte *val;
 };
 
 // This enum must match ../reflect/value.go:/SelectDir.
@@ -1214,14 +1196,13 @@ enum SelectDir {
 	SelectDefault,
 };
 
+// func rselect(cases []runtimeSelect) (chosen int, recvOK bool)
+
 struct rselect_ret {
 	intgo chosen;
-	uintptr word;
-	bool recvOK;
+	_Bool recvOK;
 };
 
-// func rselect(cases []runtimeSelect) (chosen int, word uintptr, recvOK bool)
-
 struct rselect_ret reflect_rselect(Slice)
      __asm__ (GOSYM_PREFIX "reflect.rselect");
 
@@ -1229,36 +1210,18 @@ struct rselect_ret
 reflect_rselect(Slice cases)
 {
 	struct rselect_ret ret;
+	intgo chosen;
+	bool recvOK;
 	int32 i;
 	Select *sel;
 	runtimeSelect* rcase, *rc;
-	void *elem;
-	void *recvptr;
-	uintptr maxsize;
-	bool onlyptr;
 
-	ret.chosen = -1;
-	ret.word = 0;
-	ret.recvOK = false;
+	chosen = -1;
+	recvOK = false;
 
-	maxsize = 0;
-	onlyptr = true;
 	rcase = (runtimeSelect*)cases.__values;
-	for(i=0; i<cases.__count; i++) {
-		rc = &rcase[i];
-		if(rc->dir == SelectRecv && rc->ch != nil) {
-			if(maxsize < rc->typ->__element_type->__size)
-				maxsize = rc->typ->__element_type->__size;
-			if(!__go_is_pointer_type(rc->typ->__element_type))
-				onlyptr = false;
-		}
-	}
-
-	recvptr = nil;
-	if(!onlyptr)
-		recvptr = runtime_mal(maxsize);
 
-	newselect(cases.__count, &sel);
+	sel = newselect(cases.__count);
 	for(i=0; i<cases.__count; i++) {
 		rc = &rcase[i];
 		switch(rc->dir) {
@@ -1268,28 +1231,20 @@ reflect_rselect(Slice cases)
 		case SelectSend:
 			if(rc->ch == nil)
 				break;
-			if(!__go_is_pointer_type(rc->typ->__element_type))
-				elem = (void*)rc->val;
-			else
-				elem = (void*)&rc->val;
-			selectsend(sel, rc->ch, i, elem);
+			selectsend(sel, rc->ch, i, rc->val);
 			break;
 		case SelectRecv:
 			if(rc->ch == nil)
 				break;
-			if(!__go_is_pointer_type(rc->typ->__element_type))
-				elem = recvptr;
-			else
-				elem = &ret.word;
-			selectrecv(sel, rc->ch, i, elem, &ret.recvOK);
+			selectrecv(sel, rc->ch, i, rc->val, &recvOK);
 			break;
 		}
 	}
 
-	ret.chosen = (intgo)(uintptr)selectgo(&sel);
-	if(rcase[ret.chosen].dir == SelectRecv && !__go_is_pointer_type(rcase[ret.chosen].typ->__element_type))
-		ret.word = (uintptr)recvptr;
+	chosen = (intgo)(uintptr)selectgo(&sel);
 
+	ret.chosen = chosen;
+	ret.recvOK = (_Bool)recvOK;
 	return ret;
 }
 
@@ -1428,12 +1383,11 @@ loop:
 		return nil;
 	q->first = sgp->link;
 
-	// if sgp is stale, ignore it
-	if(sgp->selgen != NOSELGEN &&
-		(sgp->selgen != sgp->g->selgen ||
-		!runtime_cas(&sgp->g->selgen, sgp->selgen, sgp->selgen + 2))) {
-		//prints("INVALID PSEUDOG POINTER\n");
-		goto loop;
+	// if sgp participates in a select and is already signaled, ignore it
+	if(sgp->selectdone != nil) {
+		// claim the right to signal
+		if(*sgp->selectdone != 0 || !runtime_cas(sgp->selectdone, 0, 1))
+			goto loop;
 	}
 
 	return sgp;
diff --git a/libgo/runtime/env_posix.c b/libgo/runtime/env_posix.c
index 3219550..93f90f5 100644
--- a/libgo/runtime/env_posix.c
+++ b/libgo/runtime/env_posix.c
@@ -2,10 +2,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd windows
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
 
 #include "runtime.h"
 #include "array.h"
+#include "arch.h"
+#include "malloc.h"
 
 extern Slice syscall_Envs __asm__ (GOSYM_PREFIX "syscall.Envs");
 
diff --git a/libgo/runtime/go-append.c b/libgo/runtime/go-append.c
index 8d5dee2..1b2d49e 100644
--- a/libgo/runtime/go-append.c
+++ b/libgo/runtime/go-append.c
@@ -37,6 +37,7 @@ __go_append (struct __go_open_array a, void *bvalues, uintptr_t bcount,
   if (count > a.__capacity)
     {
       intgo m;
+      uintptr capmem;
       void *n;
 
       m = a.__capacity;
@@ -57,7 +58,9 @@ __go_append (struct __go_open_array a, void *bvalues, uintptr_t bcount,
       if (element_size > 0 && (uintptr) m > MaxMem / element_size)
 	runtime_panicstring ("growslice: cap out of range");
 
-      n = __go_alloc (m * element_size);
+      capmem = runtime_roundupsize (m * element_size);
+
+      n = __go_alloc (capmem);
       __builtin_memcpy (n, a.__values, a.__count * element_size);
 
       a.__values = n;
diff --git a/libgo/runtime/go-defer.c b/libgo/runtime/go-defer.c
index 4c61ae7..5dd8c31 100644
--- a/libgo/runtime/go-defer.c
+++ b/libgo/runtime/go-defer.c
@@ -20,7 +20,7 @@ __go_defer (_Bool *frame, void (*pfn) (void *), void *arg)
   struct __go_defer_stack *n;
 
   g = runtime_g ();
-  n = (struct __go_defer_stack *) __go_alloc (sizeof (struct __go_defer_stack));
+  n = runtime_newdefer ();
   n->__next = g->defer;
   n->__frame = frame;
   n->__panic = g->panic;
@@ -28,7 +28,7 @@ __go_defer (_Bool *frame, void (*pfn) (void *), void *arg)
   n->__arg = arg;
   n->__retaddr = NULL;
   n->__makefunc_can_recover = 0;
-  n->__free = 1;
+  n->__special = 0;
   g->defer = n;
 }
 
@@ -44,7 +44,6 @@ __go_undefer (_Bool *frame)
     {
       struct __go_defer_stack *d;
       void (*pfn) (void *);
-      M *m;
 
       d = g->defer;
       pfn = d->__pfn;
@@ -59,9 +58,8 @@ __go_undefer (_Bool *frame)
 	 call to syscall.CgocallBackDone, in which case we will not
 	 have a memory context.  Don't try to free anything in that
 	 case--the GC will release it later.  */
-      m = runtime_m ();
-      if (m != NULL && m->mcache != NULL && d->__free)
-	__go_free (d);
+      if (runtime_m () != NULL)
+	runtime_freedefer (d);
 
       /* Since we are executing a defer function here, we know we are
 	 returning from the calling function.  If the calling
diff --git a/libgo/runtime/go-defer.h b/libgo/runtime/go-defer.h
index d110a87..acf2d40 100644
--- a/libgo/runtime/go-defer.h
+++ b/libgo/runtime/go-defer.h
@@ -20,8 +20,8 @@ struct __go_defer_stack
 
   /* The value of the panic stack when this function is deferred.
      This function can not recover this value from the panic stack.
-     This can happen if a deferred function uses its own defer
-     statement.  */
+     This can happen if a deferred function has a defer statement
+     itself.  */
   struct __go_panic_stack *__panic;
 
   /* The function to call.  */
@@ -41,7 +41,7 @@ struct __go_defer_stack
      useful.  */
   _Bool __makefunc_can_recover;
 
-  /* Set to true if this defer stack entry should be freed when
-     done.  */
-  _Bool __free;
+  /* Set to true if this defer stack entry is not part of the defer
+     pool.  */
+  _Bool __special;
 };
diff --git a/libgo/runtime/go-panic.c b/libgo/runtime/go-panic.c
index 0cacbcd..77975c6 100644
--- a/libgo/runtime/go-panic.c
+++ b/libgo/runtime/go-panic.c
@@ -54,7 +54,6 @@ __go_panic (struct __go_empty_interface arg)
     {
       struct __go_defer_stack *d;
       void (*pfn) (void *);
-      M *m;
 
       d = g->defer;
       if (d == NULL)
@@ -101,9 +100,8 @@ __go_panic (struct __go_empty_interface arg)
 	 call to syscall.CgocallBackDone, in which case we will not
 	 have a memory context.  Don't try to free anything in that
 	 case--the GC will release it later.  */
-      m = runtime_m ();
-      if (m != NULL && m->mcache != NULL && d->__free)
-	__go_free (d);
+      if (runtime_m () != NULL)
+	runtime_freedefer (d);
     }
 
   /* The panic was not recovered.  */
diff --git a/libgo/runtime/go-setenv.c b/libgo/runtime/go-setenv.c
index 6c7378c..a75d7c4 100644
--- a/libgo/runtime/go-setenv.c
+++ b/libgo/runtime/go-setenv.c
@@ -11,6 +11,8 @@
 
 #include "go-alloc.h"
 #include "runtime.h"
+#include "arch.h"
+#include "malloc.h"
 
 /* Set the C environment from Go.  This is called by syscall.Setenv.  */
 
@@ -23,6 +25,7 @@ setenv_c (String k, String v)
   unsigned char *kn;
   const byte *vs;
   unsigned char *vn;
+  intgo len;
 
   ks = k.str;
   if (ks == NULL)
@@ -38,14 +41,22 @@ setenv_c (String k, String v)
 
   if (ks != NULL && ks[k.len] != 0)
     {
-      kn = __go_alloc (k.len + 1);
+      // Objects that are explicitly freed must be at least 16 bytes in size,
+      // so that they are not allocated using tiny alloc.
+      len = k.len + 1;
+      if (len < TinySize)
+	len = TinySize;
+      kn = __go_alloc (len);
       __builtin_memcpy (kn, ks, k.len);
       ks = kn;
     }
 
   if (vs != NULL && vs[v.len] != 0)
     {
-      vn = __go_alloc (v.len + 1);
+      len = v.len + 1;
+      if (len < TinySize)
+	len = TinySize;
+      vn = __go_alloc (len);
       __builtin_memcpy (vn, vs, v.len);
       vs = vn;
     }
@@ -54,7 +65,10 @@ setenv_c (String k, String v)
 
 #else /* !defined(HAVE_SETENV) */
 
-  kn = __go_alloc (k.len + v.len + 2);
+  len = k.len + v.len + 2;
+  if (len < TinySize)
+    len = TinySize;
+  kn = __go_alloc (len);
   __builtin_memcpy (kn, ks, k.len);
   kn[k.len] = '=';
   __builtin_memcpy (kn + k.len + 1, vs, v.len);
diff --git a/libgo/runtime/go-string-to-byte-array.c b/libgo/runtime/go-string-to-byte-array.c
index 5e03033..a4edb50 100644
--- a/libgo/runtime/go-string-to-byte-array.c
+++ b/libgo/runtime/go-string-to-byte-array.c
@@ -12,14 +12,17 @@
 struct __go_open_array
 __go_string_to_byte_array (String str)
 {
+  uintptr cap;
   unsigned char *data;
   struct __go_open_array ret;
 
-  data = (unsigned char *) runtime_mallocgc (str.len, 0,
-					     FlagNoScan | FlagNoZero);
+  cap = runtime_roundupsize (str.len);
+  data = (unsigned char *) runtime_mallocgc (cap, 0, FlagNoScan | FlagNoZero);
   __builtin_memcpy (data, str.str, str.len);
+  if (cap != (uintptr) str.len)
+    __builtin_memset (data + str.len, 0, cap - (uintptr) str.len);
   ret.__values = (void *) data;
   ret.__count = str.len;
-  ret.__capacity = str.len;
+  ret.__capacity = (intgo) cap;
   return ret;
 }
diff --git a/libgo/runtime/go-string-to-int-array.c b/libgo/runtime/go-string-to-int-array.c
index d91c9e2..5546889 100644
--- a/libgo/runtime/go-string-to-int-array.c
+++ b/libgo/runtime/go-string-to-int-array.c
@@ -17,6 +17,7 @@ __go_string_to_int_array (String str)
   size_t c;
   const unsigned char *p;
   const unsigned char *pend;
+  uintptr mem;
   uint32_t *data;
   uint32_t *pd;
   struct __go_open_array ret;
@@ -32,8 +33,11 @@ __go_string_to_int_array (String str)
       p += __go_get_rune (p, pend - p, &rune);
     }
 
-  data = (uint32_t *) runtime_mallocgc (c * sizeof (uint32_t), 0,
-					FlagNoScan | FlagNoZero);
+  if (c > MaxMem / sizeof (uint32_t))
+    runtime_throw ("out of memory");
+
+  mem = runtime_roundupsize (c * sizeof (uint32_t));
+  data = (uint32_t *) runtime_mallocgc (mem, 0, FlagNoScan | FlagNoZero);
   p = str.str;
   pd = data;
   while (p < pend)
@@ -43,9 +47,10 @@ __go_string_to_int_array (String str)
       p += __go_get_rune (p, pend - p, &rune);
       *pd++ = rune;
     }
-
+  if (mem > (uintptr) c * sizeof (uint32_t))
+    __builtin_memset (data + c, 0, mem - (uintptr) c * sizeof (uint32_t));
   ret.__values = (void *) data;
   ret.__count = c;
-  ret.__capacity = c;
+  ret.__capacity = (intgo) (mem / sizeof (uint32_t));
   return ret;
 }
diff --git a/libgo/runtime/go-unwind.c b/libgo/runtime/go-unwind.c
index 04b0a28..849256b 100644
--- a/libgo/runtime/go-unwind.c
+++ b/libgo/runtime/go-unwind.c
@@ -80,7 +80,6 @@ __go_check_defer (_Bool *frame)
 	{
 	  struct __go_defer_stack *d;
 	  void (*pfn) (void *);
-	  M *m;
 
 	  d = g->defer;
 	  if (d == NULL || d->__frame != frame || d->__pfn == NULL)
@@ -91,9 +90,8 @@ __go_check_defer (_Bool *frame)
 
 	  (*pfn) (d->__arg);
 
-	  m = runtime_m ();
-	  if (m != NULL && m->mcache != NULL && d->__free)
-	    __go_free (d);
+	  if (runtime_m () != NULL)
+	    runtime_freedefer (d);
 
 	  if (n->__was_recovered)
 	    {
@@ -122,7 +120,6 @@ __go_check_defer (_Bool *frame)
 	   && g->defer->__frame == frame)
     {
       struct __go_defer_stack *d;
-      M *m;
 
       /* This is the defer function which called recover.  Simply
 	 return to stop the stack unwind, and let the Go code continue
@@ -130,9 +127,8 @@ __go_check_defer (_Bool *frame)
       d = g->defer;
       g->defer = d->__next;
 
-      m = runtime_m ();
-      if (m != NULL && m->mcache != NULL && d->__free)
-	__go_free (d);
+      if (runtime_m () != NULL)
+	runtime_freedefer (d);
 
       /* We are returning from this function.  */
       *frame = 1;
diff --git a/libgo/runtime/go-varargs.c b/libgo/runtime/go-varargs.c
index 682c08d..705f55e 100644
--- a/libgo/runtime/go-varargs.c
+++ b/libgo/runtime/go-varargs.c
@@ -26,6 +26,12 @@ __go_fcntl (int fd, int cmd, int arg)
   return fcntl (fd, cmd, arg);
 }
 
+int
+__go_fcntl_flock (int fd, int cmd, struct flock *arg)
+{
+  return fcntl (fd, cmd, arg);
+}
+
 #ifdef HAVE_OPEN64
 
 int
diff --git a/libgo/runtime/lock_futex.c b/libgo/runtime/lock_futex.c
index fa27013..33ef073 100644
--- a/libgo/runtime/lock_futex.c
+++ b/libgo/runtime/lock_futex.c
@@ -124,26 +124,36 @@ runtime_notewakeup(Note *n)
 void
 runtime_notesleep(Note *n)
 {
+	M *m = runtime_m();
+
   /* For gccgo it's OK to sleep in non-g0, and it happens in
      stoptheworld because we have not implemented preemption.
 
 	if(runtime_g() != runtime_m()->g0)
 		runtime_throw("notesleep not on g0");
   */
-	while(runtime_atomicload((uint32*)&n->key) == 0)
+	while(runtime_atomicload((uint32*)&n->key) == 0) {
+		m->blocked = true;
 		runtime_futexsleep((uint32*)&n->key, 0, -1);
+		m->blocked = false;
+	}
 }
 
 static bool
 notetsleep(Note *n, int64 ns, int64 deadline, int64 now)
 {
+	M *m = runtime_m();
+
 	// Conceptually, deadline and now are local variables.
 	// They are passed as arguments so that the space for them
 	// does not count against our nosplit stack sequence.
 
 	if(ns < 0) {
-		while(runtime_atomicload((uint32*)&n->key) == 0)
+		while(runtime_atomicload((uint32*)&n->key) == 0) {
+			m->blocked = true;
 			runtime_futexsleep((uint32*)&n->key, 0, -1);
+			m->blocked = false;
+		}
 		return true;
 	}
 
@@ -152,7 +162,9 @@ notetsleep(Note *n, int64 ns, int64 deadline, int64 now)
 
 	deadline = runtime_nanotime() + ns;
 	for(;;) {
+		m->blocked = true;
 		runtime_futexsleep((uint32*)&n->key, 0, ns);
+		m->blocked = false;
 		if(runtime_atomicload((uint32*)&n->key) != 0)
 			break;
 		now = runtime_nanotime();
diff --git a/libgo/runtime/lock_sema.c b/libgo/runtime/lock_sema.c
index 000b9fc..d0d551d 100644
--- a/libgo/runtime/lock_sema.c
+++ b/libgo/runtime/lock_sema.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin netbsd openbsd plan9 windows
+// +build darwin netbsd openbsd plan9 solaris windows
 
 #include "runtime.h"
 
@@ -167,7 +167,9 @@ runtime_notesleep(Note *n)
 		return;
 	}
 	// Queued.  Sleep.
+	m->blocked = true;
 	runtime_semasleep(-1);
+	m->blocked = false;
 }
 
 static bool
@@ -190,18 +192,23 @@ notetsleep(Note *n, int64 ns, int64 deadline, M *mp)
 
 	if(ns < 0) {
 		// Queued.  Sleep.
+		m->blocked = true;
 		runtime_semasleep(-1);
+		m->blocked = false;
 		return true;
 	}
 
 	deadline = runtime_nanotime() + ns;
 	for(;;) {
 		// Registered.  Sleep.
+		m->blocked = true;
 		if(runtime_semasleep(ns) >= 0) {
+			m->blocked = false;
 			// Acquired semaphore, semawakeup unregistered us.
 			// Done.
 			return true;
 		}
+		m->blocked = false;
 
 		// Interrupted or timed out.  Still registered.  Semaphore not acquired.
 		ns = deadline - runtime_nanotime();
@@ -223,8 +230,10 @@ notetsleep(Note *n, int64 ns, int64 deadline, M *mp)
 		} else if(mp == (M*)LOCKED) {
 			// Wakeup happened so semaphore is available.
 			// Grab it to avoid getting out of sync.
+			m->blocked = true;
 			if(runtime_semasleep(-1) < 0)
 				runtime_throw("runtime: unable to acquire - semaphore out of sync");
+			m->blocked = false;
 			return true;
 		} else
 			runtime_throw("runtime: unexpected waitm - semaphore out of sync");
diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
index 33d0c39..798d875 100644
--- a/libgo/runtime/malloc.goc
+++ b/libgo/runtime/malloc.goc
@@ -54,6 +54,7 @@ package runtime
 
 // Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
 MHeap runtime_mheap;
+MStats mstats;
 
 int32	runtime_checking;
 
@@ -62,6 +63,9 @@ extern MStats mstats;	// defined in zruntime_def_$GOOS_$GOARCH.go
 extern volatile intgo runtime_MemProfileRate
   __asm__ (GOSYM_PREFIX "runtime.MemProfileRate");
 
+static void* largealloc(uint32, uintptr*);
+static void profilealloc(void *v, uintptr size, uintptr typ);
+
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -72,12 +76,12 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 	M *m;
 	G *g;
 	int32 sizeclass;
+	uintptr tinysize, size1;
 	intgo rate;
 	MCache *c;
 	MCacheList *l;
-	uintptr npages;
-	MSpan *s;
-	MLink *v;
+	MLink *v, *next;
+	byte *tiny;
 	bool incallback;
 
 	if(size == 0) {
@@ -119,6 +123,81 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 
 	c = m->mcache;
 	if(!runtime_debug.efence && size <= MaxSmallSize) {
+		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be FlagNoScan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (TinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= TinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+
+			tinysize = c->tinysize;
+			if(size <= tinysize) {
+				tiny = c->tiny;
+				// Align tiny pointer for required (conservative) alignment.
+				if((size&7) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 8);
+				else if((size&3) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 4);
+				else if((size&1) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 2);
+				size1 = size + (tiny - c->tiny);
+				if(size1 <= tinysize) {
+					// The object fits into existing tiny block.
+					v = (MLink*)tiny;
+					c->tiny += size1;
+					c->tinysize -= size1;
+					m->mallocing = 0;
+					m->locks--;
+					if(incallback)
+						runtime_entersyscall();
+					return v;
+				}
+			}
+			// Allocate a new TinySize block.
+			l = &c->list[TinySizeClass];
+			if(l->list == nil)
+				runtime_MCache_Refill(c, TinySizeClass);
+			v = l->list;
+			next = v->next;
+			if(next != nil)  // prefetching nil leads to a DTLB miss
+				PREFETCH(next);
+			l->list = next;
+			l->nlist--;
+			((uint64*)v)[0] = 0;
+			((uint64*)v)[1] = 0;
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if(TinySize-size > tinysize) {
+				c->tiny = (byte*)v + size;
+				c->tinysize = TinySize - size;
+			}
+			size = TinySize;
+			goto done;
+		}
 		// Allocate from mcache free lists.
 		// Inlined version of SizeToClass().
 		if(size <= 1024-8)
@@ -130,31 +209,22 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 		if(l->list == nil)
 			runtime_MCache_Refill(c, sizeclass);
 		v = l->list;
-		l->list = v->next;
+		next = v->next;
+		if(next != nil)  // prefetching nil leads to a DTLB miss
+			PREFETCH(next);
+		l->list = next;
 		l->nlist--;
 		if(!(flag & FlagNoZero)) {
 			v->next = nil;
 			// block is zeroed iff second word is zero ...
-			if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
+			if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
 				runtime_memclr((byte*)v, size);
 		}
+	done:
 		c->local_cachealloc += size;
 	} else {
-		// TODO(rsc): Report tracebacks for very large allocations.
-
 		// Allocate directly from heap.
-		npages = size >> PageShift;
-		if((size & PageMask) != 0)
-			npages++;
-		s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, !(flag & FlagNoZero));
-		if(s == nil)
-			runtime_throw("out of memory");
-		s->limit = (byte*)(s->start<<PageShift) + size;
-		size = npages<<PageShift;
-		v = (void*)(s->start << PageShift);
-
-		// setup for mark sweep
-		runtime_markspan(v, 0, 0, true);
+		v = largealloc(flag, &size);
 	}
 
 	if(flag & FlagNoGC)
@@ -180,40 +250,83 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 	m->mallocing = 0;
 	if(UseSpanType && !(flag & FlagNoScan) && typ != 0 && m->settype_bufsize == nelem(m->settype_buf))
 		runtime_settype_flush(m);
-	m->locks--;
+	if(raceenabled)
+		runtime_racemalloc(v, size);
 
 	if(runtime_debug.allocfreetrace)
 		goto profile;
 
 	if(!(flag & FlagNoProfiling) && (rate = runtime_MemProfileRate) > 0) {
-		if(size >= (uint32) rate)
-			goto profile;
-		if((uint32) m->mcache->next_sample > size)
-			m->mcache->next_sample -= size;
+		if(size < (uintptr)rate && size < (uintptr)(uint32)c->next_sample)
+			c->next_sample -= size;
 		else {
-			// pick next profile time
-			// If you change this, also change allocmcache.
-			if(rate > 0x3fffffff)	// make 2*rate not overflow
-				rate = 0x3fffffff;
-			m->mcache->next_sample = runtime_fastrand1() % (2*rate);
 		profile:
-			runtime_setblockspecial(v, true);
-			runtime_MProf_Malloc(v, size, typ);
+			profilealloc(v, size, typ);
 		}
 	}
 
+	m->locks--;
+
 	if(!(flag & FlagNoInvokeGC) && mstats.heap_alloc >= mstats.next_gc)
 		runtime_gc(0);
 
-	if(raceenabled)
-		runtime_racemalloc(v, size);
-
 	if(incallback)
 		runtime_entersyscall();
 
 	return v;
 }
 
+static void*
+largealloc(uint32 flag, uintptr *sizep)
+{
+	uintptr npages, size;
+	MSpan *s;
+	void *v;
+
+	// Allocate directly from heap.
+	size = *sizep;
+	if(size + PageSize < size)
+		runtime_throw("out of memory");
+	npages = size >> PageShift;
+	if((size & PageMask) != 0)
+		npages++;
+	s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, !(flag & FlagNoZero));
+	if(s == nil)
+		runtime_throw("out of memory");
+	s->limit = (byte*)(s->start<<PageShift) + size;
+	*sizep = npages<<PageShift;
+	v = (void*)(s->start << PageShift);
+	// setup for mark sweep
+	runtime_markspan(v, 0, 0, true);
+	return v;
+}
+
+static void
+profilealloc(void *v, uintptr size, uintptr typ)
+{
+	uintptr rate;
+	int32 next;
+	MCache *c;
+
+	c = runtime_m()->mcache;
+	rate = runtime_MemProfileRate;
+	if(size < rate) {
+		// pick next profile time
+		// If you change this, also change allocmcache.
+		if(rate > 0x3fffffff)	// make 2*rate not overflow
+			rate = 0x3fffffff;
+		next = runtime_fastrand1() % (2*rate);
+		// Subtract the "remainder" of the current allocation.
+		// Otherwise objects that are close in size to sampling rate
+		// will be under-sampled, because we consistently discard this remainder.
+		next -= (size - c->next_sample);
+		if(next < 0)
+			next = 0;
+		c->next_sample = next;
+	}
+	runtime_MProf_Malloc(v, size, typ);
+}
+
 void*
 __go_alloc(uintptr size)
 {
@@ -228,7 +341,6 @@ __go_free(void *v)
 	int32 sizeclass;
 	MSpan *s;
 	MCache *c;
-	uint32 prof;
 	uintptr size;
 
 	if(v == nil)
@@ -246,18 +358,27 @@ __go_free(void *v)
 		runtime_printf("free %p: not an allocated block\n", v);
 		runtime_throw("free runtime_mlookup");
 	}
-	prof = runtime_blockspecial(v);
+	size = s->elemsize;
+	sizeclass = s->sizeclass;
+	// Objects that are smaller than TinySize can be allocated using tiny alloc,
+	// if then such object is combined with an object with finalizer, we will crash.
+	if(size < TinySize)
+		runtime_throw("freeing too small block");
 
 	if(raceenabled)
 		runtime_racefree(v);
 
-	// Find size class for v.
-	sizeclass = s->sizeclass;
+	// Ensure that the span is swept.
+	// If we free into an unswept span, we will corrupt GC bitmaps.
+	runtime_MSpan_EnsureSwept(s);
+
+	if(s->specials != nil)
+		runtime_freeallspecials(s, v, size);
+
 	c = m->mcache;
 	if(sizeclass == 0) {
 		// Large object.
-		size = s->npages<<PageShift;
-		*(uintptr*)(s->start<<PageShift) = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
+		s->needzero = 1;
 		// Must mark v freed before calling unmarkspan and MHeap_Free:
 		// they might coalesce v into other spans and change the bitmap further.
 		runtime_markfreed(v, size);
@@ -270,9 +391,10 @@ __go_free(void *v)
 		c->local_largefree += size;
 	} else {
 		// Small object.
-		size = runtime_class_to_size[sizeclass];
-		if(size > sizeof(uintptr))
+		if(size > 2*sizeof(uintptr))
 			((uintptr*)v)[1] = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
+		else if(size > sizeof(uintptr))
+			((uintptr*)v)[1] = 0;
 		// Must mark v freed before calling MCache_Free:
 		// it might coalesce v and other blocks into a bigger span
 		// and change the bitmap further.
@@ -280,8 +402,6 @@ __go_free(void *v)
 		c->local_nsmallfree[sizeclass]++;
 		runtime_MCache_Free(c, v, sizeclass, size);
 	}
-	if(prof)
-		runtime_MProf_Free(v, size);
 	m->mallocing = 0;
 }
 
@@ -392,6 +512,12 @@ runtime_purgecachedstats(MCache *c)
 extern uintptr runtime_sizeof_C_MStats
   __asm__ (GOSYM_PREFIX "runtime.Sizeof_C_MStats");
 
+// Size of the trailing by_size array differs between Go and C,
+// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+// sizeof_C_MStats is what C thinks about size of Go struct.
+
+// Initialized in mallocinit because it's defined in go/runtime/mem.go.
+
 #define MaxArena32 (2U<<30)
 
 void
@@ -400,11 +526,10 @@ runtime_mallocinit(void)
 	byte *p;
 	uintptr arena_size, bitmap_size, spans_size;
 	extern byte _end[];
-	byte *want;
 	uintptr limit;
 	uint64 i;
 
-	runtime_sizeof_C_MStats = sizeof(MStats);
+	runtime_sizeof_C_MStats = sizeof(MStats) - (NumSizeClasses - 61) * sizeof(mstats.by_size[0]);
 
 	p = nil;
 	arena_size = 0;
@@ -419,6 +544,9 @@ runtime_mallocinit(void)
 
 	runtime_InitSizes();
 
+	if(runtime_class_to_size[TinySizeClass] != TinySize)
+		runtime_throw("bad TinySizeClass");
+
 	// limit = runtime_memlimit();
 	// See https://code.google.com/p/go/issues/detail?id=5049
 	// TODO(rsc): Fix after 1.1.
@@ -457,7 +585,7 @@ runtime_mallocinit(void)
 		spans_size = arena_size / PageSize * sizeof(runtime_mheap.spans[0]);
 		spans_size = ROUND(spans_size, PageSize);
 		for(i = 0; i < HeapBaseOptions; i++) {
-			p = runtime_SysReserve(HeapBase(i), bitmap_size + spans_size + arena_size);
+			p = runtime_SysReserve(HeapBase(i), bitmap_size + spans_size + arena_size + PageSize);
 			if(p != nil)
 				break;
 		}
@@ -499,18 +627,16 @@ runtime_mallocinit(void)
 		// So adjust it upward a little bit ourselves: 1/4 MB to get
 		// away from the running binary image and then round up
 		// to a MB boundary.
-		want = (byte*)ROUND((uintptr)_end + (1<<18), 1<<20);
-		if(0xffffffff - (uintptr)want <= bitmap_size + spans_size + arena_size)
-		  want = 0;
-		p = runtime_SysReserve(want, bitmap_size + spans_size + arena_size);
+		p = (byte*)ROUND((uintptr)_end + (1<<18), 1<<20);
+		p = runtime_SysReserve(p, bitmap_size + spans_size + arena_size + PageSize);
 		if(p == nil)
 			runtime_throw("runtime: cannot reserve arena virtual address space");
-		if((uintptr)p & (((uintptr)1<<PageShift)-1))
-			runtime_printf("runtime: SysReserve returned unaligned address %p; asked for %p", p,
-				bitmap_size+spans_size+arena_size);
 	}
-	if((uintptr)p & (((uintptr)1<<PageShift)-1))
-		runtime_throw("runtime: SysReserve returned unaligned address");
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p = (byte*)ROUND((uintptr)p, PageSize);
 
 	runtime_mheap.spans = (MSpan**)p;
 	runtime_mheap.bitmap = p + spans_size;
@@ -523,7 +649,7 @@ runtime_mallocinit(void)
 	runtime_m()->mcache = runtime_allocmcache();
 
 	// See if it works.
-	runtime_free(runtime_malloc(1));
+	runtime_free(runtime_malloc(TinySize));
 }
 
 void*
@@ -828,16 +954,18 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 		goto throw;
 	}
 	ot = (const PtrType*)obj.type;
-	if(ot->__element_type != nil && ot->__element_type->__size == 0) {
+	// As an implementation detail we do not run finalizers for zero-sized objects,
+	// because we use &runtime_zerobase for all such allocations.
+	if(ot->__element_type != nil && ot->__element_type->__size == 0)
 		return;
-	}
 	if(!runtime_mlookup(obj.__object, &base, &size, nil) || obj.__object != base) {
-		runtime_printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
-		goto throw;
+		// As an implementation detail we allow to set finalizers for an inner byte
+		// of an object if it could come from tiny alloc (see mallocgc for details).
+		if(ot->__element_type == nil || (ot->__element_type->__code&GO_NO_POINTERS) == 0 || ot->__element_type->__size >= TinySize) {
+			runtime_printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
+			goto throw;
+		}
 	}
-	ft = nil;
-	ot = (const PtrType*)obj.__type_descriptor;
-	fint = nil;
 	if(finalizer.__type_descriptor != nil) {
 		if(finalizer.__type_descriptor->__code != GO_FUNC)
 			goto badfunc;
@@ -856,11 +984,15 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 			// ok - satisfies non-empty interface
 		} else
 			goto badfunc;
-	}
 
-	if(!runtime_addfinalizer(obj.__object, finalizer.__type_descriptor != nil ? *(void**)finalizer.__object : nil, ft, ot)) {
-		runtime_printf("runtime.SetFinalizer: finalizer already set\n");
-		goto throw;
+		ot = (const PtrType*)obj.__type_descriptor;
+		if(!runtime_addfinalizer(obj.__object, *(FuncVal**)finalizer.__object, ft, ot)) {
+			runtime_printf("runtime.SetFinalizer: finalizer already set\n");
+			goto throw;
+		}
+	} else {
+		// NOTE: asking to remove a finalizer when there currently isn't one set is OK.
+		runtime_removefinalizer(obj.__object);
 	}
 	return;
 
diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h
index 16f51a5..b5dc5a4 100644
--- a/libgo/runtime/malloc.h
+++ b/libgo/runtime/malloc.h
@@ -66,14 +66,14 @@
 //
 // The small objects on the MCache and MCentral free lists
 // may or may not be zeroed.  They are zeroed if and only if
-// the second word of the object is zero.  The spans in the
-// page heap are always zeroed.  When a span full of objects
-// is returned to the page heap, the objects that need to be
-// are zeroed first.  There are two main benefits to delaying the
+// the second word of the object is zero.  A span in the
+// page heap is zeroed unless s->needzero is set. When a span
+// is allocated to break into small objects, it is zeroed if needed
+// and s->needzero is set. There are two main benefits to delaying the
 // zeroing this way:
 //
 //	1. stack frames allocated from the small object lists
-//	   can avoid zeroing altogether.
+//	   or the page heap can avoid zeroing altogether.
 //	2. the cost of zeroing when reusing a small object is
 //	   charged to the mutator, not the garbage collector.
 //
@@ -90,7 +90,7 @@ typedef struct GCStats	GCStats;
 
 enum
 {
-	PageShift	= 12,
+	PageShift	= 13,
 	PageSize	= 1<<PageShift,
 	PageMask	= PageSize - 1,
 };
@@ -103,11 +103,15 @@ enum
 	// size classes.  NumSizeClasses is that number.  It's needed here
 	// because there are static arrays of this length; when msize runs its
 	// size choosing algorithm it double-checks that NumSizeClasses agrees.
-	NumSizeClasses = 61,
+	NumSizeClasses = 67,
 
 	// Tunable constants.
 	MaxSmallSize = 32<<10,
 
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+	TinySize = 16,
+	TinySizeClass = 2,
+
 	FixAllocChunk = 16<<10,		// Chunk size for FixAlloc
 	MaxMHeapList = 1<<(20 - PageShift),	// Maximum page length for fixed-size list in MHeap.
 	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
@@ -256,7 +260,7 @@ struct MStats
 };
 
 extern MStats mstats
-  __asm__ (GOSYM_PREFIX "runtime.VmemStats");
+  __asm__ (GOSYM_PREFIX "runtime.memStats");
 
 // Size classes.  Computed and initialized by InitSizes.
 //
@@ -269,6 +273,7 @@ extern MStats mstats
 //	making new objects in class i
 
 int32	runtime_SizeToClass(int32);
+uintptr	runtime_roundupsize(uintptr);
 extern	int32	runtime_class_to_size[NumSizeClasses];
 extern	int32	runtime_class_to_allocnpages[NumSizeClasses];
 extern	int8	runtime_size_to_class8[1024/8 + 1];
@@ -291,6 +296,10 @@ struct MCache
 	// so they are grouped here for better caching.
 	int32 next_sample;		// trigger heap sample after allocating this many bytes
 	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
+	// Allocator cache for tiny objects w/o pointers.
+	// See "Tiny allocator" comment in malloc.goc.
+	byte*	tiny;
+	uintptr	tinysize;
 	// The rest is not accessed on every malloc.
 	MCacheList list[NumSizeClasses];
 	// Local allocator stats, flushed during GC.
@@ -341,6 +350,43 @@ struct MTypes
 	uintptr	data;
 };
 
+enum
+{
+	KindSpecialFinalizer = 1,
+	KindSpecialProfile = 2,
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+};
+
+typedef struct Special Special;
+struct Special
+{
+	Special*	next;	// linked list in span
+	uint16		offset;	// span offset of object
+	byte		kind;	// kind of Special
+};
+
+// The described object has a finalizer set for it.
+typedef struct SpecialFinalizer SpecialFinalizer;
+struct SpecialFinalizer
+{
+	Special;
+	FuncVal*	fn;
+	const FuncType*	ft;
+	const PtrType*	ot;
+};
+
+// The described object is being heap profiled.
+typedef struct Bucket Bucket; // from mprof.goc
+typedef struct SpecialProfile SpecialProfile;
+struct SpecialProfile
+{
+	Special;
+	Bucket*	b;
+};
+
 // An MSpan is a run of pages.
 enum
 {
@@ -356,17 +402,28 @@ struct MSpan
 	PageID	start;		// starting page number
 	uintptr	npages;		// number of pages in span
 	MLink	*freelist;	// list of free objects
-	uint32	ref;		// number of allocated objects in this span
-	int32	sizeclass;	// size class
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+	uint32	sweepgen;
+	uint16	ref;		// number of allocated objects in this span
+	uint8	sizeclass;	// size class
+	uint8	state;		// MSpanInUse etc
+	uint8	needzero;	// needs to be zeroed before allocation
 	uintptr	elemsize;	// computed from sizeclass or from npages
-	uint32	state;		// MSpanInUse etc
 	int64   unusedsince;	// First time spotted by GC in MSpanFree state
 	uintptr npreleased;	// number of pages released to the OS
 	byte	*limit;		// end of data in span
 	MTypes	types;		// types of allocated objects in this span
+	Lock	specialLock;	// TODO: use to protect types also (instead of settype_lock)
+	Special	*specials;	// linked list of special records sorted by offset.
 };
 
 void	runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages);
+void	runtime_MSpan_EnsureSwept(MSpan *span);
+bool	runtime_MSpan_Sweep(MSpan *span);
 
 // Every MSpan is in one doubly-linked list,
 // either one of the MHeap's free lists or one of the
@@ -374,6 +431,7 @@ void	runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages);
 void	runtime_MSpanList_Init(MSpan *list);
 bool	runtime_MSpanList_IsEmpty(MSpan *list);
 void	runtime_MSpanList_Insert(MSpan *list, MSpan *span);
+void	runtime_MSpanList_InsertBack(MSpan *list, MSpan *span);
 void	runtime_MSpanList_Remove(MSpan *span);	// from whatever list it is in
 
 
@@ -390,7 +448,7 @@ struct MCentral
 void	runtime_MCentral_Init(MCentral *c, int32 sizeclass);
 int32	runtime_MCentral_AllocList(MCentral *c, MLink **first);
 void	runtime_MCentral_FreeList(MCentral *c, MLink *first);
-void	runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
+bool	runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
@@ -399,10 +457,15 @@ struct MHeap
 {
 	Lock;
 	MSpan free[MaxMHeapList];	// free lists of given length
-	MSpan large;			// free lists length >= MaxMHeapList
-	MSpan **allspans;
+	MSpan freelarge;		// free lists length >= MaxMHeapList
+	MSpan busy[MaxMHeapList];	// busy lists of large objects of given length
+	MSpan busylarge;		// busy lists of large objects length >= MaxMHeapList
+	MSpan **allspans;		// all spans out there
+	MSpan **sweepspans;		// copy of allspans referenced by sweeper
 	uint32	nspan;
 	uint32	nspancap;
+	uint32	sweepgen;		// sweep generation, see comment in MSpan
+	uint32	sweepdone;		// all spans are swept
 
 	// span lookup
 	MSpan**	spans;
@@ -426,6 +489,9 @@ struct MHeap
 
 	FixAlloc spanalloc;	// allocator for Span*
 	FixAlloc cachealloc;	// allocator for MCache*
+	FixAlloc specialfinalizeralloc;	// allocator for SpecialFinalizer*
+	FixAlloc specialprofilealloc;	// allocator for SpecialProfile*
+	Lock speciallock; // lock for sepcial record allocators.
 
 	// Malloc stats.
 	uint64 largefree;	// bytes freed for large objects (>MaxSmallSize)
@@ -435,7 +501,7 @@ struct MHeap
 extern MHeap runtime_mheap;
 
 void	runtime_MHeap_Init(MHeap *h);
-MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
+MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero);
 void	runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan*	runtime_MHeap_Lookup(MHeap *h, void *v);
 MSpan*	runtime_MHeap_LookupMaybe(MHeap *h, void *v);
@@ -449,6 +515,7 @@ void*	runtime_mallocgc(uintptr size, uintptr typ, uint32 flag);
 void*	runtime_persistentalloc(uintptr size, uintptr align, uint64 *stat);
 int32	runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void	runtime_gc(int32 force);
+uintptr	runtime_sweepone(void);
 void	runtime_markscan(void *v);
 void	runtime_marknogc(void *v);
 void	runtime_checkallocated(void *v, uintptr n);
@@ -457,8 +524,6 @@ void	runtime_checkfreed(void *v, uintptr n);
 extern	int32	runtime_checking;
 void	runtime_markspan(void *v, uintptr size, uintptr n, bool leftover);
 void	runtime_unmarkspan(void *v, uintptr size);
-bool	runtime_blockspecial(void*);
-void	runtime_setblockspecial(void*, bool);
 void	runtime_purgecachedstats(MCache*);
 void*	runtime_cnew(const Type*);
 void*	runtime_cnewarray(const Type*, intgo);
@@ -486,17 +551,25 @@ struct Obj
 };
 
 void	runtime_MProf_Malloc(void*, uintptr, uintptr);
-void	runtime_MProf_Free(void*, uintptr);
+void	runtime_MProf_Free(Bucket*, void*, uintptr, bool);
 void	runtime_MProf_GC(void);
-void	runtime_MProf_Mark(void (*addroot)(Obj));
+void	runtime_MProf_TraceGC(void);
+struct Workbuf;
+void	runtime_MProf_Mark(struct Workbuf**, void (*)(struct Workbuf**, Obj));
 int32	runtime_gcprocs(void);
 void	runtime_helpgc(int32 nproc);
 void	runtime_gchelper(void);
 
+void	runtime_setprofilebucket(void *p, Bucket *b);
+
 struct __go_func_type;
 struct __go_ptr_type;
-bool	runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft, const struct __go_ptr_type **ot);
-void	runtime_walkfintab(void (*fn)(void*), void (*scan)(Obj));
+bool	runtime_addfinalizer(void *p, FuncVal *fn, const struct __go_func_type*, const struct __go_ptr_type*);
+void	runtime_removefinalizer(void*);
+void	runtime_queuefinalizer(void *p, FuncVal *fn, const struct __go_func_type *ft, const struct __go_ptr_type *ot);
+
+void	runtime_freeallspecials(MSpan *span, void *p, uintptr size);
+bool	runtime_freespecial(Special *s, void *p, uintptr size, bool freed);
 
 enum
 {
@@ -514,6 +587,6 @@ void	runtime_gc_itab_ptr(Eface*);
 
 void	runtime_memorydump(void);
 
-void	runtime_proc_scan(void (*)(Obj));
-void	runtime_time_scan(void (*)(Obj));
-void	runtime_netpoll_scan(void (*)(Obj));
+void	runtime_proc_scan(struct Workbuf**, void (*)(struct Workbuf**, Obj));
+void	runtime_time_scan(struct Workbuf**, void (*)(struct Workbuf**, Obj));
+void	runtime_netpoll_scan(struct Workbuf**, void (*)(struct Workbuf**, Obj));
diff --git a/libgo/runtime/mcentral.c b/libgo/runtime/mcentral.c
index 8191610..1285336 100644
--- a/libgo/runtime/mcentral.c
+++ b/libgo/runtime/mcentral.c
@@ -39,17 +39,58 @@ runtime_MCentral_AllocList(MCentral *c, MLink **pfirst)
 {
 	MSpan *s;
 	int32 cap, n;
+	uint32 sg;
 
 	runtime_lock(c);
-	// Replenish central list if empty.
-	if(runtime_MSpanList_IsEmpty(&c->nonempty)) {
-		if(!MCentral_Grow(c)) {
+	sg = runtime_mheap.sweepgen;
+retry:
+	for(s = c->nonempty.next; s != &c->nonempty; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
 			runtime_unlock(c);
-			*pfirst = nil;
-			return 0;
+			runtime_MSpan_Sweep(s);
+			runtime_lock(c);
+			// the span could have been moved to heap, retry
+			goto retry;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being swept by background sweeper, skip
+			continue;
+		}
+		// we have a nonempty span that does not require sweeping, allocate from it
+		goto havespan;
+	}
+
+	for(s = c->empty.next; s != &c->empty; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
+			// we have an empty span that requires sweeping,
+			// sweep it and see if we can free some space in it
+			runtime_MSpanList_Remove(s);
+			// swept spans are at the end of the list
+			runtime_MSpanList_InsertBack(&c->empty, s);
+			runtime_unlock(c);
+			runtime_MSpan_Sweep(s);
+			runtime_lock(c);
+			// the span could be moved to nonempty or heap, retry
+			goto retry;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being swept by background sweeper, skip
+			continue;
 		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break;
+	}
+
+	// Replenish central list if empty.
+	if(!MCentral_Grow(c)) {
+		runtime_unlock(c);
+		*pfirst = nil;
+		return 0;
 	}
 	s = c->nonempty.next;
+
+havespan:
 	cap = (s->npages << PageShift) / s->elemsize;
 	n = cap - s->ref;
 	*pfirst = s->freelist;
@@ -57,7 +98,7 @@ runtime_MCentral_AllocList(MCentral *c, MLink **pfirst)
 	s->ref += n;
 	c->nfree -= n;
 	runtime_MSpanList_Remove(s);
-	runtime_MSpanList_Insert(&c->empty, s);
+	runtime_MSpanList_InsertBack(&c->empty, s);
 	runtime_unlock(c);
 	return n;
 }
@@ -106,7 +147,7 @@ MCentral_Free(MCentral *c, void *v)
 		size = runtime_class_to_size[c->sizeclass];
 		runtime_MSpanList_Remove(s);
 		runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		*(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
+		s->needzero = 1;
 		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime_unlock(c);
@@ -116,8 +157,9 @@ MCentral_Free(MCentral *c, void *v)
 }
 
 // Free n objects from a span s back into the central free list c.
-// Called from GC.
-void
+// Called during sweep.
+// Returns true if the span was returned to heap.
+bool
 runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end)
 {
 	int32 size;
@@ -136,19 +178,21 @@ runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *e
 	s->ref -= n;
 	c->nfree += n;
 
-	// If s is completely freed, return it to the heap.
-	if(s->ref == 0) {
-		size = runtime_class_to_size[c->sizeclass];
-		runtime_MSpanList_Remove(s);
-		*(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
-		s->freelist = nil;
-		c->nfree -= (s->npages << PageShift) / size;
-		runtime_unlock(c);
-		runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-		runtime_MHeap_Free(&runtime_mheap, s, 0);
-	} else {
+	if(s->ref != 0) {
 		runtime_unlock(c);
+		return false;
 	}
+
+	// s is completely freed, return it to the heap.
+	size = runtime_class_to_size[c->sizeclass];
+	runtime_MSpanList_Remove(s);
+	s->needzero = 1;
+	s->freelist = nil;
+	c->nfree -= (s->npages << PageShift) / size;
+	runtime_unlock(c);
+	runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+	runtime_MHeap_Free(&runtime_mheap, s, 0);
+	return true;
 }
 
 void
diff --git a/libgo/runtime/mfinal.c b/libgo/runtime/mfinal.c
deleted file mode 100644
index 625af52..0000000
--- a/libgo/runtime/mfinal.c
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-#include "arch.h"
-#include "malloc.h"
-#include "go-type.h"
-
-enum { debug = 0 };
-
-typedef struct Fin Fin;
-struct Fin
-{
-	FuncVal *fn;
-	const struct __go_func_type *ft;
-	const struct __go_ptr_type *ot;
-};
-
-// Finalizer hash table.  Direct hash, linear scan, at most 3/4 full.
-// Table size is power of 3 so that hash can be key % max.
-// Key[i] == (void*)-1 denotes free but formerly occupied entry
-// (doesn't stop the linear scan).
-// Key and val are separate tables because the garbage collector
-// must be instructed to ignore the pointers in key but follow the
-// pointers in val.
-typedef struct Fintab Fintab;
-struct Fintab
-{
-	Lock;
-	void **fkey;
-	Fin *val;
-	int32 nkey;	// number of non-nil entries in key
-	int32 ndead;	// number of dead (-1) entries in key
-	int32 max;	// size of key, val allocations
-};
-
-#define TABSZ 17
-#define TAB(p) (&fintab[((uintptr)(p)>>3)%TABSZ])
-
-static struct {
-	Fintab;
-	uint8 pad[0 /* CacheLineSize - sizeof(Fintab) */];	
-} fintab[TABSZ];
-
-static void
-addfintab(Fintab *t, void *k, FuncVal *fn, const struct __go_func_type *ft, const struct __go_ptr_type *ot)
-{
-	int32 i, j;
-
-	i = (uintptr)k % (uintptr)t->max;
-	for(j=0; j<t->max; j++) {
-		if(t->fkey[i] == nil) {
-			t->nkey++;
-			goto ret;
-		}
-		if(t->fkey[i] == (void*)-1) {
-			t->ndead--;
-			goto ret;
-		}
-		if(++i == t->max)
-			i = 0;
-	}
-
-	// cannot happen - table is known to be non-full
-	runtime_throw("finalizer table inconsistent");
-
-ret:
-	t->fkey[i] = k;
-	t->val[i].fn = fn;
-	t->val[i].ft = ft;
-	t->val[i].ot = ot;
-}
-
-static bool
-lookfintab(Fintab *t, void *k, bool del, Fin *f)
-{
-	int32 i, j;
-
-	if(t->max == 0)
-		return false;
-	i = (uintptr)k % (uintptr)t->max;
-	for(j=0; j<t->max; j++) {
-		if(t->fkey[i] == nil)
-			return false;
-		if(t->fkey[i] == k) {
-			if(f)
-				*f = t->val[i];
-			if(del) {
-				t->fkey[i] = (void*)-1;
-				t->val[i].fn = nil;
-				t->val[i].ft = nil;
-				t->val[i].ot = nil;
-				t->ndead++;
-			}
-			return true;
-		}
-		if(++i == t->max)
-			i = 0;
-	}
-
-	// cannot happen - table is known to be non-full
-	runtime_throw("finalizer table inconsistent");
-	return false;
-}
-
-static void
-resizefintab(Fintab *tab)
-{
-	Fintab newtab;
-	void *k;
-	int32 i;
-
-	runtime_memclr((byte*)&newtab, sizeof newtab);
-	newtab.max = tab->max;
-	if(newtab.max == 0)
-		newtab.max = 3*3*3;
-	else if(tab->ndead < tab->nkey/2) {
-		// grow table if not many dead values.
-		// otherwise just rehash into table of same size.
-		newtab.max *= 3;
-	}
-	
-	newtab.fkey = runtime_mallocgc(newtab.max*sizeof newtab.fkey[0], 0, FlagNoInvokeGC|FlagNoScan);
-	newtab.val = runtime_mallocgc(newtab.max*sizeof newtab.val[0], 0, FlagNoInvokeGC);
-	
-	for(i=0; i<tab->max; i++) {
-		k = tab->fkey[i];
-		if(k != nil && k != (void*)-1)
-			addfintab(&newtab, k, tab->val[i].fn, tab->val[i].ft, tab->val[i].ot);
-	}
-	
-	runtime_free(tab->fkey);
-	runtime_free(tab->val);
-	
-	tab->fkey = newtab.fkey;
-	tab->val = newtab.val;
-	tab->nkey = newtab.nkey;
-	tab->ndead = newtab.ndead;
-	tab->max = newtab.max;
-}
-
-bool
-runtime_addfinalizer(void *p, FuncVal *f, const struct __go_func_type *ft, const struct __go_ptr_type *ot)
-{
-	Fintab *tab;
-	byte *base;
-	
-	if(debug) {
-		if(!runtime_mlookup(p, &base, nil, nil) || p != base)
-			runtime_throw("addfinalizer on invalid pointer");
-	}
-	
-	tab = TAB(p);
-	runtime_lock(tab);
-	if(f == nil) {
-		lookfintab(tab, p, true, nil);
-		runtime_unlock(tab);
-		return true;
-	}
-
-	if(lookfintab(tab, p, false, nil)) {
-		runtime_unlock(tab);
-		return false;
-	}
-
-	if(tab->nkey >= tab->max/2+tab->max/4) {
-		// keep table at most 3/4 full:
-		// allocate new table and rehash.
-		resizefintab(tab);
-	}
-
-	addfintab(tab, p, f, ft, ot);
-	runtime_setblockspecial(p, true);
-	runtime_unlock(tab);
-	return true;
-}
-
-// get finalizer; if del, delete finalizer.
-// caller is responsible for updating RefHasFinalizer (special) bit.
-bool
-runtime_getfinalizer(void *p, bool del, FuncVal **fn, const struct __go_func_type **ft, const struct __go_ptr_type **ot)
-{
-	Fintab *tab;
-	bool res;
-	Fin f;
-	
-	tab = TAB(p);
-	runtime_lock(tab);
-	res = lookfintab(tab, p, del, &f);
-	runtime_unlock(tab);
-	if(res==false)
-		return false;
-	*fn = f.fn;
-	*ft = f.ft;
-	*ot = f.ot;
-	return true;
-}
-
-void
-runtime_walkfintab(void (*fn)(void*), void (*addroot)(Obj))
-{
-	void **key;
-	void **ekey;
-	int32 i;
-
-	for(i=0; i<TABSZ; i++) {
-		runtime_lock(&fintab[i]);
-		key = fintab[i].fkey;
-		ekey = key + fintab[i].max;
-		for(; key < ekey; key++)
-			if(*key != nil && *key != ((void*)-1))
-				fn(*key);
-		addroot((Obj){(byte*)&fintab[i].fkey, sizeof(void*), 0});
-		addroot((Obj){(byte*)&fintab[i].val, sizeof(void*), 0});
-		runtime_unlock(&fintab[i]);
-	}
-}
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
index d665d92..10dd412 100644
--- a/libgo/runtime/mgc0.c
+++ b/libgo/runtime/mgc0.c
@@ -2,7 +2,53 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Garbage collector.
+// Garbage collector (GC).
+//
+// GC is:
+// - mark&sweep
+// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
+// - parallel (up to MaxGcproc threads)
+// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
+// - non-moving/non-compacting
+// - full (non-partial)
+//
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+//
+// Concurrent sweep.
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+// and so next_gc calculation is tricky and happens as follows.
+// At the end of the stop-the-world phase next_gc is conservatively set based on total
+// heap size; all spans are marked as "needs sweeping".
+// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+// closer to the target value. However, this is not enough to avoid over-allocating memory.
+// Consider that a goroutine wants to allocate a new span for a large object and
+// there are no free swept spans, but there are small-object unswept spans.
+// If the goroutine naively allocates a new span, it can surpass the yet-unknown
+// target next_gc value. In order to prevent such cases (1) when a goroutine needs
+// to allocate a new small-object span, it sweeps small-object spans for the same
+// object size until it frees at least one object; (2) when a goroutine needs to
+// allocate large-object span from heap, it sweeps spans until it frees at least
+// that many pages into heap. Together these two measures ensure that we don't surpass
+// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+// but there can still be other one-page unswept spans which could be combined into a two-page span.
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
 
 #include <unistd.h>
 
@@ -19,12 +65,9 @@
 #define cap __capacity
 // Iface aka __go_interface
 #define tab __methods
-// Eface aka __go_empty_interface.
-#define type __type_descriptor
 // Hmap aka __go_map
 typedef struct __go_map Hmap;
 // Type aka __go_type_descriptor
-#define kind __code
 #define string __reflection
 #define KindPtr GO_PTR
 #define KindNoPointers GO_NO_POINTERS
@@ -43,15 +86,19 @@ extern void * __splitstack_find_context (void *context[10], size_t *, void **,
 
 enum {
 	Debug = 0,
-	DebugMark = 0,  // run second pass to check mark
 	CollectStats = 0,
 	ScanStackByFrames = 1,
 	IgnorePreciseGC = 0,
+	ConcurrentSweep = 1,
 
 	// Four bits per word (see #defines below).
 	wordsPerBitmapWord = sizeof(void*)*8/4,
 	bitShift = sizeof(void*)*8/4,
 
+	WorkbufSize	= 16*1024,
+	RootBlockSize	= 4*1024,
+	FinBlockSize	= 4*1024,
+
 	handoffThreshold = 4,
 	IntermediateBufferCapacity = 64,
 
@@ -66,8 +113,20 @@ enum {
 	BitsPointer = 1,
 	BitsIface = 2,
 	BitsEface = 3,
+
+	RootData	= 0,
+	RootBss		= 1,
+	RootFinalizers	= 2,
+	RootSpanTypes	= 3,
+	RootFlushCaches = 4,
+	RootCount	= 5,
 };
 
+#define GcpercentUnknown (-2)
+
+// Initialized from $GOGC.  GOGC=off means no gc.
+static int32 gcpercent = GcpercentUnknown;
+
 static struct
 {
 	Lock;  
@@ -89,16 +148,34 @@ sync_runtime_registerPool(void **p)
 static void
 clearpools(void)
 {
-	void **p, **next;
-
-	for(p = pools.head; p != nil; p = next) {
-		next = p[0];
-		p[0] = nil; // next
-		p[1] = nil; // slice
-		p[2] = nil;
-		p[3] = nil;
+	void **pool, **next;
+	P *p, **pp;
+	MCache *c;
+	uintptr off;
+
+	// clear sync.Pool's
+	for(pool = pools.head; pool != nil; pool = next) {
+		next = pool[0];
+		pool[0] = nil; // next
+		pool[1] = nil; // local
+		pool[2] = nil; // localSize
+		off = (uintptr)pool[3] / sizeof(void*);
+		pool[off+0] = nil; // global slice
+		pool[off+1] = nil;
+		pool[off+2] = nil;
 	}
 	pools.head = nil;
+
+	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
+		// clear tinyalloc pool
+		c = p->mcache;
+		if(c != nil) {
+			c->tiny = nil;
+			c->tinysize = 0;
+		}
+		// clear defer pools
+		p->deferpool = nil;
+	}
 }
 
 // Bits in per-word bitmap.
@@ -149,11 +226,10 @@ clearpools(void)
 //
 uint32 runtime_worldsema = 1;
 
-// The size of Workbuf is N*PageSize.
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
-#define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr))
+#define SIZE (WorkbufSize-sizeof(LFNode)-sizeof(uintptr))
 	LFNode  node; // must be first
 	uintptr nobj;
 	Obj     obj[SIZE/sizeof(Obj) - 1];
@@ -180,45 +256,42 @@ struct FinBlock
 	Finalizer fin[1];
 };
 
-static G *fing;
-static FinBlock *finq; // list of finalizers that are to be executed
-static FinBlock *finc; // cache of free blocks
-static FinBlock *allfin; // list of all blocks
-static Lock finlock;
-static int32 fingwait;
+static G	*fing;
+static FinBlock	*finq; // list of finalizers that are to be executed
+static FinBlock	*finc; // cache of free blocks
+static FinBlock	*allfin; // list of all blocks
+static int32	fingwait;
+static Lock	gclock;
 
-static void runfinq(void*);
+static void	runfinq(void*);
+static void	bgsweep(void*);
 static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
 static void	putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
 static void	gchelperstart(void);
+static void	flushallmcaches(void);
+static void	addstackroots(G *gp, Workbuf **wbufp);
 
 static struct {
 	uint64	full;  // lock-free list of full blocks
 	uint64	empty; // lock-free list of empty blocks
 	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
 	uint32	nproc;
+	int64	tstart;
 	volatile uint32	nwait;
 	volatile uint32	ndone;
-	volatile uint32 debugmarkdone;
 	Note	alldone;
 	ParFor	*markfor;
-	ParFor	*sweepfor;
 
 	Lock;
 	byte	*chunk;
 	uintptr	nchunk;
-
-	Obj	*roots;
-	uint32	nroot;
-	uint32	rootcap;
 } work __attribute__((aligned(8)));
 
 enum {
 	GC_DEFAULT_PTR = GC_NUM_INSTR,
 	GC_CHAN,
-	GC_G_PTR,
 
 	GC_NUM_INSTR2
 };
@@ -250,6 +323,8 @@ static struct {
 		uint64 foundword;
 		uint64 foundspan;
 	} markonly;
+	uint32 nbgsweep;
+	uint32 npausesweep;
 } gcstats;
 
 // markonly marks an object. It returns true if the object
@@ -638,11 +713,6 @@ static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
 static uintptr chanProg[2] = {0, GC_CHAN};
 #endif
 
-#if 0
-// G* program
-static uintptr gptrProg[2] = {0, GC_G_PTR};
-#endif
-
 // Local variables of a program fragment or loop
 typedef struct Frame Frame;
 struct Frame {
@@ -713,15 +783,11 @@ checkptr(void *obj, uintptr objti)
 // a work list in the Workbuf* structures and loops in the main function
 // body.  Keeping an explicit work list is easier on the stack allocator and
 // more efficient.
-//
-// wbuf: current work buffer
-// wp:   storage for next queued pointer (write pointer)
-// nobj: number of queued objects
 static void
-scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
+scanblock(Workbuf *wbuf, bool keepworking)
 {
 	byte *b, *arena_start, *arena_used;
-	uintptr n, i, end_b, elemsize, size, ti, objti, count /* , type */;
+	uintptr n, i, end_b, elemsize, size, ti, objti, count, /* type, */ nobj;
 	uintptr *pc, precise_type, nominal_size;
 #if 0
 	uintptr *chan_ret, chancap;
@@ -738,8 +804,9 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 	Hchan *chan;
 	ChanType *chantype;
 #endif
+	Obj *wp;
 
-	if(sizeof(Workbuf) % PageSize != 0)
+	if(sizeof(Workbuf) % WorkbufSize != 0)
 		runtime_throw("scanblock: size of Workbuf is suboptimal");
 
 	// Memory arena parameters.
@@ -751,6 +818,14 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 	precise_type = false;
 	nominal_size = 0;
 
+	if(wbuf) {
+		nobj = wbuf->nobj;
+		wp = &wbuf->obj[nobj];
+	} else {
+		nobj = 0;
+		wp = nil;
+	}
+
 	// Initialize sbuf
 	scanbuffers = &bufferList[runtime_m()->helpgc];
 
@@ -904,11 +979,11 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		case GC_EFACE:
 			eface = (Eface*)(stack_top.b + pc[1]);
 			pc += 2;
-			if(eface->type == nil)
+			if(eface->__type_descriptor == nil)
 				continue;
 
 			// eface->type
-			t = eface->type;
+			t = eface->__type_descriptor;
 			if((const byte*)t >= arena_start && (const byte*)t < arena_used) {
 				union { const Type *tc; Type *tr; } u;
 				u.tc = t;
@@ -920,11 +995,11 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			// eface->__object
 			if((byte*)eface->__object >= arena_start && (byte*)eface->__object < arena_used) {
 				if(t->__size <= sizeof(void*)) {
-					if((t->kind & KindNoPointers))
+					if((t->__code & KindNoPointers))
 						continue;
 
 					obj = eface->__object;
-					if((t->kind & ~KindNoPointers) == KindPtr)
+					if((t->__code & ~KindNoPointers) == KindPtr)
 						// objti = (uintptr)((PtrType*)t)->elem->gc;
 						objti = 0;
 				} else {
@@ -953,11 +1028,11 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 				// t = iface->tab->type;
 				t = nil;
 				if(t->__size <= sizeof(void*)) {
-					if((t->kind & KindNoPointers))
+					if((t->__code & KindNoPointers))
 						continue;
 
 					obj = iface->__object;
-					if((t->kind & ~KindNoPointers) == KindPtr)
+					if((t->__code & ~KindNoPointers) == KindPtr)
 						// objti = (uintptr)((const PtrType*)t)->elem->gc;
 						objti = 0;
 				} else {
@@ -1064,7 +1139,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			}
 			if(markonly(chan)) {
 				chantype = (ChanType*)pc[2];
-				if(!(chantype->elem->kind & KindNoPointers)) {
+				if(!(chantype->elem->__code & KindNoPointers)) {
 					// Start chanProg.
 					chan_ret = pc+3;
 					pc = chanProg+1;
@@ -1077,7 +1152,7 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 		case GC_CHAN:
 			// There are no heap pointers in struct Hchan,
 			// so we can ignore the leading sizeof(Hchan) bytes.
-			if(!(chantype->elem->kind & KindNoPointers)) {
+			if(!(chantype->elem->__code & KindNoPointers)) {
 				// Channel's buffer follows Hchan immediately in memory.
 				// Size of buffer (cap(c)) is second int in the chan struct.
 				chancap = ((uintgo*)chan)[1];
@@ -1098,13 +1173,6 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 			continue;
 #endif
 
-#if 0
-		case GC_G_PTR:
-			obj = (void*)stack_top.b;
-			scanstack(obj, &sbuf);
-			goto next_block;
-#endif
-
 		default:
 			runtime_throw("scanblock: invalid GC instruction");
 			return;
@@ -1149,80 +1217,15 @@ scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
 	}
 }
 
-// debug_scanblock is the debug copy of scanblock.
-// it is simpler, slower, single-threaded, recursive,
-// and uses bitSpecial as the mark bit.
-static void
-debug_scanblock(byte *b, uintptr n)
-{
-	byte *obj, *p;
-	void **vp;
-	uintptr size, *bitp, bits, shift, i, xbits, off;
-	MSpan *s;
-
-	if(!DebugMark)
-		runtime_throw("debug_scanblock without DebugMark");
-
-	if((intptr)n < 0) {
-		runtime_printf("debug_scanblock %p %D\n", b, (int64)n);
-		runtime_throw("debug_scanblock");
-	}
-
-	// Align b to a word boundary.
-	off = (uintptr)b & (PtrSize-1);
-	if(off != 0) {
-		b += PtrSize - off;
-		n -= PtrSize - off;
-	}
-
-	vp = (void**)b;
-	n /= PtrSize;
-	for(i=0; i<(uintptr)n; i++) {
-		obj = (byte*)vp[i];
-
-		// Words outside the arena cannot be pointers.
-		if((byte*)obj < runtime_mheap.arena_start || (byte*)obj >= runtime_mheap.arena_used)
-			continue;
-
-		// Round down to word boundary.
-		obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
-
-		// Consult span table to find beginning.
-		s = runtime_MHeap_LookupMaybe(&runtime_mheap, obj);
-		if(s == nil)
-			continue;
-
-		p =  (byte*)((uintptr)s->start<<PageShift);
-		size = s->elemsize;
-		if(s->sizeclass == 0) {
-			obj = p;
-		} else {
-			int32 i = ((byte*)obj - p)/size;
-			obj = p+i*size;
-		}
-
-		// Now that we know the object header, reload bits.
-		off = (uintptr*)obj - (uintptr*)runtime_mheap.arena_start;
-		bitp = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-		shift = off % wordsPerBitmapWord;
-		xbits = *bitp;
-		bits = xbits >> shift;
-
-		// Now we have bits, bitp, and shift correct for
-		// obj pointing at the base of the object.
-		// If not allocated or already marked, done.
-		if((bits & bitAllocated) == 0 || (bits & bitSpecial) != 0)  // NOTE: bitSpecial not bitMarked
-			continue;
-		*bitp |= bitSpecial<<shift;
-		if(!(bits & bitMarked))
-			runtime_printf("found unmarked block %p in %p\n", obj, vp+i);
-
-		// If object has no pointers, don't need to scan further.
-		if((bits & bitScan) == 0)
-			continue;
+static struct root_list* roots;
 
-		debug_scanblock(obj, size);
-	}
+void
+__go_register_gc_roots (struct root_list* r)
+{
+	// FIXME: This needs locking if multiple goroutines can call
+	// dlopen simultaneously.
+	r->next = roots;
+	roots = r;
 }
 
 // Append obj to the work buffer.
@@ -1281,18 +1284,125 @@ enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
 }
 
 static void
+enqueue1(Workbuf **wbufp, Obj obj)
+{
+	Workbuf *wbuf;
+
+	wbuf = *wbufp;
+	if(wbuf->nobj >= nelem(wbuf->obj))
+		*wbufp = wbuf = getempty(wbuf);
+	wbuf->obj[wbuf->nobj++] = obj;
+}
+
+static void
 markroot(ParFor *desc, uint32 i)
 {
-	Obj *wp;
 	Workbuf *wbuf;
-	uintptr nobj;
+	FinBlock *fb;
+	MHeap *h;
+	MSpan **allspans, *s;
+	uint32 spanidx, sg;
+	G *gp;
+	void *p;
 
 	USED(&desc);
-	wp = nil;
-	wbuf = nil;
-	nobj = 0;
-	enqueue(work.roots[i], &wbuf, &wp, &nobj);
-	scanblock(wbuf, wp, nobj, false);
+	wbuf = getempty(nil);
+	switch(i) {
+	case RootData:
+		// For gccgo this is both data and bss.
+		{
+			struct root_list *pl;
+
+			for(pl = roots; pl != nil; pl = pl->next) {
+				struct root *pr = &pl->roots[0];
+				while(1) {
+					void *decl = pr->decl;
+					if(decl == nil)
+						break;
+					enqueue1(&wbuf, (Obj){decl, pr->size, 0});
+					pr++;
+				}
+			}
+		}
+		break;
+
+	case RootBss:
+		// For gccgo we use this for all the other global roots.
+		enqueue1(&wbuf, (Obj){(byte*)&runtime_m0, sizeof runtime_m0, 0});
+		enqueue1(&wbuf, (Obj){(byte*)&runtime_g0, sizeof runtime_g0, 0});
+		enqueue1(&wbuf, (Obj){(byte*)&runtime_allg, sizeof runtime_allg, 0});
+		enqueue1(&wbuf, (Obj){(byte*)&runtime_allm, sizeof runtime_allm, 0});
+		enqueue1(&wbuf, (Obj){(byte*)&runtime_allp, sizeof runtime_allp, 0});
+		enqueue1(&wbuf, (Obj){(byte*)&work, sizeof work, 0});
+		runtime_proc_scan(&wbuf, enqueue1);
+		runtime_MProf_Mark(&wbuf, enqueue1);
+		runtime_time_scan(&wbuf, enqueue1);
+		runtime_netpoll_scan(&wbuf, enqueue1);
+		break;
+
+	case RootFinalizers:
+		for(fb=allfin; fb; fb=fb->alllink)
+			enqueue1(&wbuf, (Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
+		break;
+
+	case RootSpanTypes:
+		// mark span types and MSpan.specials (to walk spans only once)
+		h = &runtime_mheap;
+		sg = h->sweepgen;
+		allspans = h->allspans;
+		for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
+			Special *sp;
+			SpecialFinalizer *spf;
+
+			s = allspans[spanidx];
+			if(s->sweepgen != sg) {
+				runtime_printf("sweep %d %d\n", s->sweepgen, sg);
+				runtime_throw("gc: unswept span");
+			}
+			if(s->state != MSpanInUse)
+				continue;
+			// The garbage collector ignores type pointers stored in MSpan.types:
+			//  - Compiler-generated types are stored outside of heap.
+			//  - The reflect package has runtime-generated types cached in its data structures.
+			//    The garbage collector relies on finding the references via that cache.
+			if(s->types.compression == MTypes_Words || s->types.compression == MTypes_Bytes)
+				markonly((byte*)s->types.data);
+			for(sp = s->specials; sp != nil; sp = sp->next) {
+				if(sp->kind != KindSpecialFinalizer)
+					continue;
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf = (SpecialFinalizer*)sp;
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
+				enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
+				enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
+				enqueue1(&wbuf, (Obj){(void*)&spf->ft, PtrSize, 0});
+				enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
+			}
+		}
+		break;
+
+	case RootFlushCaches:
+		flushallmcaches();
+		break;
+
+	default:
+		// the rest is scanning goroutine stacks
+		if(i - RootCount >= runtime_allglen)
+			runtime_throw("markroot: bad index");
+		gp = runtime_allg[i - RootCount];
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince == 0)
+			gp->waitsince = work.tstart;
+		addstackroots(gp, &wbuf);
+		break;
+		
+	}
+
+	if(wbuf)
+		scanblock(wbuf, false);
 }
 
 // Get an empty work buffer off the work.empty list,
@@ -1395,32 +1505,22 @@ handoff(Workbuf *b)
 }
 
 static void
-addroot(Obj obj)
+addstackroots(G *gp, Workbuf **wbufp)
 {
-	uint32 cap;
-	Obj *new;
-
-	if(work.nroot >= work.rootcap) {
-		cap = PageSize/sizeof(Obj);
-		if(cap < 2*work.rootcap)
-			cap = 2*work.rootcap;
-		new = (Obj*)runtime_SysAlloc(cap*sizeof(Obj), &mstats.gc_sys);
-		if(new == nil)
-			runtime_throw("runtime: cannot allocate memory");
-		if(work.roots != nil) {
-			runtime_memmove(new, work.roots, work.rootcap*sizeof(Obj));
-			runtime_SysFree(work.roots, work.rootcap*sizeof(Obj), &mstats.gc_sys);
-		}
-		work.roots = new;
-		work.rootcap = cap;
+	switch(gp->status){
+	default:
+		runtime_printf("unexpected G.status %d (goroutine %p %D)\n", gp->status, gp, gp->goid);
+		runtime_throw("mark - bad status");
+	case Gdead:
+		return;
+	case Grunning:
+		runtime_throw("mark - world not stopped");
+	case Grunnable:
+	case Gsyscall:
+	case Gwaiting:
+		break;
 	}
-	work.roots[work.nroot] = obj;
-	work.nroot++;
-}
 
-static void
-addstackroots(G *gp)
-{
 #ifdef USING_SPLIT_STACK
 	M *mp;
 	void* sp;
@@ -1458,11 +1558,11 @@ addstackroots(G *gp)
 		}
 	}
 	if(sp != nil) {
-		addroot((Obj){sp, spsize, 0});
+		enqueue1(wbufp, (Obj){sp, spsize, 0});
 		while((sp = __splitstack_find(next_segment, next_sp,
 					      &spsize, &next_segment,
 					      &next_sp, &initial_sp)) != nil)
-			addroot((Obj){sp, spsize, 0});
+			enqueue1(wbufp, (Obj){sp, spsize, 0});
 	}
 #else
 	M *mp;
@@ -1484,159 +1584,23 @@ addstackroots(G *gp)
 	}
 	top = (byte*)gp->gcinitial_sp + gp->gcstack_size;
 	if(top > bottom)
-		addroot((Obj){bottom, top - bottom, 0});
+		enqueue1(wbufp, (Obj){bottom, top - bottom, 0});
 	else
-		addroot((Obj){top, bottom - top, 0});
+		enqueue1(wbufp, (Obj){top, bottom - top, 0});
 #endif
 }
 
-static void
-addfinroots(void *v)
-{
-	uintptr size;
-	void *base;
-
-	size = 0;
-	if(!runtime_mlookup(v, (byte**)&base, &size, nil) || !runtime_blockspecial(base))
-		runtime_throw("mark - finalizer inconsistency");
-
-	// do not mark the finalizer block itself.  just mark the things it points at.
-	addroot((Obj){base, size, 0});
-}
-
-static struct root_list* roots;
-
 void
-__go_register_gc_roots (struct root_list* r)
-{
-	// FIXME: This needs locking if multiple goroutines can call
-	// dlopen simultaneously.
-	r->next = roots;
-	roots = r;
-}
-
-static void
-addroots(void)
-{
-	struct root_list *pl;
-	G *gp;
-	FinBlock *fb;
-	MSpan *s, **allspans;
-	uint32 spanidx;
-
-	work.nroot = 0;
-
-	// mark data+bss.
-	for(pl = roots; pl != nil; pl = pl->next) {
-		struct root* pr = &pl->roots[0];
-		while(1) {
-			void *decl = pr->decl;
-			if(decl == nil)
-				break;
-			addroot((Obj){decl, pr->size, 0});
-			pr++;
-		}
-	}
-
-	addroot((Obj){(byte*)&runtime_m0, sizeof runtime_m0, 0});
-	addroot((Obj){(byte*)&runtime_g0, sizeof runtime_g0, 0});
-	addroot((Obj){(byte*)&runtime_allg, sizeof runtime_allg, 0});
-	addroot((Obj){(byte*)&runtime_allm, sizeof runtime_allm, 0});
-	addroot((Obj){(byte*)&runtime_allp, sizeof runtime_allp, 0});
-	runtime_proc_scan(addroot);
-	runtime_MProf_Mark(addroot);
-	runtime_time_scan(addroot);
-	runtime_netpoll_scan(addroot);
-
-	// MSpan.types
-	allspans = runtime_mheap.allspans;
-	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
-		s = allspans[spanidx];
-		if(s->state == MSpanInUse) {
-			// The garbage collector ignores type pointers stored in MSpan.types:
-			//  - Compiler-generated types are stored outside of heap.
-			//  - The reflect package has runtime-generated types cached in its data structures.
-			//    The garbage collector relies on finding the references via that cache.
-			switch(s->types.compression) {
-			case MTypes_Empty:
-			case MTypes_Single:
-				break;
-			case MTypes_Words:
-			case MTypes_Bytes:
-				markonly((byte*)s->types.data);
-				break;
-			}
-		}
-	}
-
-	// stacks
-	for(gp=runtime_allg; gp!=nil; gp=gp->alllink) {
-		switch(gp->status){
-		default:
-			runtime_printf("unexpected G.status %d\n", gp->status);
-			runtime_throw("mark - bad status");
-		case Gdead:
-			break;
-		case Grunning:
-			runtime_throw("mark - world not stopped");
-		case Grunnable:
-		case Gsyscall:
-		case Gwaiting:
-			addstackroots(gp);
-			break;
-		}
-	}
-
-	runtime_walkfintab(addfinroots, addroot);
-
-	for(fb=allfin; fb; fb=fb->alllink)
-		addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
-
-	addroot((Obj){(byte*)&work, sizeof work, 0});
-}
-
-static void
-addfreelists(void)
-{
-	int32 i;
-	P *p, **pp;
-	MCache *c;
-	MLink *m;
-
-	// Mark objects in the MCache of each P so we don't collect them.
-	for(pp=runtime_allp; (p=*pp); pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		for(i = 0; i < NumSizeClasses; i++) {
-			for(m = c->list[i].list; m != nil; m = m->next) {
-				markonly(m);
-			}
-		}
-	}
-	// Note: the sweeper will mark objects in each span's freelist.
-}
-
-static bool
-handlespecial(byte *p, uintptr size)
+runtime_queuefinalizer(void *p, FuncVal *fn, const FuncType *ft, const PtrType *ot)
 {
-	FuncVal *fn;
-	const struct __go_func_type *ft;
-	const struct __go_ptr_type *ot;
 	FinBlock *block;
 	Finalizer *f;
-	
-	if(!runtime_getfinalizer(p, true, &fn, &ft, &ot)) {
-		runtime_setblockspecial(p, false);
-		runtime_MProf_Free(p, size);
-		return false;
-	}
 
-	runtime_lock(&finlock);
+	runtime_lock(&gclock);
 	if(finq == nil || finq->cnt == finq->cap) {
 		if(finc == nil) {
-			finc = runtime_persistentalloc(PageSize, 0, &mstats.gc_sys);
-			finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
+			finc = runtime_persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
+			finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
 			finc->alllink = allfin;
 			allfin = finc;
 		}
@@ -1651,37 +1615,64 @@ handlespecial(byte *p, uintptr size)
 	f->ft = ft;
 	f->ot = ot;
 	f->arg = p;
-	runtime_unlock(&finlock);
-	return true;
+	runtime_unlock(&gclock);
+}
+
+void
+runtime_MSpan_EnsureSwept(MSpan *s)
+{
+	M *m = runtime_m();
+	uint32 sg;
+
+	sg = runtime_mheap.sweepgen;
+	if(runtime_atomicload(&s->sweepgen) == sg)
+		return;
+	m->locks++;
+	if(runtime_cas(&s->sweepgen, sg-2, sg-1)) {
+		runtime_MSpan_Sweep(s);
+		m->locks--;
+		return;
+	}
+	m->locks--;
+	// unfortunate condition, and we don't have efficient means to wait
+	while(runtime_atomicload(&s->sweepgen) != sg)
+		runtime_osyield();  
 }
 
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
-static void
-sweepspan(ParFor *desc, uint32 idx)
+// Returns true if the span was returned to heap.
+bool
+runtime_MSpan_Sweep(MSpan *s)
 {
 	M *m;
-	int32 cl, n, npages;
-	uintptr size, off, *bitp, shift;
+	int32 cl, n, npages, nfree;
+	uintptr size, off, *bitp, shift, bits;
+	uint32 sweepgen;
 	byte *p;
 	MCache *c;
 	byte *arena_start;
 	MLink head, *end;
-	int32 nfree;
 	byte *type_data;
 	byte compression;
 	uintptr type_data_inc;
-	MSpan *s;
 	MLink *x;
+	Special *special, **specialp, *y;
+	bool res, sweepgenset;
 
 	m = runtime_m();
 
-	USED(&desc);
-	s = runtime_mheap.allspans[idx];
-	if(s->state != MSpanInUse)
-		return;
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	if(m->locks == 0 && m->mallocing == 0 && runtime_g() != m->g0)
+		runtime_throw("MSpan_Sweep: m is not locked");
+	sweepgen = runtime_mheap.sweepgen;
+	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
+		runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
+			s->state, s->sweepgen, sweepgen);
+		runtime_throw("MSpan_Sweep: bad span state");
+	}
 	arena_start = runtime_mheap.arena_start;
-	p = (byte*)(s->start << PageShift);
 	cl = s->sizeclass;
 	size = s->elemsize;
 	if(cl == 0) {
@@ -1691,9 +1682,11 @@ sweepspan(ParFor *desc, uint32 idx)
 		npages = runtime_class_to_allocnpages[cl];
 		n = (npages << PageShift) / size;
 	}
+	res = false;
 	nfree = 0;
 	end = &head;
 	c = m->mcache;
+	sweepgenset = false;
 
 	// mark any free objects in this span so we don't collect them
 	for(x = s->freelist; x != nil; x = x->next) {
@@ -1706,6 +1699,35 @@ sweepspan(ParFor *desc, uint32 idx)
 		*bitp |= bitMarked<<shift;
 	}
 	
+	// Unlink & free special records for any objects we're about to free.
+	specialp = &s->specials;
+	special = *specialp;
+	while(special != nil) {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		p = (byte*)(s->start << PageShift) + special->offset/size*size;
+		off = (uintptr*)p - (uintptr*)arena_start;
+		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		bits = *bitp>>shift;
+		if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
+			// Find the exact byte for which the special was setup
+			// (as opposed to object beginning).
+			p = (byte*)(s->start << PageShift) + special->offset;
+			// about to free object: splice out special record
+			y = special;
+			special = special->next;
+			*specialp = special;
+			if(!runtime_freespecial(y, p, size, false)) {
+				// stop freeing of object if it has a finalizer
+				*bitp |= bitMarked << shift;
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special->next;
+			special = *specialp;
+		}
+	}
+
 	type_data = (byte*)s->types.data;
 	type_data_inc = sizeof(uintptr);
 	compression = s->types.compression;
@@ -1719,9 +1741,8 @@ sweepspan(ParFor *desc, uint32 idx)
 	// Sweep through n objects of given size starting at p.
 	// This thread owns the span now, so it can manipulate
 	// the block bitmap without atomic operations.
+	p = (byte*)(s->start << PageShift);
 	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
-		uintptr off, *bitp, shift, bits;
-
 		off = (uintptr*)p - (uintptr*)arena_start;
 		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
@@ -1731,36 +1752,28 @@ sweepspan(ParFor *desc, uint32 idx)
 			continue;
 
 		if((bits & bitMarked) != 0) {
-			if(DebugMark) {
-				if(!(bits & bitSpecial))
-					runtime_printf("found spurious mark on %p\n", p);
-				*bitp &= ~(bitSpecial<<shift);
-			}
 			*bitp &= ~(bitMarked<<shift);
 			continue;
 		}
 
-		// Special means it has a finalizer or is being profiled.
-		// In DebugMark mode, the bit has been coopted so
-		// we have to assume all blocks are special.
-		if(DebugMark || (bits & bitSpecial) != 0) {
-			if(handlespecial(p, size))
-				continue;
-		}
-
 		// Clear mark, scan, and special bits.
 		*bitp &= ~((bitScan|bitMarked|bitSpecial)<<shift);
 
 		if(cl == 0) {
 			// Free large span.
 			runtime_unmarkspan(p, 1<<PageShift);
-			*(uintptr*)p = (uintptr)0xdeaddeaddeaddeadll;	// needs zeroing
+			s->needzero = 1;
+			// important to set sweepgen before returning it to heap
+			runtime_atomicstore(&s->sweepgen, sweepgen);
+			sweepgenset = true;
 			if(runtime_debug.efence)
 				runtime_SysFree(p, size, &mstats.gc_sys);
 			else
 				runtime_MHeap_Free(&runtime_mheap, s, 1);
 			c->local_nlargefree++;
 			c->local_largefree += size;
+			runtime_xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
+			res = true;
 		} else {
 			// Free small object.
 			switch(compression) {
@@ -1771,19 +1784,106 @@ sweepspan(ParFor *desc, uint32 idx)
 				*(byte*)type_data = 0;
 				break;
 			}
-			if(size > sizeof(uintptr))
+			if(size > 2*sizeof(uintptr))
 				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
-			
+			else if(size > sizeof(uintptr))
+				((uintptr*)p)[1] = 0;
+
 			end->next = (MLink*)p;
 			end = (MLink*)p;
 			nfree++;
 		}
 	}
 
+	if(!sweepgenset) {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
+			runtime_printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
+				s->state, s->sweepgen, sweepgen);
+			runtime_throw("MSpan_Sweep: bad span state after sweep");
+		}
+		runtime_atomicstore(&s->sweepgen, sweepgen);
+	}
 	if(nfree) {
 		c->local_nsmallfree[cl] += nfree;
 		c->local_cachealloc -= nfree * size;
-		runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
+		runtime_xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
+		res = runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
+	}
+	return res;
+}
+
+// State of background sweep.
+// Pretected by gclock.
+static struct
+{
+	G*	g;
+	bool	parked;
+
+	MSpan**	spans;
+	uint32	nspan;
+	uint32	spanidx;
+} sweep;
+
+// background sweeping goroutine
+static void
+bgsweep(void* dummy __attribute__ ((unused)))
+{
+	runtime_g()->issystem = 1;
+	for(;;) {
+		while(runtime_sweepone() != (uintptr)-1) {
+			gcstats.nbgsweep++;
+			runtime_gosched();
+		}
+		runtime_lock(&gclock);
+		if(finq != nil) {
+			// kick off or wake up goroutine to run queued finalizers
+			if(fing == nil)
+				fing = __go_go(runfinq, nil);
+			else if(fingwait) {
+				fingwait = 0;
+				runtime_ready(fing);
+			}
+		}
+		sweep.parked = true;
+		runtime_parkunlock(&gclock, "GC sweep wait");
+	}
+}
+
+// sweeps one span
+// returns number of pages returned to heap, or -1 if there is nothing to sweep
+uintptr
+runtime_sweepone(void)
+{
+	M *m = runtime_m();
+	MSpan *s;
+	uint32 idx, sg;
+	uintptr npages;
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	m->locks++;
+	sg = runtime_mheap.sweepgen;
+	for(;;) {
+		idx = runtime_xadd(&sweep.spanidx, 1) - 1;
+		if(idx >= sweep.nspan) {
+			runtime_mheap.sweepdone = true;
+			m->locks--;
+			return (uintptr)-1;
+		}
+		s = sweep.spans[idx];
+		if(s->state != MSpanInUse) {
+			s->sweepgen = sg;
+			continue;
+		}
+		if(s->sweepgen != sg-2 || !runtime_cas(&s->sweepgen, sg-2, sg-1))
+			continue;
+		npages = s->npages;
+		if(!runtime_MSpan_Sweep(s))
+			npages = 0;
+		m->locks--;
+		return npages;
 	}
 }
 
@@ -1874,34 +1974,14 @@ runtime_gchelper(void)
 	runtime_parfordo(work.markfor);
 
 	// help other threads scan secondary blocks
-	scanblock(nil, nil, 0, true);
+	scanblock(nil, true);
 
-	if(DebugMark) {
-		// wait while the main thread executes mark(debug_scanblock)
-		while(runtime_atomicload(&work.debugmarkdone) == 0)
-			runtime_usleep(10);
-	}
-
-	runtime_parfordo(work.sweepfor);
 	bufferList[runtime_m()->helpgc].busy = 0;
 	nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
 	if(runtime_xadd(&work.ndone, +1) == nproc-1)
 		runtime_notewakeup(&work.alldone);
 }
 
-#define GcpercentUnknown (-2)
-
-// Initialized from $GOGC.  GOGC=off means no gc.
-//
-// Next gc is after we've allocated an extra amount of
-// memory proportional to the amount already in use.
-// If gcpercent=100 and we're using 4M, we'll gc again
-// when we get to 8M.  This keeps the gc cost in linear
-// proportion to the allocation cost.  Adjusting gcpercent
-// just changes the linear constant (and also the amount of
-// extra memory used).
-static int32 gcpercent = GcpercentUnknown;
-
 static void
 cachestats(void)
 {
@@ -1917,12 +1997,25 @@ cachestats(void)
 }
 
 static void
+flushallmcaches(void)
+{
+	P *p, **pp;
+	MCache *c;
+
+	// Flush MCache's to MCentral.
+	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime_MCache_ReleaseAll(c);
+	}
+}
+
+static void
 updatememstats(GCStats *stats)
 {
 	M *mp;
 	MSpan *s;
-	MCache *c;
-	P *p, **pp;
 	uint32 i;
 	uint64 stacks_inuse, smallfree;
 	uint64 *src, *dst;
@@ -1963,12 +2056,7 @@ updatememstats(GCStats *stats)
 	}
 
 	// Flush MCache's to MCentral.
-	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
-		c = p->mcache;
-		if(c==nil)
-			continue;
-		runtime_MCache_ReleaseAll(c);
-	}
+	flushallmcaches();
 
 	// Aggregate local stats.
 	cachestats();
@@ -2081,6 +2169,9 @@ runtime_gc(int32 force)
 	a.start_time = runtime_nanotime();
 	m->gcing = 1;
 	runtime_stoptheworld();
+	
+	if(runtime_debug.allocfreetrace)
+		runtime_MProf_TraceGC();
 
 	clearpools();
 
@@ -2108,19 +2199,24 @@ runtime_gc(int32 force)
 	m->locks--;
 
 	// now that gc is done, kick off finalizer thread if needed
-	if(finq != nil) {
-		runtime_lock(&finlock);
-		// kick off or wake up goroutine to run queued finalizers
-		if(fing == nil)
-			fing = __go_go(runfinq, nil);
-		else if(fingwait) {
-			fingwait = 0;
-			runtime_ready(fing);
+	if(!ConcurrentSweep) {
+		if(finq != nil) {
+			runtime_lock(&gclock);
+			// kick off or wake up goroutine to run queued finalizers
+			if(fing == nil)
+				fing = __go_go(runfinq, nil);
+			else if(fingwait) {
+				fingwait = 0;
+				runtime_ready(fing);
+			}
+			runtime_unlock(&gclock);
 		}
-		runtime_unlock(&finlock);
+		// give the queued finalizers, if any, a chance to run
+		runtime_gosched();
+	} else {
+		// For gccgo, let other goroutines run.
+		runtime_gosched();
 	}
-	// give the queued finalizers, if any, a chance to run
-	runtime_gosched();
 }
 
 static void
@@ -2137,7 +2233,7 @@ gc(struct gc_args *args)
 {
 	M *m;
 	int64 t0, t1, t2, t3, t4;
-	uint64 heap0, heap1, obj0, obj1, ninstr;
+	uint64 heap0, heap1, obj, ninstr;
 	GCStats stats;
 	M *mp;
 	uint32 i;
@@ -2146,6 +2242,7 @@ gc(struct gc_args *args)
 	m = runtime_m();
 
 	t0 = args->start_time;
+	work.tstart = args->start_time; 
 
 	if(CollectStats)
 		runtime_memclr((byte*)&gcstats, sizeof(gcstats));
@@ -2153,61 +2250,50 @@ gc(struct gc_args *args)
 	for(mp=runtime_allm; mp; mp=mp->alllink)
 		runtime_settype_flush(mp);
 
-	heap0 = 0;
-	obj0 = 0;
-	if(runtime_debug.gctrace) {
-		updatememstats(nil);
-		heap0 = mstats.heap_alloc;
-		obj0 = mstats.nmalloc - mstats.nfree;
-	}
-
 	m->locks++;	// disable gc during mallocs in parforalloc
 	if(work.markfor == nil)
 		work.markfor = runtime_parforalloc(MaxGcproc);
-	if(work.sweepfor == nil)
-		work.sweepfor = runtime_parforalloc(MaxGcproc);
 	m->locks--;
 
 	if(itabtype == nil) {
 		// get C pointer to the Go type "itab"
 		// runtime_gc_itab_ptr(&eface);
-		// itabtype = ((PtrType*)eface.type)->elem;
+		// itabtype = ((PtrType*)eface.__type_descriptor)->elem;
 	}
 
+	t1 = runtime_nanotime();
+
+	// Sweep what is not sweeped by bgsweep.
+	while(runtime_sweepone() != (uintptr)-1)
+		gcstats.npausesweep++;
+
 	work.nwait = 0;
 	work.ndone = 0;
-	work.debugmarkdone = 0;
 	work.nproc = runtime_gcprocs();
-	addroots();
-	addfreelists();
-	runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
-	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap.nspan, nil, true, sweepspan);
+	runtime_parforsetup(work.markfor, work.nproc, RootCount + runtime_allglen, nil, false, markroot);
 	if(work.nproc > 1) {
 		runtime_noteclear(&work.alldone);
 		runtime_helpgc(work.nproc);
 	}
 
-	t1 = runtime_nanotime();
+	t2 = runtime_nanotime();
 
 	gchelperstart();
 	runtime_parfordo(work.markfor);
-	scanblock(nil, nil, 0, true);
-
-	if(DebugMark) {
-		for(i=0; i<work.nroot; i++)
-			debug_scanblock(work.roots[i].p, work.roots[i].n);
-		runtime_atomicstore(&work.debugmarkdone, 1);
-	}
-	t2 = runtime_nanotime();
+	scanblock(nil, true);
 
-	runtime_parfordo(work.sweepfor);
-	bufferList[m->helpgc].busy = 0;
 	t3 = runtime_nanotime();
 
+	bufferList[m->helpgc].busy = 0;
 	if(work.nproc > 1)
 		runtime_notesleep(&work.alldone);
 
 	cachestats();
+	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+	// estimate what was live heap size after previous GC (for tracing only)
+	heap0 = mstats.next_gc*100/(gcpercent+100);
+	// conservatively set next_gc to high value assuming that everything is live
+	// concurrent/lazy sweep will reduce this number while discovering new garbage
 	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 
 	t4 = runtime_nanotime();
@@ -2221,20 +2307,23 @@ gc(struct gc_args *args)
 	if(runtime_debug.gctrace) {
 		updatememstats(&stats);
 		heap1 = mstats.heap_alloc;
-		obj1 = mstats.nmalloc - mstats.nfree;
+		obj = mstats.nmalloc - mstats.nfree;
 
-		stats.nprocyield += work.sweepfor->nprocyield;
-		stats.nosyield += work.sweepfor->nosyield;
-		stats.nsleep += work.sweepfor->nsleep;
+		stats.nprocyield += work.markfor->nprocyield;
+		stats.nosyield += work.markfor->nosyield;
+		stats.nsleep += work.markfor->nsleep;
 
-		runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
+		runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB, %D (%D-%D) objects,"
+				" %d/%d/%d sweeps,"
 				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-			mstats.numgc, work.nproc, (t2-t1)/1000000, (t3-t2)/1000000, (t1-t0+t4-t3)/1000000,
-			heap0>>20, heap1>>20, obj0, obj1,
+			mstats.numgc, work.nproc, (t3-t2)/1000000, (t2-t1)/1000000, (t1-t0+t4-t3)/1000000,
+			heap0>>20, heap1>>20, obj,
 			mstats.nmalloc, mstats.nfree,
+			sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
 			stats.nhandoff, stats.nhandoffcnt,
-			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
+			work.markfor->nsteal, work.markfor->nstealcnt,
 			stats.nprocyield, stats.nosyield, stats.nsleep);
+		gcstats.nbgsweep = gcstats.npausesweep = 0;
 		if(CollectStats) {
 			runtime_printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
 				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
@@ -2261,9 +2350,44 @@ gc(struct gc_args *args)
 		}
 	}
 
+	// We cache current runtime_mheap.allspans array in sweep.spans,
+	// because the former can be resized and freed.
+	// Otherwise we would need to take heap lock every time
+	// we want to convert span index to span pointer.
+
+	// Free the old cached array if necessary.
+	if(sweep.spans && sweep.spans != runtime_mheap.allspans)
+		runtime_SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &mstats.other_sys);
+	// Cache the current array.
+	runtime_mheap.sweepspans = runtime_mheap.allspans;
+	runtime_mheap.sweepgen += 2;
+	runtime_mheap.sweepdone = false;
+	sweep.spans = runtime_mheap.allspans;
+	sweep.nspan = runtime_mheap.nspan;
+	sweep.spanidx = 0;
+
+	// Temporary disable concurrent sweep, because we see failures on builders.
+	if(ConcurrentSweep) {
+		runtime_lock(&gclock);
+		if(sweep.g == nil)
+			sweep.g = __go_go(bgsweep, nil);
+		else if(sweep.parked) {
+			sweep.parked = false;
+			runtime_ready(sweep.g);
+		}
+		runtime_unlock(&gclock);
+	} else {
+		// Sweep all spans eagerly.
+		while(runtime_sweepone() != (uintptr)-1)
+			gcstats.npausesweep++;
+	}
+
 	runtime_MProf_GC();
 }
 
+extern uintptr runtime_sizeof_C_MStats
+  __asm__ (GOSYM_PREFIX "runtime.Sizeof_C_MStats");
+
 void runtime_ReadMemStats(MStats *)
   __asm__ (GOSYM_PREFIX "runtime.ReadMemStats");
 
@@ -2281,7 +2405,9 @@ runtime_ReadMemStats(MStats *stats)
 	m->gcing = 1;
 	runtime_stoptheworld();
 	updatememstats(nil);
-	*stats = mstats;
+	// Size of the trailing by_size array differs between Go and C,
+	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+	runtime_memmove(stats, &mstats, runtime_sizeof_C_MStats);
 	m->gcing = 0;
 	m->locks++;
 	runtime_semrelease(&runtime_worldsema);
@@ -2366,15 +2492,15 @@ runfinq(void* dummy __attribute__ ((unused)))
 	Iface iface;
 
 	for(;;) {
-		runtime_lock(&finlock);
+		runtime_lock(&gclock);
 		fb = finq;
 		finq = nil;
 		if(fb == nil) {
 			fingwait = 1;
-			runtime_park(runtime_unlock, &finlock, "finalizer wait");
+			runtime_parkunlock(&gclock, "finalizer wait");
 			continue;
 		}
-		runtime_unlock(&finlock);
+		runtime_unlock(&gclock);
 		if(raceenabled)
 			runtime_racefingo();
 		for(; fb; fb=next) {
@@ -2385,12 +2511,12 @@ runfinq(void* dummy __attribute__ ((unused)))
 
 				f = &fb->fin[i];
 				fint = ((const Type**)f->ft->__in.array)[0];
-				if(fint->kind == KindPtr) {
+				if(fint->__code == KindPtr) {
 					// direct use of pointer
 					param = &f->arg;
 				} else if(((const InterfaceType*)fint)->__methods.__count == 0) {
 					// convert to empty interface
-					ef.type = (const Type*)f->ot;
+					ef.__type_descriptor = (const Type*)f->ot;
 					ef.__object = f->arg;
 					param = &ef;
 				} else {
@@ -2585,50 +2711,6 @@ runtime_unmarkspan(void *v, uintptr n)
 		*b-- = 0;
 }
 
-bool
-runtime_blockspecial(void *v)
-{
-	uintptr *b, off, shift;
-
-	if(DebugMark)
-		return true;
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-
-	return (*b & (bitSpecial<<shift)) != 0;
-}
-
-void
-runtime_setblockspecial(void *v, bool s)
-{
-	uintptr *b, off, shift, bits, obits;
-
-	if(DebugMark)
-		return;
-
-	off = (uintptr*)v - (uintptr*)runtime_mheap.arena_start;
-	b = (uintptr*)runtime_mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-
-	for(;;) {
-		obits = *b;
-		if(s)
-			bits = obits | (bitSpecial<<shift);
-		else
-			bits = obits & ~(bitSpecial<<shift);
-		if(runtime_gomaxprocs == 1) {
-			*b = bits;
-			break;
-		} else {
-			// more than one goroutine is potentially running: use atomic op
-			if(runtime_casp((void**)b, (void*)obits, (void*)bits))
-				break;
-		}
-	}
-}
-
 void
 runtime_MHeap_MapBits(MHeap *h)
 {
diff --git a/libgo/runtime/mheap.c b/libgo/runtime/mheap.c
index 1b8ab79..3a5eb15 100644
--- a/libgo/runtime/mheap.c
+++ b/libgo/runtime/mheap.c
@@ -41,7 +41,10 @@ RecordSpan(void *vh, byte *p)
 			runtime_throw("runtime: cannot allocate memory");
 		if(h->allspans) {
 			runtime_memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
-			runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
+			// Don't free the old array if it's referenced by sweep.
+			// See the comment in mgc0.c.
+			if(h->allspans != runtime_mheap.sweepspans)
+				runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
 		}
 		h->allspans = all;
 		h->nspancap = cap;
@@ -57,10 +60,15 @@ runtime_MHeap_Init(MHeap *h)
 
 	runtime_FixAlloc_Init(&h->spanalloc, sizeof(MSpan), RecordSpan, h, &mstats.mspan_sys);
 	runtime_FixAlloc_Init(&h->cachealloc, sizeof(MCache), nil, nil, &mstats.mcache_sys);
+	runtime_FixAlloc_Init(&h->specialfinalizeralloc, sizeof(SpecialFinalizer), nil, nil, &mstats.other_sys);
+	runtime_FixAlloc_Init(&h->specialprofilealloc, sizeof(SpecialProfile), nil, nil, &mstats.other_sys);
 	// h->mapcache needs no init
-	for(i=0; i<nelem(h->free); i++)
+	for(i=0; i<nelem(h->free); i++) {
 		runtime_MSpanList_Init(&h->free[i]);
-	runtime_MSpanList_Init(&h->large);
+		runtime_MSpanList_Init(&h->busy[i]);
+	}
+	runtime_MSpanList_Init(&h->freelarge);
+	runtime_MSpanList_Init(&h->busylarge);
 	for(i=0; i<nelem(h->central); i++)
 		runtime_MCentral_Init(&h->central[i], i);
 }
@@ -84,10 +92,86 @@ runtime_MHeap_MapSpans(MHeap *h)
 	h->spans_mapped = n;
 }
 
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+static uintptr
+MHeap_ReclaimList(MHeap *h, MSpan *list, uintptr npages)
+{
+	MSpan *s;
+	uintptr n;
+	uint32 sg;
+
+	n = 0;
+	sg = runtime_mheap.sweepgen;
+retry:
+	for(s = list->next; s != list; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime_cas(&s->sweepgen, sg-2, sg-1)) {
+			runtime_MSpanList_Remove(s);
+			// swept spans are at the end of the list
+			runtime_MSpanList_InsertBack(list, s);
+			runtime_unlock(h);
+			n += runtime_MSpan_Sweep(s);
+			runtime_lock(h);
+			if(n >= npages)
+				return n;
+			// the span could have been moved elsewhere
+			goto retry;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being sweept by background sweeper, skip
+			continue;
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break;
+	}
+	return n;
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+static void
+MHeap_Reclaim(MHeap *h, uintptr npage)
+{
+	uintptr reclaimed, n;
+
+	// First try to sweep busy spans with large objects of size >= npage,
+	// this has good chances of reclaiming the necessary space.
+	for(n=npage; n < nelem(h->busy); n++) {
+		if(MHeap_ReclaimList(h, &h->busy[n], npage))
+			return;  // Bingo!
+	}
+
+	// Then -- even larger objects.
+	if(MHeap_ReclaimList(h, &h->busylarge, npage))
+		return;  // Bingo!
+
+	// Now try smaller objects.
+	// One such object is not enough, so we need to reclaim several of them.
+	reclaimed = 0;
+	for(n=0; n < npage && n < nelem(h->busy); n++) {
+		reclaimed += MHeap_ReclaimList(h, &h->busy[n], npage-reclaimed);
+		if(reclaimed >= npage)
+			return;
+	}
+
+	// Now sweep everything that is not yet swept.
+	runtime_unlock(h);
+	for(;;) {
+		n = runtime_sweepone();
+		if(n == (uintptr)-1)  // all spans are swept
+			break;
+		reclaimed += n;
+		if(reclaimed >= npage)
+			break;
+	}
+	runtime_lock(h);
+}
+
 // Allocate a new span of npage pages from the heap
 // and record its size class in the HeapMap and HeapMapCache.
 MSpan*
-runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed)
+runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero)
 {
 	MSpan *s;
 
@@ -97,14 +181,22 @@ runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
-		if(acct) {
+		if(large) {
 			mstats.heap_objects++;
 			mstats.heap_alloc += npage<<PageShift;
+			// Swept spans are at the end of lists.
+			if(s->npages < nelem(h->free))
+				runtime_MSpanList_InsertBack(&h->busy[s->npages], s);
+			else
+				runtime_MSpanList_InsertBack(&h->busylarge, s);
 		}
 	}
 	runtime_unlock(h);
-	if(s != nil && *(uintptr*)(s->start<<PageShift) != 0 && zeroed)
-		runtime_memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
+	if(s != nil) {
+		if(needzero && s->needzero)
+			runtime_memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
+		s->needzero = 0;
+	}
 	return s;
 }
 
@@ -115,6 +207,11 @@ MHeap_AllocLocked(MHeap *h, uintptr npage, int32 sizeclass)
 	MSpan *s, *t;
 	PageID p;
 
+	// To prevent excessive heap growth, before allocating n pages
+	// we need to sweep and reclaim at least n pages.
+	if(!h->sweepdone)
+		MHeap_Reclaim(h, npage);
+
 	// Try in fixed-size lists up to max.
 	for(n=npage; n < nelem(h->free); n++) {
 		if(!runtime_MSpanList_IsEmpty(&h->free[n])) {
@@ -138,29 +235,12 @@ HaveSpan:
 	if(s->npages < npage)
 		runtime_throw("MHeap_AllocLocked - bad npages");
 	runtime_MSpanList_Remove(s);
+	runtime_atomicstore(&s->sweepgen, h->sweepgen);
 	s->state = MSpanInUse;
 	mstats.heap_idle -= s->npages<<PageShift;
 	mstats.heap_released -= s->npreleased<<PageShift;
-	if(s->npreleased > 0) {
-		// We have called runtime_SysUnused with these pages, and on
-		// Unix systems it called madvise.  At this point at least
-		// some BSD-based kernels will return these pages either as
-		// zeros or with the old data.  For our caller, the first word
-		// in the page indicates whether the span contains zeros or
-		// not (this word was set when the span was freed by
-		// MCentral_Free or runtime_MCentral_FreeSpan).  If the first
-		// page in the span is returned as zeros, and some subsequent
-		// page is returned with the old data, then we will be
-		// returning a span that is assumed to be all zeros, but the
-		// actual data will not be all zeros.  Avoid that problem by
-		// explicitly marking the span as not being zeroed, just in
-		// case.  The beadbead constant we use here means nothing, it
-		// is just a unique constant not seen elsewhere in the
-		// runtime, as a clue in case it turns up unexpectedly in
-		// memory or in a stack trace.
+	if(s->npreleased > 0)
 		runtime_SysUsed((void*)(s->start<<PageShift), s->npages<<PageShift);
-		*(uintptr*)(s->start<<PageShift) = (uintptr)0xbeadbeadbeadbeadULL;
-	}
 	s->npreleased = 0;
 
 	if(s->npages > npage) {
@@ -174,7 +254,8 @@ HaveSpan:
 			h->spans[p-1] = s;
 		h->spans[p] = t;
 		h->spans[p+t->npages-1] = t;
-		*(uintptr*)(t->start<<PageShift) = *(uintptr*)(s->start<<PageShift);  // copy "needs zeroing" mark
+		t->needzero = s->needzero;
+		runtime_atomicstore(&t->sweepgen, h->sweepgen);
 		t->state = MSpanInUse;
 		MHeap_FreeLocked(h, t);
 		t->unusedsince = s->unusedsince; // preserve age
@@ -197,7 +278,7 @@ HaveSpan:
 static MSpan*
 MHeap_AllocLarge(MHeap *h, uintptr npage)
 {
-	return BestFit(&h->large, npage, nil);
+	return BestFit(&h->freelarge, npage, nil);
 }
 
 // Search list for smallest span with >= npage pages.
@@ -258,6 +339,7 @@ MHeap_Grow(MHeap *h, uintptr npage)
 	p -= ((uintptr)h->arena_start>>PageShift);
 	h->spans[p] = s;
 	h->spans[p + s->npages - 1] = s;
+	runtime_atomicstore(&s->sweepgen, h->sweepgen);
 	s->state = MSpanInUse;
 	MHeap_FreeLocked(h, s);
 	return true;
@@ -319,20 +401,19 @@ runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 static void
 MHeap_FreeLocked(MHeap *h, MSpan *s)
 {
-	uintptr *sp, *tp;
 	MSpan *t;
 	PageID p;
 
 	s->types.compression = MTypes_Empty;
 
-	if(s->state != MSpanInUse || s->ref != 0) {
-		runtime_printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d\n", s, s->start<<PageShift, s->state, s->ref);
+	if(s->state != MSpanInUse || s->ref != 0 || s->sweepgen != h->sweepgen) {
+		runtime_printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d sweepgen %d/%d\n",
+			s, s->start<<PageShift, s->state, s->ref, s->sweepgen, h->sweepgen);
 		runtime_throw("MHeap_FreeLocked - invalid free");
 	}
 	mstats.heap_idle += s->npages<<PageShift;
 	s->state = MSpanFree;
 	runtime_MSpanList_Remove(s);
-	sp = (uintptr*)(s->start<<PageShift);
 	// Stamp newly unused spans. The scavenger will use that
 	// info to potentially give back some pages to the OS.
 	s->unusedsince = runtime_nanotime();
@@ -342,13 +423,10 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 	p = s->start;
 	p -= (uintptr)h->arena_start >> PageShift;
 	if(p > 0 && (t = h->spans[p-1]) != nil && t->state != MSpanInUse) {
-		if(t->npreleased == 0) {  // cant't touch this otherwise
-			tp = (uintptr*)(t->start<<PageShift);
-			*tp |= *sp;	// propagate "needs zeroing" mark
-		}
 		s->start = t->start;
 		s->npages += t->npages;
 		s->npreleased = t->npreleased; // absorb released pages
+		s->needzero |= t->needzero;
 		p -= t->npages;
 		h->spans[p] = s;
 		runtime_MSpanList_Remove(t);
@@ -356,12 +434,9 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 		runtime_FixAlloc_Free(&h->spanalloc, t);
 	}
 	if((p+s->npages)*sizeof(h->spans[0]) < h->spans_mapped && (t = h->spans[p+s->npages]) != nil && t->state != MSpanInUse) {
-		if(t->npreleased == 0) {  // cant't touch this otherwise
-			tp = (uintptr*)(t->start<<PageShift);
-			*sp |= *tp;	// propagate "needs zeroing" mark
-		}
 		s->npages += t->npages;
 		s->npreleased += t->npreleased;
+		s->needzero |= t->needzero;
 		h->spans[p + s->npages - 1] = s;
 		runtime_MSpanList_Remove(t);
 		t->state = MSpanDead;
@@ -372,7 +447,7 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 	if(s->npages < nelem(h->free))
 		runtime_MSpanList_Insert(&h->free[s->npages], s);
 	else
-		runtime_MSpanList_Insert(&h->large, s);
+		runtime_MSpanList_Insert(&h->freelarge, s);
 }
 
 static void
@@ -427,7 +502,7 @@ scavenge(int32 k, uint64 now, uint64 limit)
 	sumreleased = 0;
 	for(i=0; i < nelem(h->free); i++)
 		sumreleased += scavengelist(&h->free[i], now, limit);
-	sumreleased += scavengelist(&h->large, now, limit);
+	sumreleased += scavengelist(&h->freelarge, now, limit);
 
 	if(runtime_debug.gctrace > 0) {
 		if(sumreleased > 0)
@@ -516,10 +591,13 @@ runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages)
 	span->ref = 0;
 	span->sizeclass = 0;
 	span->elemsize = 0;
-	span->state = 0;
+	span->state = MSpanDead;
 	span->unusedsince = 0;
 	span->npreleased = 0;
 	span->types.compression = MTypes_Empty;
+	span->specialLock.key = 0;
+	span->specials = nil;
+	span->needzero = 0;
 }
 
 // Initialize an empty doubly-linked list.
@@ -561,4 +639,212 @@ runtime_MSpanList_Insert(MSpan *list, MSpan *span)
 	span->prev->next = span;
 }
 
+void
+runtime_MSpanList_InsertBack(MSpan *list, MSpan *span)
+{
+	if(span->next != nil || span->prev != nil) {
+		runtime_printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
+		runtime_throw("MSpanList_Insert");
+	}
+	span->next = list;
+	span->prev = list->prev;
+	span->next->prev = span;
+	span->prev->next = span;
+}
+
+// Adds the special record s to the list of special records for
+// the object p.  All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+static bool
+addspecial(void *p, Special *s)
+{
+	MSpan *span;
+	Special **t, *x;
+	uintptr offset;
+	byte kind;
+
+	span = runtime_MHeap_LookupMaybe(&runtime_mheap, p);
+	if(span == nil)
+		runtime_throw("addspecial on invalid pointer");
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	runtime_MSpan_EnsureSwept(span);
+
+	offset = (uintptr)p - (span->start << PageShift);
+	kind = s->kind;
+
+	runtime_lock(&span->specialLock);
+
+	// Find splice point, check for existing record.
+	t = &span->specials;
+	while((x = *t) != nil) {
+		if(offset == x->offset && kind == x->kind) {
+			runtime_unlock(&span->specialLock);
+			return false; // already exists
+		}
+		if(offset < x->offset || (offset == x->offset && kind < x->kind))
+			break;
+		t = &x->next;
+	}
+	// Splice in record, fill in offset.
+	s->offset = offset;
+	s->next = x;
+	*t = s;
+	runtime_unlock(&span->specialLock);
+	return true;
+}
 
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+static Special*
+removespecial(void *p, byte kind)
+{
+	MSpan *span;
+	Special *s, **t;
+	uintptr offset;
+
+	span = runtime_MHeap_LookupMaybe(&runtime_mheap, p);
+	if(span == nil)
+		runtime_throw("removespecial on invalid pointer");
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	runtime_MSpan_EnsureSwept(span);
+
+	offset = (uintptr)p - (span->start << PageShift);
+
+	runtime_lock(&span->specialLock);
+	t = &span->specials;
+	while((s = *t) != nil) {
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if(offset == s->offset && kind == s->kind) {
+			*t = s->next;
+			runtime_unlock(&span->specialLock);
+			return s;
+		}
+		t = &s->next;
+	}
+	runtime_unlock(&span->specialLock);
+	return nil;
+}
+
+// Adds a finalizer to the object p.  Returns true if it succeeded.
+bool
+runtime_addfinalizer(void *p, FuncVal *f, const FuncType *ft, const PtrType *ot)
+{
+	SpecialFinalizer *s;
+
+	runtime_lock(&runtime_mheap.speciallock);
+	s = runtime_FixAlloc_Alloc(&runtime_mheap.specialfinalizeralloc);
+	runtime_unlock(&runtime_mheap.speciallock);
+	s->kind = KindSpecialFinalizer;
+	s->fn = f;
+	s->ft = ft;
+	s->ot = ot;
+	if(addspecial(p, s))
+		return true;
+
+	// There was an old finalizer
+	runtime_lock(&runtime_mheap.speciallock);
+	runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, s);
+	runtime_unlock(&runtime_mheap.speciallock);
+	return false;
+}
+
+// Removes the finalizer (if any) from the object p.
+void
+runtime_removefinalizer(void *p)
+{
+	SpecialFinalizer *s;
+
+	s = (SpecialFinalizer*)removespecial(p, KindSpecialFinalizer);
+	if(s == nil)
+		return; // there wasn't a finalizer to remove
+	runtime_lock(&runtime_mheap.speciallock);
+	runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, s);
+	runtime_unlock(&runtime_mheap.speciallock);
+}
+
+// Set the heap profile bucket associated with addr to b.
+void
+runtime_setprofilebucket(void *p, Bucket *b)
+{
+	SpecialProfile *s;
+
+	runtime_lock(&runtime_mheap.speciallock);
+	s = runtime_FixAlloc_Alloc(&runtime_mheap.specialprofilealloc);
+	runtime_unlock(&runtime_mheap.speciallock);
+	s->kind = KindSpecialProfile;
+	s->b = b;
+	if(!addspecial(p, s))
+		runtime_throw("setprofilebucket: profile already set");
+}
+
+// Do whatever cleanup needs to be done to deallocate s.  It has
+// already been unlinked from the MSpan specials list.
+// Returns true if we should keep working on deallocating p.
+bool
+runtime_freespecial(Special *s, void *p, uintptr size, bool freed)
+{
+	SpecialFinalizer *sf;
+	SpecialProfile *sp;
+
+	switch(s->kind) {
+	case KindSpecialFinalizer:
+		sf = (SpecialFinalizer*)s;
+		runtime_queuefinalizer(p, sf->fn, sf->ft, sf->ot);
+		runtime_lock(&runtime_mheap.speciallock);
+		runtime_FixAlloc_Free(&runtime_mheap.specialfinalizeralloc, sf);
+		runtime_unlock(&runtime_mheap.speciallock);
+		return false; // don't free p until finalizer is done
+	case KindSpecialProfile:
+		sp = (SpecialProfile*)s;
+		runtime_MProf_Free(sp->b, p, size, freed);
+		runtime_lock(&runtime_mheap.speciallock);
+		runtime_FixAlloc_Free(&runtime_mheap.specialprofilealloc, sp);
+		runtime_unlock(&runtime_mheap.speciallock);
+		return true;
+	default:
+		runtime_throw("bad special kind");
+		return true;
+	}
+}
+
+// Free all special records for p.
+void
+runtime_freeallspecials(MSpan *span, void *p, uintptr size)
+{
+	Special *s, **t, *list;
+	uintptr offset;
+
+	// first, collect all specials into the list; then, free them
+	// this is required to not cause deadlock between span->specialLock and proflock
+	list = nil;
+	offset = (uintptr)p - (span->start << PageShift);
+	runtime_lock(&span->specialLock);
+	t = &span->specials;
+	while((s = *t) != nil) {
+		if(offset + size <= s->offset)
+			break;
+		if(offset <= s->offset) {
+			*t = s->next;
+			s->next = list;
+			list = s;
+		} else
+			t = &s->next;
+	}
+	runtime_unlock(&span->specialLock);
+
+	while(list != nil) {
+		s = list;
+		list = s->next;
+		if(!runtime_freespecial(s, p, size, true))
+			runtime_throw("can't explicitly free an object with a finalizer");
+	}
+}
diff --git a/libgo/runtime/mprof.goc b/libgo/runtime/mprof.goc
index 469ddfe..24f8fe5 100644
--- a/libgo/runtime/mprof.goc
+++ b/libgo/runtime/mprof.goc
@@ -23,7 +23,6 @@ enum { MProf, BProf };  // profile types
 
 // Per-call-stack profiling information.
 // Lookup by hashing call stack into a linked-list hash table.
-typedef struct Bucket Bucket;
 struct Bucket
 {
 	Bucket	*next;	// next in hash list
@@ -35,14 +34,33 @@ struct Bucket
 	{
 		struct  // typ == MProf
 		{
+			// The following complex 3-stage scheme of stats accumulation
+			// is required to obtain a consistent picture of mallocs and frees
+			// for some point in time.
+			// The problem is that mallocs come in real time, while frees
+			// come only after a GC during concurrent sweeping. So if we would
+			// naively count them, we would get a skew toward mallocs.
+			//
+			// Mallocs are accounted in recent stats.
+			// Explicit frees are accounted in recent stats.
+			// GC frees are accounted in prev stats.
+			// After GC prev stats are added to final stats and
+			// recent stats are moved into prev stats.
 			uintptr	allocs;
 			uintptr	frees;
 			uintptr	alloc_bytes;
 			uintptr	free_bytes;
-			uintptr	recent_allocs;  // since last gc
+
+			uintptr	prev_allocs;  // since last but one till last gc
+			uintptr	prev_frees;
+			uintptr	prev_alloc_bytes;
+			uintptr	prev_free_bytes;
+
+			uintptr	recent_allocs;  // since last gc till now
 			uintptr	recent_frees;
 			uintptr	recent_alloc_bytes;
 			uintptr	recent_free_bytes;
+
 		};
 		struct  // typ == BProf
 		{
@@ -50,7 +68,8 @@ struct Bucket
 			int64	cycles;
 		};
 	};
-	uintptr	hash;
+	uintptr	hash;	// hash of size + stk
+	uintptr	size;
 	uintptr	nstk;
 	Location stk[1];
 };
@@ -64,7 +83,7 @@ static uintptr bucketmem;
 
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
 static Bucket*
-stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
+stkbucket(int32 typ, uintptr size, Location *stk, int32 nstk, bool alloc)
 {
 	int32 i, j;
 	uintptr h;
@@ -83,12 +102,17 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 		h += h<<10;
 		h ^= h>>6;
 	}
+	// hash in size
+	h += size;
+	h += h<<10;
+	h ^= h>>6;
+	// finalize
 	h += h<<3;
 	h ^= h>>11;
 
 	i = h%BuckHashSize;
 	for(b = buckhash[i]; b; b=b->next) {
-		if(b->typ == typ && b->hash == h && b->nstk == (uintptr)nstk) {
+		if(b->typ == typ && b->hash == h && b->size == size && b->nstk == (uintptr)nstk) {
 			for(j = 0; j < nstk; j++) {
 				if(b->stk[j].pc != stk[j].pc ||
 				   b->stk[j].lineno != stk[j].lineno ||
@@ -108,6 +132,7 @@ stkbucket(int32 typ, Location *stk, int32 nstk, bool alloc)
 	runtime_memmove(b->stk, stk, nstk*sizeof stk[0]);
 	b->typ = typ;
 	b->hash = h;
+	b->size = size;
 	b->nstk = nstk;
 	b->next = buckhash[i];
 	buckhash[i] = b;
@@ -127,10 +152,16 @@ MProf_GC(void)
 	Bucket *b;
 
 	for(b=mbuckets; b; b=b->allnext) {
-		b->allocs += b->recent_allocs;
-		b->frees += b->recent_frees;
-		b->alloc_bytes += b->recent_alloc_bytes;
-		b->free_bytes += b->recent_free_bytes;
+		b->allocs += b->prev_allocs;
+		b->frees += b->prev_frees;
+		b->alloc_bytes += b->prev_alloc_bytes;
+		b->free_bytes += b->prev_free_bytes;
+
+		b->prev_allocs = b->recent_allocs;
+		b->prev_frees = b->recent_frees;
+		b->prev_alloc_bytes = b->recent_alloc_bytes;
+		b->prev_free_bytes = b->recent_free_bytes;
+
 		b->recent_allocs = 0;
 		b->recent_frees = 0;
 		b->recent_alloc_bytes = 0;
@@ -147,115 +178,6 @@ runtime_MProf_GC(void)
 	runtime_unlock(&proflock);
 }
 
-// Map from pointer to Bucket* that allocated it.
-// Three levels:
-//	Linked-list hash table for top N-AddrHashShift bits.
-//	Array index for next AddrDenseBits bits.
-//	Linked list for next AddrHashShift-AddrDenseBits bits.
-// This is more efficient than using a general map,
-// because of the typical clustering of the pointer keys.
-
-typedef struct AddrHash AddrHash;
-typedef struct AddrEntry AddrEntry;
-
-enum {
-	AddrHashBits = 12,	// good for 4GB of used address space
-	AddrHashShift = 20,	// each AddrHash knows about 1MB of address space
-	AddrDenseBits = 8,	// good for a profiling rate of 4096 bytes
-};
-
-struct AddrHash
-{
-	AddrHash *next;	// next in top-level hash table linked list
-	uintptr addr;	// addr>>20
-	AddrEntry *dense[1<<AddrDenseBits];
-};
-
-struct AddrEntry
-{
-	AddrEntry *next;	// next in bottom-level linked list
-	uint32 addr;
-	Bucket *b;
-};
-
-static AddrHash **addrhash;	// points to (AddrHash*)[1<<AddrHashBits]
-static AddrEntry *addrfree;
-static uintptr addrmem;
-
-// Multiplicative hash function:
-// hashMultiplier is the bottom 32 bits of int((sqrt(5)-1)/2 * (1<<32)).
-// This is a good multiplier as suggested in CLR, Knuth.  The hash
-// value is taken to be the top AddrHashBits bits of the bottom 32 bits
-// of the multiplied value.
-enum {
-	HashMultiplier = 2654435769U
-};
-
-// Set the bucket associated with addr to b.
-static void
-setaddrbucket(uintptr addr, Bucket *b)
-{
-	int32 i;
-	uint32 h;
-	AddrHash *ah;
-	AddrEntry *e;
-
-	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
-	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>AddrHashShift))
-			goto found;
-
-	ah = runtime_persistentalloc(sizeof *ah, 0, &mstats.buckhash_sys);
-	addrmem += sizeof *ah;
-	ah->next = addrhash[h];
-	ah->addr = addr>>AddrHashShift;
-	addrhash[h] = ah;
-
-found:
-	if((e = addrfree) == nil) {
-		e = runtime_persistentalloc(64*sizeof *e, 0, &mstats.buckhash_sys);
-		addrmem += 64*sizeof *e;
-		for(i=0; i+1<64; i++)
-			e[i].next = &e[i+1];
-		e[63].next = nil;
-	}
-	addrfree = e->next;
-	e->addr = (uint32)~(addr & ((1<<AddrHashShift)-1));
-	e->b = b;
-	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
-	e->next = ah->dense[h];
-	ah->dense[h] = e;
-}
-
-// Get the bucket associated with addr and clear the association.
-static Bucket*
-getaddrbucket(uintptr addr)
-{
-	uint32 h;
-	AddrHash *ah;
-	AddrEntry *e, **l;
-	Bucket *b;
-
-	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
-	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>AddrHashShift))
-			goto found;
-	return nil;
-
-found:
-	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
-	for(l=&ah->dense[h]; (e=*l) != nil; l=&e->next) {
-		if(e->addr == (uint32)~(addr & ((1<<AddrHashShift)-1))) {
-			*l = e->next;
-			b = e->b;
-			e->next = addrfree;
-			addrfree = e;
-			return b;
-		}
-	}
-	return nil;
-}
-
 static const char*
 typeinfoname(int32 typeinfo)
 {
@@ -285,6 +207,18 @@ printstackframes(Location *stk, int32 nstk)
 	}
 }
 
+// Called by collector to report a gc in allocfreetrace mode.
+void
+runtime_MProf_TraceGC(void)
+{
+	Location stk[32];
+	int32 nstk;
+
+	nstk = runtime_callers(1, stk, nelem(stk));
+	runtime_printf("MProf_TraceGC\n");
+	printstackframes(stk, nstk);
+}
+
 // Called by malloc to record a profiled block.
 void
 runtime_MProf_Malloc(void *p, uintptr size, uintptr typ)
@@ -295,39 +229,44 @@ runtime_MProf_Malloc(void *p, uintptr size, uintptr typ)
 	const char *name;
 	int32 nstk;
 
-	nstk = runtime_callers(1, stk, 32);
+	nstk = runtime_callers(1, stk, nelem(stk));
 	runtime_lock(&proflock);
-        if(runtime_debug.allocfreetrace) {
+	if(runtime_debug.allocfreetrace) {
 		type = (Type*)(typ & ~3);
 		name = typeinfoname(typ & 3);
 		runtime_printf("MProf_Malloc(p=%p, size=%p, type=%p <%s", p, size, type, name);
 		if(type != nil)
-                	runtime_printf(" of %S", *type->__reflection);
+			runtime_printf(" of %S", *type->__reflection);
 		runtime_printf(">)\n");
 		printstackframes(stk, nstk);
 	}
-	b = stkbucket(MProf, stk, nstk, true);
+	b = stkbucket(MProf, size, stk, nstk, true);
 	b->recent_allocs++;
 	b->recent_alloc_bytes += size;
-	setaddrbucket((uintptr)p, b);
 	runtime_unlock(&proflock);
+
+	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
+	// This reduces potential contention and chances of deadlocks.
+	// Since the object must be alive during call to MProf_Malloc,
+	// it's fine to do this non-atomically.
+	runtime_setprofilebucket(p, b);
 }
 
 // Called when freeing a profiled block.
 void
-runtime_MProf_Free(void *p, uintptr size)
+runtime_MProf_Free(Bucket *b, void *p, uintptr size, bool freed)
 {
-	Bucket *b;
-
 	runtime_lock(&proflock);
-	b = getaddrbucket((uintptr)p);
-	if(b != nil) {
+	if(freed) {
 		b->recent_frees++;
 		b->recent_free_bytes += size;
-		if(runtime_debug.allocfreetrace) {
-			runtime_printf("MProf_Free(p=%p, size=%p)\n", p, size);
-			printstackframes(b->stk, b->nstk);
-		}
+	} else {
+		b->prev_frees++;
+		b->prev_free_bytes += size;
+	}
+	if(runtime_debug.allocfreetrace) {
+		runtime_printf("MProf_Free(p=%p, size=%p)\n", p, size);
+		printstackframes(b->stk, b->nstk);
 	}
 	runtime_unlock(&proflock);
 }
@@ -366,9 +305,9 @@ runtime_blockevent(int64 cycles, int32 skip)
 	if(rate <= 0 || (rate > cycles && runtime_fastrand1()%rate > cycles))
 		return;
 
-	nstk = runtime_callers(skip, stk, 32);
+	nstk = runtime_callers(skip, stk, nelem(stk));
 	runtime_lock(&proflock);
-	b = stkbucket(BProf, stk, nstk, true);
+	b = stkbucket(BProf, 0, stk, nstk, true);
 	b->count++;
 	b->cycles += cycles;
 	runtime_unlock(&proflock);
@@ -420,6 +359,7 @@ func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 		// garbage collection is disabled from the beginning of execution,
 		// accumulate stats as if a GC just happened, and recount buckets.
 		MProf_GC();
+		MProf_GC();
 		n = 0;
 		for(b=mbuckets; b; b=b->allnext)
 			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
@@ -437,13 +377,11 @@ func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 }
 
 void
-runtime_MProf_Mark(void (*addroot)(Obj))
+runtime_MProf_Mark(struct Workbuf **wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
 	// buckhash is not allocated via mallocgc.
-	addroot((Obj){(byte*)&mbuckets, sizeof mbuckets, 0});
-	addroot((Obj){(byte*)&bbuckets, sizeof bbuckets, 0});
-	addroot((Obj){(byte*)&addrhash, sizeof addrhash, 0});
-	addroot((Obj){(byte*)&addrfree, sizeof addrfree, 0});
+	enqueue1(wbufp, (Obj){(byte*)&mbuckets, sizeof mbuckets, 0});
+	enqueue1(wbufp, (Obj){(byte*)&bbuckets, sizeof bbuckets, 0});
 }
 
 // Must match BlockProfileRecord in debug.go.
@@ -568,6 +506,7 @@ saveg(G *gp, TRecord *r)
 }
 
 func GoroutineProfile(b Slice) (n int, ok bool) {
+	uintptr i;
 	TRecord *r;
 	G *gp;
 	
@@ -584,7 +523,8 @@ func GoroutineProfile(b Slice) (n int, ok bool) {
 			ok = true;
 			r = (TRecord*)b.__values;
 			saveg(g, r++);
-			for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
+			for(i = 0; i < runtime_allglen; i++) {
+				gp = runtime_allg[i];
 				if(gp == g || gp->status == Gdead)
 					continue;
 				saveg(gp, r++);
@@ -596,9 +536,3 @@ func GoroutineProfile(b Slice) (n int, ok bool) {
 		runtime_starttheworld();
 	}
 }
-
-void
-runtime_mprofinit(void)
-{
-	addrhash = runtime_persistentalloc((1<<AddrHashBits)*sizeof *addrhash, 0, &mstats.buckhash_sys);
-}
diff --git a/libgo/runtime/msize.c b/libgo/runtime/msize.c
index 745a769..34509d0 100644
--- a/libgo/runtime/msize.c
+++ b/libgo/runtime/msize.c
@@ -44,8 +44,8 @@ int32 runtime_class_to_allocnpages[NumSizeClasses];
 int8 runtime_size_to_class8[1024/8 + 1];
 int8 runtime_size_to_class128[(MaxSmallSize-1024)/128 + 1];
 
-static int32
-SizeToClass(int32 size)
+int32
+runtime_SizeToClass(int32 size)
 {
 	if(size > MaxSmallSize)
 		runtime_throw("SizeToClass - invalid size");
@@ -90,9 +90,9 @@ runtime_InitSizes(void)
 		// objects into the page, we might as well
 		// use just this size instead of having two
 		// different sizes.
-		if(sizeclass > 1
-		&& (int32)npages == runtime_class_to_allocnpages[sizeclass-1]
-		&& allocsize/size == allocsize/runtime_class_to_size[sizeclass-1]) {
+		if(sizeclass > 1 &&
+			(int32)npages == runtime_class_to_allocnpages[sizeclass-1] &&
+			allocsize/size == allocsize/runtime_class_to_size[sizeclass-1]) {
 			runtime_class_to_size[sizeclass-1] = size;
 			continue;
 		}
@@ -119,7 +119,7 @@ runtime_InitSizes(void)
 	// Double-check SizeToClass.
 	if(0) {
 		for(n=0; n < MaxSmallSize; n++) {
-			sizeclass = SizeToClass(n);
+			sizeclass = runtime_SizeToClass(n);
 			if(sizeclass < 1 || sizeclass >= NumSizeClasses || runtime_class_to_size[sizeclass] < n) {
 				runtime_printf("size=%d sizeclass=%d runtime_class_to_size=%d\n", n, sizeclass, runtime_class_to_size[sizeclass]);
 				runtime_printf("incorrect SizeToClass");
@@ -158,3 +158,18 @@ dump:
 	}
 	runtime_throw("InitSizes failed");
 }
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+uintptr
+runtime_roundupsize(uintptr size)
+{
+	if(size < MaxSmallSize) {
+		if(size <= 1024-8)
+			return runtime_class_to_size[runtime_size_to_class8[(size+7)>>3]];
+		else
+			return runtime_class_to_size[runtime_size_to_class128[(size-1024+127) >> 7]];
+	}
+	if(size + PageSize < size)
+		return size;
+	return ROUND(size, PageSize);
+}
diff --git a/libgo/runtime/netpoll.goc b/libgo/runtime/netpoll.goc
index 0270573..15dd58c 100644
--- a/libgo/runtime/netpoll.goc
+++ b/libgo/runtime/netpoll.goc
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd windows
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
 
 package net
 
@@ -24,21 +24,45 @@ package net
 // An implementation must call the following function to denote that the pd is ready.
 // void runtime_netpollready(G **gpp, PollDesc *pd, int32 mode);
 
+// PollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
+// goroutines respectively. The semaphore can be in the following states:
+// READY - io readiness notification is pending;
+//         a goroutine consumes the notification by changing the state to nil.
+// WAIT - a goroutine prepares to park on the semaphore, but not yet parked;
+//        the goroutine commits to park by changing the state to G pointer,
+//        or, alternatively, concurrent io notification changes the state to READY,
+//        or, alternatively, concurrent timeout/close changes the state to nil.
+// G pointer - the goroutine is blocked on the semaphore;
+//             io notification or timeout/close changes the state to READY or nil respectively
+//             and unparks the goroutine.
+// nil - nothing of the above.
 #define READY ((G*)1)
+#define WAIT  ((G*)2)
+
+enum
+{
+	PollBlockSize	= 4*1024,
+};
 
 struct PollDesc
 {
 	PollDesc* link;	// in pollcache, protected by pollcache.Lock
+
+	// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
+	// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
+	// pollReset, pollWait, pollWaitCanceled and runtime_netpollready (IO rediness notification)
+	// proceed w/o taking the lock. So closing, rg, rd, wg and wd are manipulated
+	// in a lock-free way by all operations.
 	Lock;		// protectes the following fields
 	uintptr	fd;
 	bool	closing;
 	uintptr	seq;	// protects from stale timers and ready notifications
-	G*	rg;	// G waiting for read or READY (binary semaphore)
+	G*	rg;	// READY, WAIT, G waiting for read or nil
 	Timer	rt;	// read deadline timer (set if rt.fv != nil)
 	int64	rd;	// read deadline
-	G*	wg;	// the same for writes
-	Timer	wt;
-	int64	wd;
+	G*	wg;	// READY, WAIT, G waiting for write or nil
+	Timer	wt;	// write deadline timer
+	int64	wd;	// write deadline
 };
 
 static struct
@@ -52,7 +76,7 @@ static struct
 	// seq is incremented when deadlines are changed or descriptor is reused.
 } pollcache;
 
-static bool	netpollblock(PollDesc*, int32);
+static bool	netpollblock(PollDesc*, int32, bool);
 static G*	netpollunblock(PollDesc*, int32, bool);
 static void	deadline(int64, Eface);
 static void	readDeadline(int64, Eface);
@@ -102,7 +126,6 @@ func runtime_pollClose(pd *PollDesc) {
 }
 
 func runtime_pollReset(pd *PollDesc, mode int) (err int) {
-	runtime_lock(pd);
 	err = checkerr(pd, mode);
 	if(err)
 		goto ret;
@@ -111,14 +134,15 @@ func runtime_pollReset(pd *PollDesc, mode int) (err int) {
 	else if(mode == 'w')
 		pd->wg = nil;
 ret:
-	runtime_unlock(pd);
 }
 
 func runtime_pollWait(pd *PollDesc, mode int) (err int) {
-	runtime_lock(pd);
 	err = checkerr(pd, mode);
 	if(err == 0) {
-		while(!netpollblock(pd, mode)) {
+		// As for now only Solaris uses level-triggered IO.
+		if(Solaris)
+			runtime_netpollarm(pd->fd, mode);
+		while(!netpollblock(pd, mode, false)) {
 			err = checkerr(pd, mode);
 			if(err != 0)
 				break;
@@ -127,15 +151,13 @@ func runtime_pollWait(pd *PollDesc, mode int) (err int) {
 			// Pretend it has not happened and retry.
 		}
 	}
-	runtime_unlock(pd);
 }
 
 func runtime_pollWaitCanceled(pd *PollDesc, mode int) {
-	runtime_lock(pd);
-	// wait for ioready, ignore closing or timeouts.
-	while(!netpollblock(pd, mode))
+	// This function is used only on windows after a failed attempt to cancel
+	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
+	while(!netpollblock(pd, mode, true))
 		;
-	runtime_unlock(pd);
 }
 
 func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
@@ -190,7 +212,7 @@ func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
 	}
 	// If we set the new deadline in the past, unblock currently pending IO if any.
 	rg = nil;
-	wg = nil;
+	runtime_atomicstorep(&wg, nil);  // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
 	if(pd->rd < 0)
 		rg = netpollunblock(pd, 'r', false);
 	if(pd->wd < 0)
@@ -210,6 +232,7 @@ func runtime_pollUnblock(pd *PollDesc) {
 		runtime_throw("runtime_pollUnblock: already closing");
 	pd->closing = true;
 	pd->seq++;
+	runtime_atomicstorep(&rg, nil);  // full memory barrier between store to closing and read of rg/wg in netpollunblock
 	rg = netpollunblock(pd, 'r', false);
 	wg = netpollunblock(pd, 'w', false);
 	if(pd->rt.fv) {
@@ -240,12 +263,10 @@ runtime_netpollready(G **gpp, PollDesc *pd, int32 mode)
 	G *rg, *wg;
 
 	rg = wg = nil;
-	runtime_lock(pd);
 	if(mode == 'r' || mode == 'r'+'w')
 		rg = netpollunblock(pd, 'r', true);
 	if(mode == 'w' || mode == 'r'+'w')
 		wg = netpollunblock(pd, 'w', true);
-	runtime_unlock(pd);
 	if(rg) {
 		rg->schedlink = *gpp;
 		*gpp = rg;
@@ -266,51 +287,75 @@ checkerr(PollDesc *pd, int32 mode)
 	return 0;
 }
 
+static bool
+blockcommit(G *gp, G **gpp)
+{
+	return runtime_casp(gpp, WAIT, gp);
+}
+
 // returns true if IO is ready, or false if timedout or closed
+// waitio - wait only for completed IO, ignore errors
 static bool
-netpollblock(PollDesc *pd, int32 mode)
+netpollblock(PollDesc *pd, int32 mode, bool waitio)
 {
-	G **gpp;
+	G **gpp, *old;
 
 	gpp = &pd->rg;
 	if(mode == 'w')
 		gpp = &pd->wg;
-	if(*gpp == READY) {
-		*gpp = nil;
-		return true;
+
+	// set the gpp semaphore to WAIT
+	for(;;) {
+		old = *gpp;
+		if(old == READY) {
+			*gpp = nil;
+			return true;
+		}
+		if(old != nil)
+			runtime_throw("netpollblock: double wait");
+		if(runtime_casp(gpp, nil, WAIT))
+			break;
 	}
-	if(*gpp != nil)
-		runtime_throw("netpollblock: double wait");
-	*gpp = runtime_g();
-	runtime_park(runtime_unlock, &pd->Lock, "IO wait");
-	runtime_lock(pd);
-	if(runtime_g()->param)
-		return true;
-	return false;
+
+	// need to recheck error states after setting gpp to WAIT
+	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
+	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
+	if(waitio || checkerr(pd, mode) == 0)
+		runtime_park((bool(*)(G*, void*))blockcommit, gpp, "IO wait");
+	// be careful to not lose concurrent READY notification
+	old = runtime_xchgp(gpp, nil);
+	if(old > WAIT)
+		runtime_throw("netpollblock: corrupted state");
+	return old == READY;
 }
 
 static G*
 netpollunblock(PollDesc *pd, int32 mode, bool ioready)
 {
-	G **gpp, *old;
+	G **gpp, *old, *new;
 
 	gpp = &pd->rg;
 	if(mode == 'w')
 		gpp = &pd->wg;
-	if(*gpp == READY)
-		return nil;
-	if(*gpp == nil) {
-		// Only set READY for ioready. runtime_pollWait
-		// will check for timeout/cancel before waiting.
+
+	for(;;) {
+		old = *gpp;
+		if(old == READY)
+			return nil;
+		if(old == nil && !ioready) {
+			// Only set READY for ioready. runtime_pollWait
+			// will check for timeout/cancel before waiting.
+			return nil;
+		}
+		new = nil;
 		if(ioready)
-			*gpp = READY;
-		return nil;
+			new = READY;
+		if(runtime_casp(gpp, old, new))
+			break;
 	}
-	old = *gpp;
-	// pass unblock reason onto blocked g
-	old->param = (void*)(uintptr)ioready;
-	*gpp = nil;
-	return old;
+	if(old > WAIT)
+		return old;  // must be G*
+	return nil;
 }
 
 static void
@@ -336,14 +381,14 @@ deadlineimpl(int64 now, Eface arg, bool read, bool write)
 		if(pd->rd <= 0 || pd->rt.fv == nil)
 			runtime_throw("deadlineimpl: inconsistent read deadline");
 		pd->rd = -1;
-		pd->rt.fv = nil;
+		runtime_atomicstorep(&pd->rt.fv, nil);  // full memory barrier between store to rd and load of rg in netpollunblock
 		rg = netpollunblock(pd, 'r', false);
 	}
 	if(write) {
 		if(pd->wd <= 0 || (pd->wt.fv == nil && !read))
 			runtime_throw("deadlineimpl: inconsistent write deadline");
 		pd->wd = -1;
-		pd->wt.fv = nil;
+		runtime_atomicstorep(&pd->wt.fv, nil);  // full memory barrier between store to wd and load of wg in netpollunblock
 		wg = netpollunblock(pd, 'w', false);
 	}
 	runtime_unlock(pd);
@@ -379,7 +424,7 @@ allocPollDesc(void)
 
 	runtime_lock(&pollcache);
 	if(pollcache.first == nil) {
-		n = PageSize/sizeof(*pd);
+		n = PollBlockSize/sizeof(*pd);
 		if(n == 0)
 			n = 1;
 		// Must be in non-GC memory because can be referenced
diff --git a/libgo/runtime/netpoll_epoll.c b/libgo/runtime/netpoll_epoll.c
index 2acbca3..fe534c9 100644
--- a/libgo/runtime/netpoll_epoll.c
+++ b/libgo/runtime/netpoll_epoll.c
@@ -116,6 +116,14 @@ runtime_netpollclose(uintptr fd)
 	return -res;
 }
 
+void
+runtime_netpollarm(uintptr fd, int32 mode)
+{
+	USED(fd);
+	USED(mode);
+	runtime_throw("unused");
+}
+
 // polls for ready network connections
 // returns list of goroutines that become runnable
 G*
@@ -159,7 +167,8 @@ retry:
 }
 
 void
-runtime_netpoll_scan(void (*addroot)(Obj))
+runtime_netpoll_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	USED(addroot);
+	USED(wbufp);
+	USED(enqueue1);
 }
diff --git a/libgo/runtime/netpoll_kqueue.c b/libgo/runtime/netpoll_kqueue.c
index 5d3f856..bc38644 100644
--- a/libgo/runtime/netpoll_kqueue.c
+++ b/libgo/runtime/netpoll_kqueue.c
@@ -59,6 +59,13 @@ runtime_netpollclose(uintptr fd)
 	return 0;
 }
 
+void
+runtime_netpollarm(uintptr fd, int32 mode)
+{
+	USED(fd, mode);
+	runtime_throw("unused");
+}
+
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
 G*
@@ -104,7 +111,8 @@ retry:
 }
 
 void
-runtime_netpoll_scan(void (*addroot)(Obj))
+runtime_netpoll_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	USED(addroot);
+	USED(wbufp);
+	USED(enqueue1);
 }
diff --git a/libgo/runtime/netpoll_select.c b/libgo/runtime/netpoll_select.c
index 788d19f..b461335 100644
--- a/libgo/runtime/netpoll_select.c
+++ b/libgo/runtime/netpoll_select.c
@@ -246,7 +246,7 @@ runtime_netpoll(bool block)
 }
 
 void
-runtime_netpoll_scan(void (*addroot)(Obj))
+runtime_netpoll_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	addroot((Obj){(byte*)&data, sizeof data, 0});
+	enqueue1(wbufp, (Obj){(byte*)&data, sizeof data, 0});
 }
diff --git a/libgo/runtime/netpoll_stub.c b/libgo/runtime/netpoll_stub.c
index a88c9f5..468a610 100644
--- a/libgo/runtime/netpoll_stub.c
+++ b/libgo/runtime/netpoll_stub.c
@@ -19,7 +19,8 @@ runtime_netpoll(bool block)
 }
 
 void
-runtime_netpoll_scan(void (*addroot)(Obj))
+runtime_netpoll_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
+	USED(wbufp);
 	USED(addroot);
 }
diff --git a/libgo/runtime/panic.c b/libgo/runtime/panic.c
index 8fe321f..78d4dd9 100644
--- a/libgo/runtime/panic.c
+++ b/libgo/runtime/panic.c
@@ -12,6 +12,42 @@
 uint32 runtime_panicking;
 static Lock paniclk;
 
+// Allocate a Defer, usually using per-P pool.
+// Each defer must be released with freedefer.
+Defer*
+runtime_newdefer()
+{
+	Defer *d;
+	P *p;
+
+	d = nil;
+	p = runtime_m()->p;
+	d = p->deferpool;
+	if(d)
+		p->deferpool = d->__next;
+	if(d == nil) {
+		// deferpool is empty
+		d = runtime_malloc(sizeof(Defer));
+	}
+	return d;
+}
+
+// Free the given defer.
+// The defer cannot be used after this call.
+void
+runtime_freedefer(Defer *d)
+{
+	P *p;
+
+	if(d->__special)
+		return;
+	p = runtime_m()->p;
+	d->__next = p->deferpool;
+	p->deferpool = d;
+	// No need to wipe out pointers in argp/pc/fn/args,
+	// because we empty the pool before GC.
+}
+
 // Run all deferred functions for the current goroutine.
 static void
 rundefer(void)
@@ -28,8 +64,7 @@ rundefer(void)
 		d->__pfn = nil;
 		if (pfn != nil)
 			(*pfn)(d->__arg);
-		if (d->__free)
-		  runtime_free(d);
+		runtime_freedefer(d);
 	}
 }
 
@@ -44,18 +79,34 @@ runtime_startpanic(void)
 		m->mallocing = 1; // tell rest of panic not to try to malloc
 	} else if(m->mcache == nil) // can happen if called from signal handler or throw
 		m->mcache = runtime_allocmcache();
-	if(m->dying) {
+	switch(m->dying) {
+	case 0:
+		m->dying = 1;
+		if(runtime_g() != nil)
+			runtime_g()->writebuf = nil;
+		runtime_xadd(&runtime_panicking, 1);
+		runtime_lock(&paniclk);
+		if(runtime_debug.schedtrace > 0 || runtime_debug.scheddetail > 0)
+			runtime_schedtrace(true);
+		runtime_freezetheworld();
+		return;
+	case 1:
+		// Something failed while panicing, probably the print of the
+		// argument to panic().  Just print a stack trace and exit.
+		m->dying = 2;
 		runtime_printf("panic during panic\n");
+		runtime_dopanic(0);
 		runtime_exit(3);
+	case 2:
+		// This is a genuine bug in the runtime, we couldn't even
+		// print the stack trace successfully.
+		m->dying = 3;
+		runtime_printf("stack trace unavailable\n");
+		runtime_exit(4);
+	default:
+		// Can't even print!  Just exit.
+		runtime_exit(5);
 	}
-	m->dying = 1;
-	if(runtime_g() != nil)
-		runtime_g()->writebuf = nil;
-	runtime_xadd(&runtime_panicking, 1);
-	runtime_lock(&paniclk);
-	if(runtime_debug.schedtrace > 0 || runtime_debug.scheddetail > 0)
-		runtime_schedtrace(true);
-	runtime_freezetheworld();
 }
 
 void
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index c627ac1..1e15519 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -392,17 +392,23 @@ struct Sched {
 	int32	profilehz;	// cpu profiling rate
 };
 
-// The max value of GOMAXPROCS.
-// There are no fundamental restrictions on the value.
-enum { MaxGomaxprocs = 1<<8 };
+enum
+{
+	// The max value of GOMAXPROCS.
+	// There are no fundamental restrictions on the value.
+	MaxGomaxprocs = 1<<8,
+
+	// Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	GoidCacheBatch = 16,
+};
 
 Sched	runtime_sched;
 int32	runtime_gomaxprocs;
 uint32	runtime_needextram = 1;
 bool	runtime_iscgo = true;
 M	runtime_m0;
-G	runtime_g0;	 // idle goroutine for m0
-G*	runtime_allg;
+G	runtime_g0;	// idle goroutine for m0
 G*	runtime_lastg;
 M*	runtime_allm;
 P**	runtime_allp;
@@ -412,10 +418,15 @@ int32	runtime_ncpu;
 bool	runtime_precisestack;
 static int32	newprocs;
 
+static	Lock allglock;	// the following vars are protected by this lock or by stoptheworld
+G**	runtime_allg;
+uintptr runtime_allglen;
+static	uintptr allgcap;
+
 void* runtime_mstart(void*);
 static void runqput(P*, G*);
 static G* runqget(P*);
-static void runqgrow(P*);
+static bool runqputslow(P*, G*, uint32, uint32);
 static G* runqsteal(P*, P*);
 static void mput(M*);
 static M* mget(void);
@@ -442,12 +453,14 @@ static void gfput(P*, G*);
 static G* gfget(P*);
 static void gfpurge(P*);
 static void globrunqput(G*);
+static void globrunqputbatch(G*, G*, int32);
 static G* globrunqget(P*, int32);
 static P* pidleget(void);
 static void pidleput(P*);
 static void injectglist(G*);
 static bool preemptall(void);
 static bool exitsyscallfast(void);
+static void allgadd(G*);
 
 // The bootstrap sequence is:
 //
@@ -476,7 +489,6 @@ runtime_schedinit(void)
 	runtime_sched.maxmcount = 10000;
 	runtime_precisestack = 0;
 
-	runtime_mprofinit();
 	runtime_mallocinit();
 	mcommoninit(m);
 	
@@ -541,7 +553,7 @@ runtime_main(void* dummy __attribute__((unused)))
 	d.__retaddr = nil;
 	d.__makefunc_can_recover = 0;
 	d.__frame = &frame;
-	d.__free = 0;
+	d.__special = true;
 	g->defer = &d;
 
 	if(m != &runtime_m0)
@@ -579,6 +591,7 @@ void
 runtime_goroutineheader(G *gp)
 {
 	const char *status;
+	int64 waitfor;
 
 	switch(gp->status) {
 	case Gidle:
@@ -603,7 +616,16 @@ runtime_goroutineheader(G *gp)
 		status = "???";
 		break;
 	}
-	runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
+
+	// approx time the G is blocked, in minutes
+	waitfor = 0;
+	if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
+		waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
+
+	if(waitfor < 1)
+		runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
+	else
+		runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
 }
 
 void
@@ -624,7 +646,7 @@ runtime_printcreatedby(G *g)
 struct Traceback
 {
 	G* gp;
-	Location locbuf[100];
+	Location locbuf[TracebackMaxFrames];
 	int32 c;
 };
 
@@ -634,6 +656,7 @@ runtime_tracebackothers(G * volatile me)
 	G * volatile gp;
 	Traceback tb;
 	int32 traceback;
+	volatile uintptr i;
 
 	tb.gp = me;
 	traceback = runtime_gotraceback(nil);
@@ -657,7 +680,9 @@ runtime_tracebackothers(G * volatile me)
 		runtime_printcreatedby(gp);
 	}
 
-	for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
+	runtime_lock(&allglock);
+	for(i = 0; i < runtime_allglen; i++) {
+		gp = runtime_allg[i];
 		if(gp == me || gp == m->curg || gp->status == Gdead)
 			continue;
 		if(gp->issystem && traceback < 2)
@@ -696,6 +721,7 @@ runtime_tracebackothers(G * volatile me)
 			runtime_printcreatedby(gp);
 		}
 	}
+	runtime_unlock(&allglock);
 }
 
 static void
@@ -1038,6 +1064,7 @@ struct CgoThreadStart
 {
 	M *m;
 	G *g;
+	uintptr *tls;
 	void (*fn)(void);
 };
 
@@ -1200,14 +1227,7 @@ runtime_newextram(void)
 	gp->lockedm = mp;
 	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
 	// put on allg for garbage collector
-	runtime_lock(&runtime_sched);
-	if(runtime_lastg == nil)
-		runtime_allg = gp;
-	else
-		runtime_lastg->alllink = gp;
-	runtime_lastg = gp;
-	runtime_unlock(&runtime_sched);
-	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+	allgadd(gp);
 
 	// The context for gp will be set up in runtime_needm.  But
 	// here we need to set up the context for g0.
@@ -1379,7 +1399,7 @@ mspinning(void)
 }
 
 // Schedules some M to run the p (creates an M if necessary).
-// If p==nil, tries to get an idle P, if no idle P's returns false.
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
 static void
 startm(P *p, bool spinning)
 {
@@ -1543,6 +1563,7 @@ execute(G *gp)
 		runtime_throw("execute: bad g status");
 	}
 	gp->status = Grunning;
+	gp->waitsince = 0;
 	m->p->schedtick++;
 	m->curg = gp;
 	gp->m = m;
@@ -1760,10 +1781,10 @@ top:
 	execute(gp);
 }
 
-// Puts the current goroutine into a waiting state and unlocks the lock.
-// The goroutine can be made runnable again by calling runtime_ready(gp).
+// Puts the current goroutine into a waiting state and calls unlockf.
+// If unlockf returns false, the goroutine is resumed.
 void
-runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason)
+runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
 {
 	m->waitlock = lock;
 	m->waitunlockf = unlockf;
@@ -1771,17 +1792,39 @@ runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason)
 	runtime_mcall(park0);
 }
 
+static bool
+parkunlock(G *gp, void *lock)
+{
+	USED(gp);
+	runtime_unlock(lock);
+	return true;
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling runtime_ready(gp).
+void
+runtime_parkunlock(Lock *lock, const char *reason)
+{
+	runtime_park(parkunlock, lock, reason);
+}
+
 // runtime_park continuation on g0.
 static void
 park0(G *gp)
 {
+	bool ok;
+
 	gp->status = Gwaiting;
 	gp->m = nil;
 	m->curg = nil;
 	if(m->waitunlockf) {
-		m->waitunlockf(m->waitlock);
+		ok = m->waitunlockf(gp, m->waitlock);
 		m->waitunlockf = nil;
 		m->waitlock = nil;
+		if(!ok) {
+			gp->status = Grunnable;
+			execute(gp);  // Schedule it back, never returns.
+		}
 	}
 	if(m->lockedg) {
 		stoplockedm();
@@ -1968,6 +2011,7 @@ runtime_exitsyscall(void)
 	if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
 		incidlelocked(-1);
 
+	g->waitsince = 0;
 	if(exitsyscallfast()) {
 		// There's a cpu for us, so we can run.
 		m->p->syscalltick++;
@@ -2160,11 +2204,13 @@ __go_go(void (*fn)(void*), void* arg)
 	byte *sp;
 	size_t spsize;
 	G *newg;
+	P *p;
 
 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
 	m->locks++;  // disable preemption because it can be holding p in a local var
 
-	if((newg = gfget(m->p)) != nil) {
+	p = m->p;
+	if((newg = gfget(p)) != nil) {
 #ifdef USING_SPLIT_STACK
 		int dont_block_signals = 0;
 
@@ -2181,20 +2227,18 @@ __go_go(void (*fn)(void*), void* arg)
 #endif
 	} else {
 		newg = runtime_malg(StackMin, &sp, &spsize);
-		runtime_lock(&runtime_sched);
-		if(runtime_lastg == nil)
-			runtime_allg = newg;
-		else
-			runtime_lastg->alllink = newg;
-		runtime_lastg = newg;
-		runtime_unlock(&runtime_sched);
+		allgadd(newg);
 	}
 
 	newg->entry = (byte*)fn;
 	newg->param = arg;
 	newg->gopc = (uintptr)__builtin_return_address(0);
 	newg->status = Grunnable;
-	newg->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+	if(p->goidcache == p->goidcacheend) {
+		p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
+		p->goidcacheend = p->goidcache + GoidCacheBatch;
+	}
+	newg->goid = p->goidcache++;
 
 	{
 		// Avoid warnings about variables clobbered by
@@ -2211,7 +2255,7 @@ __go_go(void (*fn)(void*), void* arg)
 		vnewg->context.uc_stack.ss_size = vspsize;
 		makecontext(&vnewg->context, kickoff, 0);
 
-		runqput(m->p, vnewg);
+		runqput(p, vnewg);
 
 		if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
 			wakep();
@@ -2220,6 +2264,31 @@ __go_go(void (*fn)(void*), void* arg)
 	}
 }
 
+static void
+allgadd(G *gp)
+{
+	G **new;
+	uintptr cap;
+
+	runtime_lock(&allglock);
+	if(runtime_allglen >= allgcap) {
+		cap = 4096/sizeof(new[0]);
+		if(cap < 2*allgcap)
+			cap = 2*allgcap;
+		new = runtime_malloc(cap*sizeof(new[0]));
+		if(new == nil)
+			runtime_throw("runtime: cannot allocate memory");
+		if(runtime_allg != nil) {
+			runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0]));
+			runtime_free(runtime_allg);
+		}
+		runtime_allg = new;
+		allgcap = cap;
+	}
+	runtime_allg[runtime_allglen++] = gp;
+	runtime_unlock(&allglock);
+}
+
 // Put on gfree list.
 // If local list is too long, transfer a batch to the global list.
 static void
@@ -2415,19 +2484,21 @@ runtime_gcount(void)
 {
 	G *gp;
 	int32 n, s;
+	uintptr i;
 
 	n = 0;
-	runtime_lock(&runtime_sched);
+	runtime_lock(&allglock);
 	// TODO(dvyukov): runtime.NumGoroutine() is O(N).
 	// We do not want to increment/decrement centralized counter in newproc/goexit,
 	// just to make runtime.NumGoroutine() faster.
 	// Compromise solution is to introduce per-P counters of active goroutines.
-	for(gp = runtime_allg; gp; gp = gp->alllink) {
+	for(i = 0; i < runtime_allglen; i++) {
+		gp = runtime_allg[i];
 		s = gp->status;
 		if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
 			n++;
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&allglock);
 	return n;
 }
 
@@ -2441,32 +2512,39 @@ static struct {
 	Lock;
 	void (*fn)(uintptr*, int32);
 	int32 hz;
-	uintptr pcbuf[100];
-	Location locbuf[100];
+	uintptr pcbuf[TracebackMaxFrames];
+	Location locbuf[TracebackMaxFrames];
 } prof;
 
-static void
-System(void)
-{
-}
+static void System(void) {}
+static void GC(void) {}
 
 // Called if we receive a SIGPROF signal.
 void
 runtime_sigprof()
 {
+	M *mp = m;
 	int32 n, i;
 	bool traceback;
 
 	if(prof.fn == nil || prof.hz == 0)
 		return;
+
+	if(mp == nil)
+		return;
+
 	traceback = true;
-	// Windows does profiling in a dedicated thread w/o m.
-	if(!Windows && (m == nil || m->mcache == nil))
+
+	if(mp->mcache == nil)
 		traceback = false;
-	
+
+	// Profiling runs concurrently with GC, so it must not allocate.
+	mp->mallocing++;
+
 	runtime_lock(&prof);
 	if(prof.fn == nil) {
 		runtime_unlock(&prof);
+		mp->mallocing--;
 		return;
 	}
 	n = 0;
@@ -2484,13 +2562,17 @@ runtime_sigprof()
 		for(i = 0; i < n; i++)
 			prof.pcbuf[i] = prof.locbuf[i].pc;
 	}
-	if (!traceback || n <= 0) {
+	if(!traceback || n <= 0) {
 		n = 2;
 		prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
-		prof.pcbuf[1] = (uintptr)System + 1;
+		if(mp->gcing || mp->helpgc)
+			prof.pcbuf[1] = (uintptr)GC;
+		else
+			prof.pcbuf[1] = (uintptr)System;
 	}
 	prof.fn(prof.pcbuf, n);
 	runtime_unlock(&prof);
+	mp->mallocing--;
 }
 
 // Arrange to call fn with a traceback hz times a second.
@@ -2533,6 +2615,7 @@ static void
 procresize(int32 new)
 {
 	int32 i, old;
+	bool empty;
 	G *gp;
 	P *p;
 
@@ -2554,27 +2637,42 @@ procresize(int32 new)
 			else
 				p->mcache = runtime_allocmcache();
 		}
-		if(p->runq == nil) {
-			p->runqsize = 128;
-			p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, FlagNoInvokeGC);
-		}
 	}
 
 	// redistribute runnable G's evenly
-	for(i = 0; i < old; i++) {
-		p = runtime_allp[i];
-		while((gp = runqget(p)) != nil)
-			globrunqput(gp);
+	// collect all runnable goroutines in global queue preserving FIFO order
+	// FIFO order is required to ensure fairness even during frequent GCs
+	// see http://golang.org/issue/7126
+	empty = false;
+	while(!empty) {
+		empty = true;
+		for(i = 0; i < old; i++) {
+			p = runtime_allp[i];
+			if(p->runqhead == p->runqtail)
+				continue;
+			empty = false;
+			// pop from tail of local queue
+			p->runqtail--;
+			gp = p->runq[p->runqtail%nelem(p->runq)];
+			// push onto head of global queue
+			gp->schedlink = runtime_sched.runqhead;
+			runtime_sched.runqhead = gp;
+			if(runtime_sched.runqtail == nil)
+				runtime_sched.runqtail = gp;
+			runtime_sched.runqsize++;
+		}
 	}
+	// fill local queues with at most nelem(p->runq)/2 goroutines
 	// start at 1 because current M already executes some G and will acquire allp[0] below,
 	// so if we have a spare G we want to put it into allp[1].
-	for(i = 1; runtime_sched.runqhead; i++) {
+	for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
 		gp = runtime_sched.runqhead;
 		runtime_sched.runqhead = gp->schedlink;
+		if(runtime_sched.runqhead == nil)
+			runtime_sched.runqtail = nil;
+		runtime_sched.runqsize--;
 		runqput(runtime_allp[i%new], gp);
 	}
-	runtime_sched.runqtail = nil;
-	runtime_sched.runqsize = 0;
 
 	// free unused P's
 	for(i = new; i < old; i++) {
@@ -2656,28 +2754,39 @@ checkdead(void)
 {
 	G *gp;
 	int32 run, grunning, s;
+	uintptr i;
 
 	// -1 for sysmon
 	run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
 	if(run > 0)
 		return;
+	// If we are dying because of a signal caught on an already idle thread,
+	// freezetheworld will cause all running threads to block.
+	// And runtime will essentially enter into deadlock state,
+	// except that there is a thread that will call runtime_exit soon.
+	if(runtime_panicking > 0)
+		return;
 	if(run < 0) {
-		runtime_printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
+		runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
 			runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
 		runtime_throw("checkdead: inconsistent counts");
 	}
 	grunning = 0;
-	for(gp = runtime_allg; gp; gp = gp->alllink) {
+	runtime_lock(&allglock);
+	for(i = 0; i < runtime_allglen; i++) {
+		gp = runtime_allg[i];
 		if(gp->isbackground)
 			continue;
 		s = gp->status;
 		if(s == Gwaiting)
 			grunning++;
 		else if(s == Grunnable || s == Grunning || s == Gsyscall) {
-			runtime_printf("checkdead: find g %D in status %d\n", gp->goid, s);
+			runtime_unlock(&allglock);
+			runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
 			runtime_throw("checkdead: runnable g");
 		}
 	}
+	runtime_unlock(&allglock);
 	if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
 		runtime_exit(0);
 	m->throwing = -1;  // do not dump full stacks
@@ -2774,16 +2883,19 @@ retake(int64 now)
 		pd = &pdesc[i];
 		s = p->status;
 		if(s == Psyscall) {
-			// Retake P from syscall if it's there for more than 1 sysmon tick (20us).
-			// But only if there is other work to do.
+			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
 			t = p->syscalltick;
 			if(pd->syscalltick != t) {
 				pd->syscalltick = t;
 				pd->syscallwhen = now;
 				continue;
 			}
+			// On the one hand we don't want to retake Ps if there is no other work to do,
+			// but on the other hand we want to retake them eventually
+			// because they can prevent the sysmon thread from deep sleep.
 			if(p->runqhead == p->runqtail &&
-				runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)
+				runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
+				pd->syscallwhen + 10*1000*1000 > now)
 				continue;
 			// Need to decrement number of idle locked M's
 			// (pretending that one more is running) before the CAS.
@@ -2828,7 +2940,8 @@ runtime_schedtrace(bool detailed)
 	static int64 starttime;
 	int64 now;
 	int64 id1, id2, id3;
-	int32 i, q, t, h, s;
+	int32 i, t, h;
+	uintptr gi;
 	const char *fmt;
 	M *mp, *lockedm;
 	G *gp, *lockedg;
@@ -2855,15 +2968,11 @@ runtime_schedtrace(bool detailed)
 		if(p == nil)
 			continue;
 		mp = p->m;
-		t = p->runqtail;
-		h = p->runqhead;
-		s = p->runqsize;
-		q = t - h;
-		if(q < 0)
-			q += s;
+		h = runtime_atomicload(&p->runqhead);
+		t = runtime_atomicload(&p->runqtail);
 		if(detailed)
-			runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d/%d gfreecnt=%d\n",
-				i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, q, s, p->gfreecnt);
+			runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
+				i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
 		else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
@@ -2874,7 +2983,7 @@ runtime_schedtrace(bool detailed)
 				fmt = " [%d";
 			else if(i == runtime_gomaxprocs-1)
 				fmt = " %d]\n";
-			runtime_printf(fmt, q);
+			runtime_printf(fmt, t-h);
 		}
 	}
 	if(!detailed) {
@@ -2895,18 +3004,21 @@ runtime_schedtrace(bool detailed)
 		if(lockedg)
 			id3 = lockedg->goid;
 		runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
-			" locks=%d dying=%d helpgc=%d spinning=%d lockedg=%D\n",
+			" locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
 			mp->id, id1, id2,
 			mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
-			mp->spinning, id3);
+			mp->spinning, m->blocked, id3);
 	}
-	for(gp = runtime_allg; gp; gp = gp->alllink) {
+	runtime_lock(&allglock);
+	for(gi = 0; gi < runtime_allglen; gi++) {
+		gp = runtime_allg[gi];
 		mp = gp->m;
 		lockedm = gp->lockedm;
 		runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
 			gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
 			lockedm ? lockedm->id : -1);
 	}
+	runtime_unlock(&allglock);
 	runtime_unlock(&runtime_sched);
 }
 
@@ -2949,6 +3061,20 @@ globrunqput(G *gp)
 	runtime_sched.runqsize++;
 }
 
+// Put a batch of runnable goroutines on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqputbatch(G *ghead, G *gtail, int32 n)
+{
+	gtail->schedlink = nil;
+	if(runtime_sched.runqtail)
+		runtime_sched.runqtail->schedlink = ghead;
+	else
+		runtime_sched.runqhead = ghead;
+	runtime_sched.runqtail = gtail;
+	runtime_sched.runqsize += n;
+}
+
 // Try get a batch of G's from the global runnable queue.
 // Sched must be locked.
 static G*
@@ -2964,6 +3090,8 @@ globrunqget(P *p, int32 max)
 		n = runtime_sched.runqsize;
 	if(max > 0 && n > max)
 		n = max;
+	if((uint32)n > nelem(p->runq)/2)
+		n = nelem(p->runq)/2;
 	runtime_sched.runqsize -= n;
 	if(runtime_sched.runqsize == 0)
 		runtime_sched.runqtail = nil;
@@ -3003,78 +3131,98 @@ pidleget(void)
 	return p;
 }
 
-// Put g on local runnable queue.
-// TODO(dvyukov): consider using lock-free queue.
+// Try to put g on local runnable queue.
+// If it's full, put onto global queue.
+// Executed only by the owner P.
 static void
 runqput(P *p, G *gp)
 {
-	int32 h, t, s;
+	uint32 h, t;
 
-	runtime_lock(p);
 retry:
-	h = p->runqhead;
+	h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
 	t = p->runqtail;
-	s = p->runqsize;
-	if(t == h-1 || (h == 0 && t == s-1)) {
-		runqgrow(p);
-		goto retry;
+	if(t - h < nelem(p->runq)) {
+		p->runq[t%nelem(p->runq)] = gp;
+		runtime_atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
+		return;
 	}
-	p->runq[t++] = gp;
-	if(t == s)
-		t = 0;
-	p->runqtail = t;
-	runtime_unlock(p);
+	if(runqputslow(p, gp, h, t))
+		return;
+	// the queue is not full, now the put above must suceed
+	goto retry;
+}
+
+// Put g and a batch of work from local runnable queue on global queue.
+// Executed only by the owner P.
+static bool
+runqputslow(P *p, G *gp, uint32 h, uint32 t)
+{
+	G *batch[nelem(p->runq)/2+1];
+	uint32 n, i;
+
+	// First, grab a batch from local queue.
+	n = t-h;
+	n = n/2;
+	if(n != nelem(p->runq)/2)
+		runtime_throw("runqputslow: queue is not full");
+	for(i=0; i<n; i++)
+		batch[i] = p->runq[(h+i)%nelem(p->runq)];
+	if(!runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
+		return false;
+	batch[n] = gp;
+	// Link the goroutines.
+	for(i=0; i<n; i++)
+		batch[i]->schedlink = batch[i+1];
+	// Now put the batch on global queue.
+	runtime_lock(&runtime_sched);
+	globrunqputbatch(batch[0], batch[n], n+1);
+	runtime_unlock(&runtime_sched);
+	return true;
 }
 
 // Get g from local runnable queue.
+// Executed only by the owner P.
 static G*
 runqget(P *p)
 {
 	G *gp;
-	int32 t, h, s;
+	uint32 t, h;
 
-	if(p->runqhead == p->runqtail)
-		return nil;
-	runtime_lock(p);
-	h = p->runqhead;
-	t = p->runqtail;
-	s = p->runqsize;
-	if(t == h) {
-		runtime_unlock(p);
-		return nil;
+	for(;;) {
+		h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
+		t = p->runqtail;
+		if(t == h)
+			return nil;
+		gp = p->runq[h%nelem(p->runq)];
+		if(runtime_cas(&p->runqhead, h, h+1))  // cas-release, commits consume
+			return gp;
 	}
-	gp = p->runq[h++];
-	if(h == s)
-		h = 0;
-	p->runqhead = h;
-	runtime_unlock(p);
-	return gp;
 }
 
-// Grow local runnable queue.
-// TODO(dvyukov): consider using fixed-size array
-// and transfer excess to the global list (local queue can grow way too big).
-static void
-runqgrow(P *p)
+// Grabs a batch of goroutines from local runnable queue.
+// batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
+// Can be executed by any P.
+static uint32
+runqgrab(P *p, G **batch)
 {
-	G **q;
-	int32 s, t, h, t2;
+	uint32 t, h, n, i;
 
-	h = p->runqhead;
-	t = p->runqtail;
-	s = p->runqsize;
-	t2 = 0;
-	q = runtime_malloc(2*s*sizeof(*q));
-	while(t != h) {
-		q[t2++] = p->runq[h++];
-		if(h == s)
-			h = 0;
+	for(;;) {
+		h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
+		t = runtime_atomicload(&p->runqtail);  // load-acquire, synchronize with the producer
+		n = t-h;
+		n = n - n/2;
+		if(n == 0)
+			break;
+		if(n > nelem(p->runq)/2)  // read inconsistent h and t
+			continue;
+		for(i=0; i<n; i++)
+			batch[i] = p->runq[(h+i)%nelem(p->runq)];
+		if(runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
+			break;
 	}
-	runtime_free(p->runq);
-	p->runq = q;
-	p->runqhead = 0;
-	p->runqtail = t2;
-	p->runqsize = 2*s;
+	return n;
 }
 
 // Steal half of elements from local runnable queue of p2
@@ -3083,57 +3231,24 @@ runqgrow(P *p)
 static G*
 runqsteal(P *p, P *p2)
 {
-	G *gp, *gp1;
-	int32 t, h, s, t2, h2, s2, c, i;
+	G *gp;
+	G *batch[nelem(p->runq)/2];
+	uint32 t, h, n, i;
 
-	if(p2->runqhead == p2->runqtail)
+	n = runqgrab(p2, batch);
+	if(n == 0)
 		return nil;
-	// sort locks to prevent deadlocks
-	if(p < p2)
-		runtime_lock(p);
-	runtime_lock(p2);
-	if(p2->runqhead == p2->runqtail) {
-		runtime_unlock(p2);
-		if(p < p2)
-			runtime_unlock(p);
-		return nil;
-	}
-	if(p >= p2)
-		runtime_lock(p);
-	// now we've locked both queues and know the victim is not empty
-	h = p->runqhead;
+	n--;
+	gp = batch[n];
+	if(n == 0)
+		return gp;
+	h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
 	t = p->runqtail;
-	s = p->runqsize;
-	h2 = p2->runqhead;
-	t2 = p2->runqtail;
-	s2 = p2->runqsize;
-	gp = p2->runq[h2++];  // return value
-	if(h2 == s2)
-		h2 = 0;
-	// steal roughly half
-	if(t2 > h2)
-		c = (t2 - h2) / 2;
-	else
-		c = (s2 - h2 + t2) / 2;
-	// copy
-	for(i = 0; i != c; i++) {
-		// the target queue is full?
-		if(t == h-1 || (h == 0 && t == s-1))
-			break;
-		// the victim queue is empty?
-		if(t2 == h2)
-			break;
-		gp1 = p2->runq[h2++];
-		if(h2 == s2)
-			h2 = 0;
-		p->runq[t++] = gp1;
-		if(t == s)
-			t = 0;
-	}
-	p->runqtail = t;
-	p2->runqhead = h2;
-	runtime_unlock(p2);
-	runtime_unlock(p);
+	if(t - h + n >= nelem(p->runq))
+		runtime_throw("runqsteal: runq overflow");
+	for(i=0; i<n; i++, t++)
+		p->runq[t%nelem(p->runq)] = batch[i];
+	runtime_atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
 	return gp;
 }
 
@@ -3144,14 +3259,10 @@ void
 runtime_testSchedLocalQueue(void)
 {
 	P p;
-	G gs[1000];
+	G gs[nelem(p.runq)];
 	int32 i, j;
 
 	runtime_memclr((byte*)&p, sizeof(p));
-	p.runqsize = 1;
-	p.runqhead = 0;
-	p.runqtail = 0;
-	p.runq = runtime_malloc(p.runqsize*sizeof(*p.runq));
 
 	for(i = 0; i < (int32)nelem(gs); i++) {
 		if(runqget(&p) != nil)
@@ -3176,20 +3287,11 @@ void
 runtime_testSchedLocalQueueSteal(void)
 {
 	P p1, p2;
-	G gs[1000], *gp;
+	G gs[nelem(p1.runq)], *gp;
 	int32 i, j, s;
 
 	runtime_memclr((byte*)&p1, sizeof(p1));
-	p1.runqsize = 1;
-	p1.runqhead = 0;
-	p1.runqtail = 0;
-	p1.runq = runtime_malloc(p1.runqsize*sizeof(*p1.runq));
-
 	runtime_memclr((byte*)&p2, sizeof(p2));
-	p2.runqsize = nelem(gs);
-	p2.runqhead = 0;
-	p2.runqtail = 0;
-	p2.runq = runtime_malloc(p2.runqsize*sizeof(*p2.runq));
 
 	for(i = 0; i < (int32)nelem(gs); i++) {
 		for(j = 0; j < i; j++) {
@@ -3239,9 +3341,9 @@ runtime_debug_setMaxThreads(intgo in)
 }
 
 void
-runtime_proc_scan(void (*addroot)(Obj))
+runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	addroot((Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
+	enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
 }
 
 // When a function calls a closure, it passes the closure value to
@@ -3271,3 +3373,30 @@ runtime_gcwaiting(void)
 {
 	return runtime_sched.gcwaiting;
 }
+
+// func runtime_procPin() int
+
+intgo sync_runtime_procPin(void)
+  __asm__(GOSYM_PREFIX "sync.runtime_procPin");
+
+intgo
+sync_runtime_procPin()
+{
+	M *mp;
+
+	mp = m;
+	// Disable preemption.
+	mp->locks++;
+	return mp->p->id;
+}
+
+// func runtime_procUnpin()
+
+void sync_runtime_procUnpin(void)
+  __asm__ (GOSYM_PREFIX "sync.runtime_procUnpin");
+
+void
+sync_runtime_procUnpin(void)
+{
+	m->locks--;
+}
diff --git a/libgo/runtime/race.h b/libgo/runtime/race.h
index c97e672..e84c5d4 100644
--- a/libgo/runtime/race.h
+++ b/libgo/runtime/race.h
@@ -24,8 +24,8 @@ void	runtime_racewritepc(void *addr, void *callpc, void *pc);
 void	runtime_racereadpc(void *addr, void *callpc, void *pc);
 void	runtime_racewriterangepc(void *addr, uintptr sz, void *callpc, void *pc);
 void	runtime_racereadrangepc(void *addr, uintptr sz, void *callpc, void *pc);
-void	runtime_racereadobjectpc(void *addr, Type *t, void *callpc, void *pc);
-void	runtime_racewriteobjectpc(void *addr, Type *t, void *callpc, void *pc);
+void	runtime_racereadobjectpc(void *addr, const Type *t, void *callpc, void *pc);
+void	runtime_racewriteobjectpc(void *addr, const Type *t, void *callpc, void *pc);
 void	runtime_racefingo(void);
 void	runtime_raceacquire(void *addr);
 void	runtime_raceacquireg(G *gp, void *addr);
diff --git a/libgo/runtime/runtime.c b/libgo/runtime/runtime.c
index 4f9909b..eb32a8d 100644
--- a/libgo/runtime/runtime.c
+++ b/libgo/runtime/runtime.c
@@ -353,3 +353,12 @@ runtime_debug_setMaxStack(intgo in)
 	runtime_maxstacksize = in;
 	return out;
 }
+
+void memclrBytes(Slice)
+     __asm__ (GOSYM_PREFIX "runtime.memclrBytes");
+
+void
+memclrBytes(Slice s)
+{
+	runtime_memclr(s.__values, s.__count);
+}
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index ef6090f..9d5e42f 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -205,12 +205,12 @@ struct	G
 	void*	gcinitial_sp;
 	ucontext_t gcregs;
 	byte*	entry;		// initial function
-	G*	alllink;	// on allg
 	void*	param;		// passed parameter on wakeup
 	bool	fromgogo;	// reached from gogo
 	int16	status;
 	uint32	selgen;		// valid sudog pointer
 	int64	goid;
+	int64	waitsince;	// approx time when the G become blocked
 	const char*	waitreason;	// if status==Gwaiting
 	G*	schedlink;
 	bool	ispanic;
@@ -221,8 +221,6 @@ struct	G
 	int32	sig;
 	int32	writenbuf;
 	byte*	writebuf;
-	// DeferChunk*	dchunk;
-	// DeferChunk*	dchunknext;
 	uintptr	sigcode0;
 	uintptr	sigcode1;
 	// uintptr	sigpc;
@@ -256,7 +254,8 @@ struct	M
 	int32	dying;
 	int32	profilehz;
 	int32	helpgc;
-	bool	spinning;
+	bool	spinning;	// M is out of work and is actively looking for work
+	bool	blocked;	// M is blocked on a Note
 	uint32	fastrand;
 	uint64	ncgocall;	// number of cgo calls in total
 	int32	ncgo;		// number of cgo calls currently in progress
@@ -276,8 +275,7 @@ struct	M
 	bool	racecall;
 	bool	needextram;
 	bool	dropextram;	// for gccgo: drop after call is done.
-	void*	racepc;
-	void	(*waitunlockf)(Lock*);
+	bool	(*waitunlockf)(G*, void*);
 	void*	waitlock;
 
 	uintptr	settype_buf[1024];
@@ -297,12 +295,16 @@ struct P
 	uint32	syscalltick;	// incremented on every system call
 	M*	m;		// back-link to associated M (nil if idle)
 	MCache*	mcache;
+	Defer*	deferpool;	// pool of available Defer structs (see panic.c)
+
+	// Cache of goroutine ids, amortizes accesses to runtime_sched.goidgen.
+	uint64	goidcache;
+	uint64	goidcacheend;
 
 	// Queue of runnable goroutines.
-	G**	runq;
-	int32	runqhead;
-	int32	runqtail;
-	int32	runqsize;
+	uint32	runqhead;
+	uint32	runqtail;
+	G*	runq[256];
 
 	// Available G's (status == Gdead)
 	G*	gfree;
@@ -359,6 +361,15 @@ enum {
    Windows = 0
 };
 #endif
+#ifdef GOOS_solaris
+enum {
+   Solaris = 1
+};
+#else
+enum {
+   Solaris = 0
+};
+#endif
 
 struct	Timers
 {
@@ -458,12 +469,18 @@ void	runtime_hashinit(void);
 
 void	runtime_traceback(void);
 void	runtime_tracebackothers(G*);
+enum
+{
+	// The maximum number of frames we print for a traceback
+	TracebackMaxFrames = 100,
+};
 
 /*
  * external data
  */
 extern	uintptr runtime_zerobase;
-extern	G*	runtime_allg;
+extern	G**	runtime_allg;
+extern	uintptr runtime_allglen;
 extern	G*	runtime_lastg;
 extern	M*	runtime_allm;
 extern	P**	runtime_allp;
@@ -514,21 +531,6 @@ void	runtime_printtrace(Location*, int32, bool);
 #define runtime_read(d, v, n) read((d), (v), (n))
 #define runtime_write(d, v, n) write((d), (v), (n))
 #define runtime_close(d) close(d)
-#define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-#define runtime_cas64(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-#define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
-// Don't confuse with XADD x86 instruction,
-// this one is actually 'addx', that is, add-and-fetch.
-#define runtime_xadd(p, v) __sync_add_and_fetch (p, v)
-#define runtime_xadd64(p, v) __sync_add_and_fetch (p, v)
-#define runtime_xchg(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_xchg64(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicload(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicstore(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicstore64(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
-#define runtime_atomicload64(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicloadp(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
-#define runtime_atomicstorep(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
 void	runtime_ready(G*);
 const byte*	runtime_getenv(const char*);
 int32	runtime_atoi(const byte*);
@@ -546,7 +548,6 @@ void	runtime_mallocinit(void);
 void	runtime_mprofinit(void);
 #define runtime_malloc(s) __go_alloc(s)
 #define runtime_free(p) __go_free(p)
-bool	runtime_addfinalizer(void*, FuncVal *fn, const struct __go_func_type *, const struct __go_ptr_type *);
 #define runtime_getcallersp(p) __builtin_frame_address(1)
 int32	runtime_mcount(void);
 int32	runtime_gcount(void);
@@ -554,6 +555,24 @@ void	runtime_mcall(void(*)(G*));
 uint32	runtime_fastrand1(void);
 int32	runtime_timediv(int64, int32, int32*);
 
+// atomic operations
+#define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+#define runtime_cas64(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+#define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+// Don't confuse with XADD x86 instruction,
+// this one is actually 'addx', that is, add-and-fetch.
+#define runtime_xadd(p, v) __sync_add_and_fetch (p, v)
+#define runtime_xadd64(p, v) __sync_add_and_fetch (p, v)
+#define runtime_xchg(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_xchg64(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_xchgp(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicload(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicstore(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicstore64(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicload64(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicloadp(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicstorep(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+
 void runtime_setmg(M*, G*);
 void runtime_newextram(void);
 #define runtime_exit(s) exit(s)
@@ -561,7 +580,8 @@ void runtime_newextram(void);
 void	runtime_gosched(void);
 void	runtime_gosched0(G*);
 void	runtime_schedtrace(bool);
-void	runtime_park(void(*)(Lock*), Lock*, const char*);
+void	runtime_park(bool(*)(G*, void*), void*, const char*);
+void	runtime_parkunlock(Lock*, const char*);
 void	runtime_tsleep(int64, const char*);
 M*	runtime_newm(void);
 void	runtime_goexit(void);
@@ -593,6 +613,7 @@ int32	runtime_netpollopen(uintptr, PollDesc*);
 int32   runtime_netpollclose(uintptr);
 void	runtime_netpollready(G**, PollDesc*, int32);
 uintptr	runtime_netpollfd(PollDesc*);
+void	runtime_netpollarm(uintptr, int32);
 void	runtime_crash(void);
 void	runtime_parsedebugvars(void);
 void	_rt0_go(void);
@@ -743,9 +764,6 @@ void	runtime_lockOSThread(void);
 void	runtime_unlockOSThread(void);
 
 bool	runtime_showframe(String, bool);
-Hchan*	runtime_makechan_c(ChanType*, int64);
-void	runtime_chansend(ChanType*, Hchan*, byte*, bool*, void*);
-void	runtime_chanrecv(ChanType*, Hchan*, byte*, bool*, bool*);
 void	runtime_printcreatedby(G*);
 
 uintptr	runtime_memlimit(void);
@@ -793,3 +811,5 @@ void*	__go_get_closure(void);
 
 bool	runtime_gcwaiting(void);
 void	runtime_badsignal(int);
+Defer*	runtime_newdefer(void);
+void	runtime_freedefer(Defer*);
diff --git a/libgo/runtime/sema.goc b/libgo/runtime/sema.goc
index f5d5bc8..50f0e97 100644
--- a/libgo/runtime/sema.goc
+++ b/libgo/runtime/sema.goc
@@ -136,7 +136,7 @@ runtime_semacquire(uint32 volatile *addr, bool profile)
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
 		semqueue(root, addr, &s);
-		runtime_park(runtime_unlock, root, "semacquire");
+		runtime_parkunlock(root, "semacquire");
 		if(cansemacquire(addr)) {
 			if(t0)
 				runtime_blockevent(s.releasetime - t0, 3);
@@ -259,7 +259,7 @@ func runtime_Syncsemacquire(s *SyncSema) {
 		else
 			s->tail->next = &w;
 		s->tail = &w;
-		runtime_park(runtime_unlock, s, "semacquire");
+		runtime_parkunlock(s, "semacquire");
 		if(t0)
 			runtime_blockevent(w.releasetime - t0, 2);
 	}
@@ -293,7 +293,7 @@ func runtime_Syncsemrelease(s *SyncSema, n uint32) {
 		else
 			s->tail->next = &w;
 		s->tail = &w;
-		runtime_park(runtime_unlock, s, "semarelease");
+		runtime_parkunlock(s, "semarelease");
 	} else
 		runtime_unlock(s);
 }
diff --git a/libgo/runtime/signal_unix.c b/libgo/runtime/signal_unix.c
index 6c191d0..66638de 100644
--- a/libgo/runtime/signal_unix.c
+++ b/libgo/runtime/signal_unix.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux openbsd netbsd
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
 
 #include <sys/time.h>
 
diff --git a/libgo/runtime/string.goc b/libgo/runtime/string.goc
index a7446e9..a0e80cc 100644
--- a/libgo/runtime/string.goc
+++ b/libgo/runtime/string.goc
@@ -43,11 +43,9 @@ gostringsize(intgo l, byte** pmem)
 		*pmem = nil;
 		return runtime_emptystring;
 	}
-	// leave room for NUL for C runtime (e.g., callers of getenv)
-	mem = runtime_mallocgc(l+1, 0, FlagNoScan|FlagNoZero);
+	mem = runtime_mallocgc(l, 0, FlagNoScan|FlagNoZero);
 	s.str = mem;
 	s.len = l;
-	mem[l] = 0;
 	*pmem = mem;
 	return s;
 }
diff --git a/libgo/runtime/time.goc b/libgo/runtime/time.goc
index e4e35ec..13ce41f 100644
--- a/libgo/runtime/time.goc
+++ b/libgo/runtime/time.goc
@@ -78,7 +78,7 @@ runtime_tsleep(int64 ns, const char *reason)
 	t.arg.__object = g;
 	runtime_lock(&timers);
 	addtimer(&t);
-	runtime_park(runtime_unlock, &timers, reason);
+	runtime_parkunlock(&timers, reason);
 }
 
 void
@@ -221,12 +221,20 @@ timerproc(void* dummy __attribute__ ((unused)))
 				runtime_raceacquire(t);
 			__go_set_closure(t->fv);
 			f(now, arg);
+
+			// clear f and arg to avoid leak while sleeping for next timer
+			f = nil;
+			USED(f);
+			arg.__type_descriptor = nil;
+			arg.__object = nil;
+			USED(&arg);
+
 			runtime_lock(&timers);
 		}
 		if(delta < 0) {
 			// No timers left - put goroutine to sleep.
 			timers.rescheduling = true;
-			runtime_park(runtime_unlock, &timers, "timer goroutine (idle)");
+			runtime_parkunlock(&timers, "timer goroutine (idle)");
 			continue;
 		}
 		// At least one timer pending.  Sleep until then.
@@ -320,7 +328,7 @@ dumptimers(const char *msg)
 }
 
 void
-runtime_time_scan(void (*addroot)(Obj))
+runtime_time_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	addroot((Obj){(byte*)&timers, sizeof timers, 0});
+	enqueue1(wbufp, (Obj){(byte*)&timers, sizeof timers, 0});
 }
author	Ian Lance Taylor <ian@gcc.gnu.org>	2014-06-06 22:37:27 +0000
committer	Ian Lance Taylor <ian@gcc.gnu.org>	2014-06-06 22:37:27 +0000
commit	6736ef96eab222e58e6294f42be981a5afb59811 (patch)
tree	2bc668fae9bf96f9a3988e0b0a16685bde8c4f0b /libgo/runtime
parent	38a138411da4206c53f9a153ee9c3624fce58a52 (diff)
download	gcc-6736ef96eab222e58e6294f42be981a5afb59811.zip gcc-6736ef96eab222e58e6294f42be981a5afb59811.tar.gz gcc-6736ef96eab222e58e6294f42be981a5afb59811.tar.bz2