1 files changed, 374 insertions, 346 deletions
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index defb0a5..1ac03a4 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -19,7 +19,6 @@
 #include "defs.h"
 #include "malloc.h"
 #include "go-type.h"
-#include "go-defer.h"
 
 #ifdef USING_SPLIT_STACK
 
@@ -62,7 +61,6 @@ static void gtraceback(G*);
 #endif
 
 static __thread G *g;
-static __thread M *m;
 
 #ifndef SETCONTEXT_CLOBBERS_TLS
 
@@ -179,14 +177,15 @@ M* runtime_m(void) __attribute__ ((noinline, no_split_stack));
 M*
 runtime_m(void)
 {
-	return m;
+	if(g == nil)
+		return nil;
+	return g->m;
 }
 
-// Set m and g.
+// Set g.
 void
-runtime_setmg(M* mp, G* gp)
+runtime_setg(G* gp)
 {
-	m = mp;
 	g = gp;
 }
 
@@ -242,12 +241,12 @@ void
 runtime_gogo(G* newg)
 {
 #ifdef USING_SPLIT_STACK
-	__splitstack_setcontext(&newg->stack_context[0]);
+	__splitstack_setcontext(&newg->stackcontext[0]);
 #endif
 	g = newg;
 	newg->fromgogo = true;
-	fixcontext(&newg->context);
-	setcontext(&newg->context);
+	fixcontext((ucontext_t*)&newg->context[0]);
+	setcontext((ucontext_t*)&newg->context[0]);
 	runtime_throw("gogo setcontext returned");
 }
 
@@ -266,37 +265,37 @@ runtime_mcall(void (*pfn)(G*))
 	// collector.
 	__builtin_unwind_init();
 
-	mp = m;
 	gp = g;
+	mp = gp->m;
 	if(gp == mp->g0)
 		runtime_throw("runtime: mcall called on m->g0 stack");
 
 	if(gp != nil) {
 
 #ifdef USING_SPLIT_STACK
-		__splitstack_getcontext(&g->stack_context[0]);
+		__splitstack_getcontext(&g->stackcontext[0]);
 #else
-		gp->gcnext_sp = &pfn;
+		gp->gcnextsp = &pfn;
 #endif
 		gp->fromgogo = false;
-		getcontext(&gp->context);
+		getcontext((ucontext_t*)&gp->context[0]);
 
 		// When we return from getcontext, we may be running
-		// in a new thread.  That means that m and g may have
-		// changed.  They are global variables so we will
-		// reload them, but the addresses of m and g may be
-		// cached in our local stack frame, and those
-		// addresses may be wrong.  Call functions to reload
-		// the values for this thread.
-		mp = runtime_m();
+		// in a new thread.  That means that g may have
+		// changed.  It is a global variables so we will
+		// reload it, but the address of g may be cached in
+		// our local stack frame, and that address may be
+		// wrong.  Call the function to reload the value for
+		// this thread.
 		gp = runtime_g();
+		mp = gp->m;
 
 		if(gp->traceback != nil)
 			gtraceback(gp);
 	}
 	if (gp == nil || !gp->fromgogo) {
 #ifdef USING_SPLIT_STACK
-		__splitstack_setcontext(&mp->g0->stack_context[0]);
+		__splitstack_setcontext(&mp->g0->stackcontext[0]);
 #endif
 		mp->g0->entry = (byte*)pfn;
 		mp->g0->param = gp;
@@ -306,8 +305,8 @@ runtime_mcall(void (*pfn)(G*))
 		// the getcontext call just above.
 		g = mp->g0;
 
-		fixcontext(&mp->g0->context);
-		setcontext(&mp->g0->context);
+		fixcontext((ucontext_t*)&mp->g0->context[0]);
+		setcontext((ucontext_t*)&mp->g0->context[0]);
 		runtime_throw("runtime: mcall function returned");
 	}
 }
@@ -360,10 +359,6 @@ struct Sched {
 
 enum
 {
-	// The max value of GOMAXPROCS.
-	// There are no fundamental restrictions on the value.
-	MaxGomaxprocs = 1<<8,
-
 	// Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
 	GoidCacheBatch = 16,
@@ -442,6 +437,7 @@ bool runtime_isstarted;
 void
 runtime_schedinit(void)
 {
+	M *m;
 	int32 n, procs;
 	String s;
 	const byte *p;
@@ -481,11 +477,11 @@ runtime_schedinit(void)
 	s = runtime_getenv("GOMAXPROCS");
 	p = s.str;
 	if(p != nil && (n = runtime_atoi(p, s.len)) > 0) {
-		if(n > MaxGomaxprocs)
-			n = MaxGomaxprocs;
+		if(n > _MaxGomaxprocs)
+			n = _MaxGomaxprocs;
 		procs = n;
 	}
-	runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
+	runtime_allp = runtime_malloc((_MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
 	procresize(procs);
 
 	// Can not enable GC until all roots are registered.
@@ -583,17 +579,17 @@ runtime_main(void* dummy __attribute__((unused)))
 	runtime_lockOSThread();
 	
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
-	d.__pfn = initDone;
-	d.__next = g->defer;
-	d.__arg = (void*)-1;
-	d.__panic = g->panic;
-	d.__retaddr = nil;
-	d.__makefunc_can_recover = 0;
-	d.__frame = &frame;
-	d.__special = true;
-	g->defer = &d;
-
-	if(m != &runtime_m0)
+	d.pfn = (uintptr)(void*)initDone;
+	d.next = g->_defer;
+	d.arg = (void*)-1;
+	d._panic = g->_panic;
+	d.retaddr = 0;
+	d.makefunccanrecover = 0;
+	d.frame = &frame;
+	d.special = true;
+	g->_defer = &d;
+
+	if(g->m != &runtime_m0)
 		runtime_throw("runtime_main not on m0");
 	__go_go(runtime_MHeap_Scavenger, nil);
 
@@ -605,9 +601,9 @@ runtime_main(void* dummy __attribute__((unused)))
 
 	closechan(runtime_main_init_done);
 
-	if(g->defer != &d || d.__pfn != initDone)
+	if(g->_defer != &d || (void*)d.pfn != initDone)
 		runtime_throw("runtime: bad defer entry after init");
-	g->defer = d.__next;
+	g->_defer = d.next;
 	runtime_unlockOSThread();
 
 	// For gccgo we have to wait until after main is initialized
@@ -640,42 +636,42 @@ runtime_main(void* dummy __attribute__((unused)))
 void
 runtime_goroutineheader(G *gp)
 {
-	const char *status;
+	String status;
 	int64 waitfor;
 
-	switch(gp->status) {
-	case Gidle:
-		status = "idle";
+	switch(gp->atomicstatus) {
+	case _Gidle:
+		status = runtime_gostringnocopy((const byte*)"idle");
 		break;
-	case Grunnable:
-		status = "runnable";
+	case _Grunnable:
+		status = runtime_gostringnocopy((const byte*)"runnable");
 		break;
-	case Grunning:
-		status = "running";
+	case _Grunning:
+		status = runtime_gostringnocopy((const byte*)"running");
 		break;
-	case Gsyscall:
-		status = "syscall";
+	case _Gsyscall:
+		status = runtime_gostringnocopy((const byte*)"syscall");
 		break;
-	case Gwaiting:
-		if(gp->waitreason)
+	case _Gwaiting:
+		if(gp->waitreason.len > 0)
 			status = gp->waitreason;
 		else
-			status = "waiting";
+			status = runtime_gostringnocopy((const byte*)"waiting");
 		break;
 	default:
-		status = "???";
+		status = runtime_gostringnocopy((const byte*)"???");
 		break;
 	}
 
 	// approx time the G is blocked, in minutes
 	waitfor = 0;
-	if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
+	if((gp->atomicstatus == _Gwaiting || gp->atomicstatus == _Gsyscall) && gp->waitsince != 0)
 		waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
 
 	if(waitfor < 1)
-		runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
+		runtime_printf("goroutine %D [%S]:\n", gp->goid, status);
 	else
-		runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
+		runtime_printf("goroutine %D [%S, %D minutes]:\n", gp->goid, status, waitfor);
 }
 
 void
@@ -693,13 +689,6 @@ runtime_printcreatedby(G *g)
 	}
 }
 
-struct Traceback
-{
-	G* gp;
-	Location locbuf[TracebackMaxFrames];
-	int32 c;
-};
-
 void
 runtime_tracebackothers(G * volatile me)
 {
@@ -712,15 +701,15 @@ runtime_tracebackothers(G * volatile me)
 	traceback = runtime_gotraceback(nil);
 	
 	// Show the current goroutine first, if we haven't already.
-	if((gp = m->curg) != nil && gp != me) {
+	if((gp = g->m->curg) != nil && gp != me) {
 		runtime_printf("\n");
 		runtime_goroutineheader(gp);
 		gp->traceback = &tb;
 
 #ifdef USING_SPLIT_STACK
-		__splitstack_getcontext(&me->stack_context[0]);
+		__splitstack_getcontext(&me->stackcontext[0]);
 #endif
-		getcontext(&me->context);
+		getcontext((ucontext_t*)&me->context[0]);
 
 		if(gp->traceback != nil) {
 		  runtime_gogo(gp);
@@ -733,7 +722,7 @@ runtime_tracebackothers(G * volatile me)
 	runtime_lock(&allglock);
 	for(i = 0; i < runtime_allglen; i++) {
 		gp = runtime_allg[i];
-		if(gp == me || gp == m->curg || gp->status == Gdead)
+		if(gp == me || gp == g->m->curg || gp->atomicstatus == _Gdead)
 			continue;
 		if(gp->issystem && traceback < 2)
 			continue;
@@ -749,19 +738,19 @@ runtime_tracebackothers(G * volatile me)
 		// This means that if g is running or in a syscall, we
 		// can't reliably print a stack trace.  FIXME.
 
-		if(gp->status == Grunning) {
+		if(gp->atomicstatus == _Grunning) {
 			runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
 			runtime_printcreatedby(gp);
-		} else if(gp->status == Gsyscall) {
+		} else if(gp->atomicstatus == _Gsyscall) {
 			runtime_printf("\tgoroutine in C code; stack unavailable\n");
 			runtime_printcreatedby(gp);
 		} else {
 			gp->traceback = &tb;
 
 #ifdef USING_SPLIT_STACK
-			__splitstack_getcontext(&me->stack_context[0]);
+			__splitstack_getcontext(&me->stackcontext[0]);
 #endif
-			getcontext(&me->context);
+			getcontext((ucontext_t*)&me->context[0]);
 
 			if(gp->traceback != nil) {
 				runtime_gogo(gp);
@@ -794,8 +783,12 @@ gtraceback(G* gp)
 
 	traceback = gp->traceback;
 	gp->traceback = nil;
+	if(gp->m != nil)
+		runtime_throw("gtraceback: m is not nil");
+	gp->m = traceback->gp->m;
 	traceback->c = runtime_callers(1, traceback->locbuf,
 		sizeof traceback->locbuf / sizeof traceback->locbuf[0], false);
+	gp->m = nil;
 	runtime_gogo(traceback->gp);
 }
 
@@ -804,7 +797,7 @@ mcommoninit(M *mp)
 {
 	// If there is no mcache runtime_callers() will crash,
 	// and we are most likely in sysmon thread so the stack is senseless anyway.
-	if(m->mcache)
+	if(g->m->mcache)
 		runtime_callers(1, mp->createstack, nelem(mp->createstack), false);
 
 	mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
@@ -828,16 +821,16 @@ void
 runtime_ready(G *gp)
 {
 	// Mark runnable.
-	m->locks++;  // disable preemption because it can be holding p in a local var
-	if(gp->status != Gwaiting) {
-		runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
-		runtime_throw("bad g->status in ready");
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
+	if(gp->atomicstatus != _Gwaiting) {
+		runtime_printf("goroutine %D has status %d\n", gp->goid, gp->atomicstatus);
+		runtime_throw("bad g->atomicstatus in ready");
 	}
-	gp->status = Grunnable;
-	runqput(m->p, gp);
+	gp->atomicstatus = _Grunnable;
+	runqput((P*)g->m->p, gp);
 	if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 		wakep();
-	m->locks--;
+	g->m->locks--;
 }
 
 int32
@@ -884,7 +877,7 @@ runtime_helpgc(int32 nproc)
 	runtime_lock(&runtime_sched);
 	pos = 0;
 	for(n = 1; n < nproc; n++) {  // one M is currently running
-		if(runtime_allp[pos]->mcache == m->mcache)
+		if(runtime_allp[pos]->mcache == g->m->mcache)
 			pos++;
 		mp = mget();
 		if(mp == nil)
@@ -938,18 +931,18 @@ runtime_stoptheworld(void)
 	runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 	preemptall();
 	// stop current P
-	m->p->status = Pgcstop;
+	((P*)g->m->p)->status = _Pgcstop;
 	runtime_sched.stopwait--;
-	// try to retake all P's in Psyscall status
+	// try to retake all P's in _Psyscall status
 	for(i = 0; i < runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
 		s = p->status;
-		if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
+		if(s == _Psyscall && runtime_cas(&p->status, s, _Pgcstop))
 			runtime_sched.stopwait--;
 	}
 	// stop idle P's
 	while((p = pidleget()) != nil) {
-		p->status = Pgcstop;
+		p->status = _Pgcstop;
 		runtime_sched.stopwait--;
 	}
 	wait = runtime_sched.stopwait > 0;
@@ -964,7 +957,7 @@ runtime_stoptheworld(void)
 		runtime_throw("stoptheworld: not stopped");
 	for(i = 0; i < runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
-		if(p->status != Pgcstop)
+		if(p->status != _Pgcstop)
 			runtime_throw("stoptheworld: not stopped");
 	}
 }
@@ -972,7 +965,7 @@ runtime_stoptheworld(void)
 static void
 mhelpgc(void)
 {
-	m->helpgc = -1;
+	g->m->helpgc = -1;
 }
 
 void
@@ -983,7 +976,7 @@ runtime_starttheworld(void)
 	G *gp;
 	bool add;
 
-	m->locks++;  // disable preemption because it can be holding p in a local var
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
 	gp = runtime_netpoll(false);  // non-blocking
 	injectglist(gp);
 	add = needaddgcproc();
@@ -1003,8 +996,8 @@ runtime_starttheworld(void)
 			pidleput(p);
 			break;
 		}
-		p->m = mget();
-		p->link = p1;
+		p->m = (uintptr)mget();
+		p->link = (uintptr)p1;
 		p1 = p;
 	}
 	if(runtime_sched.sysmonwait) {
@@ -1015,13 +1008,13 @@ runtime_starttheworld(void)
 
 	while(p1) {
 		p = p1;
-		p1 = p1->link;
+		p1 = (P*)p1->link;
 		if(p->m) {
-			mp = p->m;
-			p->m = nil;
+			mp = (M*)p->m;
+			p->m = 0;
 			if(mp->nextp)
 				runtime_throw("starttheworld: inconsistent mp->nextp");
-			mp->nextp = p;
+			mp->nextp = (uintptr)p;
 			runtime_notewakeup(&mp->park);
 		} else {
 			// Start M to run P.  Do not start another M below.
@@ -1040,15 +1033,18 @@ runtime_starttheworld(void)
 		// the maximum number of procs.
 		newm(mhelpgc, nil);
 	}
-	m->locks--;
+	g->m->locks--;
 }
 
 // Called to start an M.
 void*
 runtime_mstart(void* mp)
 {
+	M *m;
+
 	m = (M*)mp;
 	g = m->g0;
+	g->m = m;
 
 	initcontext();
 
@@ -1059,15 +1055,15 @@ runtime_mstart(void* mp)
 	// Once we call schedule we're never coming back,
 	// so other calls can reuse this stack space.
 #ifdef USING_SPLIT_STACK
-	__splitstack_getcontext(&g->stack_context[0]);
+	__splitstack_getcontext(&g->stackcontext[0]);
 #else
-	g->gcinitial_sp = &mp;
-	// Setting gcstack_size to 0 is a marker meaning that gcinitial_sp
+	g->gcinitialsp = &mp;
+	// Setting gcstacksize to 0 is a marker meaning that gcinitialsp
 	// is the top of the stack, not the bottom.
-	g->gcstack_size = 0;
-	g->gcnext_sp = &mp;
+	g->gcstacksize = 0;
+	g->gcnextsp = &mp;
 #endif
-	getcontext(&g->context);
+	getcontext((ucontext_t*)&g->context[0]);
 
 	if(g->entry != nil) {
 		// Got here from mcall.
@@ -1097,14 +1093,14 @@ runtime_mstart(void* mp)
 	}
 	
 	if(m->mstartfn)
-		m->mstartfn();
+		((void (*)(void))m->mstartfn)();
 
 	if(m->helpgc) {
 		m->helpgc = 0;
 		stopm();
 	} else if(m != &runtime_m0) {
-		acquirep(m->nextp);
-		m->nextp = nil;
+		acquirep((P*)m->nextp);
+		m->nextp = 0;
 	}
 	schedule();
 
@@ -1127,12 +1123,12 @@ struct CgoThreadStart
 // Allocate a new m unassociated with any thread.
 // Can use p for allocation context if needed.
 M*
-runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacksize)
+runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, uintptr* ret_g0_stacksize)
 {
 	M *mp;
 
-	m->locks++;  // disable GC because it can be called from sysmon
-	if(m->p == nil)
+	g->m->locks++;  // disable GC because it can be called from sysmon
+	if(g->m->p == 0)
 		acquirep(p);  // temporarily borrow p for mallocs in this function
 #if 0
 	if(mtype == nil) {
@@ -1145,10 +1141,11 @@ runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacks
 	mp = runtime_mal(sizeof *mp);
 	mcommoninit(mp);
 	mp->g0 = runtime_malg(stacksize, ret_g0_stack, ret_g0_stacksize);
+	mp->g0->m = mp;
 
-	if(p == m->p)
+	if(p == (P*)g->m->p)
 		releasep();
-	m->locks--;
+	g->m->locks--;
 
 	return mp;
 }
@@ -1235,26 +1232,26 @@ runtime_needm(void)
 	// after exitsyscall makes sure it is okay to be
 	// running at all (that is, there's no garbage collection
 	// running right now).
-	mp->needextram = mp->schedlink == nil;
-	unlockextra(mp->schedlink);
+	mp->needextram = mp->schedlink == 0;
+	unlockextra((M*)mp->schedlink);
 
-	// Install m and g (= m->curg).
-	runtime_setmg(mp, mp->curg);
+	// Install g (= m->curg).
+	runtime_setg(mp->curg);
 
 	// Initialize g's context as in mstart.
 	initcontext();
-	g->status = Gsyscall;
+	g->atomicstatus = _Gsyscall;
 	g->entry = nil;
 	g->param = nil;
 #ifdef USING_SPLIT_STACK
-	__splitstack_getcontext(&g->stack_context[0]);
+	__splitstack_getcontext(&g->stackcontext[0]);
 #else
-	g->gcinitial_sp = &mp;
+	g->gcinitialsp = &mp;
 	g->gcstack = nil;
-	g->gcstack_size = 0;
-	g->gcnext_sp = &mp;
+	g->gcstacksize = 0;
+	g->gcnextsp = &mp;
 #endif
-	getcontext(&g->context);
+	getcontext((ucontext_t*)&g->context[0]);
 
 	if(g->entry != nil) {
 		// Got here from mcall.
@@ -1284,7 +1281,7 @@ runtime_newextram(void)
 	M *mp, *mnext;
 	G *gp;
 	byte *g0_sp, *sp;
-	size_t g0_spsize, spsize;
+	uintptr g0_spsize, spsize;
 
 	// Create extra goroutine locked to extra m.
 	// The goroutine is the context in which the cgo callback will run.
@@ -1293,9 +1290,10 @@ runtime_newextram(void)
 	// the goroutine stack ends.
 	mp = runtime_allocm(nil, StackMin, &g0_sp, &g0_spsize);
 	gp = runtime_malg(StackMin, &sp, &spsize);
-	gp->status = Gdead;
+	gp->atomicstatus = _Gdead;
+	gp->m = mp;
 	mp->curg = gp;
-	mp->locked = LockInternal;
+	mp->locked = _LockInternal;
 	mp->lockedg = gp;
 	gp->lockedm = mp;
 	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
@@ -1304,14 +1302,14 @@ runtime_newextram(void)
 
 	// The context for gp will be set up in runtime_needm.  But
 	// here we need to set up the context for g0.
-	getcontext(&mp->g0->context);
-	mp->g0->context.uc_stack.ss_sp = g0_sp;
-	mp->g0->context.uc_stack.ss_size = g0_spsize;
-	makecontext(&mp->g0->context, kickoff, 0);
+	getcontext((ucontext_t*)&mp->g0->context[0]);
+	((ucontext_t*)&mp->g0->context[0])->uc_stack.ss_sp = g0_sp;
+	((ucontext_t*)&mp->g0->context[0])->uc_stack.ss_size = (size_t)g0_spsize;
+	makecontext((ucontext_t*)&mp->g0->context[0], kickoff, 0);
 
 	// Add m to the extra list.
 	mnext = lockextra(true);
-	mp->schedlink = mnext;
+	mp->schedlink = (uintptr)mnext;
 	unlockextra(mp);
 }
 
@@ -1347,16 +1345,16 @@ runtime_dropm(void)
 	runtime_unminit();
 
 	// Clear m and g, and return m to the extra list.
-	// After the call to setmg we can only call nosplit functions.
-	mp = m;
-	runtime_setmg(nil, nil);
+	// After the call to setg we can only call nosplit functions.
+	mp = g->m;
+	runtime_setg(nil);
 
-	mp->curg->status = Gdead;
+	mp->curg->atomicstatus = _Gdead;
 	mp->curg->gcstack = nil;
-	mp->curg->gcnext_sp = nil;
+	mp->curg->gcnextsp = nil;
 
 	mnext = lockextra(true);
-	mp->schedlink = mnext;
+	mp->schedlink = (uintptr)mnext;
 	unlockextra(mp);
 }
 
@@ -1417,7 +1415,7 @@ countextra()
 			continue;
 		}
 		c = 0;
-		for(mc = mp; mc != nil; mc = mc->schedlink)
+		for(mc = mp; mc != nil; mc = (M*)mc->schedlink)
 			c++;
 		runtime_atomicstorep(&runtime_extram, mp);
 		return c;
@@ -1431,8 +1429,8 @@ newm(void(*fn)(void), P *p)
 	M *mp;
 
 	mp = runtime_allocm(p, -1, nil, nil);
-	mp->nextp = p;
-	mp->mstartfn = fn;
+	mp->nextp = (uintptr)p;
+	mp->mstartfn = (uintptr)(void*)fn;
 
 	runtime_newosproc(mp);
 }
@@ -1442,6 +1440,9 @@ newm(void(*fn)(void), P *p)
 static void
 stopm(void)
 {
+	M* m;
+
+	m = g->m;
 	if(m->locks)
 		runtime_throw("stopm holding locks");
 	if(m->p)
@@ -1456,6 +1457,7 @@ retry:
 	mput(m);
 	runtime_unlock(&runtime_sched);
 	runtime_notesleep(&m->park);
+	m = g->m;
 	runtime_noteclear(&m->park);
 	if(m->helpgc) {
 		runtime_gchelper();
@@ -1463,14 +1465,14 @@ retry:
 		m->mcache = nil;
 		goto retry;
 	}
-	acquirep(m->nextp);
-	m->nextp = nil;
+	acquirep((P*)m->nextp);
+	m->nextp = 0;
 }
 
 static void
 mspinning(void)
 {
-	m->spinning = true;
+	g->m->spinning = true;
 }
 
 // Schedules some M to run the p (creates an M if necessary).
@@ -1505,7 +1507,7 @@ startm(P *p, bool spinning)
 	if(mp->nextp)
 		runtime_throw("startm: m has p");
 	mp->spinning = spinning;
-	mp->nextp = p;
+	mp->nextp = (uintptr)p;
 	runtime_notewakeup(&mp->park);
 }
 
@@ -1527,7 +1529,7 @@ handoffp(P *p)
 	}
 	runtime_lock(&runtime_sched);
 	if(runtime_sched.gcwaiting) {
-		p->status = Pgcstop;
+		p->status = _Pgcstop;
 		if(--runtime_sched.stopwait == 0)
 			runtime_notewakeup(&runtime_sched.stopnote);
 		runtime_unlock(&runtime_sched);
@@ -1565,8 +1567,10 @@ wakep(void)
 static void
 stoplockedm(void)
 {
+	M *m;
 	P *p;
 
+	m = g->m;
 	if(m->lockedg == nil || m->lockedg->lockedm != m)
 		runtime_throw("stoplockedm: inconsistent locking");
 	if(m->p) {
@@ -1577,11 +1581,12 @@ stoplockedm(void)
 	incidlelocked(1);
 	// Wait until another thread schedules lockedg again.
 	runtime_notesleep(&m->park);
+	m = g->m;
 	runtime_noteclear(&m->park);
-	if(m->lockedg->status != Grunnable)
+	if(m->lockedg->atomicstatus != _Grunnable)
 		runtime_throw("stoplockedm: not runnable");
-	acquirep(m->nextp);
-	m->nextp = nil;
+	acquirep((P*)m->nextp);
+	m->nextp = 0;
 }
 
 // Schedules the locked m to run the locked gp.
@@ -1592,14 +1597,14 @@ startlockedm(G *gp)
 	P *p;
 
 	mp = gp->lockedm;
-	if(mp == m)
+	if(mp == g->m)
 		runtime_throw("startlockedm: locked to me");
 	if(mp->nextp)
 		runtime_throw("startlockedm: m has p");
 	// directly handoff current P to the locked m
 	incidlelocked(-1);
 	p = releasep();
-	mp->nextp = p;
+	mp->nextp = (uintptr)p;
 	runtime_notewakeup(&mp->park);
 	stopm();
 }
@@ -1613,13 +1618,13 @@ gcstopm(void)
 
 	if(!runtime_sched.gcwaiting)
 		runtime_throw("gcstopm: not waiting for gc");
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		runtime_xadd(&runtime_sched.nmspinning, -1);
 	}
 	p = releasep();
 	runtime_lock(&runtime_sched);
-	p->status = Pgcstop;
+	p->status = _Pgcstop;
 	if(--runtime_sched.stopwait == 0)
 		runtime_notewakeup(&runtime_sched.stopnote);
 	runtime_unlock(&runtime_sched);
@@ -1633,19 +1638,19 @@ execute(G *gp)
 {
 	int32 hz;
 
-	if(gp->status != Grunnable) {
-		runtime_printf("execute: bad g status %d\n", gp->status);
+	if(gp->atomicstatus != _Grunnable) {
+		runtime_printf("execute: bad g status %d\n", gp->atomicstatus);
 		runtime_throw("execute: bad g status");
 	}
-	gp->status = Grunning;
+	gp->atomicstatus = _Grunning;
 	gp->waitsince = 0;
-	m->p->schedtick++;
-	m->curg = gp;
-	gp->m = m;
+	((P*)g->m->p)->schedtick++;
+	g->m->curg = gp;
+	gp->m = g->m;
 
 	// Check whether the profiler needs to be turned on or off.
 	hz = runtime_sched.profilehz;
-	if(m->profilehz != hz)
+	if(g->m->profilehz != hz)
 		runtime_resetcpuprofiler(hz);
 
 	runtime_gogo(gp);
@@ -1668,13 +1673,13 @@ top:
 	if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil)
 		runtime_ready(gp);
 	// local runq
-	gp = runqget(m->p);
+	gp = runqget((P*)g->m->p);
 	if(gp)
 		return gp;
 	// global runq
 	if(runtime_sched.runqsize) {
 		runtime_lock(&runtime_sched);
-		gp = globrunqget(m->p, 0);
+		gp = globrunqget((P*)g->m->p, 0);
 		runtime_unlock(&runtime_sched);
 		if(gp)
 			return gp;
@@ -1682,17 +1687,17 @@ top:
 	// poll network
 	gp = runtime_netpoll(false);  // non-blocking
 	if(gp) {
-		injectglist(gp->schedlink);
-		gp->status = Grunnable;
+		injectglist((G*)gp->schedlink);
+		gp->atomicstatus = _Grunnable;
 		return gp;
 	}
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
+	if(!g->m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
 		goto stop;
-	if(!m->spinning) {
-		m->spinning = true;
+	if(!g->m->spinning) {
+		g->m->spinning = true;
 		runtime_xadd(&runtime_sched.nmspinning, 1);
 	}
 	// random steal from other P's
@@ -1700,10 +1705,10 @@ top:
 		if(runtime_sched.gcwaiting)
 			goto top;
 		p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
-		if(p == m->p)
+		if(p == (P*)g->m->p)
 			gp = runqget(p);
 		else
-			gp = runqsteal(m->p, p);
+			gp = runqsteal((P*)g->m->p, p);
 		if(gp)
 			return gp;
 	}
@@ -1715,15 +1720,15 @@ stop:
 		goto top;
 	}
 	if(runtime_sched.runqsize) {
-		gp = globrunqget(m->p, 0);
+		gp = globrunqget((P*)g->m->p, 0);
 		runtime_unlock(&runtime_sched);
 		return gp;
 	}
 	p = releasep();
 	pidleput(p);
 	runtime_unlock(&runtime_sched);
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		runtime_xadd(&runtime_sched.nmspinning, -1);
 	}
 	// check all runqueues once again
@@ -1742,9 +1747,9 @@ stop:
 	}
 	// poll network
 	if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
-		if(m->p)
+		if(g->m->p)
 			runtime_throw("findrunnable: netpoll with p");
-		if(m->spinning)
+		if(g->m->spinning)
 			runtime_throw("findrunnable: netpoll with spinning");
 		gp = runtime_netpoll(true);  // block until new work is available
 		runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
@@ -1754,8 +1759,8 @@ stop:
 			runtime_unlock(&runtime_sched);
 			if(p) {
 				acquirep(p);
-				injectglist(gp->schedlink);
-				gp->status = Grunnable;
+				injectglist((G*)gp->schedlink);
+				gp->atomicstatus = _Grunnable;
 				return gp;
 			}
 			injectglist(gp);
@@ -1770,8 +1775,8 @@ resetspinning(void)
 {
 	int32 nmspinning;
 
-	if(m->spinning) {
-		m->spinning = false;
+	if(g->m->spinning) {
+		g->m->spinning = false;
 		nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
 		if(nmspinning < 0)
 			runtime_throw("findrunnable: negative nmspinning");
@@ -1797,8 +1802,8 @@ injectglist(G *glist)
 	runtime_lock(&runtime_sched);
 	for(n = 0; glist; n++) {
 		gp = glist;
-		glist = gp->schedlink;
-		gp->status = Grunnable;
+		glist = (G*)gp->schedlink;
+		gp->atomicstatus = _Grunnable;
 		globrunqput(gp);
 	}
 	runtime_unlock(&runtime_sched);
@@ -1815,7 +1820,7 @@ schedule(void)
 	G *gp;
 	uint32 tick;
 
-	if(m->locks)
+	if(g->m->locks)
 		runtime_throw("schedule: holding locks");
 
 top:
@@ -1828,19 +1833,19 @@ top:
 	// Check the global runnable queue once in a while to ensure fairness.
 	// Otherwise two goroutines can completely occupy the local runqueue
 	// by constantly respawning each other.
-	tick = m->p->schedtick;
+	tick = ((P*)g->m->p)->schedtick;
 	// This is a fancy way to say tick%61==0,
 	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
 	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
 		runtime_lock(&runtime_sched);
-		gp = globrunqget(m->p, 1);
+		gp = globrunqget((P*)g->m->p, 1);
 		runtime_unlock(&runtime_sched);
 		if(gp)
 			resetspinning();
 	}
 	if(gp == nil) {
-		gp = runqget(m->p);
-		if(gp && m->spinning)
+		gp = runqget((P*)g->m->p);
+		if(gp && g->m->spinning)
 			runtime_throw("schedule: spinning with local work");
 	}
 	if(gp == nil) {
@@ -1863,11 +1868,11 @@ top:
 void
 runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
 {
-	if(g->status != Grunning)
+	if(g->atomicstatus != _Grunning)
 		runtime_throw("bad g status");
-	m->waitlock = lock;
-	m->waitunlockf = unlockf;
-	g->waitreason = reason;
+	g->m->waitlock = lock;
+	g->m->waitunlockf = unlockf;
+	g->waitreason = runtime_gostringnocopy((const byte*)reason);
 	runtime_mcall(park0);
 }
 
@@ -1891,17 +1896,19 @@ runtime_parkunlock(Lock *lock, const char *reason)
 static void
 park0(G *gp)
 {
+	M *m;
 	bool ok;
 
-	gp->status = Gwaiting;
+	m = g->m;
+	gp->atomicstatus = _Gwaiting;
 	gp->m = nil;
 	m->curg = nil;
 	if(m->waitunlockf) {
-		ok = m->waitunlockf(gp, m->waitlock);
+		ok = ((bool (*)(G*, void*))m->waitunlockf)(gp, m->waitlock);
 		m->waitunlockf = nil;
 		m->waitlock = nil;
 		if(!ok) {
-			gp->status = Grunnable;
+			gp->atomicstatus = _Grunnable;
 			execute(gp);  // Schedule it back, never returns.
 		}
 	}
@@ -1916,7 +1923,7 @@ park0(G *gp)
 void
 runtime_gosched(void)
 {
-	if(g->status != Grunning)
+	if(g->atomicstatus != _Grunning)
 		runtime_throw("bad g status");
 	runtime_mcall(runtime_gosched0);
 }
@@ -1925,7 +1932,10 @@ runtime_gosched(void)
 void
 runtime_gosched0(G *gp)
 {
-	gp->status = Grunnable;
+	M *m;
+
+	m = g->m;
+	gp->atomicstatus = _Grunnable;
 	gp->m = nil;
 	m->curg = nil;
 	runtime_lock(&runtime_sched);
@@ -1946,7 +1956,7 @@ void runtime_goexit(void) __attribute__ ((noinline));
 void
 runtime_goexit(void)
 {
-	if(g->status != Grunning)
+	if(g->atomicstatus != _Grunning)
 		runtime_throw("bad g status");
 	runtime_mcall(goexit0);
 }
@@ -1955,25 +1965,28 @@ runtime_goexit(void)
 static void
 goexit0(G *gp)
 {
-	gp->status = Gdead;
+	M *m;
+
+	m = g->m;
+	gp->atomicstatus = _Gdead;
 	gp->entry = nil;
 	gp->m = nil;
 	gp->lockedm = nil;
 	gp->paniconfault = 0;
-	gp->defer = nil; // should be true already but just in case.
-	gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
+	gp->_defer = nil; // should be true already but just in case.
+	gp->_panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
 	gp->writenbuf = 0;
 	gp->writebuf = nil;
-	gp->waitreason = nil;
+	gp->waitreason = runtime_gostringnocopy(nil);
 	gp->param = nil;
 	m->curg = nil;
 	m->lockedg = nil;
-	if(m->locked & ~LockExternal) {
+	if(m->locked & ~_LockExternal) {
 		runtime_printf("invalid m->locked = %d\n", m->locked);
 		runtime_throw("internal lockOSThread error");
 	}	
 	m->locked = 0;
-	gfput(m->p, gp);
+	gfput((P*)m->p, gp);
 	schedule();
 }
 
@@ -1994,7 +2007,7 @@ runtime_entersyscall()
 {
 	// Save the registers in the g structure so that any pointers
 	// held in registers will be seen by the garbage collector.
-	getcontext(&g->gcregs);
+	getcontext((ucontext_t*)&g->gcregs[0]);
 
 	// Do the work in a separate function, so that this function
 	// doesn't save any registers on its own stack.  If this
@@ -2011,24 +2024,24 @@ runtime_entersyscall()
 static void
 doentersyscall()
 {
-	// Disable preemption because during this function g is in Gsyscall status,
+	// Disable preemption because during this function g is in _Gsyscall status,
 	// but can have inconsistent g->sched, do not let GC observe it.
-	m->locks++;
+	g->m->locks++;
 
 	// Leave SP around for GC and traceback.
 #ifdef USING_SPLIT_STACK
-	g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
-				       &g->gcnext_segment, &g->gcnext_sp,
-				       &g->gcinitial_sp);
+	g->gcstack = __splitstack_find(nil, nil, &g->gcstacksize,
+				       &g->gcnextsegment, &g->gcnextsp,
+				       &g->gcinitialsp);
 #else
 	{
 		void *v;
 
-		g->gcnext_sp = (byte *) &v;
+		g->gcnextsp = (byte *) &v;
 	}
 #endif
 
-	g->status = Gsyscall;
+	g->atomicstatus = _Gsyscall;
 
 	if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
 		runtime_lock(&runtime_sched);
@@ -2039,19 +2052,19 @@ doentersyscall()
 		runtime_unlock(&runtime_sched);
 	}
 
-	m->mcache = nil;
-	m->p->m = nil;
-	runtime_atomicstore(&m->p->status, Psyscall);
+	g->m->mcache = nil;
+	((P*)(g->m->p))->m = 0;
+	runtime_atomicstore(&((P*)g->m->p)->status, _Psyscall);
 	if(runtime_atomicload(&runtime_sched.gcwaiting)) {
 		runtime_lock(&runtime_sched);
-		if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
+		if (runtime_sched.stopwait > 0 && runtime_cas(&((P*)g->m->p)->status, _Psyscall, _Pgcstop)) {
 			if(--runtime_sched.stopwait == 0)
 				runtime_notewakeup(&runtime_sched.stopnote);
 		}
 		runtime_unlock(&runtime_sched);
 	}
 
-	m->locks--;
+	g->m->locks--;
 }
 
 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
@@ -2060,29 +2073,29 @@ runtime_entersyscallblock(void)
 {
 	P *p;
 
-	m->locks++;  // see comment in entersyscall
+	g->m->locks++;  // see comment in entersyscall
 
 	// Leave SP around for GC and traceback.
 #ifdef USING_SPLIT_STACK
-	g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
-				       &g->gcnext_segment, &g->gcnext_sp,
-				       &g->gcinitial_sp);
+	g->gcstack = __splitstack_find(nil, nil, &g->gcstacksize,
+				       &g->gcnextsegment, &g->gcnextsp,
+				       &g->gcinitialsp);
 #else
-	g->gcnext_sp = (byte *) &p;
+	g->gcnextsp = (byte *) &p;
 #endif
 
 	// Save the registers in the g structure so that any pointers
 	// held in registers will be seen by the garbage collector.
-	getcontext(&g->gcregs);
+	getcontext((ucontext_t*)&g->gcregs[0]);
 
-	g->status = Gsyscall;
+	g->atomicstatus = _Gsyscall;
 
 	p = releasep();
 	handoffp(p);
 	if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
 		incidlelocked(1);
 
-	m->locks--;
+	g->m->locks--;
 }
 
 // The goroutine g exited its system call.
@@ -2094,29 +2107,29 @@ runtime_exitsyscall(void)
 {
 	G *gp;
 
-	m->locks++;  // see comment in entersyscall
-
 	gp = g;
+	gp->m->locks++;  // see comment in entersyscall
+
 	if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
 		incidlelocked(-1);
 
-	g->waitsince = 0;
+	gp->waitsince = 0;
 	if(exitsyscallfast()) {
 		// There's a cpu for us, so we can run.
-		m->p->syscalltick++;
-		gp->status = Grunning;
+		((P*)gp->m->p)->syscalltick++;
+		gp->atomicstatus = _Grunning;
 		// Garbage collector isn't running (since we are),
 		// so okay to clear gcstack and gcsp.
 #ifdef USING_SPLIT_STACK
 		gp->gcstack = nil;
 #endif
-		gp->gcnext_sp = nil;
-		runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
-		m->locks--;
+		gp->gcnextsp = nil;
+		runtime_memclr(&gp->gcregs[0], sizeof gp->gcregs);
+		gp->m->locks--;
 		return;
 	}
 
-	m->locks--;
+	gp->m->locks--;
 
 	// Call the scheduler.
 	runtime_mcall(exitsyscall0);
@@ -2130,34 +2143,37 @@ runtime_exitsyscall(void)
 #ifdef USING_SPLIT_STACK
 	gp->gcstack = nil;
 #endif
-	gp->gcnext_sp = nil;
-	runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
+	gp->gcnextsp = nil;
+	runtime_memclr(&gp->gcregs[0], sizeof gp->gcregs);
 
-	// Don't refer to m again, we might be running on a different
-	// thread after returning from runtime_mcall.
-	runtime_m()->p->syscalltick++;
+	// Note that this gp->m might be different than the earlier
+	// gp->m after returning from runtime_mcall.
+	((P*)gp->m->p)->syscalltick++;
 }
 
 static bool
 exitsyscallfast(void)
 {
+	G *gp;
 	P *p;
 
+	gp = g;
+
 	// Freezetheworld sets stopwait but does not retake P's.
 	if(runtime_sched.stopwait) {
-		m->p = nil;
+		gp->m->p = 0;
 		return false;
 	}
 
 	// Try to re-acquire the last P.
-	if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
+	if(gp->m->p && ((P*)gp->m->p)->status == _Psyscall && runtime_cas(&((P*)gp->m->p)->status, _Psyscall, _Prunning)) {
 		// There's a cpu for us, so we can run.
-		m->mcache = m->p->mcache;
-		m->p->m = m;
+		gp->m->mcache = ((P*)gp->m->p)->mcache;
+		((P*)gp->m->p)->m = (uintptr)gp->m;
 		return true;
 	}
 	// Try to get any other idle P.
-	m->p = nil;
+	gp->m->p = 0;
 	if(runtime_sched.pidle) {
 		runtime_lock(&runtime_sched);
 		p = pidleget();
@@ -2179,9 +2195,11 @@ exitsyscallfast(void)
 static void
 exitsyscall0(G *gp)
 {
+	M *m;
 	P *p;
 
-	gp->status = Grunnable;
+	m = g->m;
+	gp->atomicstatus = _Grunnable;
 	gp->m = nil;
 	m->curg = nil;
 	runtime_lock(&runtime_sched);
@@ -2235,7 +2253,7 @@ syscall_runtime_AfterFork(void)
 
 // Allocate a new g, with a stack big enough for stacksize bytes.
 G*
-runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
+runtime_malg(int32 stacksize, byte** ret_stack, uintptr* ret_stacksize)
 {
 	G *newg;
 
@@ -2243,11 +2261,13 @@ runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
 	if(stacksize >= 0) {
 #if USING_SPLIT_STACK
 		int dont_block_signals = 0;
+		size_t ss_stacksize;
 
 		*ret_stack = __splitstack_makecontext(stacksize,
-						      &newg->stack_context[0],
-						      ret_stacksize);
-		__splitstack_block_signals_context(&newg->stack_context[0],
+						      &newg->stackcontext[0],
+						      &ss_stacksize);
+		*ret_stacksize = (uintptr)ss_stacksize;
+		__splitstack_block_signals_context(&newg->stackcontext[0],
 						   &dont_block_signals, nil);
 #else
                 // In 64-bit mode, the maximum Go allocation space is
@@ -2265,9 +2285,9 @@ runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
 			*ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
 			runtime_xadd(&runtime_stacks_sys, stacksize);
 		}
-		*ret_stacksize = stacksize;
-		newg->gcinitial_sp = *ret_stack;
-		newg->gcstack_size = (size_t)stacksize;
+		*ret_stacksize = (uintptr)stacksize;
+		newg->gcinitialsp = *ret_stack;
+		newg->gcstacksize = (uintptr)stacksize;
 #endif
 	}
 	return newg;
@@ -2310,36 +2330,39 @@ __go_go(void (*fn)(void*), void* arg)
 
 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
 	if(fn == nil) {
-		m->throwing = -1;  // do not dump full stacks
+		g->m->throwing = -1;  // do not dump full stacks
 		runtime_throw("go of nil func value");
 	}
-	m->locks++;  // disable preemption because it can be holding p in a local var
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
 
-	p = m->p;
+	p = (P*)g->m->p;
 	if((newg = gfget(p)) != nil) {
 #ifdef USING_SPLIT_STACK
 		int dont_block_signals = 0;
 
-		sp = __splitstack_resetcontext(&newg->stack_context[0],
+		sp = __splitstack_resetcontext(&newg->stackcontext[0],
 					       &spsize);
-		__splitstack_block_signals_context(&newg->stack_context[0],
+		__splitstack_block_signals_context(&newg->stackcontext[0],
 						   &dont_block_signals, nil);
 #else
-		sp = newg->gcinitial_sp;
-		spsize = newg->gcstack_size;
+		sp = newg->gcinitialsp;
+		spsize = newg->gcstacksize;
 		if(spsize == 0)
 			runtime_throw("bad spsize in __go_go");
-		newg->gcnext_sp = sp;
+		newg->gcnextsp = sp;
 #endif
 	} else {
-		newg = runtime_malg(StackMin, &sp, &spsize);
+		uintptr malsize;
+
+		newg = runtime_malg(StackMin, &sp, &malsize);
+		spsize = (size_t)malsize;
 		allgadd(newg);
 	}
 
 	newg->entry = (byte*)fn;
 	newg->param = arg;
 	newg->gopc = (uintptr)__builtin_return_address(0);
-	newg->status = Grunnable;
+	newg->atomicstatus = _Grunnable;
 	if(p->goidcache == p->goidcacheend) {
 		p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
 		p->goidcacheend = p->goidcache + GoidCacheBatch;
@@ -2353,19 +2376,19 @@ __go_go(void (*fn)(void*), void* arg)
 		size_t volatile vspsize = spsize;
 		G * volatile vnewg = newg;
 
-		getcontext(&vnewg->context);
-		vnewg->context.uc_stack.ss_sp = vsp;
+		getcontext((ucontext_t*)&vnewg->context[0]);
+		((ucontext_t*)&vnewg->context[0])->uc_stack.ss_sp = vsp;
 #ifdef MAKECONTEXT_STACK_TOP
-		vnewg->context.uc_stack.ss_sp += vspsize;
+		((ucontext_t*)&vnewg->context[0])->uc_stack.ss_sp += vspsize;
 #endif
-		vnewg->context.uc_stack.ss_size = vspsize;
-		makecontext(&vnewg->context, kickoff, 0);
+		((ucontext_t*)&vnewg->context[0])->uc_stack.ss_size = vspsize;
+		makecontext((ucontext_t*)&vnewg->context[0], kickoff, 0);
 
 		runqput(p, vnewg);
 
 		if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
 			wakep();
-		m->locks--;
+		g->m->locks--;
 		return vnewg;
 	}
 }
@@ -2400,7 +2423,7 @@ allgadd(G *gp)
 static void
 gfput(P *p, G *gp)
 {
-	gp->schedlink = p->gfree;
+	gp->schedlink = (uintptr)p->gfree;
 	p->gfree = gp;
 	p->gfreecnt++;
 	if(p->gfreecnt >= 64) {
@@ -2408,8 +2431,8 @@ gfput(P *p, G *gp)
 		while(p->gfreecnt >= 32) {
 			p->gfreecnt--;
 			gp = p->gfree;
-			p->gfree = gp->schedlink;
-			gp->schedlink = runtime_sched.gfree;
+			p->gfree = (G*)gp->schedlink;
+			gp->schedlink = (uintptr)runtime_sched.gfree;
 			runtime_sched.gfree = gp;
 		}
 		runtime_unlock(&runtime_sched.gflock);
@@ -2430,15 +2453,15 @@ retry:
 		while(p->gfreecnt < 32 && runtime_sched.gfree) {
 			p->gfreecnt++;
 			gp = runtime_sched.gfree;
-			runtime_sched.gfree = gp->schedlink;
-			gp->schedlink = p->gfree;
+			runtime_sched.gfree = (G*)gp->schedlink;
+			gp->schedlink = (uintptr)p->gfree;
 			p->gfree = gp;
 		}
 		runtime_unlock(&runtime_sched.gflock);
 		goto retry;
 	}
 	if(gp) {
-		p->gfree = gp->schedlink;
+		p->gfree = (G*)gp->schedlink;
 		p->gfreecnt--;
 	}
 	return gp;
@@ -2454,8 +2477,8 @@ gfpurge(P *p)
 	while(p->gfreecnt) {
 		p->gfreecnt--;
 		gp = p->gfree;
-		p->gfree = gp->schedlink;
-		gp->schedlink = runtime_sched.gfree;
+		p->gfree = (G*)gp->schedlink;
+		gp->schedlink = (uintptr)runtime_sched.gfree;
 		runtime_sched.gfree = gp;
 	}
 	runtime_unlock(&runtime_sched.gflock);
@@ -2482,8 +2505,8 @@ runtime_gomaxprocsfunc(int32 n)
 {
 	int32 ret;
 
-	if(n > MaxGomaxprocs)
-		n = MaxGomaxprocs;
+	if(n > _MaxGomaxprocs)
+		n = _MaxGomaxprocs;
 	runtime_lock(&runtime_sched);
 	ret = runtime_gomaxprocs;
 	if(n <= 0 || n == ret) {
@@ -2493,10 +2516,10 @@ runtime_gomaxprocsfunc(int32 n)
 	runtime_unlock(&runtime_sched);
 
 	runtime_semacquire(&runtime_worldsema, false);
-	m->gcing = 1;
+	g->m->gcing = 1;
 	runtime_stoptheworld();
 	newprocs = n;
-	m->gcing = 0;
+	g->m->gcing = 0;
 	runtime_semrelease(&runtime_worldsema);
 	runtime_starttheworld();
 
@@ -2509,22 +2532,22 @@ runtime_gomaxprocsfunc(int32 n)
 static void
 lockOSThread(void)
 {
-	m->lockedg = g;
-	g->lockedm = m;
+	g->m->lockedg = g;
+	g->lockedm = g->m;
 }
 
 void	runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
 void
 runtime_LockOSThread(void)
 {
-	m->locked |= LockExternal;
+	g->m->locked |= _LockExternal;
 	lockOSThread();
 }
 
 void
 runtime_lockOSThread(void)
 {
-	m->locked += LockInternal;
+	g->m->locked += _LockInternal;
 	lockOSThread();
 }
 
@@ -2535,9 +2558,9 @@ runtime_lockOSThread(void)
 static void
 unlockOSThread(void)
 {
-	if(m->locked != 0)
+	if(g->m->locked != 0)
 		return;
-	m->lockedg = nil;
+	g->m->lockedg = nil;
 	g->lockedm = nil;
 }
 
@@ -2546,23 +2569,23 @@ void	runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread"
 void
 runtime_UnlockOSThread(void)
 {
-	m->locked &= ~LockExternal;
+	g->m->locked &= ~_LockExternal;
 	unlockOSThread();
 }
 
 void
 runtime_unlockOSThread(void)
 {
-	if(m->locked < LockInternal)
+	if(g->m->locked < _LockInternal)
 		runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
-	m->locked -= LockInternal;
+	g->m->locked -= _LockInternal;
 	unlockOSThread();
 }
 
 bool
 runtime_lockedOSThread(void)
 {
-	return g->lockedm != nil && m->lockedg != nil;
+	return g->lockedm != nil && g->m->lockedg != nil;
 }
 
 int32
@@ -2580,8 +2603,8 @@ runtime_gcount(void)
 	// Compromise solution is to introduce per-P counters of active goroutines.
 	for(i = 0; i < runtime_allglen; i++) {
 		gp = runtime_allg[i];
-		s = gp->status;
-		if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
+		s = gp->atomicstatus;
+		if(s == _Grunnable || s == _Grunning || s == _Gsyscall || s == _Gwaiting)
 			n++;
 	}
 	runtime_unlock(&allglock);
@@ -2609,7 +2632,7 @@ static void GC(void) {}
 void
 runtime_sigprof()
 {
-	M *mp = m;
+	M *mp = g->m;
 	int32 n, i;
 	bool traceback;
 
@@ -2675,7 +2698,7 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 
 	// Disable preemption, otherwise we can be rescheduled to another thread
 	// that has profiling enabled.
-	m->locks++;
+	g->m->locks++;
 
 	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
@@ -2693,7 +2716,7 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 	if(hz != 0)
 		runtime_resetcpuprofiler(hz);
 
-	m->locks--;
+	g->m->locks--;
 }
 
 // Change number of processors.  The world is stopped, sched is locked.
@@ -2706,7 +2729,7 @@ procresize(int32 new)
 	P *p;
 
 	old = runtime_gomaxprocs;
-	if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
+	if(old < 0 || old > _MaxGomaxprocs || new <= 0 || new >_MaxGomaxprocs)
 		runtime_throw("procresize: invalid arg");
 	// initialize new P's
 	for(i = 0; i < new; i++) {
@@ -2714,12 +2737,12 @@ procresize(int32 new)
 		if(p == nil) {
 			p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
 			p->id = i;
-			p->status = Pgcstop;
+			p->status = _Pgcstop;
 			runtime_atomicstorep(&runtime_allp[i], p);
 		}
 		if(p->mcache == nil) {
 			if(old==0 && i==0)
-				p->mcache = m->mcache;  // bootstrap
+				p->mcache = g->m->mcache;  // bootstrap
 			else
 				p->mcache = runtime_allocmcache();
 		}
@@ -2739,9 +2762,9 @@ procresize(int32 new)
 			empty = false;
 			// pop from tail of local queue
 			p->runqtail--;
-			gp = p->runq[p->runqtail%nelem(p->runq)];
+			gp = (G*)p->runq[p->runqtail%nelem(p->runq)];
 			// push onto head of global queue
-			gp->schedlink = runtime_sched.runqhead;
+			gp->schedlink = (uintptr)runtime_sched.runqhead;
 			runtime_sched.runqhead = gp;
 			if(runtime_sched.runqtail == nil)
 				runtime_sched.runqtail = gp;
@@ -2753,7 +2776,7 @@ procresize(int32 new)
 	// so if we have a spare G we want to put it into allp[1].
 	for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
 		gp = runtime_sched.runqhead;
-		runtime_sched.runqhead = gp->schedlink;
+		runtime_sched.runqhead = (G*)gp->schedlink;
 		if(runtime_sched.runqhead == nil)
 			runtime_sched.runqtail = nil;
 		runtime_sched.runqsize--;
@@ -2766,21 +2789,21 @@ procresize(int32 new)
 		runtime_freemcache(p->mcache);
 		p->mcache = nil;
 		gfpurge(p);
-		p->status = Pdead;
+		p->status = _Pdead;
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
-	if(m->p)
-		m->p->m = nil;
-	m->p = nil;
-	m->mcache = nil;
+	if(g->m->p)
+		((P*)g->m->p)->m = 0;
+	g->m->p = 0;
+	g->m->mcache = nil;
 	p = runtime_allp[0];
-	p->m = nil;
-	p->status = Pidle;
+	p->m = 0;
+	p->status = _Pidle;
 	acquirep(p);
 	for(i = new-1; i > 0; i--) {
 		p = runtime_allp[i];
-		p->status = Pidle;
+		p->status = _Pidle;
 		pidleput(p);
 	}
 	runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
@@ -2790,36 +2813,41 @@ procresize(int32 new)
 static void
 acquirep(P *p)
 {
+	M *m;
+
+	m = g->m;
 	if(m->p || m->mcache)
 		runtime_throw("acquirep: already in go");
-	if(p->m || p->status != Pidle) {
-		runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
+	if(p->m || p->status != _Pidle) {
+		runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? ((M*)p->m)->id : 0, p->status);
 		runtime_throw("acquirep: invalid p state");
 	}
 	m->mcache = p->mcache;
-	m->p = p;
-	p->m = m;
-	p->status = Prunning;
+	m->p = (uintptr)p;
+	p->m = (uintptr)m;
+	p->status = _Prunning;
 }
 
 // Disassociate p and the current m.
 static P*
 releasep(void)
 {
+	M *m;
 	P *p;
 
-	if(m->p == nil || m->mcache == nil)
+	m = g->m;
+	if(m->p == 0 || m->mcache == nil)
 		runtime_throw("releasep: invalid arg");
-	p = m->p;
-	if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
+	p = (P*)m->p;
+	if((M*)p->m != m || p->mcache != m->mcache || p->status != _Prunning) {
 		runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
 			m, m->p, p->m, m->mcache, p->mcache, p->status);
 		runtime_throw("releasep: invalid p state");
 	}
-	m->p = nil;
+	m->p = 0;
 	m->mcache = nil;
-	p->m = nil;
-	p->status = Pidle;
+	p->m = 0;
+	p->status = _Pidle;
 	return p;
 }
 
@@ -2870,10 +2898,10 @@ checkdead(void)
 		gp = runtime_allg[i];
 		if(gp->isbackground)
 			continue;
-		s = gp->status;
-		if(s == Gwaiting)
+		s = gp->atomicstatus;
+		if(s == _Gwaiting)
 			grunning++;
-		else if(s == Grunnable || s == Grunning || s == Gsyscall) {
+		else if(s == _Grunnable || s == _Grunning || s == _Gsyscall) {
 			runtime_unlock(&allglock);
 			runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
 			runtime_throw("checkdead: runnable g");
@@ -2882,7 +2910,7 @@ checkdead(void)
 	runtime_unlock(&allglock);
 	if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
 		runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!");
-	m->throwing = -1;  // do not dump full stacks
+	g->m->throwing = -1;  // do not dump full stacks
 	runtime_throw("all goroutines are asleep - deadlock!");
 }
 
@@ -2958,7 +2986,7 @@ struct Pdesc
 	uint32	syscalltick;
 	int64	syscallwhen;
 };
-static Pdesc pdesc[MaxGomaxprocs];
+static Pdesc pdesc[_MaxGomaxprocs];
 
 static uint32
 retake(int64 now)
@@ -2975,7 +3003,7 @@ retake(int64 now)
 			continue;
 		pd = &pdesc[i];
 		s = p->status;
-		if(s == Psyscall) {
+		if(s == _Psyscall) {
 			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
 			t = p->syscalltick;
 			if(pd->syscalltick != t) {
@@ -2995,12 +3023,12 @@ retake(int64 now)
 			// Otherwise the M from which we retake can exit the syscall,
 			// increment nmidle and report deadlock.
 			incidlelocked(-1);
-			if(runtime_cas(&p->status, s, Pidle)) {
+			if(runtime_cas(&p->status, s, _Pidle)) {
 				n++;
 				handoffp(p);
 			}
 			incidlelocked(1);
-		} else if(s == Prunning) {
+		} else if(s == _Prunning) {
 			// Preempt G if it's running for more than 10ms.
 			t = p->schedtick;
 			if(pd->schedtick != t) {
@@ -3060,7 +3088,7 @@ runtime_schedtrace(bool detailed)
 		p = runtime_allp[i];
 		if(p == nil)
 			continue;
-		mp = p->m;
+		mp = (M*)p->m;
 		h = runtime_atomicload(&p->runqhead);
 		t = runtime_atomicload(&p->runqtail);
 		if(detailed)
@@ -3084,7 +3112,7 @@ runtime_schedtrace(bool detailed)
 		return;
 	}
 	for(mp = runtime_allm; mp; mp = mp->alllink) {
-		p = mp->p;
+		p = (P*)mp->p;
 		gp = mp->curg;
 		lockedg = mp->lockedg;
 		id1 = -1;
@@ -3100,15 +3128,15 @@ runtime_schedtrace(bool detailed)
 			" locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
 			mp->id, id1, id2,
 			mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
-			mp->spinning, m->blocked, id3);
+			mp->spinning, mp->blocked, id3);
 	}
 	runtime_lock(&allglock);
 	for(gi = 0; gi < runtime_allglen; gi++) {
 		gp = runtime_allg[gi];
 		mp = gp->m;
 		lockedm = gp->lockedm;
-		runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
-			gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
+		runtime_printf("  G%D: status=%d(%S) m=%d lockedm=%d\n",
+			gp->goid, gp->atomicstatus, gp->waitreason, mp ? mp->id : -1,
 			lockedm ? lockedm->id : -1);
 	}
 	runtime_unlock(&allglock);
@@ -3120,7 +3148,7 @@ runtime_schedtrace(bool detailed)
 static void
 mput(M *mp)
 {
-	mp->schedlink = runtime_sched.midle;
+	mp->schedlink = (uintptr)runtime_sched.midle;
 	runtime_sched.midle = mp;
 	runtime_sched.nmidle++;
 	checkdead();
@@ -3134,7 +3162,7 @@ mget(void)
 	M *mp;
 
 	if((mp = runtime_sched.midle) != nil){
-		runtime_sched.midle = mp->schedlink;
+		runtime_sched.midle = (M*)mp->schedlink;
 		runtime_sched.nmidle--;
 	}
 	return mp;
@@ -3145,9 +3173,9 @@ mget(void)
 static void
 globrunqput(G *gp)
 {
-	gp->schedlink = nil;
+	gp->schedlink = 0;
 	if(runtime_sched.runqtail)
-		runtime_sched.runqtail->schedlink = gp;
+		runtime_sched.runqtail->schedlink = (uintptr)gp;
 	else
 		runtime_sched.runqhead = gp;
 	runtime_sched.runqtail = gp;
@@ -3159,9 +3187,9 @@ globrunqput(G *gp)
 static void
 globrunqputbatch(G *ghead, G *gtail, int32 n)
 {
-	gtail->schedlink = nil;
+	gtail->schedlink = 0;
 	if(runtime_sched.runqtail)
-		runtime_sched.runqtail->schedlink = ghead;
+		runtime_sched.runqtail->schedlink = (uintptr)ghead;
 	else
 		runtime_sched.runqhead = ghead;
 	runtime_sched.runqtail = gtail;
@@ -3189,11 +3217,11 @@ globrunqget(P *p, int32 max)
 	if(runtime_sched.runqsize == 0)
 		runtime_sched.runqtail = nil;
 	gp = runtime_sched.runqhead;
-	runtime_sched.runqhead = gp->schedlink;
+	runtime_sched.runqhead = (G*)gp->schedlink;
 	n--;
 	while(n--) {
 		gp1 = runtime_sched.runqhead;
-		runtime_sched.runqhead = gp1->schedlink;
+		runtime_sched.runqhead = (G*)gp1->schedlink;
 		runqput(p, gp1);
 	}
 	return gp;
@@ -3204,7 +3232,7 @@ globrunqget(P *p, int32 max)
 static void
 pidleput(P *p)
 {
-	p->link = runtime_sched.pidle;
+	p->link = (uintptr)runtime_sched.pidle;
 	runtime_sched.pidle = p;
 	runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
 }
@@ -3218,7 +3246,7 @@ pidleget(void)
 
 	p = runtime_sched.pidle;
 	if(p) {
-		runtime_sched.pidle = p->link;
+		runtime_sched.pidle = (P*)p->link;
 		runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
 	}
 	return p;
@@ -3236,7 +3264,7 @@ retry:
 	h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
 	t = p->runqtail;
 	if(t - h < nelem(p->runq)) {
-		p->runq[t%nelem(p->runq)] = gp;
+		p->runq[t%nelem(p->runq)] = (uintptr)gp;
 		runtime_atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
 		return;
 	}
@@ -3260,13 +3288,13 @@ runqputslow(P *p, G *gp, uint32 h, uint32 t)
 	if(n != nelem(p->runq)/2)
 		runtime_throw("runqputslow: queue is not full");
 	for(i=0; i<n; i++)
-		batch[i] = p->runq[(h+i)%nelem(p->runq)];
+		batch[i] = (G*)p->runq[(h+i)%nelem(p->runq)];
 	if(!runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
 		return false;
 	batch[n] = gp;
 	// Link the goroutines.
 	for(i=0; i<n; i++)
-		batch[i]->schedlink = batch[i+1];
+		batch[i]->schedlink = (uintptr)batch[i+1];
 	// Now put the batch on global queue.
 	runtime_lock(&runtime_sched);
 	globrunqputbatch(batch[0], batch[n], n+1);
@@ -3287,7 +3315,7 @@ runqget(P *p)
 		t = p->runqtail;
 		if(t == h)
 			return nil;
-		gp = p->runq[h%nelem(p->runq)];
+		gp = (G*)p->runq[h%nelem(p->runq)];
 		if(runtime_cas(&p->runqhead, h, h+1))  // cas-release, commits consume
 			return gp;
 	}
@@ -3311,7 +3339,7 @@ runqgrab(P *p, G **batch)
 		if(n > nelem(p->runq)/2)  // read inconsistent h and t
 			continue;
 		for(i=0; i<n; i++)
-			batch[i] = p->runq[(h+i)%nelem(p->runq)];
+			batch[i] = (G*)p->runq[(h+i)%nelem(p->runq)];
 		if(runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
 			break;
 	}
@@ -3340,7 +3368,7 @@ runqsteal(P *p, P *p2)
 	if(t - h + n >= nelem(p->runq))
 		runtime_throw("runqsteal: runq overflow");
 	for(i=0; i<n; i++, t++)
-		p->runq[t%nelem(p->runq)] = batch[i];
+		p->runq[t%nelem(p->runq)] = (uintptr)batch[i];
 	runtime_atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
 	return gp;
 }
@@ -3480,7 +3508,7 @@ sync_runtime_canSpin(intgo i)
 	if (i >= ACTIVE_SPIN || runtime_ncpu <= 1 || runtime_gomaxprocs <= (int32)(runtime_sched.npidle+runtime_sched.nmspinning)+1) {
 		return false;
 	}
-	p = m->p;
+	p = (P*)g->m->p;
 	return p != nil && p->runqhead == p->runqtail;
 }