diff options
Diffstat (limited to 'libgo/runtime/mgc0.c')
-rw-r--r-- | libgo/runtime/mgc0.c | 641 |
1 files changed, 337 insertions, 304 deletions
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c index d35cc0f..72feb1f 100644 --- a/libgo/runtime/mgc0.c +++ b/libgo/runtime/mgc0.c @@ -9,6 +9,7 @@ #include "runtime.h" #include "arch.h" #include "malloc.h" +#include "race.h" #ifdef USING_SPLIT_STACK @@ -22,8 +23,8 @@ extern void * __splitstack_find_context (void *context[10], size_t *, void **, enum { Debug = 0, - PtrSize = sizeof(void*), DebugMark = 0, // run second pass to check mark + DataBlock = 8*1024, // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, @@ -78,17 +79,14 @@ enum { // uint32 runtime_worldsema = 1; -// TODO: Make these per-M. -static uint64 nhandoff; - static int32 gctrace; typedef struct Workbuf Workbuf; struct Workbuf { - Workbuf *next; + LFNode node; // must be first uintptr nobj; - byte *obj[512-2]; + byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)]; }; typedef struct Finalizer Finalizer; @@ -122,22 +120,32 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); +typedef struct GcRoot GcRoot; +struct GcRoot +{ + byte *p; + uintptr n; +}; + static struct { - Lock fmu; - Workbuf *full; - Lock emu; - Workbuf *empty; + uint64 full; // lock-free list of full blocks + uint64 empty; // lock-free list of empty blocks + byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait uint32 nproc; volatile uint32 nwait; volatile uint32 ndone; + volatile uint32 debugmarkdone; Note alldone; - Lock markgate; - Lock sweepgate; - MSpan *spans; + ParFor *markfor; + ParFor *sweepfor; Lock; byte *chunk; uintptr nchunk; + + GcRoot *roots; + uint32 nroot; + uint32 rootcap; } work; // scanblock scans a block of n bytes starting at pointer b for references @@ -147,7 +155,7 @@ static struct { // body. Keeping an explicit work list is easier on the stack allocator and // more efficient. static void -scanblock(byte *b, int64 n) +scanblock(byte *b, uintptr n) { byte *obj, *arena_start, *arena_used, *p; void **vp; @@ -158,8 +166,8 @@ scanblock(byte *b, int64 n) Workbuf *wbuf; bool keepworking; - if((int64)(uintptr)n != n || n < 0) { - runtime_printf("scanblock %p %D\n", b, n); + if((intptr)n < 0) { + runtime_printf("scanblock %p %D\n", b, (int64)n); runtime_throw("scanblock"); } @@ -173,7 +181,7 @@ scanblock(byte *b, int64 n) nobj = 0; // number of queued objects // Scanblock helpers pass b==nil. - // The main proc needs to return to make more + // Procs needs to return to make more // calls to scanblock. But if work.nproc==1 then // might as well process blocks as soon as we // have them. @@ -190,7 +198,7 @@ scanblock(byte *b, int64 n) // Each iteration scans the block b of length n, queueing pointers in // the work buffer. if(Debug > 1) - runtime_printf("scanblock %p %D\n", b, n); + runtime_printf("scanblock %p %D\n", b, (int64)n); vp = (void**)b; n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */ @@ -257,6 +265,14 @@ scanblock(byte *b, int64 n) bits = xbits >> shift; found: + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > 4 && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = (void**)(wbuf->obj + nobj); + } + // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about allocated and not marked. @@ -278,13 +294,7 @@ scanblock(byte *b, int64 n) if((bits & bitNoPointers) != 0) continue; - // If another proc wants a pointer, give it some. - if(nobj > 4 && work.nwait > 0 && work.full == nil) { - wbuf->nobj = nobj; - wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = (void**)(wbuf->obj + nobj); - } + PREFETCH(obj); // If buffer is full, get a new one. if(wbuf == nil || nobj >= nelem(wbuf->obj)) { @@ -305,7 +315,8 @@ scanblock(byte *b, int64 n) // Fetch b from the work buffer. if(nobj == 0) { if(!keepworking) { - putempty(wbuf); + if(wbuf) + putempty(wbuf); return; } // Emptied our buffer: refill. @@ -335,7 +346,7 @@ scanblock(byte *b, int64 n) // it is simpler, slower, single-threaded, recursive, // and uses bitSpecial as the mark bit. static void -debug_scanblock(byte *b, int64 n) +debug_scanblock(byte *b, uintptr n) { byte *obj, *p; void **vp; @@ -345,8 +356,8 @@ debug_scanblock(byte *b, int64 n) if(!DebugMark) runtime_throw("debug_scanblock without DebugMark"); - if((int64)(uintptr)n != n || n < 0) { - runtime_printf("debug_scanblock %p %D\n", b, n); + if((intptr)n < 0) { + runtime_printf("debug_scanblock %p %D\n", b, (int64)n); runtime_throw("debug_scanblock"); } @@ -374,7 +385,6 @@ debug_scanblock(byte *b, int64 n) if(s == nil) continue; - p = (byte*)((uintptr)s->start<<PageShift); if(s->sizeclass == 0) { obj = p; @@ -411,53 +421,33 @@ debug_scanblock(byte *b, int64 n) } } +static void +markroot(ParFor *desc, uint32 i) +{ + USED(&desc); + scanblock(work.roots[i].p, work.roots[i].n); +} + // Get an empty work buffer off the work.empty list, // allocating new buffers as needed. static Workbuf* getempty(Workbuf *b) { - if(work.nproc == 1) { - // Put b on full list. - if(b != nil) { - b->next = work.full; - work.full = b; - } - // Grab from empty list if possible. - b = work.empty; - if(b != nil) { - work.empty = b->next; - goto haveb; - } - } else { - // Put b on full list. - if(b != nil) { - runtime_lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime_unlock(&work.fmu); + if(b != nil) + runtime_lfstackpush(&work.full, &b->node); + b = (Workbuf*)runtime_lfstackpop(&work.empty); + if(b == nil) { + // Need to allocate. + runtime_lock(&work); + if(work.nchunk < sizeof *b) { + work.nchunk = 1<<20; + work.chunk = runtime_SysAlloc(work.nchunk); } - // Grab from empty list if possible. - runtime_lock(&work.emu); - b = work.empty; - if(b != nil) - work.empty = b->next; - runtime_unlock(&work.emu); - if(b != nil) - goto haveb; - } - - // Need to allocate. - runtime_lock(&work); - if(work.nchunk < sizeof *b) { - work.nchunk = 1<<20; - work.chunk = runtime_SysAlloc(work.nchunk); + b = (Workbuf*)work.chunk; + work.chunk += sizeof *b; + work.nchunk -= sizeof *b; + runtime_unlock(&work); } - b = (Workbuf*)work.chunk; - work.chunk += sizeof *b; - work.nchunk -= sizeof *b; - runtime_unlock(&work); - -haveb: b->nobj = 0; return b; } @@ -465,112 +455,95 @@ haveb: static void putempty(Workbuf *b) { - if(b == nil) - return; - - if(work.nproc == 1) { - b->next = work.empty; - work.empty = b; - return; - } - - runtime_lock(&work.emu); - b->next = work.empty; - work.empty = b; - runtime_unlock(&work.emu); + runtime_lfstackpush(&work.empty, &b->node); } // Get a full work buffer off the work.full list, or return nil. static Workbuf* getfull(Workbuf *b) { + M *m; int32 i; - Workbuf *b1; - if(work.nproc == 1) { - // Put b on empty list. - if(b != nil) { - b->next = work.empty; - work.empty = b; - } - // Grab from full list if possible. - // Since work.nproc==1, no one else is - // going to give us work. - b = work.full; - if(b != nil) - work.full = b->next; + if(b != nil) + runtime_lfstackpush(&work.empty, &b->node); + b = (Workbuf*)runtime_lfstackpop(&work.full); + if(b != nil || work.nproc == 1) return b; - } - - putempty(b); - - // Grab buffer from full list if possible. - for(;;) { - b1 = work.full; - if(b1 == nil) - break; - runtime_lock(&work.fmu); - if(work.full != nil) { - b1 = work.full; - work.full = b1->next; - runtime_unlock(&work.fmu); - return b1; - } - runtime_unlock(&work.fmu); - } + m = runtime_m(); runtime_xadd(&work.nwait, +1); for(i=0;; i++) { - b1 = work.full; - if(b1 != nil) { - runtime_lock(&work.fmu); - if(work.full != nil) { - runtime_xadd(&work.nwait, -1); - b1 = work.full; - work.full = b1->next; - runtime_unlock(&work.fmu); - return b1; - } - runtime_unlock(&work.fmu); - continue; + if(work.full != 0) { + runtime_xadd(&work.nwait, -1); + b = (Workbuf*)runtime_lfstackpop(&work.full); + if(b != nil) + return b; + runtime_xadd(&work.nwait, +1); } if(work.nwait == work.nproc) return nil; - if(i < 10) + if(i < 10) { + m->gcstats.nprocyield++; runtime_procyield(20); - else if(i < 20) + } else if(i < 20) { + m->gcstats.nosyield++; runtime_osyield(); - else + } else { + m->gcstats.nsleep++; runtime_usleep(100); + } } } static Workbuf* handoff(Workbuf *b) { + M *m; int32 n; Workbuf *b1; + m = runtime_m(); + // Make new buffer with half of b's pointers. b1 = getempty(nil); n = b->nobj/2; b->nobj -= n; b1->nobj = n; runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); - nhandoff += n; + m->gcstats.nhandoff++; + m->gcstats.nhandoffcnt += n; // Put b on full list - let first half of b get stolen. - runtime_lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime_unlock(&work.fmu); - + runtime_lfstackpush(&work.full, &b->node); return b1; } -// Scanstack calls scanblock on each of gp's stack segments. static void -scanstack(void (*scanblock)(byte*, int64), G *gp) +addroot(byte *p, uintptr n) +{ + uint32 cap; + GcRoot *new; + + if(work.nroot >= work.rootcap) { + cap = PageSize/sizeof(GcRoot); + if(cap < 2*work.rootcap) + cap = 2*work.rootcap; + new = (GcRoot*)runtime_SysAlloc(cap*sizeof(GcRoot)); + if(work.roots != nil) { + runtime_memmove(new, work.roots, work.rootcap*sizeof(GcRoot)); + runtime_SysFree(work.roots, work.rootcap*sizeof(GcRoot)); + } + work.roots = new; + work.rootcap = cap; + } + work.roots[work.nroot].p = p; + work.roots[work.nroot].n = n; + work.nroot++; +} + +static void +addstackroots(G *gp) { #ifdef USING_SPLIT_STACK M *mp; @@ -609,11 +582,11 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } } if(sp != nil) { - scanblock(sp, spsize); + addroot(sp, spsize); while((sp = __splitstack_find(next_segment, next_sp, &spsize, &next_segment, &next_sp, &initial_sp)) != nil) - scanblock(sp, spsize); + addroot(sp, spsize); } #else M *mp; @@ -635,16 +608,14 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } top = (byte*)gp->gcinitial_sp + gp->gcstack_size; if(top > bottom) - scanblock(bottom, top - bottom); + addroot(bottom, top - bottom); else - scanblock(top, bottom - top); + addroot(top, bottom - top); #endif } -// Markfin calls scanblock on the blocks that have finalizers: -// the things pointed at cannot be freed until the finalizers have run. static void -markfin(void *v) +addfinroots(void *v) { uintptr size; @@ -653,7 +624,7 @@ markfin(void *v) runtime_throw("mark - finalizer inconsistency"); // do not mark the finalizer block itself. just mark the things it points at. - scanblock(v, size); + addroot(v, size); } static struct root_list* roots; @@ -668,22 +639,15 @@ __go_register_gc_roots (struct root_list* r) } static void -debug_markfin(void *v) -{ - uintptr size; - - if(!runtime_mlookup(v, (byte**)&v, &size, nil)) - runtime_throw("debug_mark - finalizer inconsistency"); - debug_scanblock(v, size); -} - -// Mark -static void -mark(void (*scan)(byte*, int64)) +addroots(void) { struct root_list *pl; G *gp; FinBlock *fb; + MSpan *s, **allspans; + uint32 spanidx; + + work.nroot = 0; // mark data+bss. for(pl = roots; pl != nil; pl = pl->next) { @@ -692,20 +656,36 @@ mark(void (*scan)(byte*, int64)) void *decl = pr->decl; if(decl == nil) break; - scanblock(decl, pr->size); + addroot(decl, pr->size); pr++; } } - scan((byte*)&runtime_m0, sizeof runtime_m0); - scan((byte*)&runtime_g0, sizeof runtime_g0); - scan((byte*)&runtime_allg, sizeof runtime_allg); - scan((byte*)&runtime_allm, sizeof runtime_allm); - runtime_MProf_Mark(scan); - runtime_time_scan(scan); - runtime_trampoline_scan(scan); + addroot((byte*)&runtime_m0, sizeof runtime_m0); + addroot((byte*)&runtime_g0, sizeof runtime_g0); + addroot((byte*)&runtime_allg, sizeof runtime_allg); + addroot((byte*)&runtime_allm, sizeof runtime_allm); + runtime_MProf_Mark(addroot); + runtime_time_scan(addroot); + runtime_trampoline_scan(addroot); + + // MSpan.types + allspans = runtime_mheap.allspans; + for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) { + s = allspans[spanidx]; + if(s->state == MSpanInUse) { + switch(s->types.compression) { + case MTypes_Empty: + case MTypes_Single: + break; + case MTypes_Words: + case MTypes_Bytes: + addroot((byte*)&s->types.data, sizeof(void*)); + break; + } + } + } - // mark stacks for(gp=runtime_allg; gp!=nil; gp=gp->alllink) { switch(gp->status){ default: @@ -716,27 +696,22 @@ mark(void (*scan)(byte*, int64)) case Grunning: if(gp != runtime_g()) runtime_throw("mark - world not stopped"); - scanstack(scan, gp); + addstackroots(gp); break; case Grunnable: case Gsyscall: case Gwaiting: - scanstack(scan, gp); + addstackroots(gp); break; } } - // mark things pointed at by objects with finalizers - if(scan == debug_scanblock) - runtime_walkfintab(debug_markfin, scan); - else - runtime_walkfintab(markfin, scan); + runtime_walkfintab(addfinroots, addroot); for(fb=allfin; fb; fb=fb->alllink) - scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); + addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); - // in multiproc mode, join in the queued work. - scan(nil, 0); + addroot((byte*)&work, sizeof work); } static bool @@ -771,122 +746,149 @@ handlespecial(byte *p, uintptr size) f->fn = fn; f->ft = ft; f->arg = p; - runtime_unlock(&finlock); + runtime_unlock(&finlock); return true; } // Sweep frees or collects finalizers for blocks not marked in the mark phase. // It clears the mark bits in preparation for the next GC round. static void -sweep(void) +sweepspan(ParFor *desc, uint32 idx) { M *m; - MSpan *s; int32 cl, n, npages; uintptr size; byte *p; MCache *c; byte *arena_start; - int64 now; + MLink head, *end; + int32 nfree; + byte *type_data; + byte compression; + uintptr type_data_inc; + MSpan *s; m = runtime_m(); + + USED(&desc); + s = runtime_mheap.allspans[idx]; + // Stamp newly unused spans. The scavenger will use that + // info to potentially give back some pages to the OS. + if(s->state == MSpanFree && s->unusedsince == 0) + s->unusedsince = runtime_nanotime(); + if(s->state != MSpanInUse) + return; arena_start = runtime_mheap.arena_start; - now = runtime_nanotime(); + p = (byte*)(s->start << PageShift); + cl = s->sizeclass; + size = s->elemsize; + if(cl == 0) { + n = 1; + } else { + // Chunk full of small blocks. + npages = runtime_class_to_allocnpages[cl]; + n = (npages << PageShift) / size; + } + nfree = 0; + end = &head; + c = m->mcache; + + type_data = (byte*)s->types.data; + type_data_inc = sizeof(uintptr); + compression = s->types.compression; + switch(compression) { + case MTypes_Bytes: + type_data += 8*sizeof(uintptr); + type_data_inc = 1; + break; + } - for(;;) { - s = work.spans; - if(s == nil) - break; - if(!runtime_casp(&work.spans, s, s->allnext)) - continue; + // Sweep through n objects of given size starting at p. + // This thread owns the span now, so it can manipulate + // the block bitmap without atomic operations. + for(; n > 0; n--, p += size, type_data+=type_data_inc) { + uintptr off, *bitp, shift, bits; - // Stamp newly unused spans. The scavenger will use that - // info to potentially give back some pages to the OS. - if(s->state == MSpanFree && s->unusedsince == 0) - s->unusedsince = now; + off = (uintptr*)p - (uintptr*)arena_start; + bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + bits = *bitp>>shift; - if(s->state != MSpanInUse) + if((bits & bitAllocated) == 0) continue; - p = (byte*)(s->start << PageShift); - cl = s->sizeclass; - if(cl == 0) { - size = s->npages<<PageShift; - n = 1; - } else { - // Chunk full of small blocks. - size = runtime_class_to_size[cl]; - npages = runtime_class_to_allocnpages[cl]; - n = (npages << PageShift) / size; + if((bits & bitMarked) != 0) { + if(DebugMark) { + if(!(bits & bitSpecial)) + runtime_printf("found spurious mark on %p\n", p); + *bitp &= ~(bitSpecial<<shift); + } + *bitp &= ~(bitMarked<<shift); + continue; } - // Sweep through n objects of given size starting at p. - // This thread owns the span now, so it can manipulate - // the block bitmap without atomic operations. - for(; n > 0; n--, p += size) { - uintptr off, *bitp, shift, bits; - - off = (uintptr*)p - (uintptr*)arena_start; - bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; - shift = off % wordsPerBitmapWord; - bits = *bitp>>shift; - - if((bits & bitAllocated) == 0) + // Special means it has a finalizer or is being profiled. + // In DebugMark mode, the bit has been coopted so + // we have to assume all blocks are special. + if(DebugMark || (bits & bitSpecial) != 0) { + if(handlespecial(p, size)) continue; + } - if((bits & bitMarked) != 0) { - if(DebugMark) { - if(!(bits & bitSpecial)) - runtime_printf("found spurious mark on %p\n", p); - *bitp &= ~(bitSpecial<<shift); - } - *bitp &= ~(bitMarked<<shift); - continue; - } - - // Special means it has a finalizer or is being profiled. - // In DebugMark mode, the bit has been coopted so - // we have to assume all blocks are special. - if(DebugMark || (bits & bitSpecial) != 0) { - if(handlespecial(p, size)) - continue; - } - - // Mark freed; restore block boundary bit. - *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift); + // Mark freed; restore block boundary bit. + *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift); - c = m->mcache; - if(s->sizeclass == 0) { - // Free large span. - runtime_unmarkspan(p, 1<<PageShift); - *(uintptr*)p = 1; // needs zeroing - runtime_MHeap_Free(&runtime_mheap, s, 1); - } else { - // Free small object. - if(size > sizeof(uintptr)) - ((uintptr*)p)[1] = 1; // mark as "needs to be zeroed" - c->local_by_size[s->sizeclass].nfree++; - runtime_MCache_Free(c, p, s->sizeclass, size); - } + if(cl == 0) { + // Free large span. + runtime_unmarkspan(p, 1<<PageShift); + *(uintptr*)p = 1; // needs zeroing + runtime_MHeap_Free(&runtime_mheap, s, 1); c->local_alloc -= size; c->local_nfree++; + } else { + // Free small object. + switch(compression) { + case MTypes_Words: + *(uintptr*)type_data = 0; + break; + case MTypes_Bytes: + *(byte*)type_data = 0; + break; + } + if(size > sizeof(uintptr)) + ((uintptr*)p)[1] = 1; // mark as "needs to be zeroed" + + end->next = (MLink*)p; + end = (MLink*)p; + nfree++; } } + + if(nfree) { + c->local_by_size[cl].nfree += nfree; + c->local_alloc -= size * nfree; + c->local_nfree += nfree; + c->local_cachealloc -= nfree * size; + c->local_objects -= nfree; + runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end); + } } void runtime_gchelper(void) { - // Wait until main proc is ready for mark help. - runtime_lock(&work.markgate); - runtime_unlock(&work.markgate); + // parallel mark for over gc roots + runtime_parfordo(work.markfor); + // help other threads scan secondary blocks scanblock(nil, 0); - // Wait until main proc is ready for sweep help. - runtime_lock(&work.sweepgate); - runtime_unlock(&work.sweepgate); - sweep(); + if(DebugMark) { + // wait while the main thread executes mark(debug_scanblock) + while(runtime_atomicload(&work.debugmarkdone) == 0) + runtime_usleep(10); + } + runtime_parfordo(work.sweepfor); if(runtime_xadd(&work.ndone, +1) == work.nproc-1) runtime_notewakeup(&work.alldone); } @@ -912,21 +914,31 @@ stealcache(void) } static void -cachestats(void) +cachestats(GCStats *stats) { M *m; MCache *c; uint32 i; uint64 stacks_inuse; uint64 stacks_sys; + uint64 *src, *dst; + if(stats) + runtime_memclr((byte*)stats, sizeof(*stats)); stacks_inuse = 0; stacks_sys = runtime_stacks_sys; for(m=runtime_allm; m; m=m->alllink) { - runtime_purgecachedstats(m); + c = m->mcache; + runtime_purgecachedstats(c); // stacks_inuse += m->stackalloc->inuse; // stacks_sys += m->stackalloc->sys; - c = m->mcache; + if(stats) { + src = (uint64*)&m->gcstats; + dst = (uint64*)stats; + for(i=0; i<sizeof(*stats)/sizeof(uint64); i++) + dst[i] += src[i]; + runtime_memclr((byte*)&m->gcstats, sizeof(m->gcstats)); + } for(i=0; i<nelem(c->local_by_size); i++) { mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc; c->local_by_size[i].nmalloc = 0; @@ -945,7 +957,15 @@ runtime_gc(int32 force) int64 t0, t1, t2, t3; uint64 heap0, heap1, obj0, obj1; const byte *p; - bool extra; + GCStats stats; + M *m1; + uint32 i; + + // The atomic operations are not atomic if the uint64s + // are not aligned on uint64 boundaries. This has been + // a problem in the past. + if((((uintptr)&work.empty) & 7) != 0) + runtime_throw("runtime: gc work buffer is misaligned"); // Make sure all registers are saved on stack so that // scanstack sees them. @@ -986,48 +1006,67 @@ runtime_gc(int32 force) } t0 = runtime_nanotime(); - nhandoff = 0; m->gcing = 1; runtime_stoptheworld(); - cachestats(); - heap0 = mstats.heap_alloc; - obj0 = mstats.nmalloc - mstats.nfree; + for(m1=runtime_allm; m1; m1=m1->alllink) + runtime_settype_flush(m1, false); - runtime_lock(&work.markgate); - runtime_lock(&work.sweepgate); - - extra = false; - work.nproc = 1; - if(runtime_gomaxprocs > 1 && runtime_ncpu > 1) { - runtime_noteclear(&work.alldone); - work.nproc += runtime_helpgc(&extra); + heap0 = 0; + obj0 = 0; + if(gctrace) { + cachestats(nil); + heap0 = mstats.heap_alloc; + obj0 = mstats.nmalloc - mstats.nfree; } + work.nwait = 0; work.ndone = 0; + work.debugmarkdone = 0; + work.nproc = runtime_gcprocs(); + addroots(); + m->locks++; // disable gc during mallocs in parforalloc + if(work.markfor == nil) + work.markfor = runtime_parforalloc(MaxGcproc); + runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot); + if(work.sweepfor == nil) + work.sweepfor = runtime_parforalloc(MaxGcproc); + runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap.nspan, nil, true, sweepspan); + m->locks--; + if(work.nproc > 1) { + runtime_noteclear(&work.alldone); + runtime_helpgc(work.nproc); + } - runtime_unlock(&work.markgate); // let the helpers in - mark(scanblock); - if(DebugMark) - mark(debug_scanblock); + runtime_parfordo(work.markfor); + scanblock(nil, 0); + + if(DebugMark) { + for(i=0; i<work.nroot; i++) + debug_scanblock(work.roots[i].p, work.roots[i].n); + runtime_atomicstore(&work.debugmarkdone, 1); + } t1 = runtime_nanotime(); - work.spans = runtime_mheap.allspans; - runtime_unlock(&work.sweepgate); // let the helpers in - sweep(); - if(work.nproc > 1) - runtime_notesleep(&work.alldone); + runtime_parfordo(work.sweepfor); t2 = runtime_nanotime(); stealcache(); - cachestats(); + cachestats(&stats); + + if(work.nproc > 1) + runtime_notesleep(&work.alldone); + + stats.nprocyield += work.sweepfor->nprocyield; + stats.nosyield += work.sweepfor->nosyield; + stats.nsleep += work.sweepfor->nsleep; mstats.next_gc = mstats.heap_alloc+(mstats.heap_alloc-runtime_stacks_sys)*gcpercent/100; m->gcing = 0; - m->locks++; // disable gc during the mallocs in newproc if(finq != nil) { + m->locks++; // disable gc during the mallocs in newproc // kick off or wake up goroutine to run queued finalizers if(fing == nil) fing = __go_go(runfinq, nil); @@ -1035,10 +1074,9 @@ runtime_gc(int32 force) fingwait = 0; runtime_ready(fing); } + m->locks--; } - m->locks--; - cachestats(); heap1 = mstats.heap_alloc; obj1 = mstats.nmalloc - mstats.nfree; @@ -1051,26 +1089,22 @@ runtime_gc(int32 force) runtime_printf("pause %D\n", t3-t0); if(gctrace) { - runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects\n", + runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects," + " %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n", mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000, heap0>>20, heap1>>20, obj0, obj1, - mstats.nmalloc, mstats.nfree); + mstats.nmalloc, mstats.nfree, + stats.nhandoff, stats.nhandoffcnt, + work.sweepfor->nsteal, work.sweepfor->nstealcnt, + stats.nprocyield, stats.nosyield, stats.nsleep); } - + runtime_MProf_GC(); runtime_semrelease(&runtime_worldsema); + runtime_starttheworld(); - // If we could have used another helper proc, start one now, - // in the hope that it will be available next time. - // It would have been even better to start it before the collection, - // but doing so requires allocating memory, so it's tricky to - // coordinate. This lazy approach works out in practice: - // we don't mind if the first couple gc rounds don't have quite - // the maximum number of procs. - runtime_starttheworld(extra); - - // give the queued finalizers, if any, a chance to run - if(finq != nil) + // give the queued finalizers, if any, a chance to run + if(finq != nil) runtime_gosched(); if(gctrace > 1 && !force) @@ -1093,22 +1127,23 @@ runtime_ReadMemStats(MStats *stats) m = runtime_m(); m->gcing = 1; runtime_stoptheworld(); - cachestats(); + cachestats(nil); *stats = mstats; m->gcing = 0; runtime_semrelease(&runtime_worldsema); - runtime_starttheworld(false); + runtime_starttheworld(); } static void runfinq(void* dummy __attribute__ ((unused))) { - G* gp; Finalizer *f; FinBlock *fb, *next; uint32 i; - gp = runtime_g(); + if(raceenabled) + runtime_racefingo(); + for(;;) { // There's no need for a lock in this section // because it only conflicts with the garbage @@ -1120,9 +1155,7 @@ runfinq(void* dummy __attribute__ ((unused))) finq = nil; if(fb == nil) { fingwait = 1; - gp->status = Gwaiting; - gp->waitreason = "finalizer wait"; - runtime_gosched(); + runtime_park(nil, nil, "finalizer wait"); continue; } for(; fb; fb=next) { |