Skip to content

Commit

Permalink
More fixes for #5654
Browse files Browse the repository at this point in the history
* In stg_ap_0_fast, if we're evaluating a thunk, the thunk might
  evaluate to a function in which case we may have to adjust its CCS.

* The interpreter has its own implementation of stg_ap_0_fast, so we
  have to do the same shenanigans with creating empty PAPs and copying
  PAPs there.

* GHCi creates Cost Centres as children of CCS_MAIN, which enterFunCCS()
  wrongly assumed to imply that they were CAFs.  Now we use the is_caf
  flag for this, which we have to correctly initialise when we create a
  Cost Centre in GHCi.
  • Loading branch information
simonmar committed Jan 6, 2017
1 parent 5088110 commit 3a18baf
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 62 deletions.
1 change: 1 addition & 0 deletions includes/stg/MiscClosures.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ RTS_RET(stg_maskUninterruptiblezh_ret);
RTS_RET(stg_maskAsyncExceptionszh_ret);
RTS_RET(stg_stack_underflow_frame);
RTS_RET(stg_restore_cccs);
RTS_RET(stg_restore_cccs_eval);

// RTS_FUN(stg_interp_constr1_entry);
// RTS_FUN(stg_interp_constr2_entry);
Expand Down
27 changes: 27 additions & 0 deletions rts/Apply.cmm
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ stg_ap_0_fast ( P_ fun )
The mechanism we use to wrap the function is to create a
zero-argument PAP as a proxy object to hold the new CCS, and return
that.

If the closure we evaluated is itself a PAP, we cannot make a nested
PAP, so we copy the original PAP and set the CCS in the new PAP to
enterFunCCS(pap->header.prof.ccs).
*/

again:
Expand Down Expand Up @@ -122,6 +126,8 @@ again:
CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
P_ pap;
pap = Hp - size + WDS(1);
// We'll lose the original PAP, so we should enter its CCS
ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(fun) "ptr");
SET_HDR(pap, stg_PAP_info, CCCS);
StgPAP_arity(pap) = StgPAP_arity(fun);
StgPAP_n_args(pap) = StgPAP_n_args(fun);
Expand All @@ -137,6 +143,27 @@ again:
goto loop;
}
}
case AP,
AP_STACK,
BLACKHOLE,
WHITEHOLE,
THUNK,
THUNK_1_0,
THUNK_0_1,
THUNK_2_0,
THUNK_1_1,
THUNK_0_2,
THUNK_STATIC,
THUNK_SELECTOR:
{
// The thunk might evaluate to a function, so we have to come
// back here again to adjust its CCS if necessary. The
// stg_restore_ccs_eval stack frame does that.
STK_CHK_GEN();
jump %ENTRY_CODE(info)
(stg_restore_cccs_eval_info, CCCS)
(UNTAG(fun));
}
default:
{
jump %ENTRY_CODE(info) (UNTAG(fun));
Expand Down
72 changes: 66 additions & 6 deletions rts/Interpreter.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,48 @@ void interp_shutdown ( void )

#endif

#ifdef PROFILING

//
// Build a zero-argument PAP with the current CCS
// See Note [Evaluating functions with profiling] in Apply.cmm
//
STATIC_INLINE
StgClosure * newEmptyPAP (Capability *cap,
StgClosure *tagged_obj, // a FUN or a BCO
uint32_t arity)
{
StgPAP *pap = (StgPAP *)allocate(cap, sizeofW(StgPAP));
SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
pap->arity = arity;
pap->n_args = 0;
pap->fun = tagged_obj;
return (StgClosure *)pap;
}

//
// Make an exact copy of a PAP, except that we combine the current CCS with the
// CCS in the PAP. See Note [Evaluating functions with profiling] in Apply.cmm
//
STATIC_INLINE
StgClosure * copyPAP (Capability *cap, StgPAP *oldpap)
{
uint32_t size = PAP_sizeW(oldpap->n_args);
StgPAP *pap = (StgPAP *)allocate(cap, size);
enterFunCCS(&cap->r, oldpap->header.prof.ccs);
SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
pap->arity = oldpap->arity;
pap->n_args = oldpap->n_args;
pap->fun = oldpap->fun;
uint32_t i;
for (i = 0; i < ((StgPAP *)pap)->n_args; i++) {
pap->payload[i] = oldpap->payload[i];
}
return (StgClosure *)pap;
}

#endif

static StgWord app_ptrs_itbl[] = {
(W_)&stg_ap_p_info,
(W_)&stg_ap_pp_info,
Expand Down Expand Up @@ -343,22 +385,39 @@ interpretBCO (Capability* cap)
case CONSTR_1_1:
case CONSTR_0_2:
case CONSTR_NOCAF:
break;

case FUN:
case FUN_1_0:
case FUN_0_1:
case FUN_2_0:
case FUN_1_1:
case FUN_0_2:
case FUN_STATIC:
#ifdef PROFILING
if (cap->r.rCCCS != obj->header.prof.ccs) {
tagged_obj =
newEmptyPAP(cap, tagged_obj, get_fun_itbl(obj)->f.arity);
}
#endif
break;

case PAP:
// already in WHNF
#ifdef PROFILING
if (cap->r.rCCCS != obj->header.prof.ccs) {
tagged_obj = copyPAP(cap, (StgPAP *)obj);
}
#endif
break;

case BCO:
{
ASSERT(((StgBCO *)obj)->arity > 0);
#ifdef PROFILING
if (cap->r.rCCCS != obj->header.prof.ccs) {
tagged_obj = newEmptyPAP(cap, tagged_obj, ((StgBCO *)obj)->arity);
}
#endif
break;
}

case AP: /* Copied from stg_AP_entry. */
{
Expand All @@ -380,7 +439,7 @@ interpretBCO (Capability* cap)
// restore the CCCS after evaluating the AP
Sp -= 2;
Sp[1] = (W_)cap->r.rCCCS;
Sp[0] = (W_)&stg_restore_cccs_info;
Sp[0] = (W_)&stg_restore_cccs_eval_info;
#endif

Sp -= sizeofW(StgUpdateFrame);
Expand Down Expand Up @@ -425,7 +484,7 @@ interpretBCO (Capability* cap)
// restore the CCCS after evaluating the closure
Sp -= 2;
Sp[1] = (W_)cap->r.rCCCS;
Sp[0] = (W_)&stg_restore_cccs_info;
Sp[0] = (W_)&stg_restore_cccs_eval_info;
#endif
Sp -= 2;
Sp[1] = (W_)tagged_obj;
Expand Down Expand Up @@ -465,7 +524,8 @@ interpretBCO (Capability* cap)
// NOTE: not using get_itbl().
info = ((StgClosure *)Sp)->header.info;

if (info == (StgInfoTable *)&stg_restore_cccs_info) {
if (info == (StgInfoTable *)&stg_restore_cccs_info ||
info == (StgInfoTable *)&stg_restore_cccs_eval_info) {
cap->r.rCCCS = (CostCentreStack*)Sp[1];
Sp += 2;
goto do_return;
Expand Down
5 changes: 5 additions & 0 deletions rts/Printer.c
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,11 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
fprintCCS(stderr, (CostCentreStack*)sp[1]);
debugBelch("\n" );
continue;
} else if (c == (StgWord)&stg_restore_cccs_eval_info) {
debugBelch("stg_restore_cccs_eval_info\n" );
fprintCCS(stderr, (CostCentreStack*)sp[1]);
debugBelch("\n" );
continue;
#endif
} else {
debugBelch("RET_SMALL (%p)\n", info);
Expand Down
6 changes: 5 additions & 1 deletion rts/Profiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ CostCentre *mkCostCentre (char *label, char *module, char *srcloc)
cc->label = label;
cc->module = module;
cc->srcloc = srcloc;
cc->is_caf = 0;
cc->mem_alloc = 0;
cc->time_ticks = 0;
cc->link = NULL;
return cc;
}

Expand Down Expand Up @@ -379,7 +383,7 @@ void enterFunCCS (StgRegTable *reg, CostCentreStack *ccsfn)
}

// common case 2: the function stack is empty, or just CAF
if (ccsfn->prevStack == CCS_MAIN) {
if (ccsfn->cc->is_caf) {
return;
}

Expand Down
10 changes: 10 additions & 0 deletions rts/StgMiscClosures.cmm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ info_ptr, W_ cccs)
jump %ENTRY_CODE(Sp(0)) [*]; // NB. all registers live!
}


INFO_TABLE_RET (stg_restore_cccs_eval, RET_SMALL, W_ info_ptr, W_ cccs)
return (P_ ret)
{
#if defined(PROFILING)
CCCS = cccs;
#endif
jump stg_ap_0_fast(ret);
}

/* ----------------------------------------------------------------------------
Support for the bytecode interpreter.
------------------------------------------------------------------------- */
Expand Down
2 changes: 1 addition & 1 deletion testsuite/tests/codeGen/should_run/cgrun057.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*** Exception (reporting due to +RTS -xc): (THUNK_2_0), stack trace:
*** Exception (reporting due to +RTS -xc): (THUNK_STATIC), stack trace:
Main.g,
called from Main.f,
called from Main.main,
Expand Down
65 changes: 32 additions & 33 deletions testsuite/tests/profiling/should_run/T680.prof.sample
Original file line number Diff line number Diff line change
@@ -1,42 +1,41 @@
Thu Dec 8 15:23 2016 Time and Allocation Profiling Report (Final)
Tue Dec 20 13:18 2016 Time and Allocation Profiling Report (Final)

T680 +RTS -hc -p -RTS
T680 +RTS -p -RTS

total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
total alloc = 753,032 bytes (excludes profiling overheads)
total time = 0.20 secs (204 ticks @ 1000 us, 1 processor)
total alloc = 449,729,208 bytes (excludes profiling overheads)

COST CENTRE MODULE SRC %time %alloc
COST CENTRE MODULE SRC %time %alloc

CAF GHC.IO.Handle.FD <entire-module> 0.0 4.6
main Main T680.hs:20:1-14 0.0 1.2
foo.\ Main T680.hs:3:12-40 0.0 25.5
foo.bar Main T680.hs:(5,3)-(9,38) 0.0 29.8
foo.bar.\ Main T680.hs:(8,11)-(9,38) 0.0 38.2
foo.\ Main T680.hs:3:12-40 98.5 99.8


individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc

MAIN MAIN <built-in> 104 0 0.0 0.1 0.0 100.0
CAF Main <entire-module> 207 0 0.0 0.0 0.0 93.5
k Main T680.hs:12:1-17 211 1 0.0 0.0 0.0 0.0
foo Main T680.hs:(2,1)-(9,38) 212 1 0.0 0.0 0.0 0.0
foo.bar Main T680.hs:(5,3)-(9,38) 216 1 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 220 1 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 221 1 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 208 1 0.0 0.0 0.0 0.0
r Main T680.hs:18:1-26 210 1 0.0 0.0 0.0 93.5
k Main T680.hs:12:1-17 213 0 0.0 0.0 0.0 93.5
foo Main T680.hs:(2,1)-(9,38) 214 0 0.0 0.0 0.0 93.5
foo.\ Main T680.hs:3:12-40 215 4001 0.0 25.5 0.0 25.5
foo.bar Main T680.hs:(5,3)-(9,38) 217 0 0.0 29.8 0.0 68.0
foo.bar.\ Main T680.hs:(8,11)-(9,38) 218 4001 0.0 38.2 0.0 38.2
foo.bar.\.k'' Main T680.hs:8:15-27 219 4000 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 222 0 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 201 0 0.0 0.1 0.0 0.1
CAF GHC.IO.Encoding <entire-module> 191 0 0.0 0.4 0.0 0.4
CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 4.6 0.0 4.6
CAF GHC.IO.Handle.Text <entire-module> 179 0 0.0 0.0 0.0 0.0
CAF GHC.Show <entire-module> 165 0 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 209 0 0.0 1.2 0.0 1.2
MAIN MAIN <built-in> 108 0 0.0 0.0 100.0 100.0
CAF Main <entire-module> 215 0 0.0 0.0 100.0 100.0
k Main T680.hs:12:1-17 219 1 0.0 0.0 0.0 0.0
foo Main T680.hs:(2,1)-(9,38) 220 1 0.0 0.0 0.0 0.0
foo.bar Main T680.hs:(5,3)-(9,38) 224 1 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 228 1 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 229 1 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 216 1 0.0 0.0 0.0 0.0
r Main T680.hs:18:1-26 218 1 0.0 0.0 100.0 100.0
k Main T680.hs:12:1-17 221 0 0.0 0.0 100.0 100.0
foo Main T680.hs:(2,1)-(9,38) 222 0 0.0 0.0 100.0 100.0
foo.\ Main T680.hs:3:12-40 223 4001 98.5 99.8 98.5 99.8
foo.bar Main T680.hs:(5,3)-(9,38) 225 0 0.5 0.0 1.5 0.2
foo.bar.\ Main T680.hs:(8,11)-(9,38) 226 4001 1.0 0.1 1.0 0.1
foo.bar.\.k'' Main T680.hs:8:15-27 227 4000 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 232 0 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 233 0 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 230 0 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 231 0 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 206 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Encoding <entire-module> 194 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Encoding.Iconv <entire-module> 192 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.FD <entire-module> 184 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.Text <entire-module> 182 0 0.0 0.0 0.0 0.0
CAF GHC.Show <entire-module> 168 0 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 217 0 0.0 0.0 0.0 0.0
3 changes: 1 addition & 2 deletions testsuite/tests/profiling/should_run/all.T
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,7 @@ test('T5559', [], compile_and_run, [''])

test('callstack001',
# unoptimised results are different w.r.t. CAF attribution
[ expect_broken_for_10037,
omit_ways(['ghci-ext-prof']), # produces a different stack
[ omit_ways(['ghci-ext-prof']), # produces a different stack
], compile_and_run,
['-fprof-auto-calls -fno-full-laziness -fno-state-hack'])

Expand Down
41 changes: 22 additions & 19 deletions testsuite/tests/profiling/should_run/toplevel_scc_1.prof.sample
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
Tue Jul 19 08:36 2016 Time and Allocation Profiling Report (Final)
Tue Dec 20 14:22 2016 Time and Allocation Profiling Report (Final)

toplevel_scc_1 +RTS -p -RTS

total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
total alloc = 79,792 bytes (excludes profiling overheads)
total alloc = 75,880 bytes (excludes profiling overheads)

COST CENTRE MODULE SRC %time %alloc

MAIN MAIN <built-in> 0.0 24.0
CAF GHC.Read <entire-module> 0.0 1.2
CAF GHC.IO.Handle.FD <entire-module> 0.0 64.9
CAF GHC.IO.Encoding <entire-module> 0.0 3.5
CAF Main <entire-module> 0.0 27.6
CAF GHC.IO.Handle.FD <entire-module> 0.0 68.3
CAF GHC.IO.Encoding <entire-module> 0.0 3.6


individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc

MAIN MAIN <built-in> 105 0 0.0 0.4 0.0 100.0
CAF Main <entire-module> 209 0 0.0 27.6 0.0 27.9
Main.f1 Main toplevel_scc_1.hs:4:1-2 212 1 0.0 0.0 0.0 0.0
Main.foo Main toplevel_scc_1.hs:7:1-2 210 1 0.0 0.2 0.0 0.3
Main.bar Main toplevel_scc_1.hs:10:5-6 211 1 0.0 0.1 0.0 0.1
CAF GHC.Conc.Signal <entire-module> 203 0 0.0 0.8 0.0 0.8
CAF GHC.IO.Encoding <entire-module> 193 0 0.0 3.5 0.0 3.5
CAF GHC.IO.Encoding.Iconv <entire-module> 191 0 0.0 0.3 0.0 0.3
CAF GHC.IO.Handle.FD <entire-module> 183 0 0.0 64.9 0.0 64.9
CAF GHC.IO.Handle.Text <entire-module> 181 0 0.0 0.1 0.0 0.1
CAF GHC.Read <entire-module> 171 0 0.0 1.2 0.0 1.2
CAF Text.Read.Lex <entire-module> 154 0 0.0 0.8 0.0 0.8
MAIN MAIN <built-in> 105 0 0.0 24.0 0.0 100.0
CAF Main <entire-module> 209 0 0.0 0.3 0.0 0.5
Main.f1 Main toplevel_scc_1.hs:4:1-2 214 1 0.0 0.0 0.0 0.0
Main.foo Main toplevel_scc_1.hs:7:1-2 210 1 0.0 0.2 0.0 0.2
Main.bar Main toplevel_scc_1.hs:10:5-6 212 1 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 203 0 0.0 0.8 0.0 0.8
CAF GHC.IO.Encoding <entire-module> 193 0 0.0 3.6 0.0 3.6
CAF GHC.IO.Encoding.Iconv <entire-module> 191 0 0.0 0.3 0.0 0.3
CAF GHC.IO.Handle.FD <entire-module> 183 0 0.0 68.3 0.0 68.3
CAF GHC.IO.Handle.Text <entire-module> 181 0 0.0 0.1 0.0 0.1
CAF GHC.Read <entire-module> 171 0 0.0 1.2 0.0 1.2
CAF Text.Read.Lex <entire-module> 154 0 0.0 0.9 0.0 0.9
Main.f1 Main toplevel_scc_1.hs:4:1-2 215 0 0.0 0.0 0.0 0.0
Main.foo Main toplevel_scc_1.hs:7:1-2 211 0 0.0 0.1 0.0 0.1
Main.bar Main toplevel_scc_1.hs:10:5-6 213 0 0.0 0.1 0.0 0.1

0 comments on commit 3a18baf

Please sign in to comment.