aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/ThinLTO
diff options
context:
space:
mode:
authorTeresa Johnson <tejohnson@google.com>2023-01-05 09:55:33 -0800
committerTeresa Johnson <tejohnson@google.com>2023-05-03 13:34:00 -0700
commitbf6ff4fd4b735afffc65f92a4a79f6610e7174c3 (patch)
treeae95a405deb0ca4dd671842ed58abafd5f3d227e /llvm/test/ThinLTO
parent853d212e323abd471b09adfe06f3391eda2526f9 (diff)
downloadllvm-bf6ff4fd4b735afffc65f92a4a79f6610e7174c3.zip
llvm-bf6ff4fd4b735afffc65f92a4a79f6610e7174c3.tar.gz
llvm-bf6ff4fd4b735afffc65f92a4a79f6610e7174c3.tar.bz2
[MemProf] Context disambiguation cloning pass [patch 3/4]
Applies cloning decisions to the IR, cloning functions and updating calls. For Regular LTO, the IR is updated directly during function assignment, whereas for ThinLTO it is recorded in the summary index (a subsequent patch will apply to the IR via the index during the ThinLTO backend. The function assignment and cloning proceeds greedily, and we create new clones as needed when we find an incompatible assignment of function clones to callsite clones (i.e. when different callers need to invoke different combinations of callsite clones). Depends on D140949. Differential Revision: https://reviews.llvm.org/D141077
Diffstat (limited to 'llvm/test/ThinLTO')
-rw-r--r--llvm/test/ThinLTO/X86/memprof-basic.ll35
-rw-r--r--llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll47
-rw-r--r--llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll232
-rw-r--r--llvm/test/ThinLTO/X86/memprof-indirectcall.ll31
-rw-r--r--llvm/test/ThinLTO/X86/memprof-inlined.ll28
5 files changed, 366 insertions, 7 deletions
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index eaac271..06dc6b1 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -39,13 +39,35 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should have cloned bar, baz, and foo, for the cold memory allocation.
+; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
source_filename = "memprof-basic.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -227,6 +249,11 @@ uselistorder ptr @_Z3foov, { 1, 0 }
; DUMP: Clone of [[BAR]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
@@ -258,3 +285,9 @@ uselistorder ptr @_Z3foov, { 1, 0 }
; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
; DOTCLONED: }
+
+
+; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
+; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
+; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
index 6f89b36..d4aceab 100644
--- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -1,7 +1,8 @@
;; Test callsite context graph generation for call graph with with MIBs
;; that have pruned contexts that partially match multiple inlined
;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
;;
;; Original code looks like:
;;
@@ -60,7 +61,9 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -68,6 +71,27 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+;; We should clone D once for the cold allocations via C.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
source_filename = "duplicate-context-ids.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -104,7 +128,13 @@ entry:
ret ptr null
}
-declare i32 @main()
+define i32 @main() {
+entry:
+ call ptr @_Z1Bv()
+ call ptr @_Z1Ev()
+ call ptr @_Z1Fv()
+ ret i32 0
+}
declare void @_ZdaPv()
@@ -268,6 +298,11 @@ declare i32 @sleep()
; DUMP: Clone of [[D]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOTPRE: digraph "prestackupdate" {
; DOTPRE: label="prestackupdate";
; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
@@ -305,3 +340,9 @@ declare i32 @sleep()
; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"];
; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
; DOTCLONED: }
+
+; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1)
+; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
+; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
+; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
new file mode 100644
index 0000000..59d1b1b
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
@@ -0,0 +1,232 @@
+;; Test context disambiguation for a callgraph containing multiple memprof
+;; contexts and no inlining, where we need to perform additional cloning
+;; during function assignment/cloning to handle the combination of contexts
+;; to 2 different allocations.
+;;
+;; void E(char **buf1, char **buf2) {
+;; *buf1 = new char[10];
+;; *buf2 = new char[10];
+;; }
+;;
+;; void B(char **buf1, char **buf2) {
+;; E(buf1, buf2);
+;; }
+;;
+;; void C(char **buf1, char **buf2) {
+;; E(buf1, buf2);
+;; }
+;;
+;; void D(char **buf1, char **buf2) {
+;; E(buf1, buf2);
+;; }
+;; int main(int argc, char **argv) {
+;; char *cold1, *cold2, *default1, *default2, *default3, *default4;
+;; B(&default1, &default2);
+;; C(&default3, &cold1);
+;; D(&cold2, &default4);
+;; memset(cold1, 0, 10);
+;; memset(cold2, 0, 10);
+;; memset(default1, 0, 10);
+;; memset(default2, 0, 10);
+;; memset(default3, 0, 10);
+;; memset(default4, 0, 10);
+;; delete[] default1;
+;; delete[] default2;
+;; delete[] default3;
+;; delete[] default4;
+;; sleep(10);
+;; delete[] cold1;
+;; delete[] cold2;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+
+source_filename = "funcassigncloning.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline optnone
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
+entry:
+ %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
+ %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
+ ret void
+}
+
+declare ptr @_Znam(i64)
+
+define internal void @_Z1BPPcS0_() {
+entry:
+ call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
+ ret void
+}
+
+define internal void @_Z1CPPcS0_() {
+entry:
+ call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
+ ret void
+}
+
+define internal void @_Z1DPPcS0_() {
+entry:
+ call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
+ ret void
+}
+
+; Function Attrs: noinline optnone
+define i32 @main() {
+entry:
+ call void @_Z1BPPcS0_()
+ call void @_Z1CPPcS0_()
+ call void @_Z1DPPcS0_()
+ ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+!0 = !{!1, !3, !5}
+!1 = !{!2, !"cold"}
+!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!7 = !{i64 -3461278137325233666}
+!8 = !{!9, !11, !13}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!11 = !{!12, !"cold"}
+!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!15 = !{i64 -1415475215210681400}
+!16 = !{i64 -2441057035866683071}
+!17 = !{i64 -3483158674395044949}
+!18 = !{i64 -7799663586031895603}
+
+
+;; Originally we create a single clone of each call to new from E, since each
+;; allocates cold memory for a single caller.
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 2 StackIds: 0
+; DUMP: AllocType 1 StackIds: 1
+; DUMP: AllocType 1 StackIds: 2
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 2 3
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
+
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 6
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: CallerEdges:
+
+; DUMP: Node [[C]]
+; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 2 5
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
+; DUMP: Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 3 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
+; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: CallerEdges:
+
+; DUMP: Node [[ENEW2ORIG]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 2
+; DUMP: AllocType 2 StackIds: 1
+; DUMP: AllocType 1 StackIds: 0
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 4 6
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: Clones: [[ENEW2CLONE]]
+
+; DUMP: Node [[ENEW1CLONE]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 2 StackIds: 0
+; DUMP: AllocType 1 StackIds: 1
+; DUMP: AllocType 1 StackIds: 2
+; DUMP: (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: Clone of [[ENEW1ORIG]]
+
+; DUMP: Node [[ENEW2CLONE]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 2
+; DUMP: AllocType 2 StackIds: 1
+; DUMP: AllocType 1 StackIds: 0
+; DUMP: (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 5
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: Clone of [[ENEW2ORIG]]
+
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
index 8ba958a..a27b4b63 100644
--- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -1,7 +1,7 @@
;; Tests callsite context graph generation for call graph containing indirect
;; calls. Currently this should result in conservative behavior, such that the
;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -61,7 +61,9 @@
; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should only create a single clone of foo, for the direct call
@@ -69,6 +71,26 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should only create a single clone of foo, for the direct call
+;; from main allocating cold memory.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
source_filename = "indirectcall.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -359,6 +381,11 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
; DUMP: Clone of [[FOO]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
index d6fa0d3..ea4c9c7 100644
--- a/llvm/test/ThinLTO/X86/memprof-inlined.ll
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -1,6 +1,7 @@
;; Test callsite context graph generation for call graph with two memprof
;; contexts and partial inlining, requiring generation of a new fused node to
;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -48,7 +49,9 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should create clones for foo and bar for the call from main to allocate
@@ -56,6 +59,24 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should create clones for foo and bar for the call from main to allocate
+;; cold memory.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
source_filename = "inlined.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -257,6 +278,11 @@ declare i32 @sleep()
; DUMP: Clone of [[BAR]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];