diff options
-rw-r--r-- | gprofng/doc/Makefile.am | 26 | ||||
-rw-r--r-- | gprofng/doc/Makefile.in | 59 | ||||
-rw-r--r-- | gprofng/doc/gp-archive.texi | 246 | ||||
-rw-r--r-- | gprofng/doc/gp-collect-app.texi | 380 | ||||
-rw-r--r-- | gprofng/doc/gp-display-html.texi | 252 | ||||
-rw-r--r-- | gprofng/doc/gp-display-src.texi | 246 | ||||
-rw-r--r-- | gprofng/doc/gp-display-text.texi | 437 | ||||
-rw-r--r-- | gprofng/doc/gp-macros.texi | 72 | ||||
-rw-r--r-- | gprofng/doc/gprofng.texi | 3636 | ||||
-rw-r--r-- | gprofng/doc/gprofng_ug.texi | 4396 | ||||
-rw-r--r-- | gprofng/doc/version.texi | 6 | ||||
-rw-r--r-- | gprofng/gp-display-html/Makefile.am | 32 | ||||
-rw-r--r-- | gprofng/gp-display-html/Makefile.in | 94 | ||||
-rw-r--r-- | gprofng/src/Makefile.am | 56 | ||||
-rw-r--r-- | gprofng/src/Makefile.in | 114 |
15 files changed, 6291 insertions, 3761 deletions
diff --git a/gprofng/doc/Makefile.am b/gprofng/doc/Makefile.am index 8be0e92..ad72c26 100644 --- a/gprofng/doc/Makefile.am +++ b/gprofng/doc/Makefile.am @@ -19,6 +19,9 @@ AUTOMAKE_OPTIONS = info-in-builddir foreign no-texinfo.tex +PDFS = gprofng.pdf +HTMLS = gprofng.html + # Options to extract the man page MANCONF = -Dman @@ -26,24 +29,31 @@ TEXI2POD = perl $(srcdir)/../../etc/texi2pod.pl $(AM_MAKEINFOFLAGS) POD2MAN = pod2man --center="User Commands" \ --release="binutils-$(VERSION)" --section=1 -info_TEXINFOS = gprofng.texi -gprofng_TEXINFOS = fdl.texi +info_TEXINFOS = gprofng_ug.texi +gprofng_ug_TEXINFOS = fdl.texi gp-macros.texi TEXINFO_TEX = . MAKEINFOHTML = $(MAKEINFO) --html --no-split -man_MANS = gprofng.1 +man_MANS = gprofng.1 gp-archive.1 gp-collect-app.1 gp-display-html.1 gp-display-src.1 gp-display-text.1 # Build the man page from the texinfo file # The sed command removes the no-adjust Nroff command so that # the man output looks standard. -gprofng.1: $(srcdir)/gprofng.texi +$(man_MANS): $(srcdir)/gp-macros.texi $(AM_V_GEN)touch $@ - $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/gprofng.texi > gprofng.pod - $(AM_V_at)-($(POD2MAN) gprofng.pod | \ - sed -e '/^.if n .na/d' > $@.tmp && \ + $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/`basename $@ .1`.texi > $@.pod + $(AM_V_at)-($(POD2MAN) $@.pod | sed -e '/^.if n .na/d' > $@.tmp && \ mv -f $@.tmp $@) || (rm -f $@.tmp && exit 1) - $(AM_V_at)rm -f gprofng.pod + $(AM_V_at)rm -f $@.pod + +gprofng.1: $(srcdir)/gprofng.texi +gp-archive.1: $(srcdir)/gp-archive.texi +gp-collect-app.1: $(srcdir)/gp-collect-app.texi +gp-display-html.1: $(srcdir)/gp-display-html.texi +gp-display-src.1: $(srcdir)/gp-display-src.texi +gp-display-text.1: $(srcdir)/gp-display-text.texi MAINTAINERCLEANFILES = gprofng.info $(man_MANS) +EXTRA_DIST = $(man_MANS) version.texi info: $(man_MANS) diff --git a/gprofng/doc/Makefile.in b/gprofng/doc/Makefile.in index 3cd2068..78f8ae1 100644 --- a/gprofng/doc/Makefile.in +++ b/gprofng/doc/Makefile.in @@ -168,11 +168,9 @@ am__v_texidevnull_0 = > /dev/null am__v_texidevnull_1 = INFO_DEPS = gprofng.info am__TEXINFO_TEX_DIR = $(srcdir)/. -DVIS = gprofng.dvi -PDFS = gprofng.pdf -PSS = gprofng.ps -HTMLS = gprofng.html -TEXINFOS = gprofng.texi +DVIS = gprofng_ug.dvi +PSS = gprofng_ug.ps +TEXINFOS = gprofng_ug.texi TEXI2DVI = texi2dvi TEXI2PDF = $(TEXI2DVI) --pdf --batch AM_MAKEINFOHTMLFLAGS = $(AM_MAKEINFOFLAGS) @@ -214,7 +212,7 @@ man1dir = $(mandir)/man1 NROFF = nroff MANS = $(man_MANS) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) -am__DIST_COMMON = $(gprofng_TEXINFOS) $(srcdir)/Makefile.in \ +am__DIST_COMMON = $(gprofng_ug_TEXINFOS) $(srcdir)/Makefile.in \ $(top_srcdir)/../mkinstalldirs mdate-sh texinfo.tex DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ @@ -364,6 +362,8 @@ top_srcdir = @top_srcdir@ zlibdir = @zlibdir@ zlibinc = @zlibinc@ AUTOMAKE_OPTIONS = info-in-builddir foreign no-texinfo.tex +PDFS = gprofng.pdf +HTMLS = gprofng.html # Options to extract the man page MANCONF = -Dman @@ -371,12 +371,13 @@ TEXI2POD = perl $(srcdir)/../../etc/texi2pod.pl $(AM_MAKEINFOFLAGS) POD2MAN = pod2man --center="User Commands" \ --release="binutils-$(VERSION)" --section=1 -info_TEXINFOS = gprofng.texi -gprofng_TEXINFOS = fdl.texi +info_TEXINFOS = gprofng_ug.texi +gprofng_ug_TEXINFOS = fdl.texi gp-macros.texi TEXINFO_TEX = . MAKEINFOHTML = $(MAKEINFO) --html --no-split -man_MANS = gprofng.1 +man_MANS = gprofng.1 gp-archive.1 gp-collect-app.1 gp-display-html.1 gp-display-src.1 gp-display-text.1 MAINTAINERCLEANFILES = gprofng.info $(man_MANS) +EXTRA_DIST = $(man_MANS) version.texi all: all-am .SUFFIXES: @@ -417,7 +418,7 @@ mostlyclean-libtool: clean-libtool: -rm -rf .libs _libs -gprofng.info: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) +gprofng.info: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS) $(AM_V_MAKEINFO)restore=: && backupdir="$(am__leading_dot)am$$$$" && \ rm -rf $$backupdir && mkdir $$backupdir && \ if ($(MAKEINFO) --version) >/dev/null 2>&1; then \ @@ -426,7 +427,7 @@ gprofng.info: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) done; \ else :; fi && \ if $(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \ - -o $@ `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi; \ + -o $@ `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi; \ then \ rc=0; \ else \ @@ -435,31 +436,31 @@ gprofng.info: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) fi; \ rm -rf $$backupdir; exit $$rc -gprofng.dvi: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) +gprofng.dvi: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS) $(AM_V_TEXI2DVI)TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \ MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \ $(TEXI2DVI) $(AM_V_texinfo) --build-dir=$(@:.dvi=.t2d) -o $@ $(AM_V_texidevnull) \ - `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi + `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi -gprofng.pdf: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) +gprofng.pdf: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS) $(AM_V_TEXI2PDF)TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \ MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \ $(TEXI2PDF) $(AM_V_texinfo) --build-dir=$(@:.pdf=.t2p) -o $@ $(AM_V_texidevnull) \ - `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi + `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi -gprofng.html: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS) +gprofng.html: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS) $(AM_V_MAKEINFO)rm -rf $(@:.html=.htp) $(AM_V_at)if $(MAKEINFOHTML) $(AM_MAKEINFOHTMLFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \ - -o $(@:.html=.htp) `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi; \ + -o $(@:.html=.htp) `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi; \ then \ rm -rf $@ && mv $(@:.html=.htp) $@; \ else \ rm -rf $(@:.html=.htp); exit 1; \ fi $(srcdir)/version.texi: @MAINTAINER_MODE_TRUE@ $(srcdir)/stamp-vti -$(srcdir)/stamp-vti: gprofng.texi $(top_srcdir)/configure - @(dir=.; test -f ./gprofng.texi || dir=$(srcdir); \ - set `$(SHELL) $(srcdir)/mdate-sh $$dir/gprofng.texi`; \ +$(srcdir)/stamp-vti: gprofng_ug.texi $(top_srcdir)/configure + @(dir=.; test -f ./gprofng_ug.texi || dir=$(srcdir); \ + set `$(SHELL) $(srcdir)/mdate-sh $$dir/gprofng_ug.texi`; \ echo "@set UPDATED $$1 $$2 $$3"; \ echo "@set UPDATED-MONTH $$2 $$3"; \ echo "@set EDITION $(VERSION)"; \ @@ -557,7 +558,7 @@ dist-info: $(INFO_DEPS) done mostlyclean-aminfo: - -rm -rf gprofng.t2d gprofng.t2p + -rm -rf gprofng_ug.t2d gprofng_ug.t2p clean-aminfo: -test -z "gprofng.dvi gprofng.pdf gprofng.ps gprofng.html" \ @@ -874,13 +875,19 @@ uninstall-man: uninstall-man1 # Build the man page from the texinfo file # The sed command removes the no-adjust Nroff command so that # the man output looks standard. -gprofng.1: $(srcdir)/gprofng.texi +$(man_MANS): $(srcdir)/gp-macros.texi $(AM_V_GEN)touch $@ - $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/gprofng.texi > gprofng.pod - $(AM_V_at)-($(POD2MAN) gprofng.pod | \ - sed -e '/^.if n .na/d' > $@.tmp && \ + $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/`basename $@ .1`.texi > $@.pod + $(AM_V_at)-($(POD2MAN) $@.pod | sed -e '/^.if n .na/d' > $@.tmp && \ mv -f $@.tmp $@) || (rm -f $@.tmp && exit 1) - $(AM_V_at)rm -f gprofng.pod + $(AM_V_at)rm -f $@.pod + +gprofng.1: $(srcdir)/gprofng.texi +gp-archive.1: $(srcdir)/gp-archive.texi +gp-collect-app.1: $(srcdir)/gp-collect-app.texi +gp-display-html.1: $(srcdir)/gp-display-html.texi +gp-display-src.1: $(srcdir)/gp-display-src.texi +gp-display-text.1: $(srcdir)/gp-display-text.texi info: $(man_MANS) diff --git a/gprofng/doc/gp-archive.texi b/gprofng/doc/gp-archive.texi new file mode 100644 index 0000000..722a954 --- /dev/null +++ b/gprofng/doc/gp-archive.texi @@ -0,0 +1,246 @@ +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the gp-collect-app man page. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng archive +@settitle Archive gprofng experiment data +@include gp-macros.texi +@end ifset + +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { +@c +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. +@c ---------------------------------------------------------------------------- + +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NAME} +@c man begin NAME + +gprofng archive - Archive gprofng experiment data + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS + +@command{gprofng archive} [@var{option(s)}] @var{experiment} + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- + +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION + +Archive the associated application binaries and source files in a gprofng +experiment to make it self contained and portable. + +By default, the binaries are archived, but the application source files +are not archived. Use this tool to change this and afterwards archive +additional components. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} +@c man begin OPTIONS + +@table @gcctabopt + +@item --version +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear + +Print the version number and exit. + +@item --help +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear + +Print usage information and exit. + +@c -- @item --verbose @{on|off@} +@c -- @ifclear man +@c -- @IndexSubentry{Options, @code{--verbose}} +@c -- @end ifclear + +@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}. + +@item -a @{off|on|ldobjects|src|usedldobjects|usedsrc@} +@ifclear man +@IndexSubentry{Options, @code{-a}} +@end ifclear + +Specify archiving of binaries and other files. In addition to disable this +feature (off), or enable archiving off all loadobjects and sources (on), +the other op tions support a more refined selection. + +All of these options enable archiving, but the keyword controls what exactly +is selected: all load objects (ldobjects), all source files (src), the +loadobjects asscoiated with a program counter (usedldobjects), or the source +files associated with a program counter (usedsrc). +The default is @samp{-a ldobjects}. + +@item -n +@ifclear man +@IndexSubentry{Options, @code{-n}} +@end ifclear + +Archive the named experiment only, not any of its descendants. + +@item -m @var{regex} +@ifclear man +@IndexSubentry{Options, @code{-m}} +@end ifclear + +Archive only those source, object, and debug info files whose full path name +matches the given POSIX compliant @var{regex} regular expression. + +@item -q +@ifclear man +@IndexSubentry{Options, @code{-q}} +@end ifclear + +Do not write any warnings to stderr. Warnings are incorporated into the +.archive file in the experiment directory. They are shown in the output +of @command{gprofng display text}. + +@item -F +@ifclear man +@IndexSubentry{Options, @code{-F}} +@end ifclear + +Force writing or rewriting of the archive. This is ignored with the +@samp{-n} or @samp{-m} option, or if this is a subexperiment. + +@item -d @var{path} +@ifclear man +@IndexSubentry{Options, @code{-d}} +@end ifclear + +The @var{path} is the absolute path path to a common archive, which is a +directory that contains archived files. If the directory does not +exist, then it will be created. Files are saved in the common archive +directory, and a symbolic link is created in the experiment archive. + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c NOTES section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NOTES} +@c man begin NOTES + +Default archiving does not occur in case the application profiled terminates +prematurely, or if archiving is disabled when collecting the performance data. +In such cases, this tool can be used to afterwards archive the information, +but it has to be run on the same system where the profiling data was recorded. + +Some Java applications store shared objects in jar files. By default, such +shared objects are not automatically archived. To archive shared objects +contained in jar files, the addpath directive in an .er.rc file. The addpath +directive should give the path to the jar file, including the jar file itself. +The .er.rc file should be saved in the user home directory or parent of the +experiment directory. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SEEALSO section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SEEALSO} +@c man begin SEEALSO + +gprofng(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1), gp-display-text(1) + +The user guide for gprofng is maintained as a Texinfo manual. If the info +and gprofng programs are correctly installed, the command +@command{info gprofng} should give access to this document. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c COPYRIGHT section +@c ---------------------------------------------------------------------------- + +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c If this text is used for a man page, exit. Otherwise we need to continue. +@c ---------------------------------------------------------------------------- + +@ifset man +@bye +@end ifset diff --git a/gprofng/doc/gp-collect-app.texi b/gprofng/doc/gp-collect-app.texi new file mode 100644 index 0000000..7e81f85 --- /dev/null +++ b/gprofng/doc/gp-collect-app.texi @@ -0,0 +1,380 @@ +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the gp-collect-app man page. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng collect app +@settitle Collect performance data for the target application +@include gp-macros.texi +@end ifset + +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { +@c +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. +@c ---------------------------------------------------------------------------- + +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NAME} +@c man begin NAME + +gprofng collect app - Collect performance data for the target program + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS + +@command{gprofng collect app} [@var{option(s)}] @var{target} [@var{option(s)}] + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- + +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION + +Collect performance data on the target program. In addition to Program Counter +(PC) sampling, hardware event counters and various tracing options are supported. + +For example, this command collects performance data for an executable called +@samp{a.out} and stores the data collected in an experiment directory with +the name @samp{example.er}. + +@smallexample +$ gprofng collect app -o example.er ./a.out +@end smallexample + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} +@c man begin OPTIONS + +@table @gcctabopt + +@item --version +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear + +Print the version number and exit. + +@item --help +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear + +Print usage information and exit. + +@c -- @item --verbose @{on|off@} +@c -- @ifclear man +@c -- @IndexSubentry{Options, @code{--verbose}} +@c -- @end ifclear + +@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}. + +@item -p @{off|on|lo|hi|@var{<value>}@} +@ifclear man +@IndexSubentry{Options, @code{-p}} +@end ifclear + +Disable (off) or enable (on) clock-profiling using a default sampling +granularity, or enable clock-profiling implicitly by setting the sampling +granularity (lo, hi, or a specific value in ms). By default, clock profiling +is enabled (@samp{-p on}). + +@item -h @var{@{<ctr_def>...,<ctr_n_def>@}} +@ifclear man +@IndexSubentry{Options, @code{-h}} +@end ifclear +Enable hardware event counter profiling and select the counter(s). +To see the supported counters on this system, use the @samp{-h} option +without other arguments. + +@item -o @var{<exp_name>} +@ifclear man +@IndexSubentry{Options, @code{-o}} +@end ifclear + +Specify the name for the experiment directory. The name has to end with +@samp{.er} and may contain an absolute path (e.g. @file{/tmp/experiment.er}). + +@item -O @var{<exp_name>} +@ifclear man +@IndexSubentry{Options, @code{-O}} +@end ifclear + +This is the same as the @samp{-o} option, but unlike this option, silently +overwrites an existing experiment directory with the same name. + +@item -C @var{<comment_string>} +@ifclear man +@IndexSubentry{Options, @code{-C}} +@end ifclear + +Add up to 10 comment strings to the experiment. These comments appear in the +notes section of the header and can be retrieved with the +@command{gprofng display text} command using the @samp{-header} option. + +@item -j @{on|off|@var{<path>}@} +@ifclear man +@IndexSubentry{Options, @code{-j}} +@end ifclear + +Controls Java profiling when the target is a JVM machine. The allowed values of +this option are: enable (on), disable (off) Java profiling when the target +program is a JVM, or set @samp{<path>} to a non-default JVM. +The default is @samp{-j on} + +@table @gcctabopt + +@item on +Record profiling data for the JVM machine, and recognize methods compiled by +the Java HotSpot virtual machine. Also record Java call stacks. The default +is @samp{-j on}. + +@item off +Does not record Java profiling data. Profiling data for native call stacks is +still recorded. + +@item @var{<path>} +Records profiling data for the JVM, and use the JVM as installed in @var{<path>}. + +@end table + +@item -J @var{<jvm-options>} +@ifclear man +@IndexSubentry{Options, @code{-J}} +@end ifclear + +Specifies additional options to be passed to the JVM used. The +@var{jvm-options} list must be enclosed in quotation marks if it contains more +than one option. The items in the list need to be separated by spaces or tab. +Each item is passed as a separate option to the JVM. Note that this option +implies @samp{-j on}. + +@item -t @var{<duration>}[m|s] +@ifclear man +@IndexSubentry{Options, @code{-t}} +@end ifclear + +Collects data for the specified duration. The duration can be a single number, +optionally followed by either @samp{m} to specify minutes, or @samp{s} to +specify seconds, which is the default. + +The duration can also two numbers separated by minus (-) sign. If a single +number is given, data is collected from the start of the run until the given +time. If two numbers are given, data is collected from the first time to the +second. If the second time is zero, data is collected until the end of the +run. If two non-zero numbers are given, the first must be less than the second. + +@item -n +@ifclear man +@IndexSubentry{Options, @code{-n}} +@end ifclear + +This is used for a dry run. Several run-time settings are displayed, but the +target is not executed and no performance data is collected. + +@item -F @{off|on|=@var{regex}@} +@ifclear man +@IndexSubentry{Options, @code{-F}} +@end ifclear + +Control whether descendant processes should have their data recorded. +To disable/enable this feature, use @samp{off}/@samp{on}. Use +@samp{=}@var{regex} to record data on those processes whose executable name +matches the regular expression. Only the basename of the executable is used, +not the full path. If spaces or characters interpreted by the shell are used, +enclose the @var{regex} in single quotes. The default is @samp{-F on}. + +@item -a @{off|on|ldobjects|src|usedldobjects|usedsrc@} +@ifclear man +@IndexSubentry{Options, @code{-a}} +@end ifclear + +Specify archiving of binaries and other files. In addition to disable this +feature (off), or enable archiving off all loadobjects and sources (on), +the other op tions support a more refined selection. + +All of these options enable archiving, but the keyword controls what exactly +is selected: all load objects (ldobjects), all source files (src), the +loadobjects asscoiated with a program counter (usedldobjects), or the source +files associated with a program counter (usedsrc). +The default is @samp{-a ldobjects}. + +@item -S @{off|on|@var{<seconds>}@} +@ifclear man +@IndexSubentry{Options, @code{-S}} +@end ifclear + +Disable (off), or enable (on) periodic sampling of process-wide resource +utilization. By default, sampling occurs every second. Use the @var{<seconds>} +option to change this. The default is @samp{-S on}. + +@item -y @var{<signal>}[,r] +@ifclear man +@IndexSubentry{Options, @code{-y}} +@end ifclear + +Controls recording of data with the signal named @var{<signal>}, referred to +as the pause-resume signal. Whenever the given signal is delivered to the +process, switch between paused (no data is recorded) and resumed (data is +recorded) states. + +By default, data collection begins in the paused state. If the optional +@samp{r} is given, data collection begins in the resumed state and data +collection begins immediately. + +SIGUSR1 or SIGUSR2 are recommended for this use, but any signal that is +not used by the target can be used. + +@item -l @var{<signal>} +@ifclear man +@IndexSubentry{Options, @code{-l}} +@end ifclear + +Specify a signal that will trigger a sample of process-wide resource utilization. +When the named @var{<signal>} is delivered to the process, a sample is recorded. + +The signal can be specified using the full name, without the initial +letters @code{SIG}, or the signal number. Note that the @command{kill} +command can be used to deliver a signal. + +If both the @samp{-l} and @samp{-y} options are used, the signal must be +different. + +@item -s @var{<option>}[,@var{<API>}] +@ifclear man +@IndexSubentry{Options, @code{-s}} +@end ifclear + +Enable synchronization wait tracing, where @var{<option>} is used to define the +specifics of the tracing (on, off, @var{<threshold>}, or all). The API is +selected through the setting for @var{<API>}: @samp{n} selects native/Pthreads, +@samp{j} selects Java, and @samp{nj} selects both. The default is @samp{-s off}. + +@item -H @{off|on@} +@ifclear man +@IndexSubentry{Options, @code{-H}} +@end ifclear + +Disable (off), or enable (on) heap tracing. The default is @samp{-H off}. + +@item -i @{off|on@} +@ifclear man +@IndexSubentry{Options, @code{-i}} +@end ifclear + +Disable (off), or enable (on) I/O tracing. The default is @samp{-i off}. + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c NOTES section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NOTES} +@c man begin NOTES + +Any executable in the ELF (Executable and Linkable Format) object format can +be used for profiling with gprofng. If debug information is available, +gprofng can provide more details, but this is not a requirement. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SEEALSO section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SEEALSO} +@c man begin SEEALSO + +gprofng(1), gp-archive(1), gp-display-html(1), gp-display-src(1), gp-display-text(1) + +The user guide for gprofng is maintained as a Texinfo manual. If the +@command{info} and @command{gprofng} programs are correctly installed, the +command @command{info gprofng} should give access to this document. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c COPYRIGHT section +@c ---------------------------------------------------------------------------- + +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c If this text is used for a man page, exit. Otherwise we need to continue. +@c ---------------------------------------------------------------------------- + +@ifset man +@bye +@end ifset diff --git a/gprofng/doc/gp-display-html.texi b/gprofng/doc/gp-display-html.texi new file mode 100644 index 0000000..de09c34 --- /dev/null +++ b/gprofng/doc/gp-display-html.texi @@ -0,0 +1,252 @@ +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the gp-collect-app man page. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng display html +@settitle Generate an HTML based directory structure to browse the profiles +@include gp-macros.texi +@end ifset + +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { +@c +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. +@c ---------------------------------------------------------------------------- + +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NAME} +@c man begin NAME + +gprofng display html - Generate an HTML based directory structure to browse the profiles + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS + +@command{gprofng display html} [@var{option(s)}] @var{experiment(s)} + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- + +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION + +Process one or more experiments to generate a directory containing the +@file{index.html} file that may be used to browse the experiment data. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} +@c man begin OPTIONS + +@table @gcctabopt + +@item --version +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear + +Print the version number and exit. + +@item --help +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear + +Print usage information and exit. + +@item --verbose @{on|off@} +@ifclear man +@IndexSubentry{Options, @code{--verbose}} +@end ifclear + +Enable (@samp{on}) or disable (@samp{off)} verbose mode. +The default is @samp{off}. + +@item --debug @{on|s|m|l|xl|off@} +@item -d @{on|s|m|l|xl|off@} +@ifclear man +@IndexSubentry{Options, @code{-d}} +@IndexSubentry{Options, @code{--debug}} +@end ifclear + +Control the printing of run time information to assist with troubleshooting, +or further development of this tool. The keyword is case insensitive. +A setting of @samp{on} gives a modest amount of information. The keywords +@samp{s}, @samp{m}, @samp{l}, and @samp{xl} give an increasing amount of +information, while @samp{off} disables the printing of debug information. +This is also the default. + +Note that currently @samp{on}, @samp{s}, @samp{m}, and @samp{l} are +equivalent. This is expected to change in future updates. + +@item ---highlight-percentage @var{value} +@item -hp @var{value} +@ifclear man +@IndexSubentry{Options, @code{--highlight-percentage}} +@IndexSubentry{Options, @code{-hp}} +@end ifclear + +Set a percentage value in the interval [0,100] to select and color code source +lines, as well as instructions, that are within this percentage of the maximum +metric value(s). The default is 90 (%). + +A value of zero @samp{(-hp 0)} disables this feature. + +@item --output @var{dirname} +@item -o @var{dirname} +@ifclear man +@IndexSubentry{Options, @code{--output}} +@IndexSubentry{Options, @code{-o}} +@end ifclear + +Use @var{dirname} as the directory name to store the HTML files in. +The default name is @samp{display.<n>.html} with @var{<n>} the first +positive integer number not in use. An existing directory with the +same name is not overwritten. + +@item --overwrite @var{dirname} +@item -O @var{dirname} +@ifclear man +@IndexSubentry{Options, @code{--overwrite}} +@IndexSubentry{Options, @code{-O}} +@end ifclear + +Use @var{dirname} as the directory name to store the HTML files in. + +@item --quiet @{on|off@} +@item -q @{on|off@} +@ifclear man +@IndexSubentry{Options, @code{--quiet}} +@IndexSubentry{Options, @code{-q}} +@end ifclear + +Control the display of all warning, debug and verbose messages. +If set to @samp{on}, the settings for verbose, warnings and debug are ignored. +By default the quiet mode is disabled (@samp{-q off}). + +@item --warnings @{on|off@} +@item -w @{on|off@} +@ifclear man +@IndexSubentry{Options, @code{--warnings}} +@IndexSubentry{Options, @code{-w}} +@end ifclear + +Enable (@samp{on}), or disable (@samp{off}) run time warning messages from +the tool. By default these are enabled. + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c NOTES section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NOTES} +@c man begin NOTES + +When setting a directory name for the HTML files to be stored in, make sure that +umask is set to the correct access permissions. + +Regardless of the setting for the warning messages, any warnings are accessible +through the main @file{index.html} page. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SEEALSO section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SEEALSO} +@c man begin SEEALSO + +gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-src(1), gp-display-text(1) + +The user guide for gprofng is maintained as a Texinfo manual. If the +@command{info} and @command{gprofng} programs are correctly installed, the +command @command{info gprofng} should give access to this document. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c COPYRIGHT section +@c ---------------------------------------------------------------------------- + +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c If this text is used for a man page, exit. Otherwise we need to continue. +@c ---------------------------------------------------------------------------- + +@ifset man +@bye +@end ifset diff --git a/gprofng/doc/gp-display-src.texi b/gprofng/doc/gp-display-src.texi new file mode 100644 index 0000000..6b32a99 --- /dev/null +++ b/gprofng/doc/gp-display-src.texi @@ -0,0 +1,246 @@ +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the gp-collect-app man page. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng display src +@settitle Display the source code, optionally interleaved with the disassembly of the target object +@include gp-macros.texi +@end ifset + +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { +@c +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. +@c ---------------------------------------------------------------------------- + +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NAME} +@c man begin NAME + +gprofng display src - Display the source code, optionally interleaved with the disassembly of the target object + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS + +@command{gprofng display src} [@var{option(s)}] @var{target_file} + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- + +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION + +Display the source code listing, or source code interleaved with disassembly code, +as extracted from the target file (an executable, shared object, object file, or a +Java .class file). + +For example, this command displays the source code and disassembly listing for a +function called @samp{mxv_core} that is part of object file @samp{mxv.o}: + +@smallexample +$ gprofng display src -disasm mxv_core mxv.o +@end smallexample + +To list the source code and disassembly for all the functions in this file, +use the following command: + +@smallexample +$ gprofng display src -disasm all -1 mxv.o +@end smallexample + +The @var{target_file} is the name of an executable, a shared object, an object +file (.o), or a Java .class file. + +If no options are given, the source code listing of the @var{target_file} +is shown. This is equivalent to @samp{-source all -1}. If this information +is not available, a message to this extent is printed. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} +@c man begin OPTIONS + +@table @gcctabopt + +@item --version +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear + +Print the version number and exit. + +@item --help +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear + +Print usage information and exit. + +@c -- @item --verbose @{on|off@} +@c -- @ifclear man +@c -- @IndexSubentry{Options, @code{--verbose}} +@c -- @end ifclear + +@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}. + +@item -functions +@ifclear man +@IndexSubentry{Options, @code{-functions}} +@IndexSubentry{Commands, @code{functions}} +@end ifclear +List all the functions from the given object. + +@item -source @var{item} @var{tag} +@ifclear man +@IndexSubentry{Options, @code{-source}} +@IndexSubentry{Commands, @code{source}} +@end ifclear +Show the source code for @var{item} in @var{target_file}. The @var{tag} +is used to differentiate in case there are multiple occurences with the same +name. +See the @samp{NOTES} section for the definition of @var{item} and @var{tag}. + +@item -disasm @var{item} @var{tag} +@ifclear man +@IndexSubentry{Options, @code{-disasm}} +@IndexSubentry{Commands, @code{disasm}} +@end ifclear +Include the disassembly in the source listing. The default listing does not +include the disassembly. If the source code is not available, show a listing +of the disassembly only. +See the @samp{NOTES} section for the definition of @var{item} and @var{tag}. + +@item -outfile @var{filename} +@ifclear man +@IndexSubentry{Options, @code{-outfile}} +@IndexSubentry{Commands, @code{outfile}} +@end ifclear +Write results to file @var{filename}. A dash (-) writes to stdout. This is also +the default. Note that this option only affects those options included to the +right of this option. + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c NOTES section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NOTES} +@c man begin NOTES + +Use @var{item} to specify the name of a function, or of a source or object +file that was used to build the executable, or shared object. + +The @var{tag} is an index used to determine which item is being referred +to when multiple functions have the same name. It is required, but will +be ignored if not necessary to resolve the function. + +The @var{item} may also be specified in the form @samp{function`file`}, in +which case the source or disassembly of the named function in the source +context of the named file will be used. + +The special @var{item} and @var{tag} combination @samp{all -1}, is used to +indicate generating the source, or disassembly, for all functions in the +@var{target_file}. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SEEALSO section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SEEALSO} +@c man begin SEEALSO + +gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-text(1) + +The user guide for gprofng is maintained as a Texinfo manual. If the info +and gprofng programs are correctly installed, the command +@command{info gprofng} should give access to this document. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c COPYRIGHT section +@c ---------------------------------------------------------------------------- + +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c If this text is used for a man page, exit. Otherwise we need to continue. +@c ---------------------------------------------------------------------------- + +@ifset man +@bye +@end ifset diff --git a/gprofng/doc/gp-display-text.texi b/gprofng/doc/gp-display-text.texi new file mode 100644 index 0000000..993f9f0 --- /dev/null +++ b/gprofng/doc/gp-display-text.texi @@ -0,0 +1,437 @@ +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the gp-collect-app man page. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng display text +@settitle Display the performance data in plain text format +@include gp-macros.texi +@end ifset + +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { +@c +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. +@c ---------------------------------------------------------------------------- + +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NAME} +@c man begin NAME + +gprofng display text - Display the performance data in plain text format + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS + +@command{gprofng display text} [@var{option(s)}] [@var{commands}] +[-script @var{script-file}] @var{experiment(s)} + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- + +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION + +Print a plain text version of the various displays supported by gprofng. + +The input consists of one or more experiment directories. Through commands, +the user controls the output. + +There is a rich set of commands to control the display of the data. The +@samp{NOTES} section lists the most common ones. The gprofng user guide +lists all the commands supported. + +Commands specified on the command line need to be prepended with the dash ('-') +symbol. + +In this example, a function overview will be shown, followed by the source +code listing of function @samp{my-func}, annotated with the +performance metrics that have been recorded during the data collection +and stored in experiment directory @samp{my-exp.er}: + +@smallexample +$ gprofng display text -functions -source my-func my-exp.er +@end smallexample + +Instead of, or in addition to, specifying these commands on the command line, +commands may also be included in a file called the @var{script-file}. + +Note that the commands are processed and interpreted from left to right, +@emph{so the order matters}. + +If this tool is invoked without options, commands, or a script file, it +starts in interpreter mode. The user can then issue the commands interactively. +The session is terminated with the @command{exit} command in the interpreter. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} +@c man begin OPTIONS + +@table @gcctabopt + +@item --version +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear + +Print the version number and exit. + +@item --help +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear + +Print usage information and exit. + +@c -- @item --verbose @{on|off@} +@c -- @ifclear man +@c -- @IndexSubentry{Options, @code{--verbose}} +@c -- @end ifclear + +@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}. + +@item -script @var{script-file} +@ifclear man +@IndexSubentry{Options, @code{-script}} +@IndexSubentry{Commands, @code{script}} +@end ifclear + +Execute the commands stored in the script file. This feature may be combined +with commands specified at the command line. + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c NOTES section +@c ---------------------------------------------------------------------------- + +@ManPageStart{NOTES} +@c man begin NOTES + +Many commands are supported. Below, the more common ones are listed in +mostly alphabetical order, because sometimes it is more logical to +swap the order of two entries. + +@ifset man +There are many more commands. These are documented in the user guide. +@end ifset + +@table @code + +@item callers-callees +@ifclear man +@IndexSubentry{Options, @code{-callers-callees}} +@IndexSubentry{Commands, @code{callers-callees}} +@end ifclear +In a callers-callees panel, it is shown which function(s) call the target +function (the @emph{callers}) and what functions it is calling (the +@emph{callees}). +This command prints the callers-callees panel for each of the functions, +in the order specified by the function sort metric. + +@item calltree +@ifclear man +@IndexSubentry{Options, @code{-calltree}} +@IndexSubentry{Commands, @code{calltree}} +@end ifclear +Display the dynamic call graph from the experiment, showing the hierarchical +metrics at each level. + +@item compare @{on | off | delta | ratio@} +@ifclear man +@IndexSubentry{Options, @code{-compare}} +@IndexSubentry{Commands, @code{compare}} +@end ifclear +By default, the results for multiple experiments are aggregated. This +command changes this to enable the comparison of experiments for certain +views (e.g. the function view). The first experiment specified is defined +to be the reference. The following options are supported: + +@table @code + +@item on +For each experiment specified on the command line, print the values for +the metrics that have been activated for the experiment. + +@item off +Disable the comparison of experiments. This is the default. + +@item delta +Print the values for the reference experiment. The results for the other +experiments are shown as a delta relative to the reference (current-reference). + +@item ratio +Print the values for the reference experiment. The results for the other +experiments are shown as a ratio relative to the reference (current/reference). + +@end table + +@item disasm @var{function-name} +@ifclear man +@IndexSubentry{Options, @code{-disasm}} +@IndexSubentry{Commands, @code{disasm}} +@end ifclear +List the source code and instructions for the function specified. The +instructions are annotated with the metrics used. + +@item fsingle @var{function-name} [@samp{n}] +@ifclear man +@IndexSubentry{Options, @code{-fsingle}} +@IndexSubentry{Commands, @code{fsingle}} +@end ifclear +Write a summary panel for the specified function. The optional parameter +@var{n} is needed for those cases where several functions have the same name. + +@item fsummary +@ifclear man +@IndexSubentry{Options, @code{-fsummary}} +@IndexSubentry{Commands, @code{fsummary}} +@end ifclear +Write a summary panel for each function in the function list. + +@item functions +@ifclear man +@IndexSubentry{Options, @code{-functions}} +@IndexSubentry{Commands, @code{functions}} +@end ifclear +Display a list of all functions executed. For each function the used metrics +(e.g. the CPU time) ar shown. + +@item header +@ifclear man +@IndexSubentry{Options, @code{-header}} +@IndexSubentry{Commands, @code{header}} +@end ifclear +Shows several operational characteristics of the experiment(s) specified +on the command line. + +@item limit @var{n} +@ifclear man +@IndexSubentry{Options, @code{-limit}} +@IndexSubentry{Commands, @code{limit}} +@end ifclear +Limit the output to @var{n} lines. + +@item lines +@ifclear man +@IndexSubentry{Options, @code{-lines}} +@IndexSubentry{Commands, @code{lines}} +@end ifclear +Write a list of source lines and their metrics, ordered by the current +sort metric. + +@item metric_list +@ifclear man +@IndexSubentry{Options, @code{-metric_list}} +@IndexSubentry{Commands, @code{metric_list}} +@end ifclear +Display the currently selected metrics in the function view and a list +of all the metrics available for the target experiment(s). + +@item metrics @var{metric-spec} +@ifclear man +@IndexSubentry{Options, @code{-metrics}} +@IndexSubentry{Commands, @code{metrics}} +@end ifclear +Define the metrics to be displayed in the function and callers-callees +overviews. + +The @var{metric-spec} can either be the keyword @samp{default} +to restore the default metrics selection, or a colon separated list +with metrics. + +The gprofng user guide has more details how to define metrics. + +@item name @{short | long | mangled@}[:@{soname | nosoname@}] +@ifclear man +@IndexSubentry{Options, @code{-name}} +@IndexSubentry{Commands, @code{name}} +@end ifclear +Specify whether to use the short, long, or mangled form of function names. +Optionally, the load object that the function is part of can be included in +the output by adding the @emph{soname} keyword. It can also be ommitted +(@emph{nosoname}), which is the default. + +Whether there is an actual difference between these types of names depends +on the language. + +Note that there should be no (white)space to the left and right of the +colon (@samp{:}). + +@item overview +@ifclear man +@IndexSubentry{Options, @code{-overview}} +@IndexSubentry{Commands, @code{overview}} +@end ifclear +Shows a summary of the recorded performance data for the experiment(s) +specified on the command line. + +@item pcs +@ifclear man +@IndexSubentry{Options, @code{-pcs}} +@IndexSubentry{Commands, @code{pcs}} +@end ifclear +Write a list of program counters (PCs) and their metrics, ordered by +the current sort metric. + +@item sort @var{metric-spec} +@ifclear man +@IndexSubentry{Options, @code{-sort}} +@IndexSubentry{Commands, @code{sort}} +@end ifclear +Sort the function list on the @var{metric-spec} given. + +@IndexSubentry{Sort, Reverse order} +The data can be sorted in reverse order by prepending the metric definition +with a minus (@samp{-}) sign. + +@noindent +For example @command{sort -e.totalcpu}. + +@IndexSubentry{Sort, Reset to default} +A default metric for the sort operation has been defined and since this is +a persistent command, this default can be restored with @code{default} as +the key (@command{sort default}). + +@item source @var{function-name} +@ifclear man +@IndexSubentry{Options, @code{-source}} +@IndexSubentry{Commands, @code{source}} +@end ifclear +List the source code for the function specified, annotated with the metrics +used. + +@item viewmode @{user | expert | machine@} +@ifclear man +@IndexSubentry{Options, @code{-viewmode}} +@IndexSubentry{Commands, @code{viewmode}} +@end ifclear +This command is only relevant for Java programs. For all other languages +supported, the viewmode setting has no effect. + +The following options are supported: + +@table @code + +@item user +Show the Java call stacks for Java threads, but do not show housekeeping +threads. The function view includes a function called @samp{<JVM-System>}. +This represents the aggregated time from non-Java threads. +In case the JVM software does not report a Java call stack, time is reported +against the function @samp{<no Java callstack recorded>}. + +@item expert +Show the Java call stacks for Java threads when the user Java code is executed, +and machine call stacks when JVM code is executed, or when the JVM software +does not report a Java call stack. Show the machine call stacks for +housekeeping threads. + +@item machine +Show the actual native call stacks for all threads. This is the view mode +for C, C++, and Fortran. + +@end table + +@end table + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c SEEALSO section +@c ---------------------------------------------------------------------------- + +@ManPageStart{SEEALSO} +@c man begin SEEALSO + +gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1) + +The user guide for gprofng is maintained as a Texinfo manual. If the +@command{info} and @command{gprofng} programs are correctly installed, the +command @command{info gprofng} should give access to this document. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c COPYRIGHT section +@c ---------------------------------------------------------------------------- + +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. + +@c man end +@ManPageEnd{} + +@c ---------------------------------------------------------------------------- +@c If this text is used for a man page, exit. Otherwise we need to continue. +@c ---------------------------------------------------------------------------- + +@ifset man +@bye +@end ifset diff --git a/gprofng/doc/gp-macros.texi b/gprofng/doc/gp-macros.texi new file mode 100644 index 0000000..f4bd423 --- /dev/null +++ b/gprofng/doc/gp-macros.texi @@ -0,0 +1,72 @@ +@c -- Macro definitions ------------------------------------------------------- +@c +@c Since only letters can be used, we use capitalization to distinguish +@c different words. +@c ---------------------------------------------------------------------------- +@macro CollectApp{} +@command{gprofng collect app} +@end macro + +@macro DisplayHTML{} +@command{gprofng display html} +@end macro + +@macro DisplayText{} +@command{gprofng display text} +@end macro + +@macro DisplaySRC{} +@command{gprofng display src} +@end macro + +@macro Archive{} +@command{gprofng archive} +@end macro + +@macro Driver{} +@command{gprofng} +@end macro + +@macro ProductName{} +gprofng +@end macro + +@macro ToolName{} +@command{gprofng} +@end macro + +@macro IndexSubentry{label, string} +@c -- @cindex \label\ @subentry \string\ +@cindex \label\, \string\ +@end macro + +@macro vspace {lines} +@sp \lines\ +@end macro + +@c -- For some reason ending this macro with @noindent does not work out well. + +@macro OptionHeader {lines, option, description} +@sp \lines\ +@noindent +@code{\option\} @ @emph{\description\} +@c -- @sp 1 +@end macro + +@macro gcctabopt{body} +@code{\body\} +@end macro + +@macro ManPageStart{headername} +@ifclear man +@sp 1 +@noindent @b{\headername\} +@indentedblock +@end ifclear +@end macro + +@macro ManPageEnd{} +@ifclear man +@end indentedblock +@end ifclear +@end macro diff --git a/gprofng/doc/gprofng.texi b/gprofng/doc/gprofng.texi index 1a2c84b..d038a47 100644 --- a/gprofng/doc/gprofng.texi +++ b/gprofng/doc/gprofng.texi @@ -1,3568 +1,308 @@ -\input texinfo @c -*-texinfo-*- - -@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES -@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { - @c ---------------------------------------------------------------------------- -@c This is the Texinfo source file for the GPROFNG manual. +@c This is the Texinfo source file for the gprofng man page. @c @c Author: Ruud van der Pas @c ---------------------------------------------------------------------------- +@ifset man +\input texinfo @c -*-texinfo-*- +@setfilename gprofng +@settitle The next generation GNU application profiling tool +@include gp-macros.texi +@end ifset -@c %**start of header - -@setfilename gprofng.info -@settitle GNU gprofng - -@c -- Set the indent for the @example command to 1 space, not 5 --------------- -@exampleindent 1 - -@c %**end of header - -@c -- Start a new chapter on a new, odd numbered, page ------------------------ -@setchapternewpage odd - -@c -- Merge all index entries into the Concepts Index ------------------------- -@syncodeindex fn cp -@syncodeindex ky cp -@syncodeindex pg cp -@syncodeindex vr cp +@c @ManPageStart{NAME} +@c @ManPageStart{SYNOPSIS} +@c @ManPageStart{DESCRIPTION} +@c @ManPageStart{OPTIONS} +@c @ManPageStart{NOTES} +@c @ManPageStart{SEEALSO} +@c @ManPageStart{COPYRIGHT} -@c -- Macro definitions ------------------------------------------------------- +@c ---------------------------------------------------------------------------- +@c This is from the man-pages(7) man page +@c +@c "The list below shows conventional or suggested sections. Most manual pages +@c should include at least the highlighted sections. Arrange a new manual +@c page so that sections are placed in the order shown in the list." +@c +@c NAME +@c SYNOPSIS +@c CONFIGURATION [Normally only in Section 4] +@c DESCRIPTION +@c OPTIONS [Normally only in Sections 1, 8] +@c EXIT STATUS [Normally only in Sections 1, 8] +@c RETURN VALUE [Normally only in Sections 2, 3] +@c ERRORS [Typically only in Sections 2, 3] +@c ENVIRONMENT +@c FILES +@c VERSIONS [Normally only in Sections 2, 3] +@c ATTRIBUTES [Normally only in Sections 2, 3] +@c CONFORMING TO +@c NOTES +@c BUGS +@c EXAMPLES +@c AUTHORS [Discouraged] +@c REPORTING BUGS [Not used in man-pages] +@c COPYRIGHT [Not used in man-pages] +@c SEE ALSO +@c +@c This is what the texi2pod.pl tool recognizes: +@c +@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES +@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) { @c -@c Since only letters can be used, we use capitalization to distinguish -@c different words. +@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which +@c makes sense and adhered to for the other formats. @c ---------------------------------------------------------------------------- -@macro CollectApp{} -@command{gprofng collect app} -@end macro - -@macro DisplayHTML{} -@command{gprofng display html} -@end macro - -@macro DisplayText{} -@command{gprofng display text} -@end macro - -@macro Driver{} -@command{gprofng} -@end macro - -@macro ProductName{} -gprofng -@end macro - -@macro ToolName{} -@command{gprofng} -@end macro - -@macro IndexSubentry{label, string} -@c -- @cindex \label\ @subentry \string\ -@cindex \label\, \string\ -@end macro - -@macro gcctabopt{body} -@code{\body\} -@end macro - -@c -- Get the version information --------------------------------------------- -@include version.texi - -@c -- Entry for the Info dir structure ---------------------------------------- -@ifnottex -@dircategory Software development -@direntry -* gprofng: (gprofng). The next generation profiling tool for Linux -@end direntry -@end ifnottex - -@c -- Copyright stuff --------------------------------------------------------- -@copying -This document is the manual for @ProductName{}, last updated @value{UPDATED}. -Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. - -@c -- @quotation -Permission is granted to copy, distribute and/or modify this document -under the terms of the GNU Free Documentation License, -Version 1.3 or any later version published by the Free Software -Foundation; with no Invariant Sections, with no Front-Cover texts, -and with no Back-Cover Texts. A copy of the license is included in the -section entitled ``GNU Free Documentation License.'' - -@c -- @end quotation -@end copying - -@finalout -@smallbook - -@c -- Define the title page --------------------------------------------------- -@titlepage -@title GNU gprofng -@subtitle The next generation profiling tool for Linux -@subtitle version @value{VERSION} (last updated @value{UPDATED}) -@author Ruud van der Pas -@page -@vskip 0pt plus 1filll -@insertcopying - -@c man begin COPYRIGHT +@c ---------------------------------------------------------------------------- +@c NAME section +@c ---------------------------------------------------------------------------- -Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. +@ManPageStart{NAME} +@c man begin NAME -Permission is granted to copy, distribute and/or modify this document -under the terms of the GNU Free Documentation License, Version 1.3 -or any later version published by the Free Software Foundation; -with no Invariant Sections, with no Front-Cover Texts, and with no -Back-Cover Texts. A copy of the license is included in the -section entitled ``GNU Free Documentation License''. +gprofng - The driver for the gprofng application profiling tool @c man end +@ManPageEnd{} -@end titlepage - -@c -- Generate the Table of Contents ------------------------------------------ -@contents - -@c -- The Top node ------------------------------------------------------------ -@c Should contain a short summary, copying permissions and a master menu. @c ---------------------------------------------------------------------------- -@ifnottex -@node Top -@top GNU Gprofng - -@insertcopying -@end ifnottex - -@ifinfo -@c -- The menu entries -------------------------------------------------------- - -@menu -* Introduction:: About this manual. -* Overview:: A brief overview of @ProductName{}. -* A Mini Tutorial:: A short tutorial covering the key features. -* Terminology:: Various concepts and some terminology explained. -* Other Document Formats:: How to create this document in other formats. -* Index:: The index. - -@detailmenu - ---- The Detailed Node Listing --- - -Introduction - -Overview - -* Main Features:: A high level overview. -* Sampling versus Tracing:: The pros and cons of sampling versus tracing. -* Steps Needed to Create a Profile:: How to create a profile. - -A Mini Tutorial - -* Getting Started:: The basics of profiling with @ProductName(). -* Support for Multithreading:: Commands specific to multithreaded applications. -* Viewing Multiple Experiments:: Analyze multiple experiments. -* Profile Hardware Event Counters:: How to use hardware event counters. -* Java Profiling:: How to profile a Java application. - -Terminology +@c SYNOPSIS section +@c ---------------------------------------------------------------------------- -* The Program Counter:: What is a Program Counter? -* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics. -* Metric Definitions:: Definitions associated with metrics. -* The Viewmode:: Select the way call stacks are presented. -* The Selection List:: How to define a selection. -* Load Objects and Functions:: The components in an application. -* The Concept of a CPU in @ProductName{}:: The definition of a CPU. -* Hardware Event Counters Explained:: What are event counters? -* apath:: Our generic definition of a path. +@ManPageStart{SYNOPSIS} +@c man begin SYNOPSIS -@c -- Index +@command{gprofng} [@var{option(s)}] @var{action} [@var{qualifier}] [@var{option(s)}] @var{target} [@var{options}] -@end detailmenu -@end menu -@end ifinfo +@c man end +@ManPageEnd{} -@ifset man +@c ---------------------------------------------------------------------------- +@c DESCRIPTION section +@c ---------------------------------------------------------------------------- -@c man title gprofng the driver for the gprofng tool suite +@ManPageStart{DESCRIPTION} +@c man begin DESCRIPTION -@c man begin SYNOPSIS -gprofng [OPTION(S)] ACTION [@b{QUALIFIER}] [ARGUMENTS] TARGET -@c man end +This is the driver for the gprofng tools suite to gather and analyze performance +data. -@c man begin DESCRIPTION -This is the driver for the GPROFNG tools suite to gather and analyze performance data. +The driver executes the @var{action} specified. An example of an action is +@samp{collect} to collect performance data. Depending on the action, a +@var{qualifier} may be needed to further define the command. +The last item is the @var{target} that the command applies to. -The driver executes the action specified. An example of an action is @code{collect} -to collect performance data. Depending on the action, a qualifier may be needed to -define the command. Several qualifiers support options. The last item on the command -is the target the command applies to. +There are three places where options are supported. The driver supports +options. These can be found below. The @var{action}, possibly in combination +with the @var{qualifier} also supports options. A description of these can be +found in the man page for the command. Any options needed to execute the +target command should follow the target name. -For example, to collect performance data for an application called @code{a.out} and -store the results in experiment directory @code{mydata.er}, the following command may -be used: +For example, to collect performance data for an application called +@command{a.out} and store the results in experiment directory @samp{mydata.er}, +the following command may be used: @smallexample -$ gprofng collect app -o mydata.er a.out +$ gprofng collect app -o mydata.er a.out -t 2 @end smallexample -In this example, the action is @code{collect}, the qualifier is @code{app}, the single -argument is @code{-o mydata.er} and the target is @code{a.out}. +In this example, the action is @samp{collect}, the qualifier is @samp{app}, the single +argument to the command is @code{-o mydata.er} and the target is @command{a.out}. +The target command is invoked with the @samp{-t 2} option. If gprofng is executed without any additional option, action, or target, a usage overview is printed. @c man end +@ManPageEnd{} +@c ---------------------------------------------------------------------------- +@c OPTIONS section +@c ---------------------------------------------------------------------------- + +@ManPageStart{OPTIONS} @c man begin OPTIONS @table @gcctabopt @item @var{--version} -print the version number and exit. +@ifclear man +@IndexSubentry{Options, @code{--version}} +@end ifclear +Print the version number and exit. @item @var{--help} -print usage information and exit. +@ifclear man +@IndexSubentry{Options, @code{--help}} +@end ifclear +Print usage information and exit. @end table @c man end +@ManPageEnd{} -@c man begin NOTES - -The gprofng driver supports the following commands. - -@c The man pages for the commands below can be viewed using the command name with "gprofng" replaced by "gp" and the spaces replaced by a dash ("-"). For example the man page -@c name for "gprofng collect app" is "gp-collect-app". - -Collect performance data: - -@table @code - -@item gprofng collect app -collect application performance data. - -@end table - -Display the performance results: - -@table @code - -@item gprofng display text -display the performance data in ASCII format. - -@item gprofng display html -generate an HTML file from one or more experiments. - -@end table - -Miscellaneous commands: - -@table @code - -@item gprofng display src -display source or disassembly with compiler annotations. - -@item gprofng archive -include binaries and source code in an experiment directory. - -@end table - -It is also possible to invoke the lower level commands directly, but since -these are subject to change, in particular the options, we recommend to -use the driver. - -@c man end +@c ----------------------------------------------------------------------------- +@c ENVIRONMENT SECTION +@c ----------------------------------------------------------------------------- +@ManPageStart{ENVIRONMENT} @c man begin ENVIRONMENT + The following environment variables are supported: -@table @code +@table @samp @item @env{GPROFNG_MAX_CALL_STACK_DEPTH} -set the depth of the call stack (default is 256). +@cindex Environment variables +Set the depth of the call stack (default is 256). @item @env{GPROFNG_USE_JAVA_OPTIONS} -may be set when profiling a C/C++ application that uses dlopen() to execute Java code. +@cindex Environment variables +May be set when profiling a C/C++ application that uses dlopen() to execute +Java code. -@item @env{GPROFNG_SSH_REMOTE_DISPLAY} -use this variable to define the ssh command executed by the remote display tool. +@c -- deferred @item @env{GPROFNG_SSH_REMOTE_DISPLAY} +@c -- deferred Use this variable to define the ssh command executed by the remote display tool. -@item @env{GPROFNG_SKIP_VALIDATION} -set this variable to disable checking hardware, system, and Java versions. +@c -- deferred @item @env{GPROFNG_SKIP_VALIDATION} +@c -- deferred Set this variable to disable checking hardware, system, and Java versions. @item @env{GPROFNG_ALLOW_CORE_DUMP} -set this variable to allow a core file to be generated; otherwise an error report is created on /tmp. +@cindex Environment variables +Set this variable to allow a core file to be generated; otherwise an error +report is created on /tmp. @item @env{GPROFNG_ARCHIVE} -use this variable to define the settings for automatic archiving upon experiment recording completion. +@cindex Environment variables +Use this variable to define the settings for automatic archiving upon experiment +recording completion. @item @env{GPROFNG_ARCHIVE_COMMON_DIR} -set this variable to the location of the common archive. +@cindex Environment variables +Set this variable to the location of the common archive. @item @env{GPROFNG_JAVA_MAX_CALL_STACK_DEPTH} -set the depth of the Java call stack; the default is 256; set to 0 to disable capturing of call stacks. +@cindex Environment variables +Set the depth of the Java call stack; the default is 256; set to 0 to disable +capturing of call stacks. @item @env{GPROFNG_JAVA_NATIVE_MAX_CALL_STACK_DEPTH} -set the depth of the Java native call stack; the default is 256; set to 0 to disable capturing of call stacks (JNI and assembly call stacks are not captured). +@cindex Environment variables +Set the depth of the Java native call stack; the default is 256; set to 0 to +disable capturing of call stacks (JNI and assembly call stacks are not +captured). @end table @c man end +@ManPageEnd{} -@c man begin SEEALSO -The man pages for the various gprofng commands are not available yet, but -the @option{--help} option supported on each of the commands lists the options -and provides more information. - -For example this displays the options supported on the @command{gprofng collect app} -command: - -@smallexample -$ gprofng collect app --help -@end smallexample - -The user guide is available as an Info entry for @file{gprofng}. -@c man end - -@end ifset - -@c man begin DESCRIPTION -@c man end - -@c -- A new node -------------------------------------------------------------- -@node Introduction -@chapter Introduction -@c ---------------------------------------------------------------------------- -The @ProductName{} tool is the next generation profiler for Linux. It consists -of various commands to generate and display profile information. - -This manual starts with a tutorial how to create and interpret a profile. This -part is highly practical and has the goal to get users up to speed as quickly -as possible. As soon as possible, we would like to show you how to get your -first profile on your screen. - -This is followed by more examples, covering many of the features. At the -end of this tutorial, you should feel confident enough to tackle the more -complex tasks. - -In a future update a more formal reference manual will be included as well. -Since even in this tutorial we use certain terminology, we have included a -chapter with descriptions at the end. In case you encounter unfamiliar -wordings or terminology, please check this chapter. - -One word of caution. In several cases we had to somewhat tweak the screen -output in order to make it fit. This is why the output may look somewhat -different when you try things yourself. - -For now, we wish you a smooth profiling experience with @ProductName{} and -good luck tackling performance bottlenecks. - -@c -- A new node -------------------------------------------------------------- -@c cccccc @node A Brief Overview of @ProductName{} -@node Overview -@chapter A Brief Overview of @ProductName{} -@c ---------------------------------------------------------------------------- - -@menu -* Main Features:: A high level overview. -* Sampling versus Tracing:: The pros and cons of sampling versus tracing. -* Steps Needed to Create a Profile:: How to create a profile. -@end menu - -Before we cover this tool in quite some detail, we start with a brief overview -of what it is, and the main features. Since we know that many of you would -like to get started rightaway, already in this first chapter we explain the -basics of profiling with @ToolName{}. - -@c ---------------------------------------------------------------------------- -@c TBD Review this text. Probably be more specific on the gcc releases and -@c processor specifics. -@c ---------------------------------------------------------------------------- - -@c -- A new node -------------------------------------------------------------- -@node Main Features -@section Main Features -@c ---------------------------------------------------------------------------- - -@noindent -These are the main features of the @ProductName{} tool: - -@itemize @bullet - -@item -Profiling is supported for an application written in C, C++, Java, or Scala. - -@c TBD Java: up to 1.8 full support, support other than for modules - -@item -Shared libraries are supported. The information is presented at the instruction -level. - -@item -The following multithreading programming models are supported: Pthreads, -OpenMP, and Java threads. - -@item -This tool works with unmodified production level executables. There is no need to -recompile the code, but if the @code{-g} option has been used when building -the application, source line level information is available. - -@item -The focus is on support for code generated with the @code{gcc} compiler, but -there is some limited support for the @code{icc} compiler as well. Future -improvements and enhancements will focus on @code{gcc} though. - -@item -Processors from Intel, AMD, and Arm are supported, but the level of support -depends on the architectural details. In particular, hardware event counters -may not be supported. - -@item -Several views into the data are supported. For example, a function overview -where the time is spent, but also a source line, disassembly, call tree and -a caller-callees overview are available. - -@item -Through filters, the user can zoom in on an area of interest. - -@item -Two or more profiles can be aggregated, or used in a comparison. This comparison -can be obtained at the function, source line, and disassembly level. - -@item -Through a scripting language, and customization of the metrics shown, -the generation and creation of a profile can be fully automated and provide -tailored output. - -@end itemize - -@c -- A new node -------------------------------------------------------------- -@node Sampling versus Tracing -@section Sampling versus Tracing -@c ---------------------------------------------------------------------------- - -A key difference with some other profiling tools is that the main data -collection command @CollectApp{} mostly uses -@cindex Program Counter sampling -@cindex PC sampling -Program Counter (PC) sampling -under the hood. - -With @emph{sampling}, the executable is stopped at regular intervals. Each time -it is halted, key information is gathered and stored. This includes the Program -Counter that keeps track of where the execution is. Hence the name. - -Together with operational -data, this information is stored in the experiment directory and can be -viewed in the second phase. - -For example, the PC information is used to derive where the program was when -it was halted. Since the sampling interval is known, it is relatively easy to -derive how much time was spent in the various parts of the program. - -The opposite technique is generally referred to as @emph{tracing}. With -tracing, the target is instrumented with specific calls that collect the -requested information. - -These are some of the pros and cons of PC sampling verus tracing: - -@itemize - -@item -Since there is no need to recompile, existing executables can be used -and the profile measures the behaviour of exactly the same executable that is -used in production runs. - -With sampling, one inherently profiles a different executable because -the calls to the instrumentation library may affect the compiler optimizations -and run time behaviour. - -@item -With sampling, there are very few restrictions on what can be profiled and even without -access to the source code, a basic profile can be made. - -@item -A downside of sampling is that, depending on the sampling frequency, small -functions may be missed or not captured accurately. Although this is rare, -this may happen and is the reason why the user has control over the sampling rate. - -@item -While tracing produces precise information, sampling is statistical in nature. -As a result, small variations may occur across seemingly identical runs. We -have not observed more than a few percent deviation though. Especially if -the target job executed for a sufficiently long time. - -@item -With sampling, it is not possible to get an accurate count how often -functions are called. - -@end itemize - -@c -- A new node -------------------------------------------------------------- -@node Steps Needed to Create a Profile -@section Steps Needed to Create a Profile -@c ---------------------------------------------------------------------------- - -Creating a profile takes two steps. First the profile data needs to be -generated. This is followed by a viewing step to create a report from the -information that has been gathered. - -Every @ProductName{} command starts with @ToolName{}, the name of the driver. This is followed -by a keyword to define the high level functionality. Depending on this -keyword, a third qualifier may be needed to further narrow down the request. -This combination is then followed by options that are specific to the functionality -desired. - -The command to gather, or ``collect'', the performance data is called -@CollectApp{}. Aside from numerous options, this command takes the name -of the target executable as an input parameter. - -Upon completion of the run, the performance data can be -found in the newly created -@cindex Experiment directory -experiment directory. - -Unless explicitly specified otherwise, a default -name for this directory is chosen. The name is @code{test.<n>.er} where -@code{n} is the first integer number not in use yet for such a name. - -For example, the first time @CollectApp{} is invoked, an experiment -directory with the name @code{test.1.er} is created. - -Upon a subsequent invocation of @CollectApp{} in the same directory, -an experiment directory with the name @code{test.2.er} will be created, -and so forth. - -Note that @CollectApp{} supports an option to explicitly name the experiment directory. -Outside of the restriction that the name of this directory has to end -with @code{.er}, any valid directory name can be used for this. - -Now that we have the performance data, the next step is to display it. - -@pindex @DisplayText{} -The most commonly used command to view the performance information is -@DisplayText{}. This is a very extensive and customizable tool that -produces the information in ASCII format. - -@pindex @DisplayHTML{} -Another option is to use @DisplayHTML{}. This tool generates a directory with -files in html format. These can be viewed in a browser, allowing for easy -navigation through the profile data. - -@c -- A new node -------------------------------------------------------------- -@node A Mini Tutorial -@chapter A Mini Tutorial -@c ---------------------------------------------------------------------------- - -In this chapter we present and discuss the main functionality of @ToolName{}. -This will be a practical approach, using an example code to generate profile -data and show how to get various performance reports. - -@menu -* Getting Started:: The basics of profiling with @ProductName(). -* Support for Multithreading:: Commands specific to multithreaded applications. -* Viewing Multiple Experiments:: Analyze multiple experiments. -* Profile Hardware Event Counters:: How to use hardware event counters. -* Java Profiling:: How to profile a Java application. -@end menu - -@c -- A new node -------------------------------------------------------------- -@node Getting Started -@section Getting Started -@c ---------------------------------------------------------------------------- - -The information presented here provides a good and common basis for many -profiling tasks, but there are more features that you may want to leverage. - -These are covered in subsequent sections in this chapter. - -@menu -* The Example Program:: A description of the example program used. -* A First Profile:: How to get the first profile. -* The Source Code View:: Display the metrics in the source code. -* The Disassembly View:: Display the metrics at the instruction level. -* Display and Define the Metrics:: An example how to customize the metrics. -* A First Customization of the Output:: An example how to customize the output. -* Name the Experiment Directory:: Change the name of the experiment directory. -* Control the Number of Lines in the Output:: Change the number of lines in the tables. -* Sorting the Performance Data:: How to set the metric to sort by. -* Scripting:: Use a script to execute the commands. -* A More Elaborate Example:: An example of customization. -* The Call Tree:: Display the dynamic call tree. -* More Information on the Experiment:: How to get additional statistics. -* Control the Sampling Frequency:: How to control the sampling granularity. -* Information on Load Objects:: How to get more information on load objects. -@end menu - -@c -- A new node -------------------------------------------------------------- -@node The Example Program -@subsection The Example Program -@c ---------------------------------------------------------------------------- - -Throughout this guide we use the same example C code that implements the -multiplication of a vector of length @math{n} by an @math{m} by @math{n} -matrix. The result is stored in a vector of length @math{m}. -@cindex Pthreads -@cindex Posix Threads -The algorithm has been parallelized using Posix Threads, or Pthreads for short. - -The code was built using the @code{gcc} compiler and the name of the executable -is -@cindex mxv-pthreads.exe -mxv-pthreads.exe. - -The matrix sizes can be set through the @code{-m} and @code{-n} options. The -number of threads is set with the @code{-t} option. To increase the duration -of the run, the multiplication is executed repeatedly. - -This is an example that multiplies a @math{3000} by @math{2000} matrix with -a vector of length @math{2000} using @math{2} threads: - -@smallexample -@verbatim -$ ./mxv-pthreads.exe -m 3000 -n 2000 -t 2 -mxv: error check passed - rows = 3000 columns = 2000 threads = 2 -$ -@end verbatim -@end smallexample - -The program performs an internal check to verify the results are correct. -The result of this check is printed, followed by the matrix sizes and the -number of threads used. - -@c -- A new node -------------------------------------------------------------- -@node A First Profile -@subsection A First Profile -@c ---------------------------------------------------------------------------- - -The first step is to collect the performance data. It is important to remember -that much more information is gathered than may be shown by default. Often a -single data collection run is sufficient to get a lot of insight. - -The @CollectApp{} command is used for the data collection. Nothing needs to be -changed in the way the application is executed. The only difference is that it -is now run under control of the tool, as shown below: - -@cartouche -@smallexample -$ gprofng collect app ./mxv.pthreads.exe -m 3000 -n 2000 -t 1 -@end smallexample -@end cartouche - -This command produces the following output: - -@smallexample -@verbatim -Creating experiment database test.1.er (Process ID: 2416504) ... -mxv: error check passed - rows = 3000 columns = 2000 threads = 1 -@end verbatim -@end smallexample - -We see the message that a directory with the name @code{test.1.er} -has been created. -The application then completes as usual and we have our first experiment -directory that can be analyzed. - -The tool we use for this is called @DisplayText{}. It takes the name of -the experiment directory as an argument. - -@cindex Interpreter mode -If invoked this way, the tool starts in the interactive @emph{interpreter} mode. -While in this environment, commands can be given and the tool responds. This is -illustrated below: - -@smallexample -@verbatim -$ gprofng display text test.1.er -Warning: History and command editing is not supported on this system. -(gp-display-text) quit -$ -@end verbatim -@end smallexample - -@cindex Command line mode -While useful in certain cases, we prefer to use this tool in command line mode, -by specifying the commands to be issued when invoking the tool. The way to do -this is to prepend the command with a hyphen (@code{-}) if used on the command -line. - -For example, -@IndexSubentry{Commands, @code{functions}} -with the @code{functions} command we request a list of the functions that -have been executed and their respective CPU times: - -@cartouche -@smallexample -$ gprofng display text -functions test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -$ gprofng display text -functions test.1.er -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Incl. Name -Total Total -CPU sec. CPU sec. -2.272 2.272 <Total> -2.160 2.160 mxv_core -0.047 0.103 init_data -0.030 0.043 erand48_r -0.013 0.013 __drand48_iterate -0.013 0.056 drand48 -0.008 0.010 _int_malloc -0.001 0.001 brk -0.001 0.002 sysmalloc -0. 0.001 __default_morecore -0. 0.113 __libc_start_main -0. 0.010 allocate_data -0. 2.160 collector_root -0. 2.160 driver_mxv -0. 0.113 main -0. 0.010 malloc -0. 0.001 sbrk -@end verbatim -@end smallexample - -As easy and simple as these steps are, we do have a first profile of our program! -There are three columns. The first two contain the -@cindex Total CPU time -@emph{Total CPU Time}, -which -is the sum of the user and system time. @xref{Inclusive and Exclusive Metrics} -for an explanation of ``exclusive'' and ``inclusive'' times. - -The first line echoes the metric that is used to sort the output. By default, this -is the exclusive CPU time, but the sort metric can be changed by the user. - -We then see three columns with the exclusive and inclusive CPU times, plus the -name of the function. - -@IndexSubentry{Miscellaneous, @code{<Total>}} -The function with the name @code{<Total>} is not a user function, but is introduced -by @ToolName{} and is used to display the accumulated metric values. In this case, -we see that the total CPU time of this job was @code{2.272} seconds. - -With @code{2.160} seconds, function @code{mxv_core} is the most time -consuming function. It is also a leaf function. - -The next function in the list is @code{init_data}. Although the CPU time spent in -this part is negligible, this is an interesting entry because the inclusive CPU -time of @code{0.103} seconds is higher than the exclusive CPU time of @code{0.047} -seconds. Clearly it is calling another function, -or even more than one function. -@xref{The Call Tree} for the details how to get more information on this. - -The function @code{collector_root} does not look familiar. It is one of the internal -functions used by @CollectApp{} and can be ignored. While the inclusive time is high, -the exclusive time is zero. This means it doesn't contribute to the performance. - -The question is how we know where this function originates from? There is a very useful -command to get more details on a function. @xref{Information on Load Objects}. - -@c -- A new node -------------------------------------------------------------- -@node The Source Code View -@subsection The Source Code View -@c ---------------------------------------------------------------------------- - -In general, you would like to focus the tuning efforts on the most time -consuming part(s) of the program. In this case that is easy, since 2.160 -seconds on a total of 2.272 seconds is spent in function @code{mxv_core}. -That is 95% of the total and it is time to dig deeper and look -@cindex Source level timings -at the time distribution at the source code level. - -@IndexSubentry{Commands, @code{source}} -The @code{source} command is used to accomplish this. It takes the name of the -function, not the source filename, as an argument. This is demonstrated -below, where the @DisplayText{} command is used to show the annotated -source listing of function @code{mxv_core}. - -Please note that the source code has to be compiled with the @code{-g} -option in order for the source code feature to work. Otherwise the -location can not be determined. - -@cartouche -@smallexample -$ gprofng display text -source mxv_core test.1.er -@end smallexample -@end cartouche - -The slightly modified output is as follows: - -@smallexample -@verbatim -Source file: <apath>/mxv.c -Object file: mxv-pthreads.exe (found as test.1.er/archives/...) -Load Object: mxv-pthreads.exe (found as test.1.er/archives/...) - - Excl. Incl. - Total Total - CPU sec. CPU sec. - - <lines deleted> - <Function: mxv_core> - 0. 0. 32. void __attribute__ ((noinline)) - mxv_core ( - uint64_t row_index_start, - uint64_t row_index_end, - uint64_t m, uint64_t n, - double **restrict A, - double *restrict b, - double *restrict c) - 0. 0. 33. { - 0. 0. 34. for (uint64_t i=row_index_start; - i<=row_index_end; i++) { - 0. 0. 35. double row_sum = 0.0; -## 1.687 1.687 36. for (int64_t j=0; j<n; j++) - 0.473 0.473 37. row_sum += A[i][j]*b[j]; - 0. 0. 38. c[i] = row_sum; - 39. } - 0. 0. 40. } -@end verbatim -@end smallexample - -The first three lines provide information on the location of the source file, -the object file and the load object (@xref{Load Objects and Functions}). - -Function @code{mxv_core} is part of a source file that has other functions -as well. These functions will be shown, but without timing information. They -have been removed in the output shown above. - -This is followed by the annotated source code listing. The selected metrics -are shown first, followed by a source line number, and the source code. -@IndexSubentry{Miscellaneous ,@code{##}} -The most time consuming line(s) are marked with the @code{##} symbol. In -this way they are easier to find. - -What we see is that all of the time is spent in lines 36-37. - -@IndexSubentry{Commands, @code{lines}} -A related command sometimes comes handy as well. It is called @code{lines} -and displays a list of the source lines and their metrics, ordered according -to the current sort metric (@xref{Sorting the Performance Data}). - -Below the command and the output. For lay-out reasons, only the top 10 is -shown here and the last part of the text on some lines has been replaced -by dots. - -@cartouche -@smallexample -$ gprofng display text -lines test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Lines sorted by metric: Exclusive Total CPU Time - -Excl. Incl. Name -Total Total -CPU sec. CPU sec. -2.272 2.272 <Total> -1.687 1.687 mxv_core, line 36 in "mxv.c" -0.473 0.473 mxv_core, line 37 in "mxv.c" -0.032 0.088 init_data, line 72 in "manage_data.c" -0.030 0.043 <Function: erand48_r, instructions without line numbers> -0.013 0.013 <Function: __drand48_iterate, instructions without ...> -0.013 0.056 <Function: drand48, instructions without line numbers> -0.012 0.012 init_data, line 77 in "manage_data.c" -0.008 0.010 <Function: _int_malloc, instructions without ...> -0.003 0.003 init_data, line 71 in "manage_data.c" -@end verbatim -@end smallexample - -What this overview immediately highlights is that the next most time consuming -source line takes 0.032 seconds only. With an inclusive time of 0.088 seconds, -it is also clear that this branch of the code does not impact the performance. - -@c -- A new node -------------------------------------------------------------- -@node The Disassembly View -@subsection The Disassembly View -@c ---------------------------------------------------------------------------- - -The source view is very useful to obtain more insight where the time is spent, -but sometimes this is not sufficient. This is when the disassembly view comes -in. It is activated with the -@IndexSubentry{Commands, @code{disasm}} -@code{disasm} -command and as with the source view, it displays an annotated listing. In this -@cindex Instruction level timings -case it shows the instructions with the metrics, interleaved with the -source lines. The -instructions have a reference in square brackets (@code{[} and @code{]}) -to the source line they correspond to. - -@noindent -This is what we get for our example: - -@cartouche -@smallexample -$ gprofng display text -disasm mxv_core test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Source file: <apath>/mxv.c -Object file: mxv-pthreads.exe (found as test.1.er/archives/...) -Load Object: mxv-pthreads.exe (found as test.1.er/archives/...) - - Excl. Incl. - Total Total - CPU sec. CPU sec. - - <lines deleted> - 32. void __attribute__ ((noinline)) - mxv_core ( - uint64_t row_index_start, - uint64_t row_index_end, - uint64_t m, uint64_t n, - double **restrict A, - double *restrict b, - double *restrict c) - 33. { - <Function: mxv_core> - 0. 0. [33] 4021ba: mov 0x8(%rsp),%r10 - 34. for (uint64_t i=row_index_start; - i<=row_index_end; i++) { - 0. 0. [34] 4021bf: cmp %rsi,%rdi - 0. 0. [34] 4021c2: jbe 0x37 - 0. 0. [34] 4021c4: ret - 35. double row_sum = 0.0; - 36. for (int64_t j=0; j<n; j++) - 37. row_sum += A[i][j]*b[j]; - 0. 0. [37] 4021c5: mov (%r8,%rdi,8),%rdx - 0. 0. [36] 4021c9: mov $0x0,%eax - 0. 0. [35] 4021ce: pxor %xmm1,%xmm1 - 0.002 0.002 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0 - 0.096 0.096 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0 - 0.375 0.375 [37] 4021dd: addsd %xmm0,%xmm1 -## 1.683 1.683 [36] 4021e1: add $0x1,%rax - 0.004 0.004 [36] 4021e5: cmp %rax,%rcx - 0. 0. [36] 4021e8: jne 0xffffffffffffffea - 38. c[i] = row_sum; - 0. 0. [38] 4021ea: movsd %xmm1,(%r10,%rdi,8) - 0. 0. [34] 4021f0: add $0x1,%rdi - 0. 0. [34] 4021f4: cmp %rdi,%rsi - 0. 0. [34] 4021f7: jb 0xd - 0. 0. [35] 4021f9: pxor %xmm1,%xmm1 - 0. 0. [36] 4021fd: test %rcx,%rcx - 0. 0. [36] 402200: jne 0xffffffffffffffc5 - 0. 0. [36] 402202: jmp 0xffffffffffffffe8 - 39. } - 40. } - 0. 0. [40] 402204: ret -@end verbatim -@end smallexample - -For each instruction, the timing values are given and we can exactly which ones -are the most expensive. As with the source level view, the most expensive -instructions are market with the @code{##} symbol. - -As illustrated below and similar to the @code{lines} command, we can get -an overview of the instructions executed by using the -@IndexSubentry{Commands, @code{pcs}} -@code{pcs} -command. - -@noindent -Below the command and the output, which again has been restricted -to 10 lines: - -@cartouche -@smallexample -$ gprofng display text -pcs test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -PCs sorted by metric: Exclusive Total CPU Time - -Excl. Incl. Name -Total Total -CPU sec. CPU sec. -2.272 2.272 <Total> -1.683 1.683 mxv_core + 0x00000027, line 36 in "mxv.c" -0.375 0.375 mxv_core + 0x00000023, line 37 in "mxv.c" -0.096 0.096 mxv_core + 0x0000001D, line 37 in "mxv.c" -0.027 0.027 init_data + 0x000000BD, line 72 in "manage_data.c" -0.012 0.012 init_data + 0x00000117, line 77 in "manage_data.c" -0.008 0.008 _int_malloc + 0x00000A45 -0.007 0.007 erand48_r + 0x00000062 -0.006 0.006 drand48 + 0x00000000 -0.005 0.005 __drand48_iterate + 0x00000005 -@end verbatim -@end smallexample - -@c -- A new node -------------------------------------------------------------- -@node Display and Define the Metrics -@subsection Display and Define the Metrics -@c ---------------------------------------------------------------------------- - -The default metrics shown by @DisplayText{} are useful, but there is more -recorded than displayed. We can customize the values shown by defining the -metrics ourselves. - -@IndexSubentry{Commands, @code{metric_list}} -There are two commands related to changing the metrics shown: @code{metric_list} -and -@IndexSubentry{Commands, @code{metrics}} -@code{metrics}. - -The first command shows the metrics in use, plus all the metrics that have -been stored as part of the experiment. The second command may be used to -define the metric list. - -In our example we get the following values for the metrics: - -@IndexSubentry{Commands, @code{metric_list}} -@cartouche -@smallexample -$ gprofng display text -metric_list test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Current metrics: e.totalcpu:i.totalcpu:name -Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu ) -Available metrics: - Exclusive Total CPU Time: e.%totalcpu - Inclusive Total CPU Time: i.%totalcpu - Size: size - PC Address: address - Name: name -@end verbatim -@end smallexample - -This shows the metrics currently in use, the metric that is used to sort -the data and all the metrics that have been recorded, but are not necessarily -shown. - -@cindex Default metrics -In this case, the default metrics are set to the exclusive and inclusive -total CPU times, plus the name of the function, or load object. - -@IndexSubentry{Commands, @code{metrics}} -The @code{metrics} command is used to define the metrics that need to be -displayed. - -For example, to display the exclusive total CPU time, both as a number and a -percentage, use the following metric definition: @code{e.%totalcpu} - -Since the metrics can be tailored for different views, there is a way -to reset them to the default. This is done through the special keyword -@code{default}. - -@c -- A new node -------------------------------------------------------------- -@node A First Customization of the Output -@subsection A First Customization of the Output -@c ---------------------------------------------------------------------------- - -With the information just given, we can customize the function overview. -For sake of the example, we would like to display the name of the function -first, followed by the exclusive CPU time, given as an absolute number and -a percentage. - -Note that the commands are parsed in order of appearance. This is why we -need to define the metrics @emph{before} requesting the function overview: - -@cartouche -@smallexample -$ gprofng display text -metrics name:e.%totalcpu -functions test.1.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Current metrics: name:e.%totalcpu -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -Functions sorted by metric: Exclusive Total CPU Time - -Name Excl. Total - CPU - sec. % - <Total> 2.272 100.00 - mxv_core 2.160 95.04 - init_data 0.047 2.06 - erand48_r 0.030 1.32 - __drand48_iterate 0.013 0.57 - drand48 0.013 0.57 - _int_malloc 0.008 0.35 - brk 0.001 0.04 - sysmalloc 0.001 0.04 - __default_morecore 0. 0. - __libc_start_main 0. 0. - allocate_data 0. 0. - collector_root 0. 0. - driver_mxv 0. 0. - main 0. 0. - malloc 0. 0. - sbrk 0. 0. -@end verbatim -@end smallexample - -This was a first and simple example how to customize the output. Note that we -did not rerun our profiling job and merely modified the display settings. -Below we will show other and also more advanced examples of customization. - - -@c -- A new node -------------------------------------------------------------- -@node Name the Experiment Directory -@subsection Name the Experiment Directory -@c ---------------------------------------------------------------------------- - -When using @CollectApp{}, the default names for experiments work fine, but -they are quite generic. It is often more convenient to select a more -descriptive name. For example, one that reflects conditions for the experiment -conducted. - -For this, the mutually exclusive @code{-o} and @code{-O} options come in handy. -Both may be used to provide a name for the experiment directory, but the -behaviour of @CollectApp{} is different. - -With the -@IndexSubentry{Options, @code{-o}} -@code{-o} -option, an existing experiment directory is not overwritten. You either -need to explicitly remove an existing directory first, or use a name that is not -in use yet. - -This is in contrast with the behaviour for the - @IndexSubentry{Options, @code{-O}} -@code{-O} -option. Any existing (experiment) directory with the same name is silently -overwritten. - -Be aware that the name of the experiment directory has to end with @code{.er}. - -@c -- A new node -------------------------------------------------------------- -@node Control the Number of Lines in the Output -@subsection Control the Number of Lines in the Output -@c ---------------------------------------------------------------------------- - -@IndexSubentry{Commands, @code{limit}} -The @code{limit <n>} command can be used to control the number of lines printed -in various overviews, including the function view, but it also takes effect -for other display commands, like @code{lines}. - -The argument @code{<n>} should be a positive integer number. It sets the number -of lines in the function view. A value of zero resets the limit to the default. - -Be aware that the pseudo-function @code{<Total>} counts as a regular function. -For example @code{limit 10} displays nine user level functions. - -@c -- A new node -------------------------------------------------------------- -@node Sorting the Performance Data -@subsection Sorting the Performance Data -@c ---------------------------------------------------------------------------- - -@IndexSubentry{Commands, @code{sort}} -The @code{sort <key>} command sets the key to be used when sorting the -performance data. - -The key is a valid metric definition, but the -@cindex Visibility field -visibility field -(@xref{Metric Definitions}) -in the metric -definition is ignored since this does not affect the outcome of the sorting -operation. -For example if we set the sort key to @code{e.totalcpu}, the values -will be sorted in descending order with respect to the exclusive total -CPU time. - -The data can be sorted in reverse order by prepending the metric definition -with a minus (@code{-}) sign. For example @code{sort -e.totalcpu}. - -A default metric for the sort operation has been defined and since this is -a persistent command, this default can be restored with @code{default} as -the key. - -@c -- A new node -------------------------------------------------------------- -@node Scripting -@subsection Scripting @c ---------------------------------------------------------------------------- - -As is probably clear by now, the list with commands for @DisplayText{} can be -very long. This is tedious and also error prone. Luckily, there is an easier and -more elegant way to control the behaviour of this tool. - -@IndexSubentry{Commands, @code{script}} -Through the @code{script} command, the name of a file with commands can be -passed in. These commands are parsed and executed as if they appeared on -the command line in the same order as encountered in the file. The commands -in this script file can actually be mixed with commands on the command line. - -The difference between the commands in the script file and those used on the -command line is that the latter require a leading dash (@code{-}) symbol. - -Comment lines are supported. They need to start with the @code{#} symbol. - -@c -- A new node -------------------------------------------------------------- -@node A More Elaborate Example -@subsection A More Elaborate Example +@c NOTES section @c ---------------------------------------------------------------------------- -With the information presented so far, we can customize our data -gathering and display commands. - -As an example, to reflect the name of the algorithm and the number of threads -that were used in the experiment, we select @code{mxv.1.thr.er} -as the name of the experiment directory. -All we then need to -do is to add the - @IndexSubentry{Options, @code{-O}} -@code{-O} -option followed by this name on the command line when running @CollectApp{}: - -@cartouche -@smallexample -$ exe=mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ gprofng collect app -O mxv.1.thr.er ./$exe -m $m -n $n -t 1 -@end smallexample -@end cartouche - -The commands to generate the profile are put into a file that we simply call -@code{my-script}: - -@smallexample -@verbatim -$ cat my-script -# This is my first gprofng script -# Set the metrics -metrics i.%totalcpu:e.%totalcpu:name -# Use the exclusive time to sort -sort e.totalcpu -# Limit the function list to 5 lines -limit 5 -# Show the function list -functions -@end verbatim -@end smallexample - -This script file is then specified as input to the @DisplayText{} command -that is used to display the performance information stored in -@code{mxv.1.thr.er}: - -@cartouche -@smallexample -$ gprofng display text -script my-script mxv.1.thr.er -@end smallexample -@end cartouche - -The command above produces the following output: - -@smallexample -@verbatim -# This is my first gprofng script -# Set the metrics -Current metrics: i.%totalcpu:e.%totalcpu:name -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -# Use the exclusive time to sort -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -# Limit the function list to 5 lines -Print limit set to 5 -# Show the function list -Functions sorted by metric: Exclusive Total CPU Time - -Incl. Total Excl. Total Name -CPU CPU - sec. % sec. % -2.272 100.00 2.272 100.00 <Total> -2.159 95.00 2.159 95.00 mxv_core -0.102 4.48 0.054 2.37 init_data -0.035 1.54 0.025 1.10 erand48_r -0.048 2.11 0.013 0.57 drand48 -@end verbatim -@end smallexample - -In the first part of the output, our comment lines in the script file are -shown. These are interleaved with an acknowledgement message for the commands. - -This is followed by a profile consisting of 5 lines only. For both metrics, -the percentages plus the timings are given. The numbers are sorted with respect -to the exclusive total CPU time. - -It is now immediately clear that function @code{mxv_core} is responsbile for -95% of the CPU time and @code{init_data} takes 4.5% only. - -This is also where we see sampling in action. Although this is exactly the -same job we profiled before, the timings are somewhat different, but the -differences are very small. - -@c -- A new node -------------------------------------------------------------- -@node The Call Tree -@subsection The Call Tree -@c ---------------------------------------------------------------------------- - -The call tree shows the dynamic hierarchy of the application by displaying the -functions executed and their parent. It helps to find the most expensive path -in the program. - -@IndexSubentry{Commands, @code{calltree}} -This feature is enabled through the @code{calltree} command. This is how to get -this tree for our current experiment: - -@cartouche -@smallexample -$ gprofng display text -calltree mxv.1.thr.er -@end smallexample -@end cartouche - -This displays the following structure: - -@smallexample -@verbatim -Functions Call Tree. Metric: Attributed Total CPU Time - -Attr. Name -Total -CPU sec. -2.272 +-<Total> -2.159 +-collector_root -2.159 | +-driver_mxv -2.159 | +-mxv_core -0.114 +-__libc_start_main -0.114 +-main -0.102 +-init_data -0.048 | +-drand48 -0.035 | +-erand48_r -0.010 | +-__drand48_iterate -0.011 +-allocate_data -0.011 | +-malloc -0.011 | +-_int_malloc -0.001 | +-sysmalloc -0.001 +-check_results -0.001 +-malloc -0.001 +-_int_malloc -@end verbatim -@end smallexample - -At first sight this may not be what you expected and some explanation is in -place. - -@c ---------------------------------------------------------------------------- -@c TBD: Revise this text when we have user and machine mode. -@c ---------------------------------------------------------------------------- -First of all, function @code{collector_root} is internal to @ToolName{} and -should be hidden to the user. This is part of a planned future enhancement. - -Recall that the @code{objects} and @code{fsingle} commands are very useful -to find out more about load objects in general, but also to help identify -an unknown entry in the function overview. @xref{Load Objects and Functions}. - -Another thing to note is that there are two main branches. The one under -@code{collector_root} and the second one under @code{__libc_start_main}. -This reflects the fact that we are executing a parallel program. Even though -we only used one thread for this run, this is still executed in a separate -path. - -The main, sequential part of the program is displayed under @code{main} and -shows the functions called and the time they took. - -There are two things worth noting for the call tree feature: - -@itemize - -@item -This is a dynamic tree and since sampling is used, it most likely looks -slighlty different across seemingly identical profile runs. In case the -run times are short, it is worth considering to use a high resolution -through the -@IndexSubentry{Options, @code{-p}} -@code{-p} -option. For example to use @code{-p hi} to increase the sampling rate. - -@item -In case hardware event counters have been enabled -(@xref{Profile Hardware Event Counters}), these values are also displayed -in the call tree view. - -@end itemize - -@c -- A new node -------------------------------------------------------------- -@node More Information on the Experiment -@subsection More Information on the Experiment -@c ---------------------------------------------------------------------------- - -The experiment directory not only contains performance related data. Several -system characteristics, the actually command executed, and some global -performance statistics can be displayed. - -@IndexSubentry{Commands, @code{header}} -The @code{header} command displays information about the experiment(s). -For example, this is the command to extract this data from for our experiment -directory: - -@cartouche -@smallexample -$ gprofng display text -header mxv.1.thr.er -@end smallexample -@end cartouche - -The above command prints the following information. Note that some of the -lay-out and the information has been modified. The textual changes are -marked with the @code{<} and @code{>} symbols. - -@smallexample -@verbatim -Experiment: mxv.1.thr.er -No errors -No warnings -Archive command `gp-archive -n -a on - --outfile <exp_dir>/archive.log <exp_dir>' - -Target command (64-bit): './mxv-pthreads.exe -m 3000 -n 2000 -t 1' -Process pid 30591, ppid 30589, pgrp 30551, sid 30468 -Current working directory: <cwd> -Collector version: `2.36.50'; experiment version 12.4 (64-bit) -Host `<hostname>', OS `Linux <version>', page size 4096, - architecture `x86_64' - 16 CPUs, clock speed 1995 MHz. - Memory: 30871514 pages @ 4096 = 120591 MB. -Data collection parameters: - Clock-profiling, interval = 997 microsecs. - Periodic sampling, 1 secs. - Follow descendant processes from: fork|exec|combo - -Experiment started <date and time> - -Experiment Ended: 2.293162658 -Data Collection Duration: 2.293162658 -@end verbatim -@end smallexample - -The output above may assist in troubleshooting, or to verify some of the -operational conditions and we recommand to include this command when -generating a profile. - -@IndexSubentry{Options, @code{-C}} -Related to this command there is a useful option to record your own comment(s) in -an experiment. -To this end, use the @code{-C} option on the @CollectApp{} tool to -specify a comment string. Up to ten comment lines can be included. -These comments are displayed with the @code{header} command on -the @DisplayText{} tool. - -@IndexSubentry{Commands, @code{overview}} -The @code{overview} command displays information on the experiment(s) and also -shows a summary of the values for the metric(s) used. This is an example how to -use it on our newly created experiment directory: - -@cartouche -@smallexample -$ gprofng display text -overview mxv.1.thr.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Experiment(s): - -Experiment :mxv.1.thr.er - Target : './mxv-pthreads.exe -m 3000 -n 2000 -t 1' - Host : <hostname> (<ISA>, Linux <version>) - Start Time : <date and time> - Duration : 2.293 Seconds - -Metrics: - - Experiment Duration (Seconds): [2.293] - Clock Profiling - [X]Total CPU Time - totalcpu (Seconds): [*2.272] - -Notes: '*' indicates hot metrics, '[X]' indicates currently enabled - metrics. - The metrics command can be used to change selections. The - metric_list command lists all available metrics. -@end verbatim -@end smallexample - -This command provides a dashboard overview that helps to easily identify -where the time is spent and in case hardware event counters are used, it -shows their total values. - -@c -- A new node -------------------------------------------------------------- -@node Control the Sampling Frequency -@subsection Control the Sampling Frequency -@c ---------------------------------------------------------------------------- - -So far we did not talk about the frequency of the sampling process, but in -some cases it is useful to change the default of 10 milliseconds. - -The advantage of increasing the sampling frequency is that functions that -do not take much time per invocation are more accurately captured. The -downside is that more data is gathered. This has an impact on the overhead -of the collection process and more disk space is required. - -In general this is not an immediate concern, but with heavily threaded -applications that run for an extended period of time, increasing the -frequency may have a more noticeable impact. - -@IndexSubentry{Options, @code{-p}} -The @code{-p} option on the @CollectApp{} tool is used to enable or disable -clock based profiling, or to explicitly set the sampling rate. -@cindex Sampling interval -This option takes one of the following keywords: - -@table @code - -@item off -Disable clock based profiling. - -@item on -Enable clock based profiling with a per thread sampling interval of 10 ms. This is the default. - -@item lo -Enable clock based profiling with a per thread sampling interval of 100 ms. - -@item hi -Enable clock based profiling with a per thread sampling interval of 1 ms. - -@item <value> -Enable clock based profiling with a per thread sampling interval of <value>. - -@end table - -One may wonder why there is an option to disable clock based profiling. This -is because by default, it is enabled when conducting hardware event counter -experiments (@xref{Profile Hardware Event Counters}). -With the @code{-p off} option, this can be disabled. - -If an explicit value is set for the sampling, the number can be an integer or a -floating-point number. -A suffix of @code{u} for microseconds, or @code{m} for milliseconds is supported. -If no suffix is used, the value is assumed to be in milliseconds. - -If the value is smaller than the clock profiling minimum, a warning message is issued -and it is set to the minimum. -In case it is not a multiple of the clock profiling resolution, it is silently rounded -down to the nearest multiple of the clock resolution. - -If the value exceeds the clock profiling maximum, is negative, or zero, an error is -reported. - -@IndexSubentry{Commands, @code{header}} -Note that the @code{header} command echoes the sampling rate used. - -@c -- A new node -------------------------------------------------------------- -@node Information on Load Objects -@subsection Information on Load Objects -@c ---------------------------------------------------------------------------- - -It may happen that the function list contains a function that is not known to -the user. This can easily happen with library functions for example. -Luckily there are three commands that come in handy then. - -@IndexSubentry{Commands, @code{objects}} -@IndexSubentry{Commands, @code{fsingle}} -@IndexSubentry{Commands, @code{fsummary}} -These commands are @code{objects}, @code{fsingle}, and @code{fsummary}. -They provide details on -@cindex Load objects -load objects (@xref{Load Objects and Functions}). - -The @code{objects} command lists all load objects that have been referenced -during the performance experiment. -Below we show the command and the result for our profile job. Like before, -the (long) path names in the output have been shortened and replaced by the -@IndexSubentry{Miscellaneous, @code{<apath>}} -@code{<apath>} symbol that represents an absolute directory path. - -@cartouche -@smallexample -$ gprofng display text -objects mxv.1.thr.er -@end smallexample -@end cartouche - -The output includes the name and path of the target executable: - -@smallexample -@verbatim - <Unknown> (<Unknown>) - <mxv-pthreads.exe> (<apath>/mxv-pthreads.exe) - <librt-2.17.so> (/usr/lib64/librt-2.17.so) - <libdl-2.17.so> (/usr/lib64/libdl-2.17.so) - <libbfd-2.36.50.20210505.so> (<apath>/libbfd-2.36.50 <etc>) - <libopcodes-2.36.50.20210505.so> (<apath>/libopcodes-2. <etc>) - <libc-2.17.so> (/usr/lib64/libc-2.17.so) - <libpthread-2.17.so> (/usr/lib64/libpthread-2.17.so) - <libm-2.17.so> (/usr/lib64/libm-2.17.so) - <libgp-collector.so> (<apath>/libgp-collector.so) - <ld-2.17.so> (/usr/lib64/ld-2.17.so) - <DYNAMIC_FUNCTIONS> (DYNAMIC_FUNCTIONS) -@end verbatim -@end smallexample - -@IndexSubentry{Commands, @code{fsingle}} -The @code{fsingle} command may be used to get more details on a specific entry -in the function view, say. For example, the command below provides additional -information on the @code{collector_root} function shown in the function overview. - -@cartouche -@smallexample -$ gprofng display text -fsingle collector_root mxv.1.thr.er -@end smallexample -@end cartouche - -Below the output from this command. It has been somewhat modified to match the -display requirements. - -@smallexample -@verbatim -collector_root - Exclusive Total CPU Time: 0. ( 0. %) - Inclusive Total CPU Time: 2.159 ( 95.0%) - Size: 401 - PC Address: 10:0x0001db60 - Source File: <apath>/dispatcher.c - Object File: mxv.1.thr.er/archives/libgp-collector.so_HpzZ6wMR-3b - Load Object: <apath>/libgp-collector.so - Mangled Name: - Aliases: -@end verbatim -@end smallexample - -In this table we not only see how much time was spent in this function, we -also see where it originates from. In addition to this, the size and start -address are given as well. If the source code location is known it is also -shown here. - -@IndexSubentry{Commands, @code{fsummary}} -The related @code{fsummary} command displays the same information as -@code{fsingle}, but for all functions in the function overview, -including @code{<Total>}: - -@cartouche -@smallexample -$ gprofng display text -fsummary mxv.1.thr.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Functions sorted by metric: Exclusive Total CPU Time - -<Total> - Exclusive Total CPU Time: 2.272 (100.0%) - Inclusive Total CPU Time: 2.272 (100.0%) - Size: 0 - PC Address: 1:0x00000000 - Source File: (unknown) - Object File: (unknown) - Load Object: <Total> - Mangled Name: - Aliases: - -mxv_core - Exclusive Total CPU Time: 2.159 ( 95.0%) - Inclusive Total CPU Time: 2.159 ( 95.0%) - Size: 75 - PC Address: 2:0x000021ba - Source File: <apath>/mxv.c - Object File: mxv.1.thr.er/archives/mxv-pthreads.exe_hRxWdccbJPc - Load Object: <apath>/mxv-pthreads.exe - Mangled Name: - Aliases: - - ... etc ... -@end verbatim -@end smallexample - -@c -- A new node -------------------------------------------------------------- -@node Support for Multithreading -@section Support for Multithreading -@c ---------------------------------------------------------------------------- - -In this chapter we introduce and discuss the support for multithreading. As -is shown below, nothing needs to be changed when collecting the performance -data. - -The difference is that additional commands are available to get more -information on the parallel environment, plus that several filters allow -the user to zoom in on specific threads. - -@c -- A new node -------------------------------------------------------------- -@node Creating a Multithreading Experiment -@subsection Creating a Multithreading Experiment -@c ---------------------------------------------------------------------------- - -We demonstrate the support for multithreading using the same code and settings -as before, but this time we use 2 threads: - -@cartouche -@smallexample -$ exe=mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ gprofng collect app -O mxv.2.thr.er ./$exe -m $m -n $n -t 2 -@end smallexample -@end cartouche - -First of all, note that we did not change anything, other than setting the -number of threads to 2. Nothing special is needed to profile a multithreaded -job when using @ToolName{}. - -The same is true when displaying the performance results. The same commands -that we used before work unmodified. For example, this is all that is needed to -get a function overview: - -@cartouche -@smallexample -$ gpprofng display text -limit 10 -functions mxv.2.thr.er -@end smallexample -@end cartouche - -This produces the following familiar looking output: - -@smallexample -@verbatim -Print limit set to 10 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Incl. Name -Total Total -CPU sec. CPU sec. -2.268 2.268 <Total> -2.155 2.155 mxv_core -0.044 0.103 init_data -0.030 0.046 erand48_r -0.016 0.016 __drand48_iterate -0.013 0.059 drand48 -0.008 0.011 _int_malloc -0.003 0.003 brk -0. 0.003 __default_morecore -0. 0.114 __libc_start_main -@end verbatim -@end smallexample - -@c -- A new node -------------------------------------------------------------- -@node Commands Specific to Multithreading -@subsection Commands Specific to Multithreading -@c ---------------------------------------------------------------------------- - -The function overview shown above shows the results aggregated over all the -threads. The interesting new element is that we can also look at the -performance data for the individual threads. - -@IndexSubentry{Commands, @code{thread_list}} -The @code{thread_list} command displays how many threads have been used: - -@cartouche -@smallexample -$ gprofng display text -thread_list mxv.2.thr.er -@end smallexample -@end cartouche - -This produces the following output, showing that three threads have -been used: - -@smallexample -@verbatim -Exp Sel Total -=== === ===== - 1 all 3 -@end verbatim -@end smallexample - -The output confirms there is one experiment and that by default all -threads are selected. - -It may seem surprising to see three threads here, since we used the -@code{-t 2} option, but it is common for a Pthreads program to use one -additional thread. This is typically the thread that runs from start to -finish and handles the sequential portions of the code, as well as takes -care of managing the threads. - -It is no different in our example code. At some point, the main thread -creates and activates the two threads that perform the multiplication -of the matrix with the vector. Upon completion of this computation, -the main thread continues. - -@IndexSubentry{Commands, @code{threads}} -The @code{threads} command is simple, yet very powerful. It shows the -total value of the metrics for each thread. To make it easier to -interpret the data, we modify the metrics to include percentages: - -@cartouche -@smallexample -$ gprofng display text -metrics e.%totalcpu -threads mxv.2.thr.er -@end smallexample -@end cartouche - -The command above produces the following overview: - -@smallexample -@verbatim -Current metrics: e.%totalcpu:name -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -Objects sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -2.258 100.00 <Total> -1.075 47.59 Process 1, Thread 3 -1.070 47.37 Process 1, Thread 2 -0.114 5.03 Process 1, Thread 1 -@end verbatim -@end smallexample - -The first line gives the total CPU time accumulated over the threads -selected. This is followed by the metric value(s) for each thread. - -From this it is clear that the main thread is responsible for 5% of -the total CPU time, while the other two threads take 47% each. - -This view is ideally suited to verify if there any load balancing -issues and also to find the most time consuming thread(s). - -@IndexSubentry{Filters, Thread selection} -While useful, often more information than this is needed. This is -@IndexSubentry{Commands, @code{thread_select}} -where the thread selection filter comes in. Through the @code{thread_select} -command, one or more threads may be selected -(@xref{The Selection List} how to define the selection list). - -Since it is most common to use this command in a script, we do so as -well here. Below the script we are using: - -@cartouche -@smallexample -# Define the metrics -metrics e.%totalcpu -# Limit the output to 10 lines -limit 10 -# Get the function overview for thread 1 -thread_select 1 -functions -# Get the function overview for thread 2 -thread_select 2 -functions -# Get the function overview for thread 3 -thread_select 3 -functions -@end smallexample -@end cartouche - -The definition of the metrics and the output limiter has been shown and -explained before and will be ignored. The new command we focus on is -@IndexSubentry{Commands, @code{thread_select}} -@code{thread_select}. - -This command takes a list (@xref{The Selection List}) to select specific -threads. In this case we simply use the individual thread numbers that we -obtained with the @code{thread_list} command earlier. - -This restricts the output of the @code{functions} command to the thread -number(s) specified. This means that the script above shows which -function(s) each thread executes and how much CPU time they consumed. -Both the timings and their percentages are given. - -This is the relevant part of the output for the first thread: - -@smallexample -@verbatim -# Get the function overview for thread 1 -Exp Sel Total -=== === ===== - 1 1 3 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -0.114 100.00 <Total> -0.051 44.74 init_data -0.028 24.56 erand48_r -0.017 14.91 __drand48_iterate -0.010 8.77 _int_malloc -0.008 7.02 drand48 -0. 0. __libc_start_main -0. 0. allocate_data -0. 0. main -0. 0. malloc -@end verbatim -@end smallexample - -As usual, the comment lines are echoed. This is followed by a confirmation -of our selection. We see that indeed thread 1 has been selected. What is -displayed next is the function overview for this particular thread. Due to -the @code{limit 10} command, there are ten entries in this list. - -Below are the overviews for threads 2 and 3 respectively. We see that all -of the CPU time is spent in function @code{mxv_core} and that this time -is approximately the same for both threads. - -@smallexample -@verbatim -# Get the function overview for thread 2 -Exp Sel Total -=== === ===== - 1 2 3 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -1.072 100.00 <Total> -1.072 100.00 mxv_core -0. 0. collector_root -0. 0. driver_mxv - -# Get the function overview for thread 3 -Exp Sel Total -=== === ===== - 1 3 3 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -1.076 100.00 <Total> -1.076 100.00 mxv_core -0. 0. collector_root -0. 0. driver_mxv -@end verbatim -@end smallexample - -When analyzing the performance of a multithreaded application, it is sometimes -useful to know whether threads have mostly executed on the same core, say, or -if they have wandered across multiple cores. This sort of stickiness is usually -referred to as -@cindex Thread affinity -@emph{thread affinity}. - -Similar to the commands for the threads, there are several commands related -to the usage of the cores, or @emph{CPUs} as they are called in @ToolName{} -(@xref{The Concept of a CPU in @ProductName{}}). - -In order to have some more interesting data to look at, we created a new -experiment, this time using 8 threads: - -@cartouche -@smallexample -$ exe=mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ gprofng collect app -O mxv.8.thr.er ./$exe -m $m -n $n -t 8 -@end smallexample -@end cartouche - -@IndexSubentry{Commands, @code{cpu_list}} -Similar to the @code{thread_list} command, the @code{cpu_list} command -displays how many CPUs have been used. -@IndexSubentry{Commands, @code{cpus}} -The equivalent of the @code{threads} threads command, is the @code{cpus} -command, which shows the CPU numbers that were used and how much time was -spent on each of them. Both are demonstrated below. - -@cartouche -@smallexample -$ gprofng display text -metrics e.%totalcpu -cpu_list -cpus mxv.8.thr.er -@end smallexample -@end cartouche - -This command produces the following output: - -@smallexample -@verbatim -Current metrics: e.%totalcpu:name -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -Exp Sel Total -=== === ===== - 1 all 10 -Objects sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -2.310 100.00 <Total> -0.286 12.39 CPU 7 -0.284 12.30 CPU 13 -0.282 12.21 CPU 5 -0.280 12.13 CPU 14 -0.266 11.52 CPU 9 -0.265 11.48 CPU 2 -0.264 11.44 CPU 11 -0.194 8.42 CPU 0 -0.114 4.92 CPU 1 -0.074 3.19 CPU 15 -@end verbatim -@end smallexample - -@c ---------------------------------------------------------------------------- -@c TBD - Ruud -@c I'd like to improve this and have a way to see where a thread has executed. -@c ---------------------------------------------------------------------------- - -What we see in this table is that a total of 10 CPUs have been used. This is -followed by a list with all the CPU numbers that have been used during the -run. For each CPU it is shown how much time was spent on it. - -While the table with thread times shown earlier may point at a load imbalance -in the application, this overview has a different purpose. - -For example, we see that 10 CPUs have been used, but we know that the -application uses 9 threads only. -This means that at least one thread has executed on more than one CPU. In -itself this is not something to worry about, but warrants a deeper -investigation. - -Honesty dictates that next we performed a pre-analysis to find out -which thread(s) have been running on more than one CPU. We found this -to be thread 7. It has executed on CPUs 0 and 15. - -With this knowledge, we wrote the script shown below. It zooms in on -the behaviour of thread 7. - -@cartouche -@smallexample -# Define the metrics -metrics e.%totalcpu -# Limit the output to 10 lines -limit 10 -functions -# Get the function overview for CPU 0 -cpu_select 0 -functions -# Get the function overview for CPU 15 -cpu_select 15 -functions -@end smallexample -@end cartouche - -From the earlier shown threads overview, we know that thread 7 has -used @code{0.268} seconds of CPU time.. - -By selecting CPUs 0 and 15, respectively, we get the following -function overviews: - -@smallexample -@verbatim -# Get the function overview for CPU 0 -Exp Sel Total -=== === ===== - 1 0 10 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -0.194 100.00 <Total> -0.194 100.00 mxv_core -0. 0. collector_root -0. 0. driver_mxv - -# Get the function overview for CPU 15 -Exp Sel Total -=== === ===== - 1 15 10 -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -0.074 100.00 <Total> -0.074 100.00 mxv_core -0. 0. collector_root -0. 0. driver_mxv -@end verbatim -@end smallexample - -This shows that thread 7 spent @code{0.194} seconds on CPU 0 and -@code{0.074} seconds on CPU 15. - -@c -- A new node -------------------------------------------------------------- -@node Viewing Multiple Experiments -@section Viewing Multiple Experiments -@c ---------------------------------------------------------------------------- - -One thing we did not cover sofar is that @ToolName{} fully supports the analysis -of multiple experiments. The @DisplayText{} tool accepts a list of experiments. -The data can either be aggregated across the experiments, or used in a -comparison. - -Mention @code{experiment_list} - -@c -- A new node -------------------------------------------------------------- -@node Aggregation of Experiments -@subsection Aggregation of Experiments -@c ---------------------------------------------------------------------------- - -By default, the data for multiple experiments is aggregrated and the display -commands shows these combined results. - -For example, we can aggregate the data for our single and dual thread -experiments. Below is the script we used for this: - -@cartouche -@smallexample -# Define the metrics -metrics e.%totalcpu -# Limit the output to 10 lines -limit 10 -# Get the list with experiments -experiment_list -# Get the function overview -functions -@end smallexample -@end cartouche - -@IndexSubentry{Commands, @code{experiment_list}} -With the exception of the @code{experiment_list} command, all commands -used have been discussed earlier. - -The @code{experiment_list} command provides a list of the experiments -that have been loaded. This is is used to verify we are looking at the -experiments we intend to aggregate. - -@cartouche -@smallexample -$ gprofng display text -script my-script-agg mxv.1.thr.er mxv.2.thr.er -@end smallexample -@end cartouche - -With the command above, we get the following output: - -@smallexample -@verbatim -# Define the metrics -Current metrics: e.%totalcpu:name -Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) -# Limit the output to 10 lines -Print limit set to 10 -# Get the list with experiments -ID Sel PID Experiment -== === ===== ============ - 1 yes 30591 mxv.1.thr.er - 2 yes 11629 mxv.2.thr.er -# Get the function overview -Functions sorted by metric: Exclusive Total CPU Time - -Excl. Total Name -CPU - sec. % -4.533 100.00 <Total> -4.306 94.99 mxv_core -0.105 2.31 init_data -0.053 1.17 erand48_r -0.027 0.59 __drand48_iterate -0.021 0.46 _int_malloc -0.021 0.46 drand48 -0.001 0.02 sysmalloc -0. 0. __libc_start_main -0. 0. allocate_data -@end verbatim -@end smallexample - -The first five lines should look familiar. The five lines following, echo -the comment line in the script and show the overview of the experiments. -This confirms two experiments have been loaded and that both are active. - -This is followed by the function overview. The timings have been summed -up and the percentages are adjusted accordingly. For example, the total -accumulated time is indeed 2.272 + 2.261 = 4.533 seconds. - -@c -- A new node -------------------------------------------------------------- -@node Comparison of Experiments -@subsection Comparison of Experiments -@c ---------------------------------------------------------------------------- - -The support for multiple experiments really shines in comparison mode. This -feature is enabled through the command -@IndexSubentry{Commands, @code{compare on/off}} -@code{compare on} -and is disabled -by setting -@code{compare off}. - -@cindex Compare experiments -In comparison mode, the data for the various experiments is shown side by -side, as illustrated below where we compare the results for the multithreaded -experiments using one and two threads respectively: - -@cartouche -@smallexample -$ gprofng display text -compare on -functions mxv.1.thr.er mxv.2.thr.er -@end smallexample -@end cartouche - -@noindent -This produces the following output: - -@smallexample -@verbatim -Functions sorted by metric: Exclusive Total CPU Time - -mxv.1.thr.er mxv.2.thr.er mxv.1.thr.er mxv.2.thr.er -Excl. Total Excl. Total Incl. Total Incl. Total Name -CPU CPU CPU CPU - sec. sec. sec. sec. -2.272 2.261 2.272 2.261 <Total> -2.159 2.148 2.159 2.148 mxv_core -0.054 0.051 0.102 0.104 init_data -0.025 0.028 0.035 0.045 erand48_r -0.013 0.008 0.048 0.053 drand48 -0.011 0.010 0.012 0.010 _int_malloc -0.010 0.017 0.010 0.017 __drand48_iterate -0.001 0. 0.001 0. sysmalloc -0. 0. 0.114 0.114 __libc_start_main -0. 0. 0.011 0.010 allocate_data -0. 0. 0.001 0. check_results -0. 0. 2.159 2.148 collector_root -0. 0. 2.159 2.148 driver_mxv -0. 0. 0.114 0.114 main -0. 0. 0.012 0.010 malloc -@end verbatim -@end smallexample - -This table is already helpful to more easily compare (two) profiles, but -there is more that we can do here. - -By default, in comparison mode, all measured values are shown. Often -profiling is about comparing performance data. It is therefore -more useful to look at differences, or ratios, using one experiment as -a reference. - -The values shown are relative to this difference. For example if a ratio -is below one, it means the reference value was higher. - -@IndexSubentry{Commands, @code{compare on/off}} -This feature is supported on the @code{compare} command. In addition to @code{on}, -or @code{off}, this command also supports -@IndexSubentry{Commands, @code{compare delta}} -@code{delta}, or -@IndexSubentry{Commands, @code{compare ratio}} -@code{ratio}. - -Usage of one of these two keywords enables the comparison feature and shows -either the difference, or the ratio, relative to the reference data. - -In the example below, we use the same two experiments used in the comparison -above, but as before, the number of lines is restricted to 10 and we focus on -the exclusive timings plus percentages. For the comparison part we are -interested in the differences. - -This is the script that produces such an overview: - -@cartouche -@smallexample -# Define the metrics -metrics e.%totalcpu -# Limit the output to 10 lines -limit 10 -# Set the comparison mode to differences -compare delta -# Get the function overview -functions -@end smallexample -@end cartouche - -Assuming this script file is called @code{my-script-comp}, this is how we -get the table displayed on our screen: - -@cartouche -@smallexample -$ gprofng display text -script my-script-comp mxv.1.thr.er mxv.2.thr.er -@end smallexample -@end cartouche - -Leaving out some of the lines printed, but we have seen before, we get -the following table: - -@smallexample -@verbatim -mxv.1.thr.er mxv.2.thr.er -Excl. Total Excl. Total Name -CPU CPU - sec. % delta % -2.272 100.00 -0.011 100.00 <Total> -2.159 95.00 -0.011 94.97 mxv_core -0.054 2.37 -0.003 2.25 init_data -0.025 1.10 +0.003 1.23 erand48_r -0.013 0.57 -0.005 0.35 drand48 -0.011 0.48 -0.001 0.44 _int_malloc -0.010 0.44 +0.007 0.75 __drand48_iterate -0.001 0.04 -0.001 0. sysmalloc -0. 0. +0. 0. __libc_start_main -0. 0. +0. 0. allocate_data -@end verbatim -@end smallexample - -It is now easy to see that the CPU times for the most time consuming -functions in this code are practically the same. - -While in this case we used the delta as a comparison, - -Note that the comparison feature is supported at the function, source, and -disassembly level. There is no practical limit on the number of experiments -that can be used in a comparison. - - - -@c -- A new node -------------------------------------------------------------- -@node Profile Hardware Event Counters -@section Profile Hardware Event Counters -@c ---------------------------------------------------------------------------- - -Many processors provide a set of hardware event counters and @ToolName{} -provides support for this feature. -@xref{Hardware Event Counters Explained} for those readers that are not -familiar with such counters and like to learn more. - -In this section we explain how to get the details on the event counter -support for the processor used in the experiment(s), and show several -examples. - -@c -- A new node -------------------------------------------------------------- -@node Getting Information on the Counters Supported -@subsection Getting Information on the Counters Supported -@c ---------------------------------------------------------------------------- - -The first step is to check if the processor used for the experiments is -supported by @ToolName{}. - -@IndexSubentry{Options, @code{-h}} -The @code{-h} option on @CollectApp{} will show the event counter -information: - -@cartouche -@smallexample -$ gprofng collect app -h -@end smallexample -@end cartouche - -In case the counters are supported, a list with the events is printed. -Otherwise, a warning message will be issued. - -For example, below we show this command and the output on an Intel Xeon -Platinum 8167M (aka ``Skylake'') processor. The output has been split -into several sections and each section is commented upon separately. - -@smallexample -@verbatim -Run "gprofng collect app --help" for a usage message. - -Specifying HW counters on `Intel Arch PerfMon v2 on Family 6 Model 85' -(cpuver=2499): - - -h {auto|lo|on|hi} - turn on default set of HW counters at the specified rate - -h <ctr_def> [-h <ctr_def>]... - -h <ctr_def>[,<ctr_def>]... - specify HW counter profiling for up to 4 HW counters -@end verbatim -@end smallexample +@ManPageStart{NOTES} +@c man begin NOTES -The first line shows how to get a usage overview. This is followed by -some information on the target processor. +The gprofng driver supports the following commands. +@vspace{1} -The next five lines explain in what ways the @code{-h} option can be -used to define the events to be monitored. +@c The man pages for the commands below can be viewed using the command name with "gprofng" replaced by "gp" and the spaces replaced by a dash ("-"). For example the man page +@c name for "gprofng collect app" is "gp-collect-app". -The first version shown above enables a default set of counters. This -default depends on the processor this command is executed on. The -keyword following the @code{-h} option defines the sampling rate: +@i{Collect performance data:} @table @code -@item auto -Match the sample rate of used by clock profiling. If the latter is disabled, -Use a per thread sampling rate of approximately 100 samples per second. -This setting is the default and preferred. - -@item on -Use a per thread sampling rate of approximately 100 samples per second. - -@item lo -Use a per thread sampling rate of approximately 10 samples per second. - -@item hi -Use a per thread sampling rate of approximately 1000 samples per second. - -@end table - -The second and third variant define the events to be monitored. Note -that the number of simultaneous events supported is printed. In this -case we can monitor four events in a single profiling job. - -It is a matter of preference whether you like to use the @code{-h} -option for each event, or use it once, followed by a comma separated -list. - -There is one slight catch though. The counter definition below has -mandatory comma (@code{,}) between the event and the rate. While a -default can be used for the rate, the comma cannot be omitted. -This may result in a somewhat awkward counter definition in case -the default sampling rate is used. - -For example, the following two commands are equivalent. Note -the double comma in the second command. This is not a typo. - -@cartouche -@smallexample -$ gprofng collect app -h cycles -h insts ... -$ gprofng collect app -h cycles,,insts ... -@end smallexample -@end cartouche - -In the first command this comma is not needed, because a -comma (``@code{,}'') immediately followed by white space may -be omitted. - -This is why we prefer the this syntax and in the remainder will -use the first version of this command. - -@IndexSubentry{Hardware event counters, counter definition} -The counter definition takes an event name, plus optionally one or -more attributes, followed by a comma, and optionally the sampling rate. -The output section below shows the formal definition. +@item gprofng collect app +Collect application performance data. -@cartouche -@smallexample - <ctr_def> == <ctr>[[~<attr>=<val>]...],[<rate>] -@end smallexample -@end cartouche +@end table -The printed help then explains this syntax. Below we have summarized -and expanded this output: +@i{Display the performance results:} @table @code -@item <ctr> -The counter name must be selected from the available counters listed -as part of the output printed with the @code{-h} option. -On most systems, if a counter is not listed, it may still be specified -by its numeric value. +@item gprofng display text +Display the performance data in ASCII format. -@item ~<attr>=<val> -This is an optional attribute that depends on the processor. The list -of supported attributes is printed in the output. Examples of -attributes are ``user'', or ``system''. The value can given in decimal -or hexadecimal format. -Multiple attributes may be specified, and each must be preceded -by a ~. +@item gprofng display html +Generate an HTML file from one or more experiments. -@item <rate> +@end table -The sampling rate is one of the following: +@i{Miscellaneous commands:} @table @code -@item auto -This is the default and matches the rate used by clock profiling. -If clock profiling is disabled, use @code{on}. - -@item on -Set the per thread maximum sampling rate to ~100 samples/second - -@item lo -Set the per thread maximum sampling rate to ~10 samples/second - -@item hi -Set the per thread maximum sampling rate to ~1000 samples/second - -@item <interval> -Define the sampling interval. -@xref{Control the Sampling Frequency} how to define this. +@item gprofng display src +Display source or disassembly with compiler annotations. -@end table +@item gprofng archive +Include binaries and source code in an experiment directory. @end table -After the section with the formal definition of events and counters, a -processor specific list is displayed. This part starts with an overview -of the default set of counters and the aliased names supported -@emph{on this specific processor}. - -@smallexample -@verbatim -Default set of HW counters: - - -h cycles,,insts,,llm - -Aliases for most useful HW counters: - - alias raw name type units regs description - - cycles unhalted-core-cycles CPU-cycles 0123 CPU Cycles - insts instruction-retired events 0123 Instructions Executed - llm llc-misses events 0123 Last-Level Cache Misses - br_msp branch-misses-retired events 0123 Branch Mispredict - br_ins branch-instruction-retired events 0123 Branch Instructions -@end verbatim -@end smallexample - -The definitions given above may or may not be available on other processors, -but we try to maximize the overlap across alias sets. - -The table above shows the default set of counters defined for this processor, -and the aliases. For each alias the full ``raw'' name is given, plus the -unit of the number returned by the counter (CPU cycles, or a raw count), -the hardware counter the event is allowed to be mapped onto, and a short -description. - -The last part of the output contains all the events that can be monitored: - -@smallexample -@verbatim -Raw HW counters: - - name type units regs description - - unhalted-core-cycles CPU-cycles 0123 - unhalted-reference-cycles events 0123 - instruction-retired events 0123 - llc-reference events 0123 - llc-misses events 0123 - branch-instruction-retired events 0123 - branch-misses-retired events 0123 - ld_blocks.store_forward events 0123 - ld_blocks.no_sr events 0123 - ld_blocks_partial.address_alias events 0123 - dtlb_load_misses.miss_causes_a_walk events 0123 - dtlb_load_misses.walk_completed_4k events 0123 - - <many lines deleted> - - l2_lines_out.silent events 0123 - l2_lines_out.non_silent events 0123 - l2_lines_out.useless_hwpf events 0123 - sq_misc.split_lock events 0123 - -See Chapter 19 of the "Intel 64 and IA-32 Architectures Software -Developer's Manual Volume 3B: System Programming Guide" -@end verbatim -@end smallexample - -As can be seen, these names are not always easy to correlate to a specific -event of interest. The processor manual should provide more clarity on this. - -@c -- A new node -------------------------------------------------------------- -@node Examples Using Hardware Event Counters -@subsection Examples Using Hardware Event Counters -@c ---------------------------------------------------------------------------- - -The previous section may give the impression that these counters are hard to -use, but as we will show now, in practice it is quite simple. - -With the information from the @code{-h} option, we can easily set up our first -event counter experiment. - -We start by using the default set of counters defined for our processor and we -use 2 threads: - -@cartouche -@smallexample -$ exe=mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ exp=mxv.hwc.def.2.thr.er -$ gprofng collect app -O $exp -h auto ./$exe -m $m -n $n -t 2 -@end smallexample -@end cartouche - -@IndexSubentry{Options, @code{-h}} -@IndexSubentry{Hardware event counters, @code{auto} option} -The new option here is @code{-h auto}. The @code{auto} keyword enables -hardware event counter profiling and selects the default set of counters -defined for this processor. - -As before, we can display the information, but there is one practical hurdle -to take. Unless we like to view all metrics recorded, we would need to know -the names of the events that have been enabled. This is tedious and also not -portable in case we would like to repeat this experiment on another processor. - -@IndexSubentry{Hardware event counters, @code{hwc} metric} -This is where the special @code{hwc} metric comes very handy. It -automatically expands to the active set of events used. - -With this, it is very easy to display the event counter values. Note that -although the regular clock based profiling was enabled, we only want to see -the counter values. We also request to see the percentages and limit the -output to the first 5 lines: - -@cartouche -@smallexample -$ exp=mxv.hwc.def.2.thr.er -$ gprofng display text -metrics e.%hwc -limit 5 -functions $exp -@end smallexample -@end cartouche - -@smallexample -@verbatim -Current metrics: e.%cycles:e+%insts:e+%llm:name -Current Sort Metric: Exclusive CPU Cycles ( e.%cycles ) -Print limit set to 5 -Functions sorted by metric: Exclusive CPU Cycles - -Excl. CPU Excl. Instructions Excl. Last-Level Name -Cycles Executed Cache Misses - sec. % % % -2.691 100.00 7906475309 100.00 122658983 100.00 <Total> -2.598 96.54 7432724378 94.01 121745696 99.26 mxv_core -0.035 1.31 188860269 2.39 70084 0.06 erand48_r -0.026 0.95 73623396 0.93 763116 0.62 init_data -0.018 0.66 76824434 0.97 40040 0.03 drand48 -@end verbatim -@end smallexample - -As we have seen before, the first few lines echo the settings. -This includes a list with the hardware event counters used by -default. - -The table that follows makes it very easy to get an overview where the -time is spent and how many of the target events have occurred. - -As before, we can drill down deeper and see the same metrics at the source -line and instruction level. Other than using @code{hwc} in the metrics -definitions, nothing has changed compared to the previous examples: - -@cartouche -@smallexample -$ exp=mxv.hwc.def.2.thr.er -$ gprofng display text -metrics e.hwc -source mxv_core $exp -@end smallexample -@end cartouche - -This is the relevant part of the output. Since the lines get very long, -we have somewhat modified the lay-out: - -@smallexample -@verbatim - Excl. CPU Excl. Excl. - Cycles Instructions Last-Level - sec. Executed Cache Misses - <Function: mxv_core> - 0. 0 0 32. void __attribute__ ((noinline)) - mxv_core(...) - 0. 0 0 33. { - 0. 0 0 34. for (uint64_t i=...) { - 0. 0 0 35. double row_sum = 0.0; -## 1.872 7291879319 88150571 36. for (int64_t j=0; j<n; j++) - 0.725 140845059 33595125 37. row_sum += A[i][j]*b[j]; - 0. 0 0 38. c[i] = row_sum; - 39. } - 0. 0 0 40. } -@end verbatim -@end smallexample - -In a smiliar way we can display the event counter values at the instruction -level. Again we have modified the lay-out due to page width limitations: - -@cartouche -@smallexample -$ exp=mxv.hwc.def.2.thr.er -$ gprofng display text -metrics e.hwc -disasm mxv_core $exp -@end smallexample -@end cartouche - -@smallexample -@verbatim - Excl. CPU Excl. Excl. - Cycles Instructions Last-Level - sec. Executed Cache Misses - <Function: mxv_core> - 0. 0 0 [33] 4021ba: mov 0x8(%rsp),%r10 - 34. for (uint64_t i=...) { - 0. 0 0 [34] 4021bf: cmp %rsi,%rdi - 0. 0 0 [34] 4021c2: jbe 0x37 - 0. 0 0 [34] 4021c4: ret - 35. double row_sum = 0.0; - 36. for (int64_t j=0; j<n; j++) - 37. row_sum += A[i][j]*b[j]; - 0. 0 0 [37] 4021c5: mov (%r8,%rdi,8),%rdx - 0. 0 0 [36] 4021c9: mov $0x0,%eax - 0. 0 0 [35] 4021ce: pxor %xmm1,%xmm1 - 0.002 12804230 321394 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0 - 0.141 60819025 3866677 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0 - 0.582 67221804 29407054 [37] 4021dd: addsd %xmm0,%xmm1 -## 1.871 7279075109 87989870 [36] 4021e1: add $0x1,%rax - 0.002 12804210 80351 [36] 4021e5: cmp %rax,%rcx - 0. 0 0 [36] 4021e8: jne 0xffffffffffffffea - 38. c[i] = row_sum; - 0. 0 0 [38] 4021ea: movsd %xmm1,(%r10,%rdi,8) - 0. 0 0 [34] 4021f0: add $0x1,%rdi - 0. 0 0 [34] 4021f4: cmp %rdi,%rsi - 0. 0 0 [34] 4021f7: jb 0xd - 0. 0 0 [35] 4021f9: pxor %xmm1,%xmm1 - 0. 0 0 [36] 4021fd: test %rcx,%rcx - 0. 0 80350 [36] 402200: jne 0xffffffffffffffc5 - 0. 0 0 [36] 402202: jmp 0xffffffffffffffe8 - 39. } - 40. } - 0. 0 0 [40] 402204: ret -@end verbatim -@end smallexample - -So far we have used the default settings for the event counters. It is -quite straightforward to select specific counters. For sake of the -example, let's assume we would like to count how many branch instructions -and retired memory load instructions that missed in the L1 cache have been -executed. We also want to count these events with a high resolution. - -This is the command to do so: - -@cartouche -@smallexample -$ exe=mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ exp=mxv.hwc.sel.2.thr.er -$ hwc1=br_ins,hi -$ hwc2=mem_load_retired.l1_miss,hi -$ gprofng collect app -O $exp -h $hwc1 -h $hwc2 $exe -m $m -n $n -t 2 -@end smallexample -@end cartouche - -As before, we get a table with the event counts. Due to the very -long name for the second counter, we have somewhat modified the -output. - -@cartouche -@smallexample -$ gprofng display text -limit 10 -functions mxv.hwc.sel.2.thr.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Functions sorted by metric: Exclusive Total CPU Time -Excl. Incl. Excl. Branch Excl. Name -Total Total Instructions mem_load_retired.l1_miss -CPU sec. CPU sec. Events -2.597 2.597 1305305319 4021340 <Total> -2.481 2.481 1233233242 3982327 mxv_core -0.040 0.107 19019012 9003 init_data -0.028 0.052 23023048 15006 erand48_r -0.024 0.024 19019008 9004 __drand48_iterate -0.015 0.067 11011009 2998 drand48 -0.008 0.010 0 3002 _int_malloc -0.001 0.001 0 0 brk -0.001 0.002 0 0 sysmalloc -0. 0.001 0 0 __default_morecore -@end verbatim -@end smallexample - -@IndexSubentry{Commands, @code{compare ratio}} -When using event counters, the values could be very large and it is not easy -to compare the numbers. As we will show next, the @code{ratio} feature is -very useful when comparing such profiles. - -To demonstrate this, we have set up another event counter experiment where -we would like to compare the number of last level cache miss and the number -of branch instructions executed when using a single thread, or two threads. - -These are the commands used to generate the experiment directories: - -@cartouche -@smallexample -$ exe=./mxv-pthreads.exe -$ m=3000 -$ n=2000 -$ exp1=mxv.hwc.comp.1.thr.er -$ exp2=mxv.hwc.comp.2.thr.er -$ gprofng collect app -O $exp1 -h llm -h br_ins $exe -m $m -n $n -t 1 -$ gprofng collect app -O $exp2 -h llm -h br_ins $exe -m $m -n $n -t 2 -@end smallexample -@end cartouche - -The following script has been used to get the tables. Due to lay-out -restrictions, we have to create two tables, one for each counter. - -@cartouche -@smallexample -# Limit the output to 5 lines -limit 5 -# Define the metrics -metrics name:e.llm -# Set the comparison to ratio -compare ratio -functions -# Define the metrics -metrics name:e.br_ins -# Set the comparison to ratio -compare ratio -functions -@end smallexample -@end cartouche - -Note that we print the name of the function first, followed by the counter -data. -The new element is that we set the comparison mode to @code{ratio}. This -divides the data in a column by its counterpart in the reference experiment. - -This is the command using this script and the two experiment directories as -input: - -@cartouche -@smallexample -$ gprofng display text -script my-script-comp-counters \ - mxv.hwc.comp.1.thr.er \ - mxv.hwc.comp.2.thr.er -@end smallexample -@end cartouche - -By design, we get two tables, one for each counter: - -@smallexample -@verbatim -Functions sorted by metric: Exclusive Last-Level Cache Misses - - mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er -Name Excl. Last-Level Excl. Last-Level - Cache Misses Cache Misses - ratio - <Total> 122709276 x 0.788 - mxv_core 121796001 x 0.787 - init_data 723064 x 1.055 - erand48_r 100111 x 0.500 - drand48 60065 x 1.167 - -Functions sorted by metric: Exclusive Branch Instructions - - mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er -Name Excl. Branch Excl. Branch - Instructions Instructions - ratio - <Total> 1307307316 x 0.997 - mxv_core 1235235239 x 0.997 - erand48_r 23023033 x 0.957 - drand48 20020009 x 0.600 - __drand48_iterate 17017028 x 0.882 -@end verbatim -@end smallexample - -A ratio less than one in the second column, means that this counter -value was smaller than the value from the reference experiment shown -in the first column. - -This kind of presentation of the results makes it much easier to -quickly interpret the data. - -We conclude this section with thread-level event counter overviews, -but before we go into this, there is an important metric we need to -mention. - -@IndexSubentry{Hardware event counters, IPC} -In case it is known how many instructions and CPU cycles have been executed, -the value for the IPC (``Instructions Per Clockycle'') can be computed. -@xref{Hardware Event Counters Explained}. -This is a derived metric that gives an indication how well the processor -is utilized. The inverse of the IPC is called CPI. - -The @DisplayText{} command automatically computes the IPC and CPI values -if an experiment contains the event counter values for the instructions -and CPU cycles executed. These are part of the metric list and can be -displayed, just like any other metric. - -@IndexSubentry{Commands, @code{metric_list}} -This can be verified through the @code{metric_list} command. If we go -back to our earlier experiment with the default event counters, we get -the following result. - -@cartouche -@smallexample -$ gprofng display text -metric_list mxv.hwc.def.2.thr.er -@end smallexample -@end cartouche - -@smallexample -@verbatim -Current metrics: e.totalcpu:i.totalcpu:e.cycles:e+insts:e+llm:name -Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu ) -Available metrics: - Exclusive Total CPU Time: e.%totalcpu - Inclusive Total CPU Time: i.%totalcpu - Exclusive CPU Cycles: e.+%cycles - Inclusive CPU Cycles: i.+%cycles - Exclusive Instructions Executed: e+%insts - Inclusive Instructions Executed: i+%insts -Exclusive Last-Level Cache Misses: e+%llm -Inclusive Last-Level Cache Misses: i+%llm - Exclusive Instructions Per Cycle: e+IPC - Inclusive Instructions Per Cycle: i+IPC - Exclusive Cycles Per Instruction: e+CPI - Inclusive Cycles Per Instruction: i+CPI - Size: size - PC Address: address - Name: name -@end verbatim -@end smallexample - -Among the other metrics, we see the new metrics for the IPC and CPI -listed. - -In the script below, we use this information and add the IPC and CPI -to the metrics to be displayed. We also use a the thread filter to -display these values for the individual threads. - -This is the complete script we have used. Other than a different selection -of the metrics, there are no new features. - -@cartouche -@smallexample -# Define the metrics -metrics e.insts:e.%cycles:e.IPC:e.CPI -# Sort with respect to cycles -sort e.cycles -# Limit the output to 5 lines -limit 5 -# Get the function overview for all threads -functions -# Get the function overview for thread 1 -thread_select 1 -functions -# Get the function overview for thread 2 -thread_select 2 -functions -# Get the function overview for thread 3 -thread_select 3 -functions -@end smallexample -@end cartouche - -In the metrics definition on the second line, we explicitly request the -counter values for the instructions (@code{e.insts}) and CPU cycles -(@code{e.cycles}) executed. These names can be found in output from the -@code{metric_list} commad above. -In addition to these metrics, we also request the IPC and CPI to be shown. - -As before, we used the @code{limit} command to control the number of -functions displayed. We then request an overview for all the threads, -followed by three sets of two commands to select a thread and display the -function overview. - -The script above is used as follows: - -@cartouche -@smallexample -$ gprofng display text -script my-script-ipc mxv.hwc.def.2.thr.er -@end smallexample -@end cartouche - -This script produces four tables. We list them separately below, -and have left out the additional output. - -The first table shows the accumulated values across the three -threads that have been active. - -@smallexample -@verbatim -Functions sorted by metric: Exclusive CPU Cycles - -Excl. Excl. CPU Excl. Excl. Name -Instructions Cycles IPC CPI -Executed sec. % -7906475309 2.691 100.00 1.473 0.679 <Total> -7432724378 2.598 96.54 1.434 0.697 mxv_core - 188860269 0.035 1.31 2.682 0.373 erand48_r - 73623396 0.026 0.95 1.438 0.696 init_data - 76824434 0.018 0.66 2.182 0.458 drand48 -@end verbatim -@end smallexample - -This shows that IPC of this program is completely dominated -by function @code{mxv_core}. It has a fairly low IPC value -of 1.43. - -The next table is for thread 1 and shows the values for the -main thread. - -@smallexample -@verbatim -Exp Sel Total -=== === ===== - 1 1 3 -Functions sorted by metric: Exclusive CPU Cycles - -Excl. Excl. CPU Excl. Excl. Name -Instructions Cycles IPC CPI -Executed sec. % -473750931 0.093 100.00 2.552 0.392 <Total> -188860269 0.035 37.93 2.682 0.373 erand48_r - 73623396 0.026 27.59 1.438 0.696 init_data - 76824434 0.018 18.97 2.182 0.458 drand48 -134442832 0.013 13.79 5.250 0.190 __drand48_iterate -@end verbatim -@end smallexample - -Although this thread hardly uses any CPU cycles, the overall IPC -of 2.55 is not all that bad. - -Last, we show the tables for threads 2 and 3: - -@smallexample -@verbatim -Exp Sel Total -=== === ===== - 1 2 3 -Functions sorted by metric: Exclusive CPU Cycles - -Excl. Excl. CPU Excl. Excl. Name -Instructions Cycles IPC CPI -Executed sec. % -3716362189 1.298 100.00 1.435 0.697 <Total> -3716362189 1.298 100.00 1.435 0.697 mxv_core - 0 0. 0. 0. 0. collector_root - 0 0. 0. 0. 0. driver_mxv - -Exp Sel Total -=== === ===== - 1 3 3 -Functions sorted by metric: Exclusive CPU Cycles - -Excl. Excl. CPU Excl. Excl. Name -Instructions Cycles IPC CPI -Executed sec. % -3716362189 1.300 100.00 1.433 0.698 <Total> -3716362189 1.300 100.00 1.433 0.698 mxv_core - 0 0. 0. 0. 0. collector_root - 0 0. 0. 0. 0. driver_mxv -@end verbatim -@end smallexample - -It is seen that both execute the same number of instructions and -take about the same number of CPU cycles. As a result, the IPC is -the same for both threads. +It is also possible to invoke the lower level commands directly, but since +these are subject to change, in particular the options, we recommend to +use the driver. -@c -- A new node -------------------------------------------------------------- -@c TBD @node Additional Features -@c TBD @section Additional Features -@c ---------------------------------------------------------------------------- +@c man end +@ManPageEnd{} -@c -- A new node -------------------------------------------------------------- -@c TBD @node More Filtering Capabilities -@c TBD @subsection More Filtering Capabilities @c ---------------------------------------------------------------------------- - -@c TBD Cover @code{samples} and @code{seconds} - -@c -- A new node -------------------------------------------------------------- -@node Java Profiling -@section Java Profiling +@c SEEALSO section @c ---------------------------------------------------------------------------- -@IndexSubentry{Java profiling, @code{-j on/off}} -The @CollectApp{} command supports Java profiling. The @code{-j on} option -can be used for this, but since this feature is enabled by default, there is -no need to set this explicitly. Java profiling may be disabled through the -@code{-j off} option. +@ManPageStart{SEEALSO} +@c man begin SEEALSO -The program is compiled as usual and the experiment directory is created -similar to what we have seen before. The only difference with a C/C++ -application is that the program has to be explicitly executed by java. +gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1), +gp-display-text(1) -For example, this is how to generate the experiment data for a Java -program that has the source code stored in file @code{Pi.java}: +Each gprofng command also supports the @option{--help} option. This lists the +options and a short description for each option. -@cartouche -@smallexample -$ javac Pi.java -$ gprofng collect app -j on -O pi.demo.er java Pi < pi.in -@end smallexample -@end cartouche - -Regarding which java is selected to generate the data, @ToolName{} -first looks for the JDK in the path set in either the -@IndexSubentry{Java profiling, @code{JDK_HOME}} -@code{JDK_HOME} environment variable, or in the -@IndexSubentry{Java profiling, @code{JAVA_PATH}} -@code{JAVA_PATH} environment variable. If neither of these variables is -set, it checks for a JDK in the search path (set in the PATH -environment variable). If there is no JDK in this path, it checks for -the java executable in @code{/usr/java/bin/java}. - -In case additional options need to be passed on to the JVM, the -@IndexSubentry{Java profiling, @code{-J <string>}} -@code{-J <string>} option can be used. The string with the -option(s) has to be delimited by quotation marks in case -there is more than one argument. - -The @DisplayText{} command may be used to view the performance data. There is -no need for any special options and the same commands as previously discussed -are supported. - -@IndexSubentry{Commands, @code{viewmode}} -@IndexSubentry{Java profiling, different view modes} -The @code{viewmode} command -@xref{The Viewmode} -is very useful to examine the call stacks. - -For example, this is how one can see the native call stacks. For -lay-out purposes we have restricted the list to the first five entries: - -@cartouche -@smallexample -$ gprofng display text -limit 5 -viewmode machine -calltree pi.demo.er -@end smallexample -@end cartouche +For example this displays the options supported on the +@command{gprofng collect app} command: @smallexample -@verbatim -Print limit set to 5 -Viewmode set to machine -Functions Call Tree. Metric: Attributed Total CPU Time - -Attr. Name -Total -CPU sec. -1.381 +-<Total> -1.171 +-Pi.calculatePi(double) -0.110 +-collector_root -0.110 | +-JavaMain -0.070 | +-jni_CallStaticVoidMethod -@end verbatim +$ gprofng collect app --help @end smallexample -@noindent -Note that the selection of the viewmode is echoed in the output. - -@c -- A new node -------------------------------------------------------------- -@c TBD @node Summary of Options and Commands -@c TBD @chapter Summary of Options and Commands -@c ---------------------------------------------------------------------------- - -@c -- A new node -------------------------------------------------------------- -@node Terminology -@chapter Terminology - -Throughout this manual, certain terminology specific to profiling tools, -or @ToolName{}, or even to this document only, is used. In this chapter we -explain this terminology in detail. - -@menu -* The Program Counter:: What is a Program Counter? -* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics. -* Metric Definitions:: Definitions associated with metrics. -* The Viewmode:: Select the way call stacks are presented. -* The Selection List:: How to define a selection. -* Load Objects and Functions:: The components in an application. -* The Concept of a CPU in @ProductName{}:: The definition of a CPU. -* Hardware Event Counters Explained:: What are event counters? -* apath:: Our generic definition of a path. -@end menu - -@c ---------------------------------------------------------------------------- -@node The Program Counter -@section The Program Counter -@c ---------------------------------------------------------------------------- - -@cindex PC -@cindex Program Counter -The @emph{Program Counter}, or PC for short, keeps track where program execution is. -The address of the next instruction to be executed is stored in a special -purpose register in the processor, or core. - -@cindex Instruction pointer -The PC is sometimes also referred to as the @emph{instruction pointer}, but -we will use Program Counter or PC throughout this document. - -@c ---------------------------------------------------------------------------- -@node Inclusive and Exclusive Metrics -@section Inclusive and Exclusive Metrics -@c ---------------------------------------------------------------------------- - -In the remainder, these two concepts occur quite often and for lack of a better -place, they are explained here. - -@cindex Inclusive metric -The @emph{inclusive} value for a metric includes all values that are part of -the dynamic extent of the target function. For example if function @code{A} -calls functions @code{B} and @code{C}, the inclusive CPU time for @code{A} -includes the CPU time spent in @code{B} and @code{C}. - -@cindex Exclusive metric -In contrast with this, the @emph{exclusive} value for a metric is computed -by excluding the metric values used by other functions called. In our imaginary -example, the exclusive CPU time for function @code{A} is the time spent outside -calling functions @code{B} and @code{C}. - -@cindex Leaf function -In case of a @emph{leaf function}, the inclusive and exclusive values for the -metric are the same since by definition, it is not calling any other -function(s). - -Why do we use these two different values? The inclusive metric shows the most -expensive path, in terms of this metric, in the application. For example, if -the metric is cache misses, the function with the highest inclusive metric -tells you where most of the cache misses come from. - -Within this branch of the application, the exclusive metric points to the -functions that contribute and help to identify which part(s) to consider -for further analysis. - -@c ---------------------------------------------------------------------------- -@node Metric Definitions -@section Metric Definitions -@c ---------------------------------------------------------------------------- -The metrics to be shown are highly customizable. In this section we explain -the definitions associated with metrics. - -@IndexSubentry{Commands, @code{metrics}} -The @code{metrics} command takes a colon (:) separated list with special -keywords. This keyword consists of the following three fields: -@code{<flavor>}@code{<visibility>}@code{<metric_name>}. - -@cindex Flavor field -@cindex Visibility field -@cindex Metric name field -The @emph{<flavor>} field is either an @code{e} for ``exclusive'', or @code{i} -for ``inclusive''. The @code{<metric_name>} field is the name of the metric -request. The @emph{<visibility>} field consists of one ore more characters -from the following table: - -@table @code - -@item . -Show the metric as time. This applies to timing metrics and hardware event counters -that measure cycles. Interpret as @code{+} for other metrics. - -@item % -Show the metric as a percentage of the total value for this metric. - -@item + -Show the metric as an absolute value. For hardware event counters this is -the event count. Interpret as @code{.} for timing metrics. - -@item | -Do not show any metric value. Cannot be used with other visibility characters. - -@end table - -@c ---------------------------------------------------------------------------- -@node The Viewmode -@section The Viewmode - -@cindex Viewmode -@IndexSubentry{Commands, @code{viewmode}} - -There are different ways to view a call stack in Java. In @ToolName{}, this -is called the @emph{viewmode} and the setting is controlled through a command -with the same name. - -The @code{viewmode} command takes one of the following keywords: - -@table @code - -@item user -This is the default and shows the Java call stacks for Java threads. -No call stacks for any housekeeping threads are shown. The function -list contains a function -@IndexSubentry{Java profiling, @code{<JVM-System>}} -@code{<JVM-System>} that represents the aggregated time from non-Java -threads. -When the JVM software does not report a Java call stack, time is reported -against the function -@IndexSubentry{Java profiling, @code{<no Java callstack recorded>}} -@code{<no Java callstack recorded>}. - - -@item expert -Show the Java call stacks for Java threads when the Java code from the -user is executed and machine call stacks when JVM code is executed, or -when the JVM software does not report a Java call stack. -Show the machine call stacks for housekeeping threads. - -@item machine -Show the actual native call stacks for all threads. - -@end table - -@c ---------------------------------------------------------------------------- -@c ---------------------------------------------------------------------------- -@node The Selection List -@section The Selection List -@c ---------------------------------------------------------------------------- - -@cindex Selection list -@cindex List specification -Several commands allow the user to specify a subset of a list. For example, -to select specific threads from all the threads that have been used when -conducting the experiment(s). - -Such a selection list (or ``list'' in the remainder of this section) can be a -single number, a contiguous range of numbers with the start and end numbers -separated by a hyphen (@code{-}), a comma-separated list of numbers and -ranges, or the @code{all} keyword. Lists must not contain spaces. +The user guide for gprofng is maintained as a Texinfo manual. If the +@command{info} and @command{gprofng} programs are correctly installed, the +command @command{info gprofng} should give access to this document. -Each list can optionally be preceded by an experiment list with a similar -format, separated from the list by a colon (:). -If no experiment list is included, the list applies to all experiments. - -Multiple lists can be concatenated by separating the individual lists -by a plus sign. - -These are some examples of various filters using a list: - -@table @code - -@item thread_select 1 -Select thread 1 from all experiments. - -@item thread_select all:1 -Select thread 1 from all experiments. - -@item thread_select 1:1+2:2 -Select thread 1 from experiment 1 and thread 2 from experiment 2. - -@item cpu_select all:1,3,5 -Selects cores 1, 3, and 5 from all experiments. - -@item cpu_select 1,2:all -Select all cores from experiments 1 and 2, as listed by the @code{by exp_list} command. - -@end table - -@c ---------------------------------------------------------------------------- -@node Load Objects and Functions -@section Load Objects and Functions -@c ---------------------------------------------------------------------------- - -An application consists of various components. The source code files are -compiled into object files. These are then glued together at link time to form -the executable. -During execution, the program may also dynamically load objects. - -@cindex Load object -A @emph{load object} is defined to be an executable, or shared object. A shared -library is an example of a load object in @ToolName{}. - -Each load object, contains a text section with the instructions generated by the -compiler, a data section for data, and various symbol tables. -All load objects must contain an -@cindex ELF -ELF -symbol table, which gives the names and addresses of all the globally known -functions in that object. - -Load objects compiled with the -g option contain additional symbolic information -that can augment the ELF symbol table and provide information about functions that -are not global, additional information about object modules from which the functions -came, and line number information relating addresses to source lines. - -The term -@cindex Function -@emph{function} -is used to describe a set of instructions that represent a high-level operation -described in the source code. The term also covers methods as used in C++ and in -the Java programming language. - -In the @ToolName{} context, functions are provided in source code format. -Normally their names appear in the symbol table representing a set of addresses. -@cindex Program Counter -@cindex PC -If the Program Counter (PC) is within that set, the program is executing within that function. - -In principle, any address within the text segment of a load object can be mapped to a -function. Exactly the same mapping is used for the leaf PC and all the other PCs on the -call stack. - -Most of the functions correspond directly to the source model of the program, but -there are exceptions. This topic is however outside of the scope of this guide. +@c man end +@ManPageEnd{} @c ---------------------------------------------------------------------------- -@node The Concept of a CPU in @ProductName{} -@section The Concept of a CPU in @ProductName{} +@c COPYRIGHT section @c ---------------------------------------------------------------------------- -@cindex CPU -In @ProductName{}, there is the concept of a CPU. Admittedly, this is not the -best word to describe what is meant here and may be replaced in the future. - -The word CPU is used in many of the displays. -In the context of @ProductName{}, it is meant to denote a part of the -processor that is capable of executing instructions and with its own state, -like the program counter. +@ManPageStart{COPYRIGHT} +@c man begin COPYRIGHT -For example, on a contemporary processor, a CPU could be a core. In case -hardware threads are supported within a core, it could be one of those -hardware threads. +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. -@c ---------------------------------------------------------------------------- -@node Hardware Event Counters Explained -@section Hardware Event Counters Explained -@c ---------------------------------------------------------------------------- +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.3 +or any later version published by the Free Software Foundation; +with no Invariant Sections, with no Front-Cover Texts, and with no +Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License''. -@IndexSubentry{Hardware event counters, description} -For quite a number of years now, many microprocessors have supported hardware -event counters. - -On the hardware side, this means that in the processor there are one or more -registers dedicated to count certain activities, or ``events''. -Examples of such events are the number of instructions executed, or the number -of cache misses at level 2 in the memory hierarchy. - -While there is a limited set of such registers, the user can map events onto -them. In case more than one register is available, this allows for the -simultaenous measurement of various events. - -A simple, yet powerful, example is to simultaneously count the number of CPU -cycles and the number of instructions excuted. These two numbers can then be -used to compute the -@cindex IPC -@emph{IPC} value. IPC stands for ``Instructions Per Clockcycle'' and each processor -has a maximum. For example, if this maximum number is 2, it means the -processor is capable of executing two instructions every clock cycle. - -Whether this is actually achieved, depends on several factors, including the -instruction characteristics. -However, in case the IPC value is well below this maximum in a time critical -part of the application and this cannot be easily explained, further -investigation is probably warranted. - -@cindex CPI -A related metric is called @emph{CPI}, or ``Clockcycles Per Instruction''. -It is the inverse of the CPI and can be compared against the theoretical -value(s) of the target instruction(s). A significant difference may point -at a bottleneck. - -One thing to keep in mind is that the value returned by a counter can either -be the number of times the event occured, or a CPU cycle count. In case of -the latter it is possible to convert this number to time. - -@IndexSubentry{Hardware event counters, variable CPU frequency} -This is often easier to interpret than a simple count, but there is one -caveat to keep in mind. The CPU frequency may not have been constant while -the experimen was recorded and this impacts the time reported. - -These event counters, or ``counters'' for short, provide great insight into -what happens deep inside the processor. In case higher level information does -not provide the insight needed, the counters provide the information to get -to the bottom of a performance problem. - -There are some things to consider though. - -@itemize @bullet - -@item -The event definitions and names vary across processors and it may even happen -that some events change with an update. -Unfortunately and this is luckily rare, there are sometimes bugs causing the -wrong count to be returned. - -@IndexSubentry{Hardware event counters, alias name} -In @ToolName{}, some of the processor specific event names have an alias -name. For example @code{insts} measures the instructions executed. -These aliases not only makes it easier to identify the functionality, but also -provide portability of certain events across processors. - -@item -Another complexity is that there are typically many events one can monitor. -There may up to hundreds of events available and it could require several -experiments to zoom in on the root cause of a performance problem. - -@item -There may be restrictions regarding the mapping of event(s) onto the -counters. For example, certain events may be restricted to specific -counters only. As a result, one may have to conduct additional experiments -to cover all the events of interest. - -@item -The names of the events may also not be easy to interpret. In such cases, -the description can be found in the architecture manual for the processor. - -@end itemize - -Despite these drawbacks, hardware event counters are extremely useful and -may even turn out to be indispensable. +@c man end +@ManPageEnd{} @c ---------------------------------------------------------------------------- -@node apath -@section What is <apath>? -@c ---------------------------------------------------------------------------- - -In most cases, @ToolName{} shows the absolute pathnames of directories. These -tend to be rather long, causing display issues in this document. - -Instead of wrapping these long pathnames over multiple lines, we decided to -represent them by the @code{<apath>} symbol, which stands for ``an absolute -pathname''. - -Note that different occurrences of @code{<apath>} may represent different -absolute pathnames. - -@c -- A new node -------------------------------------------------------------- -@node Other Document Formats -@chapter Other Document Formats +@c If this text is used for a man page, exit. Otherwise we need to continue. @c ---------------------------------------------------------------------------- -This document is written in Texinfo and the source text is made available as -part of the binutils distribution. The file name is @code{gprofng.texi} and -can be found in subdirectory @code{doc} under directory @code{gprofng} in the -top level directory. - -This file can be used to generate the document in the @code{info}, @code{html}, -and @code{pdf} formats. -The default installation procedure creates a file in the @code{info} format and -stores it in the documentation section of binutils. - -The probably easiest way to generate a different format from this Texinfo -document is to go to the distribution directory that was created when the -tools were built. -This is either the default distribution directory, or the one that has been set -with the @code{--prefix} option as part of the @code{configure} command. -In this example we symbolize this location with @code{<dist>}. - -The make file called @code{Makefile} in directory @code{<dist>/gprofng/doc} -supports several commands to generate this document in different formats. -We recommend to use these commands. - -They create the file(s) and install it in the documentation directory of binutils, -which is @code{<dist>/share/doc} in case @code{html} or @code{pdf} is selected and -@code{<dist>/share/info} for the file in the @code{info} format. - -To generate this document in the requested format and install it in the documentation -directory, the commands below should be executed. In this notation, @code{<format>} -is one of @code{info}, @code{html}, or @code{pdf}: - -@smallexample -@verbatim -$ cd <dist>/gprofng/doc -$ make install-<format> -@end verbatim -@end smallexample - -@noindent -Some things to note: - -@itemize - -@item -For the @code{pdf} file to be generated, the -@cindex TeX -TeX document formatting software is required and the relevant commmands need -to be included in the search path. An example of a popular TeX implementation -is @emph{TexLive}. It is beyond the scope of this document to go into the -details of installing and using TeX, but it is well documented elsewhere. - -@item -Instead of generating a single file in the @code{html} format, it is also -possible to create a directory with individual files for the various chapters. -To do so, remove the use of @code{--no-split} in variable @code{MAKEINFOHTML} -in the make file in the @code{doc} directory. - -@item -The make file also supports commands to only generate the file in the desired -format and not move them to the documentation directory. This is -accomplished through the @code{make <format>} command. - -@end itemize - -@ifnothtml -@node Index -@unnumbered Index -@printindex cp -@end ifnothtml - +@ifset man @bye +@end ifset diff --git a/gprofng/doc/gprofng_ug.texi b/gprofng/doc/gprofng_ug.texi new file mode 100644 index 0000000..1fe95c7 --- /dev/null +++ b/gprofng/doc/gprofng_ug.texi @@ -0,0 +1,4396 @@ +\input texinfo @c -*-texinfo-*- + +@c ---------------------------------------------------------------------------- +@c This is the Texinfo source file for the GPROFNG manual. This manual +@c includes the man pages for the various tools. +@c +@c Author: Ruud van der Pas +@c ---------------------------------------------------------------------------- + +@c %**start of header + +@setfilename gprofng.info +@settitle GNU gprofng + +@c -- Set the indent for the @example command to 1 space, not 5 --------------- +@exampleindent 1 + +@paragraphindent 3 + +@c %**end of header + +@c -- Start a new chapter on a new, odd numbered, page ------------------------ +@setchapternewpage odd + +@c -- Merge all index entries into the Concepts Index ------------------------- +@syncodeindex fn cp +@syncodeindex ky cp +@syncodeindex pg cp +@syncodeindex vr cp + +@c -- Macros specific to gprofng ---------------------------------------------- +@include gp-macros.texi + +@c -- Get the version information --------------------------------------------- +@include version.texi + +@c -- Entry for the Info dir structure ---------------------------------------- +@ifnottex +@dircategory Software development +@direntry +* gprofng: (gprofng). The next generation profiling tool for Linux +@end direntry +@end ifnottex + +@c -- Copyright stuff --------------------------------------------------------- +@copying +This document is the manual for @ProductName{}, last updated @value{UPDATED}. + +Copyright @copyright{} 2022-2023 Free Software Foundation, Inc. + +@c -- @quotation +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, +Version 1.3 or any later version published by the Free Software +Foundation; with no Invariant Sections, with no Front-Cover texts, +and with no Back-Cover Texts. A copy of the license is included in the +section entitled ``GNU Free Documentation License.'' + +@c -- @end quotation +@end copying + +@finalout +@smallbook + +@c -- Define the title page --------------------------------------------------- +@titlepage +@title GNU gprofng +@subtitle The next generation profiling tool for Linux +@subtitle version @value{VERSION} (last updated @value{UPDATED}) +@author Ruud van der Pas +@page +@vskip 0pt plus 1filll +@insertcopying +@end titlepage + +@c -- Generate the Table of Contents ------------------------------------------ +@contents + +@c -- The Top node ------------------------------------------------------------ +@c Should contain a short summary, copying permissions and a master menu. +@c ---------------------------------------------------------------------------- +@ifnottex +@node Top +@top GNU Gprofng + +@insertcopying +@end ifnottex + +@ifinfo +@c -- The menu entries -------------------------------------------------------- + +@c * Display Source Code:: Display the source code and disassembly. +@c * Archive Experiment Data:: Archive an experiment. + +@menu +* Introduction:: About this manual. +* Overview:: A brief overview of @ProductName{}. +* A Mini Tutorial:: A short tutorial covering the key features. +* The gprofng Tools:: An overview of the tools supported. +* Performance Data Collection:: Record the performance information. +* View the Performance Information:: Different ways to view the data. +* Terminology:: Concepts and terminology explained. +* Other Document Formats:: Create this document in other formats. +* The gprofng Man Pages:: The gprofng man pages. +* Index:: The index. + +@detailmenu + +--- The Detailed Node Listing --- + +Introduction + +Overview + +* Main Features:: A high level overview. +* Sampling versus Tracing:: The pros and cons of sampling versus tracing. +* Steps Needed to Create a Profile:: How to create a profile. + +A Mini Tutorial + +* Getting Started:: The basics of profiling with @ProductName(). +* Support for Multithreading:: Commands specific to multithreaded applications. +* View Multiple Experiments:: Analyze multiple experiments simultaneously. +* Profile Hardware Event Counters:: How to use hardware event counters. +* Java Profiling:: How to profile a Java application. + +The gprofng Tools + +* Tools Overview:: A brief description of the tools. +* The gprofng.rc file with default settings:: Customize the settings. +* Filters:: Filters. +* Supported Environment Variables:: The supported environment variables. + +Terminology + +* The Program Counter:: What is a Program Counter? +* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics. +* Metric Definitions:: Definitions associated with metrics. +* The Viewmode:: Select the way call stacks are presented. +* The Selection List:: How to define a selection. +* Load Objects and Functions:: The components in an application. +* The Concept of a CPU in @ProductName{}:: The definition of a CPU. +* Hardware Event Counters Explained:: What are event counters? +* apath:: Our generic definition of a path. + +The gprofng Man Pages + +* gprofng collect app:: The man page for gprofng collect app. +* gprofng display text:: The man page for gprofng display text. +* gprofng display src:: The man page for gprofng display src. +* gprofng display html:: The man page for gprofng display html. +* gprofng archive:: The man page for gprofng archive. + +@c -- Index + +@end detailmenu +@end menu +@end ifinfo + +@c -- A new node -------------------------------------------------------------- +@node Introduction +@chapter Introduction +@c ---------------------------------------------------------------------------- +The @ProductName{} tool is the next generation profiler for Linux. It consists +of various commands to generate and display profile information. + +This manual starts with a tutorial how to create and interpret a profile. This +part is highly practical and has the goal to get users up to speed as quickly +as possible. As soon as possible, we would like to show you how to get your +first profile on your screen. + +This is followed by more examples, covering many of the features. At the +end of this tutorial, you should feel confident enough to tackle the more +complex tasks. + +In a future update a more formal reference manual will be included as well. +Since even in this tutorial we use certain terminology, we have included a +chapter with descriptions at the end. In case you encounter unfamiliar +wordings or terminology, please check this chapter. + +One word of caution. In several cases we had to somewhat tweak the screen +output in order to make it fit. This is why the output may look somewhat +different when you try things yourself. + +For now, we wish you a smooth profiling experience with @ProductName{} and +good luck tackling performance bottlenecks. + +@c -- A new node -------------------------------------------------------------- +@c cccccc @node A Brief Overview of @ProductName{} +@node Overview +@chapter A Brief Overview of @ProductName{} +@c ---------------------------------------------------------------------------- + +@menu +* Main Features:: A high level overview. +* Sampling versus Tracing:: The pros and cons of sampling versus tracing. +* Steps Needed to Create a Profile:: How to create a profile. +@end menu + +Before we cover this tool in quite some detail, we start with a brief overview +of what it is, and the main features. Since we know that many of you would +like to get started rightaway, already in this first chapter we explain the +basics of profiling with @ToolName{}. + +@c ---------------------------------------------------------------------------- +@c TBD Review this text. Probably be more specific on the gcc releases and +@c processor specifics. +@c ---------------------------------------------------------------------------- + +@c -- A new node -------------------------------------------------------------- +@node Main Features +@section Main Features +@c ---------------------------------------------------------------------------- + +@noindent +These are the main features of the @ProductName{} tool: + +@itemize @bullet + +@item +Profiling is supported for an application written in C, C++, Java, or Scala. + +@c TBD Java: up to 1.8 full support, support other than for modules + +@item +Shared libraries are supported. The information is presented at the instruction +level. + +@item +The following multithreading programming models are supported: Pthreads, +OpenMP, and Java threads. + +@item +This tool works with unmodified production level executables. There is no need to +recompile the code, but if the @samp{-g} option has been used when building +the application, source line level information is available. + +@item +The focus is on support for code generated with the @command{gcc} compiler, but +there is some limited support for the @command{icc} compiler as well. Future +improvements and enhancements will focus on @command{gcc} though. + +@item +Processors from Intel, AMD, and Arm are supported, but the level of support +depends on the architectural details. In particular, hardware event counters +may not be supported. If this is the case, all views not related to these +counters still ought to work though. + +@item +Several views into the data are supported. For example, a function overview +where the time is spent, but also a source line, disassembly, call tree and +a caller-callees overview are available. + +@item +Through filters, the user can zoom in on an area of interest. + +@item +Two or more profiles can be aggregated, or used in a comparison. This comparison +can be obtained at the function, source line, and disassembly level. + +@item +Through a simple scripting language, and customization of the metrics shown, +the generation and creation of a profile can be fully automated and provide +tailored output. + +@end itemize + +@c -- A new node -------------------------------------------------------------- +@node Sampling versus Tracing +@section Sampling versus Tracing +@c ---------------------------------------------------------------------------- + +A key difference with some other profiling tools is that the main data +collection command @CollectApp{} mostly uses +@cindex Program Counter sampling +@cindex PC sampling +Program Counter (PC) sampling +under the hood. + +With @emph{sampling}, the executable is interrupted at regular intervals. Each +time it is halted, key information is gathered and stored. This includes the +Program Counter that keeps track of where the execution is. Hence the name. + +Together with operational data, this information is stored in the experiment +directory and can be viewed in the second phase. + +For example, the PC information is used to derive where the program was when +it was halted. Since the sampling interval is known, it is relatively easy to +derive how much time was spent in the various parts of the program. + +The opposite technique is generally referred to as @emph{tracing}. With +tracing, the target is instrumented with specific calls that collect the +requested information. + +These are some of the pros and cons of PC sampling verus tracing: + +@itemize + +@item +Since there is no need to recompile, existing executables can be used +and the profile measures the behaviour of exactly the same executable that is +used in production runs. + +With sampling, one inherently profiles a different executable, because +the calls to the instrumentation library may affect the compiler optimizations +and run time behaviour. + +@item +With sampling, there are very few restrictions on what can be profiled and even without +access to the source code, a basic profile can be made. + +@item +A downside of sampling is that, depending on the sampling frequency, small +functions may be missed or not captured accurately. Although this is rare, +this may happen and is the reason why the user has control over the sampling rate. + +@item +While tracing produces precise information, sampling is statistical in nature. +As a result, small variations may occur across seemingly identical runs. We +have not observed more than a few percent deviation though. Especially if +the target job executed for a sufficiently long time. + +@item +With sampling, it is not possible to get an accurate count how often +functions are called. + +@end itemize + +@c -- A new node -------------------------------------------------------------- +@node Steps Needed to Create a Profile +@section Steps Needed to Create a Profile +@c ---------------------------------------------------------------------------- + +Creating a profile takes two steps. First the profile data needs to be +generated. This is followed by a viewing step to create a report from the +information that has been gathered. + +Every @ProductName{} command starts with @ToolName{}, the name of the driver. +This is followed by a keyword to define the high level functionality. Depending +on this keyword, a third qualifier may be needed to further narrow down the request. +This combination is then followed by options that are specific to the functionality +desired. + +The command to gather, or ``collect'', the performance data is called +@CollectApp{}. Aside from numerous options, this command takes the name +of the target executable as an input parameter. + +Upon completion of the run, the performance data can be +found in the newly created +@cindex Experiment directory +experiment directory. + +Unless explicitly specified otherwise, a default +name for this directory is chosen. The name is @file{test.<n>.er} where +@var{<n>} is the first integer number not in use yet for such a name. + +For example, the first time @CollectApp{} is invoked, an experiment +directory with the name @file{test.1.er} is created. +Upon a subsequent invocation of @CollectApp{} in the same directory, +an experiment directory with the name @file{test.2.er} will be created, +and so forth. + +Note that @CollectApp{} supports an option to explicitly name the experiment +directory. +Aside from the restriction that the name of this directory has to end +with @samp{.er}, any valid directory name can be used for this. + +Now that we have the performance data, the next step is to display it. + +@IndexSubentry{@code{gprofng}, @code{display text}} +The most commonly used command to view the performance information is +@DisplayText{}. This is a very extensive and customizable tool that +produces the information in ASCII format. + +@IndexSubentry{@code{gprofng}, @code{display html}} +Another option is to use @DisplayHTML{}. This tool generates a directory with +files in html format. These can be viewed in a browser, allowing for easy +navigation through the profile data. + +@c -- A new node -------------------------------------------------------------- +@node A Mini Tutorial +@chapter A Mini Tutorial +@c ---------------------------------------------------------------------------- + +In this chapter we present and discuss the main functionality of @ToolName{}. +This will be a practical approach, using an example code to generate profile +data and show how to get various performance reports. + +@menu +* Getting Started:: The basics of profiling with @ProductName(). +* Support for Multithreading:: Commands specific to multithreaded applications. +* View Multiple Experiments:: Analyze multiple experiments simultaneously. +* Profile Hardware Event Counters:: How to use hardware event counters. +* Java Profiling:: How to profile a Java application. +@end menu + +@c -- A new node -------------------------------------------------------------- +@node Getting Started +@section Getting Started +@c ---------------------------------------------------------------------------- + +The information presented here provides a good and common basis for many +profiling tasks, but there are more features that you may want to leverage. + +These are covered in subsequent sections in this chapter. + +@menu +* The Example Program:: A description of the example program used. +* A First Profile:: How to get the first profile. +* The Source Code View:: Display the metrics in the source code. +* The Disassembly View:: Display the metrics at the instruction level. +* Display and Define the Metrics:: An example how to customize the metrics. +* Customization of the Output:: An example how to customize the output. +* Name the Experiment Directory:: Change the name of the experiment directory. +* Control the Number of Lines in the Output:: Change the number of lines in the tables. +* Sorting the Performance Data:: How to set the metric to sort by. +* Scripting:: Use a script to execute the commands. +* A More Elaborate Example:: An example of customization. +* The Call Tree:: Display the dynamic call tree. +* More Information on the Experiment:: How to get additional statistics. +* Control the Sampling Frequency:: How to control the sampling granularity. +* Information on Load Objects:: How to get more information on load objects. +@end menu + +@c -- A new node -------------------------------------------------------------- +@node The Example Program +@subsection The Example Program +@c ---------------------------------------------------------------------------- + +Throughout this guide we use the same example C code that implements the +multiplication of a vector of length @math{n} by an @math{m} by @math{n} +matrix. The result is stored in a vector of length @math{m}. +@cindex Pthreads +@cindex Posix Threads +The algorithm has been parallelized using Posix Threads, or Pthreads for short. + +The code was built using the @code{gcc} compiler and the name of the executable +is +@cindex mxv-pthreads +@command{mxv-pthreads}. + +The matrix sizes can be set through the @code{-m} and @code{-n} options. The +number of threads is set with the @code{-t} option. These are additional threads +that are used in the multiplication. To increase the duration of the run, the +computations are executed repeatedly. + +This is an example that multiplies a @math{8000} by @math{4000} matrix with +a vector of length @math{4000}. Although this is a multithreaded application, +initially we will be using @math{1} threads. Later on we will show examples +using multiple threads. + +@smallexample +@verbatim +$ ./mxv-pthreads -m 8000 -n 4000 -t 1 +mxv: error check passed - rows = 8000 columns = 4000 threads = 1 +$ +@end verbatim +@end smallexample + +The program performs an internal check to verify that the computed results +are correct. The result of this check is printed, as well as the matrix +sizes and the number of threads used. + +@c -- A new node -------------------------------------------------------------- +@node A First Profile +@subsection A First Profile +@c ---------------------------------------------------------------------------- + +The first step is to collect the performance data. It is important to remember +that much more information is gathered than may be shown by default. Often a +single data collection run is sufficient to get a lot of insight. + +The @CollectApp{} command is used for the data collection. Nothing needs to be +changed in the way the application is executed. The only difference is that it +is now run under control of the tool, as shown below: + +@cartouche +@smallexample +$ gprofng collect app ./mxv-pthreads -m 8000 -n 4000 -t 1 +@end smallexample +@end cartouche + +@noindent +This produces the following output: + +@smallexample +@verbatim +Creating experiment directory test.1.er (Process ID: 2749878) ... +mxv: error check passed - rows = 8000 columns = 4000 threads = 1 +@end verbatim +@end smallexample + +We see a message that an experiment directory with the name @file{test.1.er} +has been created. The process id is also echoed. The application completes +as usual and we have our first experiment directory that can be analyzed. + +The tool we use for this is called @DisplayText{}. It takes the name of +the experiment directory as an argument. + +@cindex Interpreter mode +If invoked this way, the tool starts in the interactive @emph{interpreter} mode. +While in this environment, commands can be given and the tool responds. This is +illustrated below: + +@smallexample +@verbatim +$ gprofng display text test.1.er +Warning: History and command editing is not supported on this system. +(gp-display-text) quit +$ +@end verbatim +@end smallexample + +@cindex Command line mode +While useful in certain cases, we prefer to use this tool in command line mode +by specifying the commands to be issued when invoking the tool. The way to do +this is to prepend the command(s) with a hyphen (@samp{-}) if used on the +command line. + +Since this makes the command appear to be an option, they are also sometimes +referred to as such, but technically they are commands. This is the +terminology we will use in this user guide, but for convenience the commands +are also listed as options in the index. + +For example, +@IndexSubentry{Options, @code{-functions}} +@IndexSubentry{Commands, @code{functions}} +below we use the @command{functions} command to request a list of the functions +that have been executed, plus their respective CPU times: + +@cartouche +@smallexample +$ gprofng display text -functions test.1.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +$ gprofng display text -functions test.1.er + +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Incl. Total Name +CPU CPU + sec. % sec. % +9.367 100.00 9.367 100.00 <Total> +8.926 95.30 8.926 95.30 mxv_core +0.210 2.24 0.420 4.49 init_data +0.080 0.85 0.210 2.24 drand48 +0.070 0.75 0.130 1.39 erand48_r +0.060 0.64 0.060 0.64 __drand48_iterate +0.010 0.11 0.020 0.21 _int_malloc +0.010 0.11 0.010 0.11 sysmalloc +0. 0. 8.926 95.30 <static>@0x47960 (<libgp-collector.so>) +0. 0. 0.440 4.70 __libc_start_main +0. 0. 0.020 0.21 allocate_data +0. 0. 8.926 95.30 driver_mxv +0. 0. 0.440 4.70 main +0. 0. 0.020 0.21 malloc +0. 0. 8.926 95.30 start_thread +@end verbatim +@end smallexample + +As easy and simple as these steps are, we do have a first profile of our program! + +There are five columns. The first four contain the +@cindex Total CPU time +''Total CPU Time'', which +is the sum of the user and system time. @xref{Inclusive and Exclusive Metrics} +for an explanation of ``exclusive'' and ``inclusive'' times. + +The first line echoes the metric that is used to sort the output. By default, +this is the exclusive CPU time, but through the @command{sort} command, the sort +metric can be changed by the user. + +Next, there are four columns with the exclusive and inclusive CPU times and the +respective percentages. This is followed by the name of the function. + +@IndexSubentry{Miscellaneous, @code{<Total>}} +The function with the name @code{<Total>} is not a user function. It is a +pseudo function introduced by @ToolName{}. It is used to display the +accumulated measured metric values. In this example, we see that the total +CPU time of this job was 9.367 seconds and it is scaled to 100%. All +other percentages in the same column are relative to this number. + +@c -- If the metric is derived, for example the @code{IPC}, the value shown under +@c -- @code{<Total>} is based upon the total values of the that are metrics used to +@c -- compute the derived metric. +@c -- @IndexSubentry{Hardware event counters, IPC} + +With 8.926 seconds, function @code{mxv_core} takes 95.30% of the +total time and is by far the most time consuming function. +The exclusive and inclusive metrics are identical, which means that is a +leaf function not calling any other functions. + +The next function in the list is @code{init_data}. Although with 4.49%, +the CPU time spent in this part is modest, this is an interesting entry because +the inclusive CPU time of 0.420 seconds is twice the exclusive CPU time +of 0.210 seconds. Clearly this function is calling another function, +or even more than one function and collectively this takes 0.210 seconds. +Below we show the call tree feature that provides more details on the call +structure of the application. + +The function @code{<static>@@0x47960 (<libgp-collector.so>)} does odd and +certainly not familiar. It is one of the internal functions used by +@CollectApp{} and can be ignored. Also, while the inclusive time is high, +the exclusive time is zero. This means it doesn't contribute to the +performance. + +The question is how we know where this function originates from? There are +several commands to dig deeper an get more details on a function. +@xref{Information on Load Objects}. + +@c -- A new node -------------------------------------------------------------- +@node The Source Code View +@subsection The Source Code View +@c ---------------------------------------------------------------------------- + +In general, the tuning efforts are best focused on the most time consuming +part(s) of an application. In this case that is easy, since over 95% of +the total CPU time is spent in function @code{mxv_core}. +It is now time to dig deeper and look +@cindex Source level metrics +at the metrics distribution at the source code level. Since we measured +CPU times, these are the metrics shown. + +@IndexSubentry{Options, @code{-source}} +@IndexSubentry{Commands, @code{source}} +The @code{source} command is used to accomplish this. It takes the name of the +function, not the source filename, as an argument. This is demonstrated +below, where the @DisplayText{} command is used to show the annotated +source listing of function @code{mxv_core}. + +Be aware that when using the @command{gcc} compiler, the source code has to +be compiled with the @code{-g} option in order for the source code feature +to work. Otherwise the location(s) can not be determined. For other compilers +we recommend to check the documentation for such an option. + +Below the command to display the source code of a function is shown. Since at +this point we are primarily interested in the timings only, we use the +@code{metrics} command to request the exclusive and inclusive total CPU +timings only. @xref{Display and Define the Metrics} for more information +how to define the metrics to be displayed. + +@cartouche +@smallexample +$ gprofng display text -metrics ei.totalcpu -source mxv_core test.1.er +@end smallexample +@end cartouche + +The output is shown below. It has been somewhat modified to fit the formatting +constraints and reduce the number of lines. + +@smallexample +@verbatim +Current metrics: e.totalcpu:i.totalcpu:name +Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu ) +Source file: <apath>/mxv.c +Object file: mxv-pthreads (found as test.1.er/archives/...) +Load Object: mxv-pthreads (found as test.1.er/archives/...) + + Excl. Incl. + Total Total + CPU sec. CPU sec. + + <lines deleted> + <Function: mxv_core> + 43. void __attribute__ ((noinline)) + mxv_core (int64_t row_index_start, + 44. int64_t row_index_end, + 45. int64_t m, + 46. int64_t n, + 47. double **restrict A, + 48. double *restrict b, + 49. double *restrict c) + 50. { + 0. 0. 50. { + 0. 0. 51. for (int64_t i=row_index_start; + i<=row_index_end; i++) + 52. { + 0. 0. 53. double row_sum = 0.0; +## 4.613 4.613 54. for (int64_t j=0; j<n; j++) +## 4.313 4.313 55. row_sum += A[i][j] * b[j]; + 0. 0. 56. c[i] = row_sum; + 57. } + 0. 0. 58. } +@end verbatim +@end smallexample + +The first line echoes the metrics that have been selected. The second line +is not very meaningful when looking at the source code listing, but it shows +the metric that is used to sort the data. + +The next three lines provide information on the location of the source file, +the object file and the load object (@xref{Load Objects and Functions}). + +Function @code{mxv_core} is part of a source file that has other functions +as well. These functions will be shown with the values for the metrics, but +for lay-out purposes they have been removed in the output shown above. + +The header is followed by the annotated source code listing. The selected +metrics are shown first, followed by a source line number, and the source code. +@IndexSubentry{Miscellaneous ,@code{##}} +The most time consuming line(s) are marked with the @code{##} symbol. In +this way they are easier to identify and find with a search. + +What we see is that all of the time is spent in lines 54-55. + +@IndexSubentry{Options, @code{-lines}} +@IndexSubentry{Commands, @code{lines}} +A related command sometimes comes handy as well. It is called @code{lines} +and displays a list of the source lines and their metrics, ordered according +to the current sort metric (@xref{Sorting the Performance Data}). + +Below the command and the output. For lay-out reasons, only the top 10 is +shown here and the last part of the text on some lines has been replaced +by dots. The full text is @samp{instructions without line numbers} and +means that the line number information for that function was not found. + +@cartouche +@smallexample +$ gprofng display text -lines test.1.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Lines sorted by metric: Exclusive Total CPU Time + +Excl. Total Incl. Total Name +CPU CPU + sec. % sec. % +9.367 100.00 9.367 100.00 <Total> +4.613 49.25 4.613 49.25 mxv_core, line 54 in "mxv.c" +4.313 46.05 4.313 46.05 mxv_core, line 55 in "mxv.c" +0.160 1.71 0.370 3.95 init_data, line 118 in "manage_data.c" +0.080 0.85 0.210 2.24 <Function: drand48, instructions ...> +0.070 0.75 0.130 1.39 <Function: erand48_r, instructions ...> +0.060 0.64 0.060 0.64 <Function: __drand48_iterate, ...> +0.040 0.43 0.040 0.43 init_data, line 124 in "manage_data.c" +0.010 0.11 0.020 0.21 <Function: _int_malloc, instructions ...> +0.010 0.11 0.010 0.11 <Function: sysmalloc, instructions ...> +@end verbatim +@end smallexample + +What this overview immediately highlights is that the third most time consuming +source line takes 0.370 seconds only. This means that the inclusive time is +only 3.95% and clearly this branch of the code hardly impacts the performance. + +@c -- A new node -------------------------------------------------------------- +@node The Disassembly View +@subsection The Disassembly View +@c ---------------------------------------------------------------------------- + +The source view is very useful to obtain more insight where the time is spent, +but sometimes this is not sufficient. The disassembly view provides more +details since it shows the metrics at the instruction level. + +This view is displayed with the +@IndexSubentry{Options, @code{-disasm}} +@IndexSubentry{Commands, @code{disasm}} +@command{disasm} +command and as with the source view, it displays an annotated listing. In this +@cindex Instruction level metrics +case it shows the instructions with the metrics, interleaved with the +source lines. The +instructions have a reference in square brackets (@code{[} and @code{]}) +to the source line they correspond to. + +@noindent +We again focus on the tmings only and set the metrics accordingly: + +@cartouche +@smallexample +$ gprofng display text -metrics ei.totalcpu -disasm mxv_core test.1.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Current metrics: e.totalcpu:i.totalcpu:name +Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu ) +Source file: <apath>/src/mxv.c +Object file: mxv-pthreads (found as test.1.er/archives/...) +Load Object: mxv-pthreads (found as test.1.er/archives/...) + + Excl. Incl. + Total Total + CPU sec. CPU sec. + + <lines deleted> + 43. void __attribute__ ((noinline)) + mxv_core (int64_t row_index_start, + 44. int64_t row_index_end, + 45. int64_t m, + 46. int64_t n, + 47. double **restrict A, + 48. double *restrict b, + 49. double *restrict c) + 50. { + <Function: mxv_core> + 0. 0. [50] 401d56: mov 0x8(%rsp),%r10 + 51. for (int64_t i=row_index_start; + i<=row_index_end; i++) + 0. 0. [51] 401d5b: cmp %rsi,%rdi + 0. 0. [51] 401d5e: jg 0x47 + 0. 0. [51] 401d60: add $0x1,%rsi + 0. 0. [51] 401d64: jmp 0x36 + 52. { + 53. double row_sum = 0.0; + 54. for (int64_t j=0; j<n; j++) + 55 row_sum += A[i][j] * b[j]; + 0. 0. [55] 401d66: mov (%r8,%rdi,8),%rdx + 0. 0. [54] 401d6a: mov $0x0,%eax + 0. 0. [53] 401d6f: pxor %xmm1,%xmm1 + 0.110 0.110 [55] 401d73: movsd (%rdx,%rax,8),%xmm0 + 1.921 1.921 [55] 401d78: mulsd (%r9,%rax,8),%xmm0 + 2.282 2.282 [55] 401d7e: addsd %xmm0,%xmm1 +## 4.613 4.613 [54] 401d82: add $0x1,%rax + 0. 0. [54] 401d86: cmp %rax,%rcx + 0. 0. [54] 401d89: jne 0xffffffffffffffea + 56. c[i] = row_sum; + 0. 0. [56] 401d8b: movsd %xmm1,(%r10,%rdi,8) + 0. 0. [51] 401d91: add $0x1,%rdi + 0. 0. [51] 401d95: cmp %rsi,%rdi + 0. 0. [51] 401d98: je 0xd + 0. 0. [53] 401d9a: pxor %xmm1,%xmm1 + 0. 0. [54] 401d9e: test %rcx,%rcx + 0. 0. [54] 401da1: jg 0xffffffffffffffc5 + 0. 0. [54] 401da3: jmp 0xffffffffffffffe8 + 57. } + 58. } + 0. 0. [58] 401da5: ret +@end verbatim +@end smallexample + +For each instruction, the timing values are given and we can immediately +identify the most expensive instructions. As with the source level view, +these are marked with the @code{##} symbol. + +It comes as no surprise that the time consuming instructions originate from +the source code at lines 54-55. +One thing to note is that the source line numbers no longer appear in +sequential order. +This is because the compiler has re-ordered the instructions as part of +the code optimizations it has performed. + +As illustrated below and similar to the @command{lines} command, we can get +an overview of the instructions executed by using the +@IndexSubentry{Options, @code{-pcs}} +@IndexSubentry{Commands, @code{pcs}} +@command{pcs} +command. + +@noindent +Below the command and the output, which again has been restricted +to 10 lines. As before, some lines have been shortened for lay-out +purposes. + +@cartouche +@smallexample +$ gprofng display text -pcs test.1.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +PCs sorted by metric: Exclusive Total CPU Time + +Excl. Total Incl. Total Name +CPU CPU + sec. % sec. % +9.367 100.00 9.367 100.00 <Total> +4.613 49.25 4.613 49.25 mxv_core + 0x0000002C, line 54 in "mxv.c" +2.282 24.36 2.282 24.36 mxv_core + 0x00000028, line 55 in "mxv.c" +1.921 20.51 1.921 20.51 mxv_core + 0x00000022, line 55 in "mxv.c" +0.150 1.60 0.150 1.60 init_data + 0x000000AC, line 118 in ... +0.110 1.18 0.110 1.18 mxv_core + 0x0000001D, line 55 in "mxv.c" +0.040 0.43 0.040 0.43 drand48 + 0x00000022 +0.040 0.43 0.040 0.43 init_data + 0x000000F1, line 124 in ... +0.030 0.32 0.030 0.32 __drand48_iterate + 0x0000001E +0.020 0.21 0.020 0.21 __drand48_iterate + 0x00000038 +@end verbatim +@end smallexample + +@noindent +What we see is that the top three instructions take 94% of the total CPU time +and any optimizations should focus on this part of the code.. + +@c -- A new node -------------------------------------------------------------- +@node Display and Define the Metrics +@subsection Display and Define the Metrics +@c ---------------------------------------------------------------------------- + +The metrics shown by @DisplayText{} are useful, but there is more recorded +than displayed by default. We can customize the values shown by defining the +metrics ourselves. + +There are two commands related to changing the metrics shown: +@IndexSubentry{Options, @code{-metric_list}} +@IndexSubentry{Commands, @code{metric_list}} +@command{metric_list} and +@IndexSubentry{Options, @code{-metrics}} +@IndexSubentry{Commands, @code{metrics}} +@command{metrics}. + +The first command shows the currently selected metrics, plus all the metrics +that have been stored as part of the experiment. The second command may be +used to define the metric list. + +@noindent +This is the way to get the information about the metrics: + +@IndexSubentry{Options, @code{-metric_list}} +@IndexSubentry{Commands, @code{metric_list}} +@cartouche +@smallexample +$ gprofng display text -metric_list test.1.er +@end smallexample +@end cartouche + +@noindent +This is the output: + +@smallexample +@verbatim +Current metrics: e.%totalcpu:i.%totalcpu:name +Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) +Available metrics: +Exclusive Total CPU Time: e.%totalcpu +Inclusive Total CPU Time: i.%totalcpu + Size: size + PC Address: address + Name: name +@end verbatim +@end smallexample + +This shows the metrics that are currently used, the metric that is used to sort +the data and all the metrics that have been recorded, but are not necessarily +shown. + +@cindex Default metrics +In this case, the current metrics are set to the exclusive and inclusive +total CPU times, the respective percentages, and the name of the function, +or load object. + +@IndexSubentry{Options, @code{-metrics}} +@IndexSubentry{Commands, @code{metrics}} +The @code{metrics} command is used to define the metrics that need to be +displayed. + +For example, to swap the exclusive and inclusive metrics, use the following +metric definition: @code{i.%totalcpu:e.%totalcpu}. + +Since the metrics can be tailored for different views, there is also a way +to reset them to the default. This is done through the special keyword +@code{default} for the metrics definition (@command{-metrics default}). +@IndexSubentry{Metrics, Reset to default} + +@c -- A new node -------------------------------------------------------------- +@node Customization of the Output +@subsection Customization of the Output +@c ---------------------------------------------------------------------------- + +With the information just given, the function overview can be customized. +For sake of the example, we would like to display the name of the function +first, only followed by the exclusive CPU time, given as an absolute number +and a percentage. + +Note that the commands are parsed in order of appearance. This is why we +need to define the metrics @emph{before} requesting the function overview: + +@cartouche +@smallexample +$ gprofng display text -metrics name:e.%totalcpu -functions test.1.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Current metrics: name:e.%totalcpu +Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) +Functions sorted by metric: Exclusive Total CPU Time + +Name Excl. Total + CPU + sec. % + <Total> 9.367 100.00 + mxv_core 8.926 95.30 + init_data 0.210 2.24 + drand48 0.080 0.85 + erand48_r 0.070 0.75 + __drand48_iterate 0.060 0.64 + _int_malloc 0.010 0.11 + sysmalloc 0.010 0.11 + <static>@0x47960 (<libgp-collector.so>) 0. 0. + __libc_start_main 0. 0. + allocate_data 0. 0. + driver_mxv 0. 0. + main 0. 0. + malloc 0. 0. + start_thread 0. 0. +@end verbatim +@end smallexample + +This was a first and simple example how to customize the output. Note that we +did not rerun our profiling job and merely modified the display settings. +Below we will show other and also more advanced examples of customization. + +@c -- A new node -------------------------------------------------------------- +@node Name the Experiment Directory +@subsection Name the Experiment Directory +@c ---------------------------------------------------------------------------- + +When using @CollectApp{}, the default names for experiments work fine, but +they are quite generic. It is often more convenient to select a more +descriptive name. For example, one that reflects conditions for the experiment +conducted, like the number of threads used. + +For this, the mutually exclusive @code{-o} and @code{-O} options come in handy. +Both may be used to provide a name for the experiment directory, but the +behaviour of @CollectApp{} is different. + +With the +@IndexSubentry{Options, @code{-o}} +@samp{-o} +option, an existing experiment directory is not overwritten. Any directory +with the same name either needs to be renamed, moved, or removed, before the +experiment can be conducted. + +This is in contrast with the behaviour for the +@IndexSubentry{Options, @code{-O}} +@samp{-O} +option. Any existing directory with the same name is silently overwritten. + +Be aware that the name of the experiment directory has to end with @file{.er}. + +@c -- A new node -------------------------------------------------------------- +@node Control the Number of Lines in the Output +@subsection Control the Number of Lines in the Output +@c ---------------------------------------------------------------------------- + +@IndexSubentry{Options, @code{-limit}} +@IndexSubentry{Commands, @code{limit}} +The @command{limit} @var{<n>} command can be used to control the number of lines +printed in various views. For example it impacts the function view, but also +takes effect for other display commands, like @command{lines}. + +The argument @var{<n>} should be a positive integer number. It sets the number +of lines in the (function) view. A value of zero resets the limit to the +default. + +Be aware that the pseudo-function @code{<Total>} counts as a regular function. +For example @command{limit 10} displays nine user level functions. + +@c -- A new node -------------------------------------------------------------- +@node Sorting the Performance Data +@subsection Sorting the Performance Data +@c ---------------------------------------------------------------------------- + +@IndexSubentry{Options, @code{-sort}} +@IndexSubentry{Commands, @code{sort}} +The @command{sort} @var{<key>} command sets the key to be used when sorting the +performance data. + +The key is a valid metric definition, but the +@IndexSubentry{Metrics, Visibility field} +visibility field +(@xref{Metric Definitions}) +in the metric +definition is ignored, since this does not affect the outcome of the sorting +operation. +For example if the sort key is set to @code{e.totalcpu}, the values +will be sorted in descending order with respect to the exclusive total +CPU time. + +@IndexSubentry{Sort, Reverse order} +The data can be sorted in reverse order by prepending the metric definition +with a minus (@samp{-}) sign. For example @command{sort -e.totalcpu}. + +@IndexSubentry{Sort, Reset to default} +A default metric for the sort operation has been defined and since this is +a persistent command, this default can be restored with @code{default} as +the key (@command{sort default}). + +@c -- A new node -------------------------------------------------------------- +@node Scripting +@subsection Scripting +@c ---------------------------------------------------------------------------- + +@cindex Script files +The list with commands for @DisplayText{} can be very long. This is tedious +and also error prone. Luckily, there is an easier and elegant way to control +the output of this tool. + +@IndexSubentry{Options, @code{-script}} +@IndexSubentry{Commands, @code{script}} +Through the @command{script} command, the name of a file with commands can be +passed in. These commands are parsed and executed as if they appeared on +the command line in the same order as encountered in the file. The commands +in this script file can actually be mixed with commands on the command line +and multiple script files may be used. +The difference between the commands in the script file and those used on the +command line is that the latter require a leading dash (@samp{-}) symbol. + +Comment lines in a script file are supported. They need to start with the +@samp{#} symbol. + +@c -- A new node -------------------------------------------------------------- +@node A More Elaborate Example +@subsection A More Elaborate Example +@c ---------------------------------------------------------------------------- + +With the information presented so far, we can customize our data +gathering and display commands. + +As an example, we would like to use @file{mxv.1.thr.er} as the name for the +experiment directory. In this way, the name of the algorithm and the +number of threads that were used are included in the name. +We also don't mind to overwrite an existing +experiment directory with the same name. + +All that needs to be done is to use the +@IndexSubentry{Options, @code{-O}} +@samp{-O} +option, followed by the directory name of choice when running @CollectApp{}: + +@cartouche +@smallexample +$ exe=mxv-pthreads +$ m=8000 +$ n=4000 +$ gprofng collect app -O mxv.1.thr.er ./$exe -m $m -n $n -t 1 +@end smallexample +@end cartouche + +Since we want to customize the profile and prefer to keep the command line +short, the commands to generate the profile are put into a file with the +name @file{my-script}: + +@smallexample +@verbatim +$ cat my-script +# This is my first gprofng script +# Set the metrics +metrics i.%totalcpu:e.%totalcpu:name +# Use the exclusive time to sort +sort e.totalcpu +# Limit the function list to 5 lines +limit 5 +# Show the function list +functions +@end verbatim +@end smallexample + +This script file is specified as input to the @DisplayText{} command +that is used to display the performance information stored in experiment +directory @file{mxv.1.thr.er}: + +@cartouche +@smallexample +$ gprofng display text -script my-script mxv.1.thr.er +@end smallexample +@end cartouche + +This command produces the following output: + +@smallexample +@verbatim +# This is my first gprofng script +# Set the metrics +Current metrics: i.%totalcpu:e.%totalcpu:name +Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) +# Use the exclusive time to sort +Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) +# Limit the function list to 5 lines +Print limit set to 5 +# Show the function list +Functions sorted by metric: Exclusive Total CPU Time + +Incl. Total Excl. Total Name +CPU CPU + sec. % sec. % +9.703 100.00 9.703 100.00 <Total> +9.226 95.09 9.226 95.09 mxv_core +0.455 4.69 0.210 2.17 init_data +0.169 1.75 0.123 1.26 erand48_r +0.244 2.52 0.075 0.77 drand48 +@end verbatim +@end smallexample + +In the first part of the output the comment lines in the script file are +echoed. These are interleaved with an acknowledgement message for the commands. + +This is followed by a profile consisting of 5 lines only. For both metrics, +the percentages plus the timings are given. The numbers are sorted with respect +to the exclusive total CPU time. Although this is the default, for +demonstration purposes we use the @command{sort} command to explicitly define +the metric for the sort. + +While we executed the same job as before and only changed the name of the +experiment directory, the results are somewhat different. This is sampling +in action. The numbers are not all that different though. +It is seen that function @code{mxv_core} is responsbile for +95% of the CPU time and @code{init_data} takes 4.5% only. + +@c -- A new node -------------------------------------------------------------- +@node The Call Tree +@subsection The Call Tree +@c ---------------------------------------------------------------------------- + +The call tree shows the dynamic structure of the application by displaying the +functions executed and their parent. The CPU time attributed to each function +is shown as well. This view helps to find the most expensive +execution path in the program. + +@IndexSubentry{Options, @code{-calltree}} +@IndexSubentry{Commands, @code{calltree}} +This feature is enabled through the @command{calltree} command. For example, +this is how to get the call tree for our current experiment: + +@cartouche +@smallexample +$ gprofng display text -calltree mxv.1.thr.er +@end smallexample +@end cartouche + +This displays the following structure: + +@smallexample +@verbatim +Functions Call Tree. Metric: Attributed Total CPU Time + +Attr. Total Name +CPU + sec. % +9.703 100.00 +-<Total> +9.226 95.09 +-start_thread +9.226 95.09 | +-<static>@0x47960 (<libgp-collector.so>) +9.226 95.09 | +-driver_mxv +9.226 95.09 | +-mxv_core +0.477 4.91 +-__libc_start_main +0.477 4.91 +-main +0.455 4.69 +-init_data +0.244 2.52 | +-drand48 +0.169 1.75 | +-erand48_r +0.047 0.48 | +-__drand48_iterate +0.021 0.22 +-allocate_data +0.021 0.22 | +-malloc +0.021 0.22 | +-_int_malloc +0.006 0.06 | +-sysmalloc +0.003 0.03 | +-__default_morecore +0.003 0.03 | +-sbrk +0.003 0.03 | +-brk +0.001 0.01 +-pthread_create +0.001 0.01 +-__pthread_create_2_1 +@end verbatim +@end smallexample + +At first sight this may not be what is expected and some explanation is in +place. + +@c ---------------------------------------------------------------------------- +@c TBD: Revise this text when we have user and machine mode. +@c ---------------------------------------------------------------------------- +The top function is the pseudo-function @code{<Total>} that we have seen +before. It is introduced and shown here to provide the total value of the +metric(s). + +We also see function @code{<static>@@0x47960} in the call tree and apparently +it is from @code{libgp-collector.so}, a library that is internal to +@ToolName{}. +The @code{<static>} marker, followed by the program counter, is shown if the +name of the function cannot be found. This function is part of the +implementation of the data collection process and should be hidden to the +user. This is part of a planned future enhancement. + +In general, if a view has a function that does not appear to be part of the +user code, or seems odd anyhow, the @command{objects} and @command{fsingle} +@IndexSubentry{Options, @code{-objects}} +@IndexSubentry{Commands, @code{objects}} +@IndexSubentry{Options, @code{-fsingle}} +@IndexSubentry{Commands, @code{fsingle}} +commands are very useful +to find out more about load objects in general, but also to help identify +an unknown entry in the function overview. @xref{Load Objects and Functions}. + +Another thing to note is that there are two main branches. The one under +@code{<static>@@0x47960} and the second one under @code{__libc_start_main}. +This reflects the fact that this is a multithreaded program and the +threaded part shows up as a separate branch in the call tree. + +The way to interpret this structure is as follows. The program starts +under control of @code{__libc_start_main}. This executes the main program +called @code{main}, which at the top level executes functions +@code{init_data}, @code{allocate_data}, and @code{pthread_create}. +The latter function creates and executes the additional thread(s). + +For this multithreaded part of the code, we need to look at the branch +under function @code{start_thread} that calls the driver code for the +matrix-vector multiplication (@code{driver_mxv}), which executes the function +that performs the actual multiplication (@code{mxv_core}). + +There are two things worth noting for the call tree feature: + +@itemize + +@item +This is a dynamic tree and since sampling is used, it most likely looks +slighlty different across seemingly identical profile runs. In case the +run times are short, it is worth considering to use a high resolution +through the +@IndexSubentry{Options, @code{-p}} +@samp{-p} +option. For example use @samp{-p hi} to increase the sampling rate. + +@item +In case hardware event counters have been enabled +(@xref{Profile Hardware Event Counters}), these values are also displayed +in the call tree view. + +@end itemize + +@c -- A new node -------------------------------------------------------------- +@node More Information on the Experiment +@subsection More Information on the Experiment +@c ---------------------------------------------------------------------------- + +The experiment directory not only contains performance related data. Several +system characteristics, the profiling command executed, plus some global +performance statistics are stored and can be displayed. + +@IndexSubentry{Options, @code{-header}} +@IndexSubentry{Commands, @code{header}} +The @command{header} command displays information about the experiment(s). +For example, this is command is used to extract this data from for our +experiment directory: + +@cartouche +@smallexample +$ gprofng display text -header mxv.1.thr.er +@end smallexample +@end cartouche + +The above command prints the following information. Note that some of the +lay-out and the information has been modified. Directory paths have been +replaced @code{<apath>} for example. Textual changes are +marked with the @samp{<} and @samp{>} symbols. + +@smallexample +@verbatim +Experiment: mxv.1.thr.er +No errors +No warnings +Archive command ` /usr/bin/gp-archive -n -a on --outfile + <apath>/archive.log <apath>/mxv.1.thr.er' + +Target command (64-bit): './mxv-pthreads -m 8000 -n 4000 -t 1' +Process pid 2750071, ppid 2750069, pgrp 2749860, sid 2742080 +Current working directory: <apath> +Collector version: `2.40.00'; experiment version 12.4 (64-bit) +Host `<the-host-name>', OS `Linux <version>', page size 4096, + architecture `x86_64' + 4 CPUs, clock speed 2294 MHz. + Memory: 3506491 pages @ 4096 = 13697 MB. +Data collection parameters: + Clock-profiling, interval = 997 microsecs. + Periodic sampling, 1 secs. + Follow descendant processes from: fork|exec|combo + +Experiment started <date and time> + +Experiment Ended: 9.801216173 +Data Collection Duration: 9.801216173 +@end verbatim +@end smallexample + +The output above may assist in troubleshooting, or to verify some of the +operational conditions and we recommend to include this command when +generating a profile. + +@IndexSubentry{Options, @code{-C}} +Related to this command there is a useful option to record comment(s) in +an experiment. +To this end, use the @samp{-C} option on the @CollectApp{} tool to +specify a comment string. Up to ten comment lines can be included. +These comments are displayed with the @command{header} command on +the @DisplayText{} tool. + +@IndexSubentry{Options, @code{-overview}} +@IndexSubentry{Commands, @code{overview}} +The @command{overview} command displays information on the experiment(s) and +also shows a summary of the values for the metric(s) used. This is an example +how to use it on the newly created experiment directory: + +@cartouche +@smallexample +$ gprofng display text -overview mxv.1.thr.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Experiment(s): + +Experiment :mxv.1.thr.er + Target : './mxv-pthreads -m 8000 -n 4000 -t 1' + Host : <hostname> (<ISA>, Linux <version>) + Start Time : <date and time> + Duration : 9.801 Seconds + +Metrics: + + Experiment Duration (Seconds): [9.801] + Clock Profiling + [X]Total CPU Time - totalcpu (Seconds): [*9.703] + +Notes: '*' indicates hot metrics, '[X]' indicates currently enabled + metrics. + The metrics command can be used to change selections. The + metric_list command lists all available metrics. +@end verbatim +@end smallexample + +This command provides a dashboard overview that helps to easily identify +where the time is spent and in case hardware event counters are used, it +shows their total values. + +@c -- A new node -------------------------------------------------------------- +@node Control the Sampling Frequency +@subsection Control the Sampling Frequency +@c ---------------------------------------------------------------------------- + +@cindex Sampling frequency +So far we did not go into details on the frequency of the sampling process, +but in some cases it is useful to change the default of 10 milliseconds. + +The advantage of increasing the sampling frequency is that functions that +do not take much time per invocation are more accurately captured. The +downside is that more data is gathered. This has an impact on the overhead +of the collection process and more disk space is required. + +In general this is not an immediate concern, but with heavily threaded +applications that run for an extended period of time, increasing the +frequency may have a more noticeable impact. + +@IndexSubentry{Options, @code{-p}} +The @code{-p} option on the @CollectApp{} tool is used to enable or disable +clock based profiling, or to explicitly set the sampling rate. +@cindex Sampling interval +This option takes one of the following keywords: + +@table @code + +@item off +Disable clock based profiling. + +@item on +Enable clock based profiling with a per thread sampling interval of 10 ms. +This is the default. + +@item lo +Enable clock based profiling with a per thread sampling interval of 100 ms. + +@item hi +Enable clock based profiling with a per thread sampling interval of 1 ms. + +@item @var{value} +@cindex Sampling interval +Enable clock based profiling with a per thread sampling interval of +@var{value}. + +@end table + +It may seem unnecessary to have an option to disable clock based profiling, +but there is a good reason to support this. +By default, clock profiling is enabled when conducting hardware event counter +experiments (@xref{Profile Hardware Event Counters}). +With the @code{-p off} option, this can be disabled. + +If an explicit value is set for the sampling, the number can be an integer or a +floating-point number. +A suffix of @samp{u} for microseconds, or @samp{m} for milliseconds is supported. +If no suffix is used, the value is assumed to be in milliseconds. + +For example, the following command sets the sampling rate to +5123.4 microseconds: + +@cartouche +@smallexample +$ gprofng collect app -p 5123.4u ./mxv-pthreads -m 8000 -n 4000 -t 1 +@end smallexample +@end cartouche + +If the value is smaller than the clock profiling minimum, a warning message is issued +and it is set to the minimum. +In case it is not a multiple of the clock profiling resolution, it is silently rounded +down to the nearest multiple of the clock resolution. +If the value exceeds the clock profiling maximum, is negative, or zero, an error is +reported. + +@IndexSubentry{Options, @code{-header}} +@IndexSubentry{Commands, @code{header}} +@noindent +Note that the @code{header} command echoes the sampling rate used. + +@c -- A new node -------------------------------------------------------------- +@node Information on Load Objects +@subsection Information on Load Objects +@c ---------------------------------------------------------------------------- + +It may happen that the function view shows a function that is not known to +the user. This can easily happen with library functions for example. +Luckily there are three commands that come in handy then. + +@IndexSubentry{Options, @code{-objects}} +@IndexSubentry{Commands, @code{objects}} +@IndexSubentry{Options, @code{-fsingle}} +@IndexSubentry{Commands, @code{fsingle}} +@IndexSubentry{Options, @code{-fsummary}} +@IndexSubentry{Commands, @code{fsummary}} +These commands are @command{objects}, @command{fsingle}, and @command{fsummary}. +They provide details on +@cindex Load objects +load objects (@xref{Load Objects and Functions}). + +The @command{objects} command lists all load objects that have been referenced +during the performance experiment. +Below we show the command and the result for our profile job. Like before, +some path names in the output have been shortened and replaced by the +@IndexSubentry{Miscellaneous, @code{<apath>}} +@code{<apath>} symbol that represents an absolute directory path. + +@cartouche +@smallexample +$ gprofng display text -objects mxv.1.thr.er +@end smallexample +@end cartouche + +The output includes the name and path of the target executable: + +@smallexample +@verbatim +<Unknown> (<Unknown>) +<mxv-pthreads> (<apath>/mxv-pthreads) +<libdl-2.28.so> (/usr/lib64/libdl-2.28.so) +<librt-2.28.so> (/usr/lib64/librt-2.28.so) +<libc-2.28.so> (/usr/lib64/libc-2.28.so) +<libpthread-2.28.so> (/usr/lib64/libpthread-2.28.so) +<libm-2.28.so> (/usr/lib64/libm-2.28.so) +<libgp-collector.so> (/usr/lib64/gprofng/libgp-collector.so) +<ld-2.28.so> (/usr/lib64/ld-2.28.so) +<DYNAMIC_FUNCTIONS> (DYNAMIC_FUNCTIONS) +@end verbatim +@end smallexample + +@IndexSubentry{Options, @code{-fsingle}} +@IndexSubentry{Commands, @code{fsingle}} +The @command{fsingle} command may be used to get more details on a specific entry +in the function view, say. For example, the command below provides additional +information on the @code{pthread_create} function shown in the function overview. + +@cartouche +@smallexample +$ gprofng display text -fsingle pthread_create mxv.1.thr.er +@end smallexample +@end cartouche + +Below the output from this command. It has been somewhat modified to match the +display requirements. + +@smallexample +@verbatim ++ gprofng display text -fsingle pthread_create mxv.1.thr.er +pthread_create + Exclusive Total CPU Time: 0. ( 0. %) + Inclusive Total CPU Time: 0.001 ( 0.0%) + Size: 258 + PC Address: 8:0x00049f60 + Source File: (unknown) + Object File: (unknown) + Load Object: /usr/lib64/gprofng/libgp-collector.so + Mangled Name: + Aliases: +@end verbatim +@end smallexample + +In this table we not only see how much time was spent in this function, we +also see where it originates from. In addition to this, the size and start +address are given as well. If the source code location is known it is also +shown here. + +@IndexSubentry{Options, @code{-fsummary}} +@IndexSubentry{Commands, @code{fsummary}} +The related @code{fsummary} command displays the same information as +@code{fsingle}, but for all functions in the function overview, +including @code{<Total>}: + +@cartouche +@smallexample +$ gprofng display text -fsummary mxv.1.thr.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Functions sorted by metric: Exclusive Total CPU Time + +<Total> + Exclusive Total CPU Time: 9.703 (100.0%) + Inclusive Total CPU Time: 9.703 (100.0%) + Size: 0 + PC Address: 1:0x00000000 + Source File: (unknown) + Object File: (unknown) + Load Object: <Total> + Mangled Name: + Aliases: + +mxv_core + Exclusive Total CPU Time: 9.226 ( 95.1%) + Inclusive Total CPU Time: 9.226 ( 95.1%) + Size: 80 + PC Address: 2:0x00001d56 + Source File: <apath>/src/mxv.c + Object File: mxv.1.thr.er/archives/mxv-pthreads_ss_pf53V__5 + Load Object: <apath>/mxv-pthreads + Mangled Name: + Aliases: + + ... etc ... +@end verbatim +@end smallexample + +@c -- A new node -------------------------------------------------------------- +@node Support for Multithreading +@section Support for Multithreading +@c ---------------------------------------------------------------------------- + +In this chapter the support for multithreading is introduced and discussed. +As is shown below, nothing needs to be changed when collecting the performance +data. + +The difference is that additional commands are available to get more +information on the multithreading details, plus that several filters allow +the user to zoom in on specific threads. + +@c -- A new node -------------------------------------------------------------- +@node Creating a Multithreading Experiment +@subsection Creating a Multithreading Experiment +@c ---------------------------------------------------------------------------- + +We demonstrate the support for multithreading using the same code and settings +as before, but this time 2 threads are used: + +@cartouche +@smallexample +$ exe=mxv-pthreads +$ m=8000 +$ n=4000 +$ gprofng collect app -O mxv.2.thr.er ./$exe -m $m -n $n -t 2 +@end smallexample +@end cartouche + +First of all, in as far as @ProductName{} is concerned, no changes are needed. +Nothing special is needed to profile a multithreaded job when using @ToolName{}. + +The same is true when displaying the performance results. The same commands +that were used before work unmodified. For example, this is all that is needed to +get a function overview: + +@cartouche +@smallexample +$ gprofng display text -limit 5 -functions mxv.2.thr.er +@end smallexample +@end cartouche + +This produces the following familiar looking output: + +@smallexample +@verbatim +Print limit set to 5 +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Incl. Total Name +CPU CPU + sec. % sec. % +9.464 100.00 9.464 100.00 <Total> +8.961 94.69 8.961 94.69 mxv_core +0.224 2.37 0.469 4.95 init_data +0.105 1.11 0.177 1.88 erand48_r +0.073 0.77 0.073 0.77 __drand48_iterate +@end verbatim +@end smallexample + +@c -- A new node -------------------------------------------------------------- +@node Commands Specific to Multithreading +@subsection Commands Specific to Multithreading +@c ---------------------------------------------------------------------------- + +The function overview shown above shows the results aggregated over all the +threads. The interesting new element is that we can also look at the +performance data for the individual threads. + +@IndexSubentry{Options, @code{-thread_list}} +@IndexSubentry{Commands, @code{thread_list}} +The @command{thread_list} command displays how many threads have been used: + +@cartouche +@smallexample +$ gprofng display text -thread_list mxv.2.thr.er +@end smallexample +@end cartouche + +This produces the following output, showing that three threads have +been used: + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 all 3 +@end verbatim +@end smallexample + +The output confirms there is one experiment and that by default all +threads are selected. + +It may seem surprising to see three threads here, since we used the +@code{-t 2} option, but it is common for a Pthreads program to use one +additional thread. +Typically, there is one main thread that runs from start to finish. +It handles the sequential portions of the code, as well as thread +management related tasks. +It is no different in the example code. At some point, the main thread +creates and activates the two threads that perform the multiplication +of the matrix with the vector. Upon completion of this computation, +the main thread continues. + +@IndexSubentry{Options, @code{-threads}} +@IndexSubentry{Commands, @code{threads}} +The @command{threads} command is simple, yet very powerful. It shows the +total value of the metrics for each thread. + +@cartouche +@smallexample +$ gprofng display text -threads mxv.2.thr.er +@end smallexample +@end cartouche + +@noindent +The command above produces the following overview: + +@smallexample +@verbatim +Objects sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +9.464 100.00 <Total> +4.547 48.05 Process 1, Thread 3 +4.414 46.64 Process 1, Thread 2 +0.502 5.31 Process 1, Thread 1 +@end verbatim +@end smallexample + +The first line gives the total CPU time accumulated over the threads +selected. This is followed by the metric value(s) for each thread. + +From this it is clear that the main thread is responsible for a +little over 5% of the total CPU time, while the other two threads +take 47-48% each. + +This view is ideally suited to verify if there are any load balancing +issues and also to find the most time consuming thread(s). + +@IndexSubentry{Filters, Thread selection} +While useful, often more information than this is needed. This is +@IndexSubentry{Options, @code{-thread_select}} +@IndexSubentry{Commands, @code{thread_select}} +where the thread selection filter comes in. Through the +@command{thread_select} +command, one or more threads may be selected. +@xref{The Selection List} how to define the selection list. + +Since it is most common to use this command in a script, we do so as +well here. Below the script we are using: + +@cartouche +@smallexample +# Define the metrics +metrics e.%totalcpu +# Limit the output to 5 lines +limit 5 +# Get the function overview for thread 1 +thread_select 1 +functions +# Get the function overview for thread 2 +thread_select 2 +functions +# Get the function overview for thread 3 +thread_select 3 +functions +@end smallexample +@end cartouche + +The definition of the metrics and the output limit have been shown and +explained earlier. The new command to focus on is @command{thread_select}. + +This command takes a list (@xref{The Selection List}) to select specific +threads. In this case, the individual thread numbers that were +obtained earlier with the @command{thread_list} command are selected. + +This restricts the output of the @command{functions} command to the thread +number(s) specified. This means that the script above shows which +function(s) each thread executes and how much CPU time they consumed. +Both the exclusive timings and their percentages are given. + +Note that technically this command is a filter and persistent. The +selection remains active until changed through another thread selection +command, or when it is reset with the @samp{all} selection list. + +@noindent +This is the relevant part of the output for the first thread: + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 1 3 +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +0.502 100.00 <Total> +0.224 44.64 init_data +0.105 20.83 erand48_r +0.073 14.48 __drand48_iterate +0.067 13.29 drand48 +@end verbatim +@end smallexample + +As usual, the comment lines are echoed. This is followed by a confirmation +of the selection. The first table shows that one experiment is loaded and +that thread 1 out of the three threads has been selected. What is +displayed next is the function overview for this particular thread. Due to +the @code{limit 5} command, there are only five functions in this list. + +Clearly, this thread handles the data initialization part and as we know +from the call tree output, function @code{init_data} executes the 3 other +functions shown in this profile. + +Below are the overviews for threads 2 and 3 respectively. It is seen that all +of the CPU time is spent in function @code{mxv_core} and that this time +is approximately the same for both threads. + +@smallexample +@verbatim +# Get the function overview for thread 2 +Exp Sel Total +=== === ===== + 1 2 3 +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +4.414 100.00 <Total> +4.414 100.00 mxv_core +0. 0. <static>@0x48630 (<libgp-collector.so>) +0. 0. driver_mxv +0. 0. start_thread + +# Get the function overview for thread 3 +Exp Sel Total +=== === ===== + 1 3 3 +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +4.547 100.00 <Total> +4.547 100.00 mxv_core +0. 0. <static>@0x48630 (<libgp-collector.so>) +0. 0. driver_mxv +0. 0. start_thread +@end verbatim +@end smallexample + +When analyzing the performance of a multithreaded application, it is sometimes +useful to know whether threads have mostly executed on the same core, say, or +if they have wandered across multiple cores. This sort of stickiness is usually +referred to as +@cindex Thread affinity +@emph{thread affinity}. + +Similar to the commands for the threads, there are several commands related +to the usage of the cores, or @emph{CPUs} as they are called in @ToolName{} +(@xref{The Concept of a CPU in @ProductName{}}). + +@IndexSubentry{Options, @code{-cpu_list}} +@IndexSubentry{Commands, @code{cpu_list}} +Similar to the @command{thread_list} command, the @command{cpu_list} command +displays how many CPUs have been used. +@IndexSubentry{Options, @code{-cpus}} +@IndexSubentry{Commands, @code{cpus}} +The equivalent of the @command{threads} threads command, is the @command{cpus} +command, which shows the numbers of the CPUs that were used and the metric values +for each one of them. Both commands are demonstrated below. + +@cartouche +@smallexample +$ gprofng display text -cpu_list -cpus mxv.2.thr.er +@end smallexample +@end cartouche + +@noindent +This command produces the following output: + +@smallexample +@verbatim ++ gprofng display text -cpu_list -cpus mxv.2.thr.er +Exp Sel Total +=== === ===== + 1 all 4 +Objects sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +9.464 100.00 <Total> +4.414 46.64 CPU 2 +2.696 28.49 CPU 0 +1.851 19.56 CPU 1 +0.502 5.31 CPU 3 +@end verbatim +@end smallexample + +The first table shows that there is only one experiment and that all of the +four CPUs have been selected. The second table shows the exclusive metrics +for each of the CPUs that have been used. + +As also echoed in the output, the data is sorted with respect to the +exclusive CPU time, but it is very easy to sort the data by the CPU id +@IndexSubentry{Options, -sort} +@IndexSubentry{Commands, sort} +by using the @command{sort} command: + +@cartouche +@smallexample +$ gprofng display text -cpu_list -sort name -cpus mxv.2.thr.er +@end smallexample +@end cartouche + +@noindent +With the @command{sort} added, the output is as follows: + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 all 4 +Current Sort Metric: Name ( name ) +Objects sorted by metric: Name + +Excl. Total Name +CPU + sec. % +9.464 100.00 <Total> +2.696 28.49 CPU 0 +1.851 19.56 CPU 1 +4.414 46.64 CPU 2 +0.502 5.31 CPU 3 +@end verbatim +@end smallexample + +While the table with thread times shown earlier may point at a load imbalance +in the application, this overview has a different purpose. + +For example, we see that 4 CPUs have been used, but we know that the +application uses 3 threads only. +We will now demonstrate how filters can be used to help answer the +question why 4 CPUs are used, while the application has 3 threads only. +This means that at least one thread has executed on more than one CPU. + +Recall the thread level timings: + +@smallexample +@verbatim +Excl. Total Name +CPU + sec. % +9.464 100.00 <Total> +4.547 48.05 Process 1, Thread 3 +4.414 46.64 Process 1, Thread 2 +0.502 5.31 Process 1, Thread 1 +@end verbatim +@end smallexample + +Compared to the CPU timings above, it seems very likely that thread 3 has +used more than one CPU, because the thread and CPU timings are the same +for both other threads. + +The command below selects thread number 3 and then requests the CPU +utilization for this thread: + +@cartouche +@smallexample +$ gprofng display text -thread_select 3 -sort name -cpus mxv.2.thr.er +@end smallexample +@end cartouche + +The output shown below confirms that thread 3 is selected and then displays +the CPU(s) that have been used by this thread: + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 3 3 + +Objects sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +4.547 100.00 <Total> +2.696 59.29 CPU 0 +1.851 40.71 CPU 1 +@end verbatim +@end smallexample + +The results show that this thread has used CPU 0 nearly 60% of the time +and CPU 1 for the remaining 40%. + +To confirm that this is the only thread that has used more than one CPU, the +same approach can be used for threads 1 and 2: + +@smallexample +@verbatim +$ gprofng display text -thread_select 1 -cpus mxv.2.thr.er +Exp Sel Total +=== === ===== + 1 1 3 +Objects sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +0.502 100.00 <Total> +0.502 100.00 CPU 3 +@end verbatim +@end smallexample + +@smallexample +@verbatim +$ gprofng display text -thread_select 2 -cpus mxv.2.thr.er +Exp Sel Total +=== === ===== + 1 2 3 +Objects sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +4.414 100.00 <Total> +4.414 100.00 CPU 2 +@end verbatim +@end smallexample + +@noindent +The output above shows that indeed threads 1 and 2 each have used a single +CPU only. + +@c -- A new node -------------------------------------------------------------- +@node View Multiple Experiments +@section View Multiple Experiments +@c ---------------------------------------------------------------------------- + +One thing we did not cover sofar is that @ToolName{} fully supports the analysis +of multiple experiments. The @DisplayText{} tool accepts a list of experiments. +The data can either be aggregated across the experiments, or used in a +comparison. + +The default is to aggregate the metric values across the experiments that have +been loaded. The @command{compare} command can be used to enable the +@IndexSubentry{Options, @code{-compare}} +@IndexSubentry{Commands, @code{compare}} +comparison of results. + +In this section both modes are illustrated with an example. + +@c -- A new node -------------------------------------------------------------- +@node Aggregation of Experiments +@subsection Aggregation of Experiments +@c ---------------------------------------------------------------------------- + +If the data for multiple experiments is aggregrated, the @DisplayText{} tool +shows the combined results. +For example, below is the script to show the function view for the data +aggregated over two experiments, drop the first experiment and then show +the function view fo the second experiment only. +We will call it @file{my-script-agg}. + +@cartouche +@smallexample +# Define the metrics +metrics e.%totalcpu +# Limit the output to 5 lines +limit 5 +# Get the list with experiments +experiment_list +# Get the function overview for all +functions +# Drop the first experiment +drop_exp mxv.2.thr.er +# Get the function overview for exp #2 +functions +@end smallexample +@end cartouche + +@IndexSubentry{Options, @code{-experiment_list}} +@IndexSubentry{Commands, @code{experiment_list}} +With the exception of the @command{experiment_list} command, all commands +used have been discussed earlier. + +The @command{experiment_list} command provides a list of the experiments +that have been loaded. This may be used to get the experiment IDs and +to verify the correct experiments are loaded for the aggregation. + +@noindent +Below is an example that loads two experiments and uses the above +script to display different function views. + +@cartouche +@smallexample +$ gprofng display text -script my-script-agg mxv.2.thr.er mxv.4.thr.er +@end smallexample +@end cartouche + +@noindent +This produces the following output: + +@smallexample +@verbatim +# Define the metrics +Current metrics: e.%totalcpu:name +Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu ) +# Limit the output to 5 lines +Print limit set to 5 +# Get the list with experiments +ID Sel PID Experiment +== === ======= ============ + 1 yes 1339450 mxv.2.thr.er + 2 yes 3579561 mxv.4.thr.er +# Get the function overview for all +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +20.567 100.00 <Total> +19.553 95.07 mxv_core + 0.474 2.30 init_data + 0.198 0.96 erand48_r + 0.149 0.72 drand48 + +# Drop the first experiment +Experiment mxv.2.thr.er has been dropped +# Get the function overview for exp #2 +Functions sorted by metric: Exclusive Total CPU Time + +Excl. Total Name +CPU + sec. % +11.104 100.00 <Total> +10.592 95.39 mxv_core + 0.249 2.24 init_data + 0.094 0.84 erand48_r + 0.082 0.74 drand48 +@end verbatim +@end smallexample + +The first five lines should look familiar. The five lines following echo +the comment line in the script and show the overview of the experiments. +This confirms two experiments have been loaded and that both are active. +This is followed by the function overview. The timings have been summed +up and the percentages are adjusted accordingly. + +@c -- A new node -------------------------------------------------------------- +@node Comparison of Experiments +@subsection Comparison of Experiments +@c ---------------------------------------------------------------------------- + +The support for multiple experiments really shines in comparison mode. +@cindex Compare experiments +In comparison mode, the data for the various experiments is shown side by +side, as illustrated below where we compare the results for the multithreaded +experiments using two and four threads respectively. + +This +feature is controlled through the +@IndexSubentry{Options, @code{-compare}} +@IndexSubentry{Commands, @code{compare}} +@code{compare} command. + +The comparison mode is enabled through @command{compare on} and with +@command{compare off} it is disabled again. +In addition to @samp{on}, or @samp{off}, this command also supports +the @samp{delta} and @samp{ratio} keywords. + +This is the script that will be used in our example. It sets the comparison +mode to @samp{on}: + +@smallexample +@verbatim +# Define the metrics +metrics e.%totalcpu +# Limit the output to 5 lines +limit 5 +# Set the comparison mode to differences +compare on +# Get the function overview +functions +@end verbatim +@end smallexample + +Assuming this script file is called @file{my-script-comp}, this is how +it is used to display the differences: + +@cartouche +@smallexample +$ gprofng display text -script my-script-comp mxv.2.thr.er mxv.4.thr.er +@end smallexample +@end cartouche + +@noindent +This produces the output shown below. The data for the first experiment +is shown as absolute numbers. The timings for the other experiment are +shown as a delta relative to these reference numbers: + +@smallexample +@verbatim + +mxv.2.thr.er mxv.4.thr.er +Excl. Total Excl. Total Name +CPU CPU + sec. % sec. % +9.464 100.00 11.104 100.00 <Total> +8.961 94.69 10.592 95.39 mxv_core +0.224 2.37 0.249 2.24 init_data +0.105 1.11 0.094 0.84 erand48_r +0.073 0.77 0.060 0.54 __drand48_iterate +@end verbatim +@end smallexample + +This table is already helpful to more easily compare (two) profiles, but +there is more that we can do here. + +By default, in comparison mode, all measured values are shown. Often +profiling is about comparing performance data. It is therefore +sometimes more useful to look at differences or ratios, using one +experiment as a reference. + +The values shown are relative to this difference. For example if a ratio +is below one, it means the reference value was higher. + +In the example below, we use the same two experiments used in the comparison +above. The script is also nearly identical. The only change is that we now +use the @samp{delta} keyword. + +As before, the number of lines is restricted to 5 and we focus on +the exclusive timings plus percentages. For the comparison part we are +interested in the differences. + +This is the script that produces such an overview: + +@smallexample +@verbatim +# Define the metrics +metrics e.%totalcpu +# Limit the output to 5 lines +limit 5 +# Set the comparison mode to differences +compare delta +# Get the function overview +functions +@end verbatim +@end smallexample + +Assuming this script file is called @file{my-script-comp2}, this is how we +get the table displayed on our screen: + +@cartouche +@smallexample +$ gprofng display text -script my-script-comp2 mxv.2.thr.er mxv.4.thr.er +@end smallexample +@end cartouche + +Leaving out some of the lines printed, but we have seen before, we get +the following table: + +@smallexample +@verbatim +mxv.2.thr.er mxv.4.thr.er +Excl. Total Excl. Total Name +CPU CPU + sec. % delta % +9.464 100.00 +1.640 100.00 <Total> +8.961 94.69 +1.631 95.39 mxv_core +0.224 2.37 +0.025 2.24 init_data +0.105 1.11 -0.011 0.84 erand48_r +0.073 0.77 -0.013 0.54 __drand48_iterate +@end verbatim +@end smallexample + +It is now easier to see that the CPU times for the most time consuming +functions in this code are practically the same. + +It is also possible to show ratio's through the @command{compare ratio} +@IndexSubentry{Options, @code{-compare}} +@IndexSubentry{Commands, @code{compare}} +command. The first colum is used as a reference and the values for +the other columns with metrics are derived by dividing the value by +the reference. The result for such a comparison is shown below: + +@smallexample +@verbatim +mxv.2.thr.er mxv.4.thr.er +Excl. Total Excl. Total CPU Name +CPU + sec. % ratio % +9.464 100.00 x 1.173 100.00 <Total> +8.961 94.69 x 1.182 95.39 mxv_core +0.224 2.37 x 1.111 2.24 init_data +0.105 1.11 x 0.895 0.84 erand48_r +0.073 0.77 x 0.822 0.54 __drand48_iterate +@end verbatim +@end smallexample + +Note that the comparison feature is supported at the function, source, and +disassembly level. There is no practical limit on the number of experiments +that can be used in a comparison. + +@c -- A new node -------------------------------------------------------------- +@node Profile Hardware Event Counters +@section Profile Hardware Event Counters +@c ---------------------------------------------------------------------------- + +Many processors provide a set of hardware event counters and @ToolName{} +provides support for this feature. +@xref{Hardware Event Counters Explained} for those readers that are not +familiar with such counters and like to learn more. + +In this section we explain how to get the details on the event counter +support for the processor used in the experiment(s), and show several +examples. + +@c -- A new node -------------------------------------------------------------- +@node Getting Information on the Counters Supported +@subsection Getting Information on the Counters Supported +@c ---------------------------------------------------------------------------- + +The first step is to check if the processor used for the experiments is +supported by @ToolName{}. +@IndexSubentry{Options, @code{-h}} +The @code{-h} option on @CollectApp{} will show the event counter +information: + +@cartouche +@smallexample +$ gprofng collect app -h +@end smallexample +@end cartouche + +In case the counters are supported, a list with the events is printed. +Otherwise, a warning message will be issued. + +For example, below we show this command and the output on an Intel Xeon +Platinum 8167M (aka ``Skylake'') processor. The output has been split +into several sections and each section is commented upon separately. + +@smallexample +@verbatim +Run "gprofng collect app --help" for a usage message. + +Specifying HW counters on `Intel Arch PerfMon v2 on Family 6 Model 85' +(cpuver=2499): + + -h {auto|lo|on|hi} + turn on default set of HW counters at the specified rate + -h <ctr_def> [-h <ctr_def>]... + -h <ctr_def>[,<ctr_def>]... + specify HW counter profiling for up to 4 HW counters +@end verbatim +@end smallexample + +The first line shows how to get a usage overview. This is followed by +some information on the target processor. +The next five lines explain in what ways the @code{-h} option can be +used to define the events to be monitored. + +The first version shown above enables a default set of counters. This +default depends on the processor this command is executed on. The +keyword following the @code{-h} option defines the sampling rate: + +@table @code + +@item auto +Match the sample rate of used by clock profiling. If the latter is disabled, +Use a per thread sampling rate of approximately 100 samples per second. +This setting is the default and preferred. + +@item on +Use a per thread sampling rate of approximately 100 samples per second. + +@item lo +Use a per thread sampling rate of approximately 10 samples per second. + +@item hi +Use a per thread sampling rate of approximately 1000 samples per second. + +@end table + +The second and third variant define the events to be monitored. Note +that the number of simultaneous events supported is printed. In this +case we can monitor four events in a single profiling job. + +It is a matter of preference whether you like to use the @code{-h} +option for each event, or use it once, followed by a comma separated +list. + +There is one slight catch though. The counter definition below has +mandatory comma (@code{,}) between the event and the rate. While a +default can be used for the rate, the comma cannot be omitted. +This may result in a somewhat awkward counter definition in case +the default sampling rate is used. + +For example, the following two commands are equivalent. Note +the double comma in the second command. This is not a typo. + +@cartouche +@smallexample +$ gprofng collect app -h cycles -h insts ... +$ gprofng collect app -h cycles,,insts ... +@end smallexample +@end cartouche + +In the first command this comma is not needed, because a +comma (``@code{,}'') immediately followed by white space may +be omitted. + +This is why we prefer the this syntax and in the remainder will +use the first version of this command. + +@IndexSubentry{Hardware event counters, counter definition} +The counter definition takes an event name, plus optionally one or +more attributes, followed by a comma, and optionally the sampling rate. +The output section below shows the formal definition. + +@cartouche +@smallexample + <ctr_def> == <ctr>[[~<attr>=<val>]...],[<rate>] +@end smallexample +@end cartouche + +The printed help then explains this syntax. Below we have summarized +and expanded this output: + +@table @code + +@item @var{<ctr>} +The counter name must be selected from the available counters listed +as part of the output printed with the @code{-h} option. +On most systems, if a counter is not listed, it may still be specified +by its numeric value. + +@item @var{~<attr>=<val>} +This is an optional attribute that depends on the processor. The list +of supported attributes is printed in the output. Examples of +attributes are ``user'', or ``system''. The value can given in decimal +or hexadecimal format. +Multiple attributes may be specified, and each must be preceded +by a ~. + +@item @var{<rate>} + +The sampling rate is one of the following: + +@table @code + +@item auto +This is the default and matches the rate used by clock profiling. +If clock profiling is disabled, use @samp{on}. + +@item on +Set the per thread maximum sampling rate to ~100 samples/second + +@item lo +Set the per thread maximum sampling rate to ~10 samples/second + +@item hi +Set the per thread maximum sampling rate to ~1000 samples/second + +@item @var{<interval>} +Define the sampling interval. +@xref{Control the Sampling Frequency} how to define this. + +@end table + +@end table + +After the section with the formal definition of events and counters, a +processor specific list is displayed. This part starts with an overview +of the default set of counters and the aliased names supported +@emph{on this specific processor}. + +@smallexample +@verbatim +Default set of HW counters: + + -h cycles,,insts,,llm + +Aliases for most useful HW counters: + + alias raw name type units regs description + + cycles unhalted-core-cycles CPU-cycles 0123 CPU Cycles + insts instruction-retired events 0123 Instructions Executed + llm llc-misses events 0123 Last-Level Cache Misses + br_msp branch-misses-retired events 0123 Branch Mispredict + br_ins branch-instruction-retired events 0123 Branch Instructions +@end verbatim +@end smallexample + +@noindent +The definitions given above may or may not be available on other processors. + +The table above shows the default set of counters defined for this processor, +and the aliases. For each alias the full ``raw'' name is given, plus the +unit of the number returned by the counter (CPU cycles, or a raw count), +the hardware counter the event is allowed to be mapped onto, and a short +description. + +The last part of the output contains all the events that can be monitored: + +@smallexample +@verbatim +Raw HW counters: + + name type units regs description + + unhalted-core-cycles CPU-cycles 0123 + unhalted-reference-cycles events 0123 + instruction-retired events 0123 + llc-reference events 0123 + llc-misses events 0123 + branch-instruction-retired events 0123 + branch-misses-retired events 0123 + ld_blocks.store_forward events 0123 + ld_blocks.no_sr events 0123 + ld_blocks_partial.address_alias events 0123 + dtlb_load_misses.miss_causes_a_walk events 0123 + dtlb_load_misses.walk_completed_4k events 0123 + + <many lines deleted> + + l2_lines_out.silent events 0123 + l2_lines_out.non_silent events 0123 + l2_lines_out.useless_hwpf events 0123 + sq_misc.split_lock events 0123 +@end verbatim +@end smallexample + +As can be seen, these names are not always easy to correlate to a specific +event of interest. The processor manual should provide more clarity on this. + +@c -- A new node -------------------------------------------------------------- +@node Examples Using Hardware Event Counters +@subsection Examples Using Hardware Event Counters +@c ---------------------------------------------------------------------------- + +The previous section may give the impression that these counters are hard to +use, but as we will show now, in practice it is quite simple. + +With the information from the @code{-h} option, we can easily set up our first +event counter experiment. + +We start by using the default set of counters defined for our processor and we +use 2 threads: + +@cartouche +@smallexample +$ exe=mxv-pthreads +$ m=8000 +$ n=4000 +$ exp=mxv.hwc.def.2.thr.er +$ gprofng collect app -O $exp -h auto ./$exe -m $m -n $n -t 2 +@end smallexample +@end cartouche + +@IndexSubentry{Options, @code{-h}} +@IndexSubentry{Hardware event counters, @code{auto} option} +The new option here is @code{-h auto}. The @code{auto} keyword enables +hardware event counter profiling and selects the default set of counters +defined for this processor. + +As before, we can display the information, but there is one practical hurdle +to take. Unless we like to view all metrics recorded, we would need to know +the names of the events that have been enabled. This is tedious and also not +portable in case we would like to repeat this experiment on another processor. + +@IndexSubentry{Hardware event counters, @code{hwc} metric} +This is where the special @code{hwc} metric comes very handy. It +automatically expands to the active set of events used. + +With this, it is very easy to display the event counter values. Note that +although the regular clock based profiling was enabled, we only want to see +the counter values. We also request to see the percentages and limit the +output to the first 5 lines: + +@cartouche +@smallexample +$ exp=mxv.hwc.def.2.thr.er +$ gprofng display text -metrics e.%hwc -limit 5 -functions $exp +@end smallexample +@end cartouche + +@smallexample +@verbatim +Current metrics: e.%cycles:e+%insts:e+%llm:name +Current Sort Metric: Exclusive CPU Cycles ( e.%cycles ) +Print limit set to 5 +Functions sorted by metric: Exclusive CPU Cycles + +Excl. CPU Excl. Instructions Excl. Last-Level Name +Cycles Executed Cache Misses + sec. % % % +2.691 100.00 7906475309 100.00 122658983 100.00 <Total> +2.598 96.54 7432724378 94.01 121745696 99.26 mxv_core +0.035 1.31 188860269 2.39 70084 0.06 erand48_r +0.026 0.95 73623396 0.93 763116 0.62 init_data +0.018 0.66 76824434 0.97 40040 0.03 drand48 +@end verbatim +@end smallexample + +As we have seen before, the first few lines echo the settings. +This includes a list with the hardware event counters used by +default. + +The table that follows makes it very easy to get an overview where the +time is spent and how many of the target events have occurred. + +As before, we can drill down deeper and see the same metrics at the source +line and instruction level. Other than using @code{hwc} in the metrics +definitions, nothing has changed compared to the previous examples: + +@cartouche +@smallexample +$ exp=mxv.hwc.def.2.thr.er +$ gprofng display text -metrics e.hwc -source mxv_core $exp +@end smallexample +@end cartouche + +This is the relevant part of the output. Since the lines get very long, +we have somewhat modified the lay-out: + +@smallexample +@verbatim + Excl. CPU Excl. Excl. + Cycles Instructions Last-Level + sec. Executed Cache Misses + <Function: mxv_core> + 0. 0 0 32. void __attribute__ ((noinline)) + mxv_core(...) + 0. 0 0 33. { + 0. 0 0 34. for (uint64_t i=...) { + 0. 0 0 35. double row_sum = 0.0; +## 1.872 7291879319 88150571 36. for (int64_t j=0; j<n; j++) + 0.725 140845059 33595125 37. row_sum += A[i][j]*b[j]; + 0. 0 0 38. c[i] = row_sum; + 39. } + 0. 0 0 40. } +@end verbatim +@end smallexample + +In a smiliar way we can display the event counter values at the instruction +level. Again we have modified the lay-out due to page width limitations: + +@cartouche +@smallexample +$ exp=mxv.hwc.def.2.thr.er +$ gprofng display text -metrics e.hwc -disasm mxv_core $exp +@end smallexample +@end cartouche + +@smallexample +@verbatim + Excl. CPU Excl. Excl. + Cycles Instructions Last-Level + sec. Executed Cache Misses + <Function: mxv_core> + 0. 0 0 [33] 4021ba: mov 0x8(%rsp),%r10 + 34. for (uint64_t i=...) { + 0. 0 0 [34] 4021bf: cmp %rsi,%rdi + 0. 0 0 [34] 4021c2: jbe 0x37 + 0. 0 0 [34] 4021c4: ret + 35. double row_sum = 0.0; + 36. for (int64_t j=0; j<n; j++) + 37. row_sum += A[i][j]*b[j]; + 0. 0 0 [37] 4021c5: mov (%r8,%rdi,8),%rdx + 0. 0 0 [36] 4021c9: mov $0x0,%eax + 0. 0 0 [35] 4021ce: pxor %xmm1,%xmm1 + 0.002 12804230 321394 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0 + 0.141 60819025 3866677 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0 + 0.582 67221804 29407054 [37] 4021dd: addsd %xmm0,%xmm1 +## 1.871 7279075109 87989870 [36] 4021e1: add $0x1,%rax + 0.002 12804210 80351 [36] 4021e5: cmp %rax,%rcx + 0. 0 0 [36] 4021e8: jne 0xffffffffffffffea + 38. c[i] = row_sum; + 0. 0 0 [38] 4021ea: movsd %xmm1,(%r10,%rdi,8) + 0. 0 0 [34] 4021f0: add $0x1,%rdi + 0. 0 0 [34] 4021f4: cmp %rdi,%rsi + 0. 0 0 [34] 4021f7: jb 0xd + 0. 0 0 [35] 4021f9: pxor %xmm1,%xmm1 + 0. 0 0 [36] 4021fd: test %rcx,%rcx + 0. 0 80350 [36] 402200: jne 0xffffffffffffffc5 + 0. 0 0 [36] 402202: jmp 0xffffffffffffffe8 + 39. } + 40. } + 0. 0 0 [40] 402204: ret +@end verbatim +@end smallexample + +So far we have used the default settings for the event counters. It is +quite straightforward to select specific counters. For sake of the +example, let's assume we would like to count how many branch instructions +and retired memory load instructions that missed in the L1 cache have been +executed. We also want to count these events with a high resolution. + +This is the command to do so: + +@cartouche +@smallexample +$ exe=mxv-pthreads +$ m=8000 +$ n=4000 +$ exp=mxv.hwc.sel.2.thr.er +$ hwc1=br_ins,hi +$ hwc2=mem_load_retired.l1_miss,hi +$ gprofng collect app -O $exp -h $hwc1 -h $hwc2 $exe -m $m -n $n -t 2 +@end smallexample +@end cartouche + +As before, we get a table with the event counts. Due to the very +long name for the second counter, we have somewhat modified the +output. + +@cartouche +@smallexample +$ gprofng display text -limit 10 -functions mxv.hwc.sel.2.thr.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Functions sorted by metric: Exclusive Total CPU Time +Excl. Incl. Excl. Branch Excl. Name +Total Total Instructions mem_load_retired.l1_miss +CPU sec. CPU sec. Events +2.597 2.597 1305305319 4021340 <Total> +2.481 2.481 1233233242 3982327 mxv_core +0.040 0.107 19019012 9003 init_data +0.028 0.052 23023048 15006 erand48_r +0.024 0.024 19019008 9004 __drand48_iterate +0.015 0.067 11011009 2998 drand48 +0.008 0.010 0 3002 _int_malloc +0.001 0.001 0 0 brk +0.001 0.002 0 0 sysmalloc +0. 0.001 0 0 __default_morecore +@end verbatim +@end smallexample + +@IndexSubentry{Options, @code{-compare}} +@IndexSubentry{Commands, @code{compare}} +When using event counters, the values could be very large and it is not easy +to compare the numbers. As we will show next, the @code{ratio} feature is +very useful when comparing such profiles. + +To demonstrate this, we have set up another event counter experiment where +we would like to compare the number of last level cache miss and the number +of branch instructions executed when using a single thread, or two threads. + +These are the commands used to generate the experiment directories: + +@cartouche +@smallexample +$ exe=./mxv-pthreads +$ m=8000 +$ n=4000 +$ exp1=mxv.hwc.comp.1.thr.er +$ exp2=mxv.hwc.comp.2.thr.er +$ gprofng collect app -O $exp1 -h llm -h br_ins $exe -m $m -n $n -t 1 +$ gprofng collect app -O $exp2 -h llm -h br_ins $exe -m $m -n $n -t 2 +@end smallexample +@end cartouche + +The following script has been used to get the tables. Due to lay-out +restrictions, we have to create two tables, one for each counter. + +@cartouche +@smallexample +# Limit the output to 5 lines +limit 5 +# Define the metrics +metrics name:e.llm +# Set the comparison to ratio +compare ratio +functions +# Define the metrics +metrics name:e.br_ins +# Set the comparison to ratio +compare ratio +functions +@end smallexample +@end cartouche + +Note that we print the name of the function first, followed by the counter +data. +The new element is that we set the comparison mode to @code{ratio}. This +divides the data in a column by its counterpart in the reference experiment. + +This is the command using this script and the two experiment directories as +input: + +@cartouche +@smallexample +$ gprofng display text -script my-script-comp-counters \ + mxv.hwc.comp.1.thr.er \ + mxv.hwc.comp.2.thr.er +@end smallexample +@end cartouche + +By design, we get two tables, one for each counter: + +@smallexample +@verbatim +Functions sorted by metric: Exclusive Last-Level Cache Misses + + mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er +Name Excl. Last-Level Excl. Last-Level + Cache Misses Cache Misses + ratio + <Total> 122709276 x 0.788 + mxv_core 121796001 x 0.787 + init_data 723064 x 1.055 + erand48_r 100111 x 0.500 + drand48 60065 x 1.167 + +Functions sorted by metric: Exclusive Branch Instructions + + mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er +Name Excl. Branch Excl. Branch + Instructions Instructions + ratio + <Total> 1307307316 x 0.997 + mxv_core 1235235239 x 0.997 + erand48_r 23023033 x 0.957 + drand48 20020009 x 0.600 + __drand48_iterate 17017028 x 0.882 +@end verbatim +@end smallexample + +A ratio less than one in the second column, means that this counter +value was smaller than the value from the reference experiment shown +in the first column. + +This kind of presentation of the results makes it much easier to +quickly interpret the data. + +We conclude this section with thread-level event counter overviews, +but before we go into this, there is an important metric we need to +mention. + +@c -- TBD Explain <Total> for IPC + +@IndexSubentry{Hardware event counters, IPC} +In case it is known how many instructions and CPU cycles have been executed, +the value for the IPC (``Instructions Per Clockycle'') can be computed. +@xref{Hardware Event Counters Explained}. +This is a derived metric that gives an indication how well the processor +is utilized. The inverse of the IPC is called CPI. +@IndexSubentry{Hardware event counters, CPI} + +The @DisplayText{} command automatically computes the IPC and CPI values +if an experiment contains the event counter values for the instructions +and CPU cycles executed. These are part of the metric list and can be +displayed, just like any other metric. + +@IndexSubentry{Options, @code{-metric_list}} +@IndexSubentry{Commands, @code{metric_list}} +This can be verified through the @command{metric_list} command. If we go +back to our earlier experiment with the default event counters, we get +the following result. + +@cartouche +@smallexample +$ gprofng display text -metric_list mxv.hwc.def.2.thr.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Current metrics: e.totalcpu:i.totalcpu:e.cycles:e+insts:e+llm:name +Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu ) +Available metrics: + Exclusive Total CPU Time: e.%totalcpu + Inclusive Total CPU Time: i.%totalcpu + Exclusive CPU Cycles: e.+%cycles + Inclusive CPU Cycles: i.+%cycles + Exclusive Instructions Executed: e+%insts + Inclusive Instructions Executed: i+%insts +Exclusive Last-Level Cache Misses: e+%llm +Inclusive Last-Level Cache Misses: i+%llm + Exclusive Instructions Per Cycle: e+IPC + Inclusive Instructions Per Cycle: i+IPC + Exclusive Cycles Per Instruction: e+CPI + Inclusive Cycles Per Instruction: i+CPI + Size: size + PC Address: address + Name: name +@end verbatim +@end smallexample + +Among the other metrics, we see the new metrics for the IPC and CPI +listed. + +In the script below, we use this information and add the IPC and CPI +to the metrics to be displayed. We also use a the thread filter to +display these values for the individual threads. + +This is the complete script we have used. Other than a different selection +of the metrics, there are no new features. + +@cartouche +@smallexample +# Define the metrics +metrics e.insts:e.%cycles:e.IPC:e.CPI +# Sort with respect to cycles +sort e.cycles +# Limit the output to 5 lines +limit 5 +# Get the function overview for all threads +functions +# Get the function overview for thread 1 +thread_select 1 +functions +# Get the function overview for thread 2 +thread_select 2 +functions +# Get the function overview for thread 3 +thread_select 3 +functions +@end smallexample +@end cartouche + +In the metrics definition on the second line, we explicitly request the +counter values for the instructions (@code{e.insts}) and CPU cycles +(@code{e.cycles}) executed. These names can be found in output from the +@IndexSubentry{Options, @code{-metric_list}} +@IndexSubentry{Commands, @code{metric_list}} +@command{metric_list} command above. +In addition to these metrics, we also request the IPC and CPI to be shown. + +@IndexSubentry{Options, @code{-limit}} +@IndexSubentry{Commands, @code{limit}} +As before, we used the @command{limit} command to control the number of +functions displayed. We then request an overview for all the threads, +followed by three sets of two commands to select a thread and display the +function overview. + +The script above is used as follows: + +@cartouche +@smallexample +$ gprofng display text -script my-script-ipc mxv.hwc.def.2.thr.er +@end smallexample +@end cartouche + +@noindent +This script produces four tables. We list them separately below, +and have left out the additional output. + +@noindent +The first table shows the accumulated values across the three +threads that have been active. + +@smallexample +@verbatim +Functions sorted by metric: Exclusive CPU Cycles + +Excl. Excl. CPU Excl. Excl. Name +Instructions Cycles IPC CPI +Executed sec. % +7906475309 2.691 100.00 1.473 0.679 <Total> +7432724378 2.598 96.54 1.434 0.697 mxv_core + 188860269 0.035 1.31 2.682 0.373 erand48_r + 73623396 0.026 0.95 1.438 0.696 init_data + 76824434 0.018 0.66 2.182 0.458 drand48 +@end verbatim +@end smallexample + +@noindent +This shows that IPC of this program is completely dominated +by function @code{mxv_core}. It has a fairly low IPC value +of 1.43. + +@noindent +The next table is for thread 1 and shows the values for the +main thread. + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 1 3 +Functions sorted by metric: Exclusive CPU Cycles + +Excl. Excl. CPU Excl. Excl. Name +Instructions Cycles IPC CPI +Executed sec. % +473750931 0.093 100.00 2.552 0.392 <Total> +188860269 0.035 37.93 2.682 0.373 erand48_r + 73623396 0.026 27.59 1.438 0.696 init_data + 76824434 0.018 18.97 2.182 0.458 drand48 +134442832 0.013 13.79 5.250 0.190 __drand48_iterate +@end verbatim +@end smallexample + +@noindent +Although this thread hardly uses any CPU cycles, the overall IPC +of 2.55 is not all that bad. + +@noindent +Last, we show the tables for threads 2 and 3: + +@smallexample +@verbatim +Exp Sel Total +=== === ===== + 1 2 3 +Functions sorted by metric: Exclusive CPU Cycles + +Excl. Excl. CPU Excl. Excl. Name +Instructions Cycles IPC CPI +Executed sec. % +3716362189 1.298 100.00 1.435 0.697 <Total> +3716362189 1.298 100.00 1.435 0.697 mxv_core + 0 0. 0. 0. 0. collector_root + 0 0. 0. 0. 0. driver_mxv + +Exp Sel Total +=== === ===== + 1 3 3 +Functions sorted by metric: Exclusive CPU Cycles + +Excl. Excl. CPU Excl. Excl. Name +Instructions Cycles IPC CPI +Executed sec. % +3716362189 1.300 100.00 1.433 0.698 <Total> +3716362189 1.300 100.00 1.433 0.698 mxv_core + 0 0. 0. 0. 0. collector_root + 0 0. 0. 0. 0. driver_mxv +@end verbatim +@end smallexample + +It is seen that both execute the same number of instructions and +take about the same number of CPU cycles. As a result, the IPC is +the same for both threads. + +@c -- A new node -------------------------------------------------------------- +@c TBD @node Additional Features +@c TBD @section Additional Features +@c ---------------------------------------------------------------------------- + +@c -- A new node -------------------------------------------------------------- +@c TBD @node More Filtering Capabilities +@c TBD @subsection More Filtering Capabilities +@c ---------------------------------------------------------------------------- + +@c TBD Cover @code{samples} and @code{seconds} + +@c -- A new node -------------------------------------------------------------- +@node Java Profiling +@section Java Profiling +@c ---------------------------------------------------------------------------- + +@IndexSubentry{Options, @code{-j}} +@IndexSubentry{Java profiling, @code{-j on/off}} +The @CollectApp{} command supports Java profiling. The @code{-j on} option +can be used for this, but since this feature is enabled by default, there is +no need to set this explicitly. Java profiling may be disabled through the +@code{-j off} option. + +The program is compiled as usual and the experiment directory is created +similar to what we have seen before. The only difference with a C/C++ +application is that the program has to be explicitly executed by java. + +For example, this is how to generate the experiment data for a Java +program that has the source code stored in file @code{Pi.java}: + +@cartouche +@smallexample +$ javac Pi.java +$ gprofng collect app -j on -O pi.demo.er java Pi < pi.in +@end smallexample +@end cartouche + +Regarding which java is selected to generate the data, @ToolName{} +first looks for the JDK in the path set in either the +@IndexSubentry{Java profiling, @code{JDK_HOME}} +@code{JDK_HOME} environment variable, or in the +@IndexSubentry{Java profiling, @code{JAVA_PATH}} +@code{JAVA_PATH} environment variable. If neither of these variables is +set, it checks for a JDK in the search path (set in the PATH +environment variable). If there is no JDK in this path, it checks for +the java executable in @code{/usr/java/bin/java}. + +In case additional options need to be passed on to the JVM, the +@IndexSubentry{Options, @code{-J}} +@IndexSubentry{Java profiling, @code{-J <string>}} +@code{-J <string>} option can be used. The string with the +option(s) has to be delimited by quotation marks in case +there is more than one argument. + +The @DisplayText{} command may be used to view the performance data. There is +no need for any special options and the same commands as previously discussed +are supported. + +@IndexSubentry{Options, @code{-viewmode}} +@IndexSubentry{Commands, @code{viewmode}} +@IndexSubentry{Java profiling, different view modes} +The @code{viewmode} command +@xref{The Viewmode} +is very useful to examine the call stacks. + +For example, this is how one can see the native call stacks. For +lay-out purposes we have restricted the list to the first five entries: + +@cartouche +@smallexample +$ gprofng display text -limit 5 -viewmode machine -calltree pi.demo.er +@end smallexample +@end cartouche + +@smallexample +@verbatim +Print limit set to 5 +Viewmode set to machine +Functions Call Tree. Metric: Attributed Total CPU Time + +Attr. Name +Total +CPU sec. +1.381 +-<Total> +1.171 +-Pi.calculatePi(double) +0.110 +-collector_root +0.110 | +-JavaMain +0.070 | +-jni_CallStaticVoidMethod +@end verbatim +@end smallexample + +@noindent +Note that the selection of the viewmode is echoed in the output. + +@c -- A new node -------------------------------------------------------------- +@node The gprofng Tools +@chapter The gprofng Tools +@c ---------------------------------------------------------------------------- + +Several tools are included in @ProductName{}. In subsequent chapters these +are discussed in detail. Below a brief description is given, followed by an +overview of the environment variables that are supported. + +@c -- A new node -------------------------------------------------------------- +@node Tools Overview +@section Tools Overview +@c ---------------------------------------------------------------------------- + +The following tools are supported by @ProductName{}: + +@table @code + +@item @CollectApp{} +@IndexSubentry{@code{gprofng}, @code{collect app}} + +Collects the performance data and stores the results in an experiment +directory. There are many options on this tool, but quite often the +defaults are sufficient. +An experiment directory is required for the subsequent analysis of +the results. + +@item @DisplayText{} +@IndexSubentry{@code{gprofng}, @code{display text}} +Generates performance reports in ASCII format. Commandline +options, and/or commands in a script file are used to control the contents +and lay-out of the generated report(s). + +@item @DisplayHTML{} +@IndexSubentry{@code{gprofng}, @code{display html}} +Takes one or more experiment directories and generates a directory with +HTML files. Starting from the index.html file, the performance data +may be examined in a browser. + +@item @DisplaySRC{} +@IndexSubentry{@code{gprofng}, @code{display src}} +Displays the source code, interleaved with the disassembled instructions. + +@item @Archive{} +@IndexSubentry{@code{gprofng}, @code{archive}} +Archives an experiment directory by (optionally) including source code and +object files, as well as the shared libraries that have been used. + +@end table + +@c -- A new section ----------------------------------------------------------- +@node The gprofng.rc file with default settings +@section The gprofng.rc file with default settings +@c ---------------------------------------------------------------------------- +The @file{gprofng.rc} +@cindex gprofng.rc +file is used to define default settings for the @DisplayText{} and +@DisplaySRC{} tools, but the user can override these defaults through local +configuration files. + +There are three files that are checked when the tool starts up. The first +file has pre-defined settings and comes with the installation, but through +a hidden file called @file{.gprofng.rc}, the user can (re)define the defaults: + +These are the locations and files that are checked upon starting the above +mentioned tools: + +@enumerate + +@item +The system-wide filename is called @file{gprofng.rc} and is located in +the top level @file{/etc} directory. + +If @ProductName{} has been built from the source, this file is in +subdirectory @file{etc} in the top level installation directory. + +@item +The user's home directory may have a hidden file called @file{.gprofng.rc}. + +@item +The directory where @DisplayText{} (or @DisplaySRC{}) is invoked from may +have a hidden file called @file{.gprofng.rc}. + +@end enumerate + +The settings of each file override the settings of the file(s) read before it. +Defaults in the system-wide file are overruled by the file in the user home +directory (if any) and any settings in the @file{.gprofng.rc} file in the +current directory override those. + +Note that the settings in these files only affect the defaults. Unlike +the commands used in a script file, they are not commands for the tools. + +@c -- TBD indxobj_define, +@c -- TBD object_show, +@c -- TBD object_hide, +@c -- TBD object_api, + +The @file{.gprofng.rc} configuration files can contain the +@command{addpath}, +@command{compare}, +@command{dthresh}, +@command{name}, +@command{pathmap}, +@command{printmode}, +@command{sthresh}, +and +@command{viewmode} +commands as described in this user guide. + +They can also contain the following commands, @emph{which cannot be used on +either the command line, or in a script file}: + +@table @code + +@item dmetrics @var{metric-spec} +@IndexSubentry{Commands, @code{dmetrics}} + +Specify the default metrics to be displayed or printed in the function list. +The syntax and use of the metric list is described in section +@ref{Metric Definitions}. +The order of the metric keywords in the list determines the order in which +the metrics are presented. + +Default metrics for the @code{callers-callees} list are derived from the +function list default metrics by adding the corresponding attributed metric +before the first occurrence of each metric name in the list. + +@item dsort @var{metric-spec} +@IndexSubentry{Commands, @code{dsort}} + +Specify the default metric by which the function list is sorted. The sort +metric is the first metric in this list that matches a metric in any loaded +experiment, subject to the following conditions: + +@itemize @bullet + +@item +If the entry in @var{metric-spec} has a visibility string of an exclamation +point (@samp{!}), the first metric whose name matches is used, regardless of +whether it is visible. + +@item +If the entry in @var{metric-spec} has any other visibility string, the first +visible metric whose name matches is used. + +@end itemize + +The syntax and use of the metric list is described in section +@ref{Metric Definitions}. +The default sort metric for the @code{callers-callees} list is the attributed +metric corresponding to the default sort metric for the function list. + +@item en_desc @{on | off | =@var{regex}@} +@IndexSubentry{Commands, @code{en_desc}} + +Set the mode for reading descendant experiments to @samp{on} (enable all +descendants) or @samp{off} to disable all descendants. If +@samp{=}@var{regex} is used, enable data from those experiments whose +executable name matches the regular expression. + +The default setting is @samp{on} to follow all descendants. In reading +experiments with descendants, any sub-experiments that contain little or +no performance data are ignored by @DisplayText{}. + +@end table + +@c -- A new section ----------------------------------------------------------- +@node Filters +@section Filters +@c ---------------------------------------------------------------------------- + +Various filter commands are supported by @DisplayText{}. +Thanks to the use of filters, the user can zoom in on a certain area of +interest. With filters, it is possible to select one or more threads to +focus on, define a window in time, select specific call stacks, etc. +@IndexSubentry{Filters, Intro} + +While already powerful by themselves, filters may be combined to further +narrow down the view into the data. + +@IndexSubentry{Filters, Persistence} +It is important to note that filters are @emph{persistent}. A filter is +active until it is reset. This means that successive filter commands +increasingly narrow down the view until one or more are reset. + +@noindent +An example is the following: + +@cartouche +@smallexample +$ gprofng display text -thread_select 1 -functions \ + -cpu_select 2 -functions @dots{} +@end smallexample +@end cartouche + +This command selects thread 1 and requests the function view for this thread. +The third (@command{cpu_select 2}) command @emph{adds} the +constraint that only the events on CPU 2 are to be selected. This means +that the next function view selects events that were executed by thread 1 and +have been running on CPU 2. + +@noindent +In contrast with this single command line, the two commands below look similar, +but behave very differently: + +@cartouche +@smallexample +$ gprofng display text -thread_select 1 -functions @dots{} +$ gprofng display text -cpu_select 2 -functions @dots{} +@end smallexample +@end cartouche + +The first command displays the function view for thread 1. The second +command shows the function view for CPU 2 for @emph{all} threads that have +been running on this CPU. + +As the following example demonstrates, things get a little more tricky in +case a script file is used. Consider the following script file: + +@smallexample +@verbatim +thread_select 1 +functions +cpu_select 2 +functions +@end verbatim +@end smallexample + +This script file displays the function view for thread 1 first. This is +followed by those functions that were executed by thread 1 @emph{and} have +been run on CPU 2. + +If however, the script should behave like the two command line invocations +shown above, the thread selection filter needs to be reset before CPU 2 is +selected: + +@smallexample +@verbatim +thread_select 1 +functions +# Reset the thread selection filter: +thread_select all +cpu_select 2 +functions +@end verbatim +@end smallexample + +In general, filters behave differently than commands or options. In +particular there may be an interaction between different filter definitions. + +For example, as explained above, in the first script file the +@command{thread_select} and @command{cpu_select} commands interact. + +For a list of all the predefined filters see @ref{Predefined Filters}. + +@c -- A new section ----------------------------------------------------------- +@node Supported Environment Variables +@section Supported Environment Variables +@c ---------------------------------------------------------------------------- + +Various environment variables are supported. We refer to the man page for +gprofng(1) for an overview and description +(@xref{Man page for gprofng}). + +@c -- A new chapter ----------------------------------------------------------- +@node Performance Data Collection +@chapter Performance Data Collection +@c ---------------------------------------------------------------------------- + +The @CollectApp{} command is used to gather the application performance data +while the application executes. +At regular intervals, program execution is halted and the required data is +recorded. +@cindex Experiment directory +An experiment directory is created when the tool starts. This directory is +used to store the relevant information and forms the basis for a subsequent +analysis with one of the viewing tools. + +@c -- A new section ----------------------------------------------------------- +@node The @CollectApp{} command +@section The @CollectApp{} command +@c ---------------------------------------------------------------------------- + +This is the command to collect the performance information for the target +application. The usage is as follows: + +@cartouche +@smallexample +$ gprofng collect app [OPTION(S)] TARGET [TARGET_ARGUMENTS] +@end smallexample +@end cartouche + +@noindent +Options to the command are passed in first. This is followed by the name of +the target, which is typically a binary executable or a script, followed by +any options that may be required by the target. + +@c -- A new section ----------------------------------------------------------- +@node View the Performance Information +@chapter View the Performance Information +@c ---------------------------------------------------------------------------- +Various tools to view the performance data stored in one or more experiment +directories are available. In this chapter, these will all be covered in +detail. + +@c -- A new section ----------------------------------------------------------- +@node The @code{gprofng display text} Tool +@section The @code{gprofng display text} Tool +@c ---------------------------------------------------------------------------- + +This tool displays the performance information in ASCII format. It supports +a variety of views into the data recorded. +These views can be specified in two ways and both may be used simultaneously: + +@itemize @bullet + +@item +Command line options start with a dash (@samp{-}) symbol and may take an +argument. + +@item +Options may also be included in a file, the ``script file''. In this case, +the dash symbol should @emph{not} be included. Multiple script files can +be used on the same command line. + +@end itemize + +While they may appear as an option, they are really commands and this is +why they will be referred to as @emph{commands} +@cindex Commands +in the documentation. + +As a general rule, @emph{the order of options matters} and if the same option, +or command, occurs multiple times, the rightmost setting is selected. + +@c -- A new sub section ------------------------------------------------------- +@node The @code{gprofng display text} Commands +@subsection The @code{gprofng display text} Commands +@c ---------------------------------------------------------------------------- + +The most commonly used commands are documented in the man page for this tool +(@xref{gprofng display text}). In this section we list and describe all other +commands that are supported. + +@c -- A new sub subsection ---------------------------------------------------- +@node Commands that List Experiment Details +@unnumberedsubsubsec Commands that List Experiment Details +@c ---------------------------------------------------------------------------- + +@table @code + +@item experiment_ids +@ifclear man +@IndexSubentry{Options, @code{-experiment_ids}} +@IndexSubentry{Commands, @code{experiment_ids}} +@end ifclear + +For each experiment that has been loaded, show the totals of the metrics +recorded, plus some other operational characteristics like the name of +the executable, PID, etc. The top line contains the accumulated totals +for the metrics. + +@item experiment_list +@ifclear man +@IndexSubentry{Options, @code{-experiment_list}} +@IndexSubentry{Commands, @code{experiment_list}} +@end ifclear + +Display the list of experiments that are loaded. Each experiment is listed +with an index, which is used when selecting samples, threads, or LWPs, and +a process id (PID), which can be used for advanced filtering. + +@item cpu_list +@IndexSubentry{Options, @code{-cpu_list}} +@IndexSubentry{Commands, @code{cpu_list}} + +Display the total number of CPUs that have been used during the experiment(s). + +@item cpus +@IndexSubentry{Options, @code{-cpus}} +@IndexSubentry{Commands, @code{cpus}} + +Show a list of CPUs that were used by the application, along with the metrics +that have been recorded. The CPUs are represented by a CPU number and show the +Total CPU time by default. + +Note that since the data is sorted with respect to the default metric, it may +be useful to use the @command{sort name} command to show the list sorted with +respect to the CPU id. + +@item GCEvents +@IndexSubentry{Options, @code{-GCEvents}} +@IndexSubentry{Commands, @code{GCEvents}} + +This commands is for Java applications only. It shows any Garbage Collection +(GC) events that have occurred while the application was executing.. + +@item lwp_list +@IndexSubentry{Options, @code{-lwp_list}} +@IndexSubentry{Commands, @code{lwp_list}} + +Displays the list of LWPs processed during the experiment(s). + +@item processes +@IndexSubentry{Options, @code{-processes}} +@IndexSubentry{Commands, @code{processes}} + +For each experiment that has been loaded, this command displays a list of +processes that were created by the application, along with their metrics. +The processes are represented by process ID (PID) numbers and show the +Total CPU time metric by default. If additional metrics are recorded in +an experiment, these are shown as well. + +@item samples +@IndexSubentry{Options, @code{-samples}} +@IndexSubentry{Commands, @code{samples}} + +Display a list of sample points and their metrics, which reflect the +microstates recorded at each sample point in the loaded experiment. +The samples are represented by sample numbers and show the Total CPU time +by default. Other metrics might also be displayed if enabled. + +@item sample_list +@IndexSubentry{Options, @code{-sample_list}} +@IndexSubentry{Commands, @code{sample_list}} + +For each experiment loaded, display the list of samples currently selected. + +@item seconds +@IndexSubentry{Options, @code{-seconds}} +@IndexSubentry{Commands, @code{seconds}} + +Show each second of the profiling run that was captured in the experiment, +along with the metrics collected in that second. The seconds view differs +from the samples view in that it shows periodic samples that occur every +second beginning at 0 and the interval cannot be changed. + +The seconds view lists the seconds of execution with the Total CPU time by +default. Other metrics might also be displayed if the metrics are present +in the loaded experiments. + +@item threads +@IndexSubentry{Options, @code{-threads}} +@IndexSubentry{Commands, @code{threads}} + +Show a list of threads and their metrics. The threads are represented +by a process and thread pair and show the Total CPU time by default. +Other metrics might also be displayed by default if the metrics are +present in the loaded experiment. + +@item thread_list +@IndexSubentry{Options, @code{-thread_list}} +@IndexSubentry{Commands, @code{thread_list}} + +Display the list of threads currently selected for the analysis. + +@end table + +@noindent +@emph{The commands below are for use in scripts and interactive mode only. +They are not allowed on the command line.} + +@table @code + +@item add_exp @var{exp-name} +@IndexSubentry{Commands, @code{add_exp}} + +Add the named experiment to the current session. + +@item drop_exp @var{exp-name} +@IndexSubentry{Commands, @code{drop_exp}} + +Drop the named experiment from the current session. + +@item open_exp @var{exp-name} +@IndexSubentry{Commands, @code{open_exp}} + +Drop all loaded experiments from the session, and then load the named +experiment. + +@end table + +@c -- A new sub subsection ---------------------------------------------------- +@node Commands that Affect Listings and Output +@unnumberedsubsubsec Commands that Affect Listings and Output +@c ---------------------------------------------------------------------------- + +@table @code + +@item dthresh @var{value} +@IndexSubentry{Options, @code{-dthresh}} +@IndexSubentry{Commands, @code{dthresh}} + +Specify the threshold percentage for highlighting metrics in the annotated +disassembly code. If the value of any metric is equal to or greater than +@var{value} as a percentage of the maximum value of that metric for any +instruction line in the file, the line on which the metrics occur has a +@samp{##} marker inserted at the beginning of the line. The default is 75. + +@item printmode @{text | html | @var{single-char}@} +@IndexSubentry{Options, @code{-printmode}} +@IndexSubentry{Commands, @code{printmode}} + +Set the print mode. If the keyword is @code{text}, printing will be done in +tabular form using plain text. In case the @code{html} keyword is selected, +the output is formatted as an HTML table. + +Alternatively, @var{single-char} may be used in a delimiter separated list, +with the single character @var{single-char} as the delimiter. + +The printmode setting is used only for those commands that generate tables, +such as @command{functions}. The setting is ignored for other printing +commands, including those showing source and disassembly listings. + +@item sthresh @var{value} +@IndexSubentry{Options, @code{-sthresh}} +@IndexSubentry{Commands, @code{sthresh}} + +Specify the threshold percentage for highlighting metrics in the annotated +source code. If the value of any metric is equal to or greater than +@var{value} (as a percentage) of the maximum value of that metric for any +source line in the file, the line on which the metrics occur has a @samp{##} +marker inserted at the beginning of the line. The default is 75. + +@end table + +@c -- A new sub subsection ---------------------------------------------------- +@node Predefined Filters +@unnumberedsubsubsec Predefined Filters +@c ---------------------------------------------------------------------------- + +The filters below use a list, the selection list, to define a sequence of +numbers. @xref{The Selection List}. +Note that this selection is persistent, but the filter can be reset by using +@samp{all} as the @var{selection-list}. + +@table @code + +@item cpu_select @var{selection-list} +@IndexSubentry{Options, @code{-cpu_select}} +@IndexSubentry{Commands, @code{cpu_select}} + +Select the CPU ids specified in the @var{selection-list}. + +@item lwp_select @var{selection-list} +@IndexSubentry{Options, @code{-lwp_select}} +@IndexSubentry{Commands, @code{lwp_select}} + +Select the LWPs specified in the @var{selection-list}. + +@item sample_select @var{selection-list} +@IndexSubentry{Options, @code{-sample-select}} +@IndexSubentry{Commands, @code{sample-select}} + +@item thread_select @var{selection-list} +@IndexSubentry{Options, @code{-thread_select}} +@IndexSubentry{Commands, @code{thread_select}} + +Select a series of threads, or just one, to be used in subsequent views. +The @var{selection-list} consists of a sequence of comma separated numbers. +This may include a range of the form @samp{n-m}. + +@end table + + +@c -- A new sub subsection ---------------------------------------------------- +@node Commands to Set and Change Search Paths +@unnumberedsubsubsec Commands to Set and Change Search Paths +@c ---------------------------------------------------------------------------- + +@table @code + +@item addpath @var{path-list} +@IndexSubentry{Options, @code{-addpath}} +@IndexSubentry{Commands, @code{addpath}} + +Append @var{path-list} to the current setpath settings. Note that multiple +@command{addpath} commands can be used in @file{.gprofng.rc} files, and will +be concatenated. + +@item pathmap @var{old-prefix} @var{new-prefix} +@IndexSubentry{Options, @code{-pathmap}} +@IndexSubentry{Commands, @code{pathmap}} + +If a file cannot be found using the path list set by @command{addpath}, or +the @command{setpath} command, one or more path remappings may be set with the +@command{pathmap} command. + +With path mapping, the user can specify how to replace the leading component +in a full path by a different string. + +With this command, any path name for a source file, object file, or shared +object that begins with the prefix specified with @var{old-prefix}, the +old prefix is replaced by the prefix specified with @var{new-prefix}. +The resulting path is used to find the file. + +For example, if a source file located in directory @file{/tmp} +is shown in the @DisplayText{} output, but should instead be taken from +@file{/home/demo}, the following @file{pathmap} command redefines the +path: + +@smallexample +$ gprofng diplay text -pathmap /tmp /home/demo -source ... +@end smallexample + +Note that multiple @command{pathmap} commands can be supplied, and each is +tried until the file is found. + +@item setpath @var{path-list} +@IndexSubentry{Options, @code{-setpath}} +@IndexSubentry{Commands, @code{setpath}} + +Set the path used to find source and object files. The path is defined +through the @var{path-list} keyword. It is a colon separated list of +directories, jar files, or zip files. +If any directory has a colon character in it, escape it with a +backslash (@samp{\}). + +The special directory name @code{$expts}, refers +to the set of current experiments in the order in which they were loaded. +You can abbreviate it with a single @samp{$} character. + +The default path is @samp{$expts:..} which is the directories of the +loaded experiments and the current working directory. + +Use @command{setpath} with no argument to display the current path. + +Note that @command{setpath} commands @emph{are not allowed .gprofng.rc +configuration files}. + +@end table + +@c -- A new subsection -------------------------------------------------------- +@c -- TBD @node Usage examples for @code{gprofng display text} +@c -- TBD @subsection Usage examples for @code{gprofng display text} +@c ---------------------------------------------------------------------------- + +@c -- TBD In this section we present usage examples. + +@c -- A new chapter ----------------------------------------------------------- +@c TBD @node The @code{gprofng display html} Tool +@c TBD @section The @code{gprofng display html} Tool +@c ---------------------------------------------------------------------------- +@c TBD The options are documented in the man page for this tool. In this section we +@c TBD present usage examples. + +@c -- A new chapter ----------------------------------------------------------- +@c TBD @node Display Source Code +@c TBD @chapter Display Source Code +@c ---------------------------------------------------------------------------- +@c TBD The options are documented in the man page for this tool. In this section we +@c TBD present usage examples. + + +@c -- A new chapter ----------------------------------------------------------- +@c TBD @node Archive Experiment Data +@c TBD @chapter Archive Experiment Data +@c ---------------------------------------------------------------------------- +@c TBD The options are documented in the man page for this tool. In this section we +@c TBD present usage examples. + +@c -- A new chapter ----------------------------------------------------------- +@node Terminology +@chapter Terminology +@c ---------------------------------------------------------------------------- + +Throughout this manual, certain terminology specific to profiling tools, +or @ToolName{}, or even to this document only, is used. In this chapter +this terminology is explained in detail. + +@menu +* The Program Counter:: What is a Program Counter? +* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics. +* Metric Definitions:: Definitions associated with metrics. +* The Viewmode:: Select the way call stacks are presented. +* The Selection List:: How to define a selection. +* Load Objects and Functions:: The components in an application. +* The Concept of a CPU in gprofng:: The definition of a CPU. +* Hardware Event Counters Explained:: What are event counters? +* apath:: Our generic definition of a path. +@end menu + +@c ---------------------------------------------------------------------------- +@node The Program Counter +@section The Program Counter +@c ---------------------------------------------------------------------------- + +@cindex PC +@cindex Program Counter +The @emph{Program Counter}, or PC for short, keeps track where program execution is. +The address of the next instruction to be executed is stored in a special +purpose register in the processor, or core. + +@cindex Instruction pointer +The PC is sometimes also referred to as the @emph{instruction pointer}, but +we will use Program Counter or PC throughout this document. + +@c ---------------------------------------------------------------------------- +@node Inclusive and Exclusive Metrics +@section Inclusive and Exclusive Metrics +@c ---------------------------------------------------------------------------- + +In the remainder, these two concepts occur quite often and for lack of a better +place, they are explained here. + +@cindex Inclusive metric +The @emph{inclusive} value for a metric includes all values that are part of +the dynamic extent of the target function. For example if function @code{A} +calls functions @code{B} and @code{C}, the inclusive CPU time for @code{A} +includes the CPU time spent in @code{B} and @code{C}. + +@cindex Exclusive metric +In contrast with this, the @emph{exclusive} value for a metric is computed +by excluding the metric values used by other functions called. In our imaginary +example, the exclusive CPU time for function @code{A} is the time spent outside +calling functions @code{B} and @code{C}. + +@cindex Leaf function +In case of a @emph{leaf function}, the inclusive and exclusive values for the +metric are the same since by definition, it is not calling any other +function(s). + +Why do we use these two different values? The inclusive metric shows the most +expensive path, in terms of this metric, in the application. For example, if +the metric is cache misses, the function with the highest inclusive metric +tells you where most of the cache misses come from. + +Within this branch of the application, the exclusive metric points to the +functions that contribute and help to identify which part(s) to consider +for further analysis. + +@c ---------------------------------------------------------------------------- +@node Metric Definitions +@section Metric Definitions +@c ---------------------------------------------------------------------------- +The metrics displayed in the various views are highly customizable. In this +section it is explained how to construct the metrics definition(s). + +@IndexSubentry{Options, @code{-metrics}} +@IndexSubentry{Commands, @code{metrics}} +The @command{metrics} command takes a colon (@samp{:}) separated list, where +each item in the list consists of the following three fields: +@var{<flavor>}@var{<visibility>}@var{<metric-name>}. + +@cindex Flavor field +@cindex Visibility field +@cindex Metric name field +@IndexSubentry{Metrics, Flavor field} +@IndexSubentry{Metrics, Visibility field} +@IndexSubentry{Metrics, Metric name field} +The @var{<flavor>} field is either @samp{e} for ``exclusive'', and/or +@samp{i} for ``inclusive''. The @var{<metric-name>} field is the name of +the metric and the @var{<visibility>} field consists of one ore more characters +from the following table: + +@table @code + +@item . +Show the metric as time. This applies to timing metrics and hardware event +counters that measure cycles. Interpret as @samp{+} for other metrics. + +@item % +Show the metric as a percentage of the total value for this metric. + +@item + +Show the metric as an absolute value. For hardware event counters this is +the event count. Interpret as @samp{.} for timing metrics. + +@item ! +Do not show any metric value. Cannot be used with other visibility characters. +This visibility is meant to be used in a @command{dmetrics} command to set +default metrics that override the built-in visibility defaults +for each type of metric. + +@end table + +Both the @var{<flavor>} and @var{<visibility>} strings may have more than one +character. If both strings have more than one character, the @var{<flavor>} +string is expanded first. For example, @code{ie.%user} is first expanded to +@code{i.%user:e.%user}, which is then expanded into +@code{i.user:i%user:e.user:e%user}. + +@c ---------------------------------------------------------------------------- +@node The Viewmode +@section The Viewmode + +@cindex Viewmode +@IndexSubentry{Options, @code{-viewmode}} +@IndexSubentry{Commands, @code{viewmode}} + +There are different ways to view a call stack in Java. In @ToolName{}, this +is called the @emph{viewmode} and the setting is controlled through a command +with the same name. + +The @code{viewmode} command takes one of the following keywords: + +@table @code + +@item user +This is the default and shows the Java call stacks for Java threads. +No call stacks for any housekeeping threads are shown. The function +list contains a function +@IndexSubentry{Java profiling, @code{<JVM-System>}} +@code{<JVM-System>} that represents the aggregated time from non-Java +threads. +When the JVM software does not report a Java call stack, time is reported +against the function +@IndexSubentry{Java profiling, @code{<no Java callstack recorded>}} +@code{<no Java callstack recorded>}. + +@item expert +Show the Java call stacks for Java threads when the Java code from the +user is executed and machine call stacks when JVM code is executed, or +when the JVM software does not report a Java call stack. +Show the machine call stacks for housekeeping threads. + +@item machine +Show the actual native call stacks for all threads. + +@end table + +@c ---------------------------------------------------------------------------- +@node The Selection List +@section The Selection List +@c ---------------------------------------------------------------------------- + +@cindex Selection list +@cindex List specification +Several commands allow the user to specify a sequence of numbers called the +@emph{selection list}. Such a list may for example be used to select specific +threads from all the threads that have been used when conducting the +experiment(s). + +A selection list (or ``list'' in the remainder of this section) can be a +single number, a contiguous range of numbers with the start and end numbers +separated by a hyphen (@samp{-}), a comma-separated list of numbers and +ranges, or the @code{all} keyword that resets the filter. +@IndexSubentry{Filters, Reset to default} +Lists must not contain spaces. + +Each list can optionally be preceded by an experiment list with a similar +format, separated from the list by a colon (:). +If no experiment list is included, the list applies to all experiments. + +Multiple lists can be concatenated by separating the individual lists +by a plus sign. + +These are some examples of various filters using a list: + +@table @code + +@item thread_select 1 +Select thread 1 from all experiments. + +@item thread_select all:1 +Select thread 1 from all experiments. + +@item thread_select 1:all +Select all the threads from the first experiment loaded. + +@item thread_select 1:2+3:4 +Select thread 2 from experiment 1 and thread 4 from experiment 3. + +@item cpu_select all:1,3,5 +Selects cores 1, 3, and 5 from all experiments. + +@item cpu_select 1,2:all +Select all cores from experiments 1 and 2. + +@end table + +Recall that there are several list commands that show the mapping between the +numbers and the targets. + +@IndexSubentry{Options, @code{-experiment_list}} +@IndexSubentry{Commands, @code{experiment_list}} +For example, the @command{experiment_list} command shows the name(s) of the +experiment(s) loaded and the associated number. In this example it is used +to get this information for a range of experiments: + +@cartouche +@smallexample +$ gprofng display text -experiment_list mxv.?.thr.er +@end smallexample +@end cartouche + +@noindent +This is the output, showing for each experiment the ID, the PID, and the name: + +@smallexample +@verbatim +ID Sel PID Experiment +== === ======= ============ + 1 yes 2750071 mxv.1.thr.er + 2 yes 1339450 mxv.2.thr.er + 3 yes 3579561 mxv.4.thr.er +@end verbatim +@end smallexample + +@c ---------------------------------------------------------------------------- +@node Load Objects and Functions +@section Load Objects and Functions +@c ---------------------------------------------------------------------------- + +An application consists of various components. The source code files are +compiled into object files. These are then glued together at link time to form +the executable. +During execution, the program may also dynamically load objects. + +@cindex Load object +A @emph{load object} is defined to be an executable, or shared object. A shared +library is an example of a load object in @ToolName{}. + +Each load object, contains a text section with the instructions generated by the +compiler, a data section for data, and various symbol tables. +All load objects must contain an +@cindex ELF +ELF +symbol table, which gives the names and addresses of all the globally known +functions in that object. + +Load objects compiled with the -g option contain additional symbolic information +that can augment the ELF symbol table and provide information about functions that +are not global, additional information about object modules from which the functions +came, and line number information relating addresses to source lines. + +The term +@cindex Function +@emph{function} +is used to describe a set of instructions that represent a high-level operation +described in the source code. The term also covers methods as used in C++ and in +the Java programming language. + +In the @ToolName{} context, functions are provided in source code format. +Normally their names appear in the symbol table representing a set of addresses. +@cindex Program Counter +@cindex PC +If the Program Counter (PC) is within that set, the program is executing within that function. + +In principle, any address within the text segment of a load object can be mapped to a +function. Exactly the same mapping is used for the leaf PC and all the other PCs on the +call stack. + +Most of the functions correspond directly to the source model of the program, but +there are exceptions. This topic is however outside of the scope of this guide. + +@c ---------------------------------------------------------------------------- +@node The Concept of a CPU in @ProductName{} +@section The Concept of a CPU in @ProductName{} +@c ---------------------------------------------------------------------------- + +@cindex CPU +In @ProductName{}, there is the concept of a CPU. Admittedly, this is not the +best word to describe what is meant here and may be replaced in the future. + +The word CPU is used in many of the displays. +In the context of @ProductName{}, it is meant to denote a part of the +processor that is capable of executing instructions and with its own state, +like the program counter. + +For example, on a contemporary processor, a CPU could be a core. In case +hardware threads are supported within a core, a CPU is one of those +hardware threads. + +To see which CPUs have been used in the experiment, use the @command{cpu} +command in @DisplayText{}. + +@c ---------------------------------------------------------------------------- +@node Hardware Event Counters Explained +@section Hardware Event Counters Explained +@c ---------------------------------------------------------------------------- + +@IndexSubentry{Hardware event counters, description} +For quite a number of years now, many microprocessors have supported hardware +event counters. + +On the hardware side, this means that in the processor there are one or more +registers dedicated to count certain activities, or ``events''. +Examples of such events are the number of instructions executed, or the number +of cache misses at level 2 in the memory hierarchy. + +While there is a limited set of such registers, the user can map events onto +them. In case more than one register is available, this allows for the +simultaenous measurement of various events. + +A simple, yet powerful, example is to simultaneously count the number of CPU +cycles and the number of instructions excuted. These two numbers can then be +used to compute the +@cindex IPC +@emph{IPC} value. IPC stands for ``Instructions Per Clockcycle'' and each processor +has a maximum. For example, if this maximum number is 2, it means the +processor is capable of executing two instructions every clock cycle. + +Whether this is actually achieved, depends on several factors, including the +instruction characteristics. +However, in case the IPC value is well below this maximum in a time critical +part of the application and this cannot be easily explained, further +investigation is probably warranted. + +@cindex CPI +A related metric is called @emph{CPI}, or ``Clockcycles Per Instruction''. +It is the inverse of the CPI and can be compared against the theoretical +value(s) of the target instruction(s). A significant difference may point +at a bottleneck. + +One thing to keep in mind is that the value returned by a counter can either +be the number of times the event occured, or a CPU cycle count. In case of +the latter it is possible to convert this number to time. + +@IndexSubentry{Hardware event counters, variable CPU frequency} +This is often easier to interpret than a simple count, but there is one +caveat to keep in mind. The CPU frequency may not have been constant while +the experimen was recorded and this impacts the time reported. + +These event counters, or ``counters'' for short, provide great insight into +what happens deep inside the processor. In case higher level information does +not provide the insight needed, the counters provide the information to get +to the bottom of a performance problem. + +There are some things to consider though. + +@itemize @bullet + +@item +The event definitions and names vary across processors and it may even happen +that some events change with an update. +Unfortunately and this is luckily rare, there are sometimes bugs causing the +wrong count to be returned. + +@IndexSubentry{Hardware event counters, alias name} +In @ToolName{}, some of the processor specific event names have an alias +name. For example @code{insts} measures the instructions executed. +These aliases not only makes it easier to identify the functionality, but also +provide portability of certain events across processors. + +@item +Another complexity is that there are typically many events one can monitor. +There may up to hundreds of events available and it could require several +experiments to zoom in on the root cause of a performance problem. + +@item +There may be restrictions regarding the mapping of event(s) onto the +counters. For example, certain events may be restricted to specific +counters only. As a result, one may have to conduct additional experiments +to cover all the events of interest. + +@item +The names of the events may also not be easy to interpret. In such cases, +the description can be found in the architecture manual for the processor. + +@end itemize + +Despite these drawbacks, hardware event counters are extremely useful and +may even turn out to be indispensable. + +@c ---------------------------------------------------------------------------- +@node apath +@section What is <apath>? +@c ---------------------------------------------------------------------------- + +In most cases, @ToolName{} shows the absolute pathnames of directories. These +tend to be rather long, causing display issues in this document. + +Instead of wrapping these long pathnames over multiple lines, we decided to +represent them by the @code{<apath>} symbol, which stands for ``an absolute +pathname''. + +Note that different occurrences of @code{<apath>} may represent different +absolute pathnames. + +@c -- A new node -------------------------------------------------------------- +@node Other Document Formats +@chapter Other Document Formats +@c ---------------------------------------------------------------------------- + +@emph{This chapter is applicable when building gprofng from the +binutils source.} + +This document is written in Texinfo and the source text is made available as +part of the binutils distribution. The file name is @code{gprofng.texi} and +can be found in subdirectory @code{gprofng/doc} of the top level binutils +directory. + +The default installation procedure creates a file in the @code{info} format and +stores it in the documentation section of binutils. +This source file can however also be used to generate the document in the +@code{html} and @code{pdf} formats. These may be easier to read and search. + +To generate this documentation file in a different format, go to the directory +that was used to build the tools. The make file to build the other formats is +in the @code{gprofng/doc} subdirectory. + +For example, if you have set the build directory to be @var{<my-build-dir>}, +go to subdirectory @var{<my-build-dir>/gprofng/doc}. + +This subdirectory has a single filed called @file{Makefile} that can be used to +build the documentation in various formats. We recommend to use these commands. + +There are four commands to generate the documentation in the @code{html} or +@code{pdf} format. It is assumed that you are in directory @code{gprofng/doc} +under the main directory @var{<my-build-dir>}. + +@table @code + +@item make html +Create the html file in the current directory. + +@item make pdf +Create the pdf file in the current directory. + +@item make install-html +Create and install the html file in the binutils documentation directory. + +@item make install-pdf +Creat and install the pdf file in the binutils documentation directory. + +@end table + +For example, to install this document in the binutils documentation directory, the +commands below may be executed. In this notation, @var{<format>} +is one of @code{html}, or @code{pdf}: + +@smallexample +@verbatim +$ cd <my-build-dir>/gprofng/doc +$ make install-<format> +@end verbatim +@end smallexample + +The binutils installation directory is either the default @code{/usr/local} or the one +that has been set with the @code{--prefix} option as part of the @code{configure} +command. In this example we symbolize this location with @code{<install>}. + +The documentation directory is @code{<install>/share/doc/gprofng} in case +@code{html} or @code{pdf} is selected and @code{<install>/share/info} for the +file in the @code{info} format. + +@noindent +Some things to note: + +@itemize + +@item +For the @code{pdf} file to be generated, the @code{texi2dvi} tool is required. +@cindex texi2dvi +It is for example available as part of the @code{texinfo-tex} package. + +@item +Instead of generating a single file in the @code{html} format, it is also +possible to create a directory with individual files for the various chapters. +To do so, remove the use of @code{--no-split} in variable @code{MAKEINFOHTML} +in the make file in the @code{<my-build-dir/gprofng/doc} directory. + +@end itemize + +@c -- An appendix ------------------------------------------------------------- +@node The @ProductName{} Man Pages +@appendix The @ProductName{} Man Pages +@c ---------------------------------------------------------------------------- + +In this appendix the man pages for the various @ProductName{} tools are listed. + +@c -- A new node -------------------------------------------------------------- +@c @node gprofng driver +@node Man page for gprofng +@section Man page for @command{gprofng} +@c ---------------------------------------------------------------------------- + +@include gprofng.texi + +@c -- A new node -------------------------------------------------------------- +@page +@node gprofng collect app +@section Man page for @command{gprofng collect app} +@c ---------------------------------------------------------------------------- + +@include gp-collect-app.texi + +@c -- A new node -------------------------------------------------------------- +@page +@node gprofng display text +@section Man page for @command{gprofng display text} +@c ---------------------------------------------------------------------------- + +@include gp-display-text.texi + +@c -- A new node -------------------------------------------------------------- +@page +@node gprofng display html +@section Man page for @command{gprofng display html} +@c ---------------------------------------------------------------------------- + +@include gp-display-html.texi + +@c -- A new node -------------------------------------------------------------- +@page +@node gprofng display src +@section Man page for @command{gprofng display src} +@c ---------------------------------------------------------------------------- + +@include gp-display-src.texi + +@c -- A new node -------------------------------------------------------------- +@page +@node gprofng archive +@section Man page for @command{gprofng archive} +@c ---------------------------------------------------------------------------- + +@include gp-archive.texi + +@ifnothtml +@node Index +@unnumbered Index +@printindex cp +@end ifnothtml + +@bye diff --git a/gprofng/doc/version.texi b/gprofng/doc/version.texi index 5bf2b08..18585d8 100644 --- a/gprofng/doc/version.texi +++ b/gprofng/doc/version.texi @@ -1,4 +1,4 @@ -@set UPDATED 28 September 2022 -@set UPDATED-MONTH September 2022 +@set UPDATED 14 April 2022 +@set UPDATED-MONTH April 2022 @set EDITION 2.40.50 -@set VERSION 2.40.50 +@set VERSION 2.0 diff --git a/gprofng/gp-display-html/Makefile.am b/gprofng/gp-display-html/Makefile.am index c79ecc8..f9b214e 100644 --- a/gprofng/gp-display-html/Makefile.am +++ b/gprofng/gp-display-html/Makefile.am @@ -27,35 +27,3 @@ do_subst = sed -e 's/BINUTILS_VERSION/$(VERSION)/' gp-display-html: gp-display-html.in Makefile $(do_subst) < $(srcdir)/gp-display-html.in > $@ chmod +x $@ - -if BUILD_MAN - -man_MANS = gp-display-html.1 -MAINTAINERCLEANFILES = $(man_MANS) - -# Use this if the man pages depend on the version number. -# common_mandeps = $(top_srcdir)/../bfd/version.m4 -# -# Also change the dependence line below to this: -# gp-display-html.1: $(common_mandeps) gp-display-html -# -# Currently, the version number shown in the man page is derived from -# the output printed with --version. - -# These variables are used by help2man to generate the man pages. - -INFO_PAGE = "gprofng" -MANUAL = "User Commands" -TEXT_GP_DISPLAY_HTML = "generate an HTML based directory structure to browse the profiles" - -HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL) -H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \ - | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \ - | sed 's/Limitations:/.SH LIMITATIONS/' - -gp-display-html.1: gp-display-html - $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ - --name=$(TEXT_GP_DISPLAY_HTML) ./gp-display-html $(H2M_FILTER) > $@ - -endif - diff --git a/gprofng/gp-display-html/Makefile.in b/gprofng/gp-display-html/Makefile.in index 42886f7..21cc1c6 100644 --- a/gprofng/gp-display-html/Makefile.in +++ b/gprofng/gp-display-html/Makefile.in @@ -150,7 +150,7 @@ am__uninstall_files_from_dir = { \ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ $(am__cd) "$$dir" && rm -f $$files; }; \ } -am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)" +am__installdirs = "$(DESTDIR)$(bindir)" SCRIPTS = $(bin_SCRIPTS) AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) @@ -171,9 +171,6 @@ am__can_run_installinfo = \ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac -man1dir = $(mandir)/man1 -NROFF = nroff -MANS = $(man_MANS) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/../mkinstalldirs DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) @@ -328,27 +325,6 @@ ACLOCAL_AMFLAGS = -I . -I .. -I ../.. bin_SCRIPTS = gp-display-html CLEANFILES = $(bin_SCRIPTS) do_subst = sed -e 's/BINUTILS_VERSION/$(VERSION)/' -@BUILD_MAN_TRUE@man_MANS = gp-display-html.1 -@BUILD_MAN_TRUE@MAINTAINERCLEANFILES = $(man_MANS) - -# Use this if the man pages depend on the version number. -# common_mandeps = $(top_srcdir)/../bfd/version.m4 -# -# Also change the dependence line below to this: -# gp-display-html.1: $(common_mandeps) gp-display-html -# -# Currently, the version number shown in the man page is derived from -# the output printed with --version. - -# These variables are used by help2man to generate the man pages. -@BUILD_MAN_TRUE@INFO_PAGE = "gprofng" -@BUILD_MAN_TRUE@MANUAL = "User Commands" -@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_HTML = "generate an HTML based directory structure to browse the profiles" -@BUILD_MAN_TRUE@HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL) -@BUILD_MAN_TRUE@H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \ -@BUILD_MAN_TRUE@ | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \ -@BUILD_MAN_TRUE@ | sed 's/Limitations:/.SH LIMITATIONS/' - all: all-am .SUFFIXES: @@ -422,49 +398,6 @@ mostlyclean-libtool: clean-libtool: -rm -rf .libs _libs -install-man1: $(man_MANS) - @$(NORMAL_INSTALL) - @list1=''; \ - list2='$(man_MANS)'; \ - test -n "$(man1dir)" \ - && test -n "`echo $$list1$$list2`" \ - || exit 0; \ - echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \ - { for i in $$list1; do echo "$$i"; done; \ - if test -n "$$list2"; then \ - for i in $$list2; do echo "$$i"; done \ - | sed -n '/\.1[a-z]*$$/p'; \ - fi; \ - } | while read p; do \ - if test -f $$p; then d=; else d="$(srcdir)/"; fi; \ - echo "$$d$$p"; echo "$$p"; \ - done | \ - sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ - -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \ - sed 'N;N;s,\n, ,g' | { \ - list=; while read file base inst; do \ - if test "$$base" = "$$inst"; then list="$$list $$file"; else \ - echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \ - $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \ - fi; \ - done; \ - for i in $$list; do echo "$$i"; done | $(am__base_list) | \ - while read files; do \ - test -z "$$files" || { \ - echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \ - $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \ - done; } - -uninstall-man1: - @$(NORMAL_UNINSTALL) - @list=''; test -n "$(man1dir)" || exit 0; \ - files=`{ for i in $$list; do echo "$$i"; done; \ - l2='$(man_MANS)'; for i in $$l2; do echo "$$i"; done | \ - sed -n '/\.1[a-z]*$$/p'; \ - } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ - -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \ - dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir) tags TAGS: ctags CTAGS: @@ -504,9 +437,9 @@ distdir: $(DISTFILES) done check-am: all-am check: check-am -all-am: Makefile $(SCRIPTS) $(MANS) +all-am: Makefile $(SCRIPTS) installdirs: - for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)"; do \ + for dir in "$(DESTDIR)$(bindir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-am @@ -540,7 +473,6 @@ distclean-generic: maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." - -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES) clean: clean-am clean-am: clean-generic clean-libtool mostlyclean-am @@ -561,7 +493,7 @@ info: info-am info-am: -install-data-am: install-man +install-data-am: install-dvi: install-dvi-am @@ -577,7 +509,7 @@ install-info: install-info-am install-info-am: -install-man: install-man1 +install-man: install-pdf: install-pdf-am @@ -605,9 +537,7 @@ ps: ps-am ps-am: -uninstall-am: uninstall-binSCRIPTS uninstall-man - -uninstall-man: uninstall-man1 +uninstall-am: uninstall-binSCRIPTS .MAKE: install-am install-strip @@ -617,12 +547,12 @@ uninstall-man: uninstall-man1 install install-am install-binSCRIPTS install-data \ install-data-am install-dvi install-dvi-am install-exec \ install-exec-am install-html install-html-am install-info \ - install-info-am install-man install-man1 install-pdf \ - install-pdf-am install-ps install-ps-am install-strip \ - installcheck installcheck-am installdirs maintainer-clean \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ maintainer-clean-generic mostlyclean mostlyclean-generic \ mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \ - uninstall-am uninstall-binSCRIPTS uninstall-man uninstall-man1 + uninstall-am uninstall-binSCRIPTS .PRECIOUS: Makefile @@ -631,10 +561,6 @@ gp-display-html: gp-display-html.in Makefile $(do_subst) < $(srcdir)/gp-display-html.in > $@ chmod +x $@ -@BUILD_MAN_TRUE@gp-display-html.1: gp-display-html -@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ -@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_HTML) ./gp-display-html $(H2M_FILTER) > $@ - # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: diff --git a/gprofng/src/Makefile.am b/gprofng/src/Makefile.am index ab90bb0..219367f 100644 --- a/gprofng/src/Makefile.am +++ b/gprofng/src/Makefile.am @@ -121,7 +121,7 @@ QLParser.tab.cc QLParser.tab.hh: QLParser.yy $(BISON) $^ BUILT_SOURCES = QLParser.tab.hh -EXTRA_DIST = QLParser.yy $(man_MANS) +EXTRA_DIST = QLParser.yy lib_LTLIBRARIES = $(LIBGPROFNG) @@ -157,60 +157,6 @@ gp_display_src_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB) gp_display_text_SOURCES = gp-display-text.cc ipc.cc ipcio.cc gp_display_text_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB) - -if BUILD_MAN - -man_MANS = \ - gp-archive.1 \ - gp-collect-app.1 \ - gp-display-src.1 \ - gp-display-text.1 - -MAINTAINERCLEANFILES = $(man_MANS) - -# The man pages depend on the version number and on a help2man include file. -common_mandeps = $(top_srcdir)/../bfd/version.m4 - -# Use -o so that the `missing' program can infer the output file. -# Embolden subcommand names in the output, and include a SEE ALSO. -# Arrange to regenerate the output if we have help2man, but leave the -# disted output there otherwise. -# Some extra annoying complexity is in place so that people without -# help2man dno't accidentally overwrite the manpage. - -INFO_PAGE = "gprofng" -MANUAL = "User Commands" -TEXT_GPROFNG = "the driver for the gprofng tool suite" -TEXT_GP_ARCHIVE = "archive gprofng experiment data" -TEXT_GP_COLLECT_APP = "collect performance data for the target application" -TEXT_GP_DISPLAY_SRC = "display the source code, optionally interleaved with the disassembly of the target object" -TEXT_GP_DISPLAY_TEXT = "display the performance data in plain text format" - -HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL) -H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \ - | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \ - | sed 's/Limitations:/.SH LIMITATIONS/' - -gp-archive.1: $(srcdir)/gp-archive.cc $(common_mandeps) | ./gp-archive$(EXEEXT) - $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ - --name=$(TEXT_GP_ARCHIVE) ./gp-archive$(EXEEXT) $(H2M_FILTER) > $@ - -gp-collect-app.1: $(srcdir)/gp-collect-app.cc $(common_mandeps) | ./gp-collect-app$(EXEEXT) - $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ - --name=$(TEXT_GP_COLLECT_APP) ./gp-collect-app$(EXEEXT) $(H2M_FILTER) > $@ - -gp-display-src.1: $(srcdir)/gp-display-src.cc $(srcdir)/Command.cc \ - $(common_mandeps) | ./gp-display-src$(EXEEXT) - $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ - --name=$(TEXT_GP_DISPLAY_SRC) ./gp-display-src$(EXEEXT) $(H2M_FILTER) > $@ - -gp-display-text.1: $(srcdir)/gp-display-text.cc $(srcdir)/Command.cc \ - $(common_mandeps) | ./gp-display-text$(EXEEXT) - $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ - --name=$(TEXT_GP_DISPLAY_TEXT) ./gp-display-text$(EXEEXT) $(H2M_FILTER) > $@ - -endif - # Distribution involves building the binaries to generate the manpage, # so ensure that the necessary libraries are built at dist time. dist-hook: $(LIBGPROFNG) diff --git a/gprofng/src/Makefile.in b/gprofng/src/Makefile.in index 605fa4f..b881268 100644 --- a/gprofng/src/Makefile.in +++ b/gprofng/src/Makefile.in @@ -156,7 +156,7 @@ am__uninstall_files_from_dir = { \ $(am__cd) "$$dir" && rm -f $$files; }; \ } am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \ - "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(dbedir)" + "$(DESTDIR)$(dbedir)" LTLIBRARIES = $(lib_LTLIBRARIES) am__DEPENDENCIES_1 = libgprofng_la_DEPENDENCIES = $(top_builddir)/../opcodes/libopcodes.la \ @@ -275,9 +275,6 @@ am__can_run_installinfo = \ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac -man1dir = $(mandir)/man1 -NROFF = nroff -MANS = $(man_MANS) DATA = $(dbe_DATA) am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) # Read a list of newline-separated strings from the standard input, @@ -547,7 +544,7 @@ AM_CFLAGS = $(GPROFNG_CFLAGS) $(PTHREAD_CFLAGS) \ AM_CXXFLAGS = $(AM_CFLAGS) BUILT_SOURCES = QLParser.tab.hh -EXTRA_DIST = QLParser.yy $(man_MANS) +EXTRA_DIST = QLParser.yy lib_LTLIBRARIES = $(LIBGPROFNG) libgprofng_la_SOURCES = $(CCSOURCES) $(CSOURCES) libgprofng_la_LDFLAGS = -version-info 0:0:0 @@ -572,35 +569,6 @@ gp_display_src_SOURCES = gp-display-src.cc gp_display_src_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB) gp_display_text_SOURCES = gp-display-text.cc ipc.cc ipcio.cc gp_display_text_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB) -@BUILD_MAN_TRUE@man_MANS = \ -@BUILD_MAN_TRUE@ gp-archive.1 \ -@BUILD_MAN_TRUE@ gp-collect-app.1 \ -@BUILD_MAN_TRUE@ gp-display-src.1 \ -@BUILD_MAN_TRUE@ gp-display-text.1 - -@BUILD_MAN_TRUE@MAINTAINERCLEANFILES = $(man_MANS) - -# The man pages depend on the version number and on a help2man include file. -@BUILD_MAN_TRUE@common_mandeps = $(top_srcdir)/../bfd/version.m4 - -# Use -o so that the `missing' program can infer the output file. -# Embolden subcommand names in the output, and include a SEE ALSO. -# Arrange to regenerate the output if we have help2man, but leave the -# disted output there otherwise. -# Some extra annoying complexity is in place so that people without -# help2man dno't accidentally overwrite the manpage. -@BUILD_MAN_TRUE@INFO_PAGE = "gprofng" -@BUILD_MAN_TRUE@MANUAL = "User Commands" -@BUILD_MAN_TRUE@TEXT_GPROFNG = "the driver for the gprofng tool suite" -@BUILD_MAN_TRUE@TEXT_GP_ARCHIVE = "archive gprofng experiment data" -@BUILD_MAN_TRUE@TEXT_GP_COLLECT_APP = "collect performance data for the target application" -@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_SRC = "display the source code, optionally interleaved with the disassembly of the target object" -@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_TEXT = "display the performance data in plain text format" -@BUILD_MAN_TRUE@HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL) -@BUILD_MAN_TRUE@H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \ -@BUILD_MAN_TRUE@ | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \ -@BUILD_MAN_TRUE@ | sed 's/Limitations:/.SH LIMITATIONS/' - all: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) all-am @@ -885,49 +853,6 @@ mostlyclean-libtool: clean-libtool: -rm -rf .libs _libs -install-man1: $(man_MANS) - @$(NORMAL_INSTALL) - @list1=''; \ - list2='$(man_MANS)'; \ - test -n "$(man1dir)" \ - && test -n "`echo $$list1$$list2`" \ - || exit 0; \ - echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \ - $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \ - { for i in $$list1; do echo "$$i"; done; \ - if test -n "$$list2"; then \ - for i in $$list2; do echo "$$i"; done \ - | sed -n '/\.1[a-z]*$$/p'; \ - fi; \ - } | while read p; do \ - if test -f $$p; then d=; else d="$(srcdir)/"; fi; \ - echo "$$d$$p"; echo "$$p"; \ - done | \ - sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ - -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \ - sed 'N;N;s,\n, ,g' | { \ - list=; while read file base inst; do \ - if test "$$base" = "$$inst"; then list="$$list $$file"; else \ - echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \ - $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \ - fi; \ - done; \ - for i in $$list; do echo "$$i"; done | $(am__base_list) | \ - while read files; do \ - test -z "$$files" || { \ - echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \ - $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \ - done; } - -uninstall-man1: - @$(NORMAL_UNINSTALL) - @list=''; test -n "$(man1dir)" || exit 0; \ - files=`{ for i in $$list; do echo "$$i"; done; \ - l2='$(man_MANS)'; for i in $$l2; do echo "$$i"; done | \ - sed -n '/\.1[a-z]*$$/p'; \ - } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \ - -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \ - dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir) install-dbeDATA: $(dbe_DATA) @$(NORMAL_INSTALL) @list='$(dbe_DATA)'; test -n "$(dbedir)" || list=; \ @@ -1038,11 +963,11 @@ distdir: $(DISTFILES) check-am: all-am check: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) check-am -all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(MANS) $(DATA) +all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(DATA) install-binPROGRAMS: install-libLTLIBRARIES installdirs: - for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(dbedir)"; do \ + for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(dbedir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: $(BUILT_SOURCES) @@ -1077,7 +1002,6 @@ maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." -test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES) - -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES) clean: clean-am clean-am: clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \ @@ -1101,7 +1025,7 @@ info: info-am info-am: -install-data-am: install-dbeDATA install-man +install-data-am: install-dbeDATA install-dvi: install-dvi-am @@ -1117,7 +1041,7 @@ install-info: install-info-am install-info-am: -install-man: install-man1 +install-man: install-pdf: install-pdf-am @@ -1148,9 +1072,7 @@ ps: ps-am ps-am: uninstall-am: uninstall-binPROGRAMS uninstall-dbeDATA \ - uninstall-libLTLIBRARIES uninstall-man - -uninstall-man: uninstall-man1 + uninstall-libLTLIBRARIES .MAKE: all check install install-am install-strip @@ -1163,13 +1085,13 @@ uninstall-man: uninstall-man1 install-data-am install-dbeDATA install-dvi install-dvi-am \ install-exec install-exec-am install-html install-html-am \ install-info install-info-am install-libLTLIBRARIES \ - install-man install-man1 install-pdf install-pdf-am install-ps \ + install-man install-pdf install-pdf-am install-ps \ install-ps-am install-strip installcheck installcheck-am \ installdirs maintainer-clean maintainer-clean-generic \ mostlyclean mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \ uninstall-am uninstall-binPROGRAMS uninstall-dbeDATA \ - uninstall-libLTLIBRARIES uninstall-man uninstall-man1 + uninstall-libLTLIBRARIES .PRECIOUS: Makefile @@ -1177,24 +1099,6 @@ uninstall-man: uninstall-man1 QLParser.tab.cc QLParser.tab.hh: QLParser.yy $(BISON) $^ -@BUILD_MAN_TRUE@gp-archive.1: $(srcdir)/gp-archive.cc $(common_mandeps) | ./gp-archive$(EXEEXT) -@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ -@BUILD_MAN_TRUE@ --name=$(TEXT_GP_ARCHIVE) ./gp-archive$(EXEEXT) $(H2M_FILTER) > $@ - -@BUILD_MAN_TRUE@gp-collect-app.1: $(srcdir)/gp-collect-app.cc $(common_mandeps) | ./gp-collect-app$(EXEEXT) -@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ -@BUILD_MAN_TRUE@ --name=$(TEXT_GP_COLLECT_APP) ./gp-collect-app$(EXEEXT) $(H2M_FILTER) > $@ - -@BUILD_MAN_TRUE@gp-display-src.1: $(srcdir)/gp-display-src.cc $(srcdir)/Command.cc \ -@BUILD_MAN_TRUE@ $(common_mandeps) | ./gp-display-src$(EXEEXT) -@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ -@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_SRC) ./gp-display-src$(EXEEXT) $(H2M_FILTER) > $@ - -@BUILD_MAN_TRUE@gp-display-text.1: $(srcdir)/gp-display-text.cc $(srcdir)/Command.cc \ -@BUILD_MAN_TRUE@ $(common_mandeps) | ./gp-display-text$(EXEEXT) -@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \ -@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_TEXT) ./gp-display-text$(EXEEXT) $(H2M_FILTER) > $@ - # Distribution involves building the binaries to generate the manpage, # so ensure that the necessary libraries are built at dist time. dist-hook: $(LIBGPROFNG) |