llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792

//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "BenchmarkRunner.h"
#include "Assembler.h"
#include "Error.h"
#include "MCInstrDescView.h"
#include "MmapUtils.h"
#include "PerfHelper.h"
#include "SubprocessMemory.h"
#include "Target.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SystemZ/zOSSupport.h"
#include <cmath>
#include <memory>
#include <string>

#ifdef __linux__
#ifdef HAVE_LIBPFM
#include <perfmon/perf_event.h>
#endif
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>

#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
#include <sys/rseq.h>
#if defined(RSEQ_SIG) && defined(SYS_rseq)
#define GLIBC_INITS_RSEQ
#endif
#endif
#endif // __linux__

namespace llvm {
namespace exegesis {

BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
                                 BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
                                 ExecutionModeE ExecutionMode,
                                 ArrayRef<ValidationEvent> ValCounters)
    : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
      ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
      Scratch(std::make_unique<ScratchSpace>()) {}

BenchmarkRunner::~BenchmarkRunner() = default;

void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
    const SmallVectorImpl<int64_t> &NewValues,
    SmallVectorImpl<int64_t> *Result) {
  const size_t NumValues = std::max(NewValues.size(), Result->size());
  if (NumValues > Result->size())
    Result->resize(NumValues, 0);
  for (size_t I = 0, End = NewValues.size(); I < End; ++I)
    (*Result)[I] += NewValues[I];
}

Expected<SmallVector<int64_t, 4>>
BenchmarkRunner::FunctionExecutor::runAndSample(
    const char *Counters, ArrayRef<const char *> ValidationCounters,
    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
  // We sum counts when there are several counters for a single ProcRes
  // (e.g. P23 on SandyBridge).
  SmallVector<int64_t, 4> CounterValues;
  SmallVector<StringRef, 2> CounterNames;
  StringRef(Counters).split(CounterNames, '+');
  for (auto &CounterName : CounterNames) {
    CounterName = CounterName.trim();
    Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
        CounterName, ValidationCounters, ValidationCounterValues);
    if (!ValueOrError)
      return ValueOrError.takeError();
    accumulateCounterValues(ValueOrError.get(), &CounterValues);
  }
  return CounterValues;
}

namespace {
class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
public:
  static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
  create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
         BenchmarkRunner::ScratchSpace *Scratch,
         std::optional<int> BenchmarkProcessCPU) {
    Expected<ExecutableFunction> EF =
        ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));

    if (!EF)
      return EF.takeError();

    return std::unique_ptr<InProcessFunctionExecutorImpl>(
        new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
  }

private:
  InProcessFunctionExecutorImpl(const LLVMState &State,
                                ExecutableFunction Function,
                                BenchmarkRunner::ScratchSpace *Scratch)
      : State(State), Function(std::move(Function)), Scratch(Scratch) {}

  static void accumulateCounterValues(const SmallVector<int64_t, 4> &NewValues,
                                      SmallVector<int64_t, 4> *Result) {
    const size_t NumValues = std::max(NewValues.size(), Result->size());
    if (NumValues > Result->size())
      Result->resize(NumValues, 0);
    for (size_t I = 0, End = NewValues.size(); I < End; ++I)
      (*Result)[I] += NewValues[I];
  }

  Expected<SmallVector<int64_t, 4>> runWithCounter(
      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
    const ExegesisTarget &ET = State.getExegesisTarget();
    char *const ScratchPtr = Scratch->ptr();
    auto CounterOrError =
        ET.createCounter(CounterName, State, ValidationCounters);

    if (!CounterOrError)
      return CounterOrError.takeError();

    pfm::CounterGroup *Counter = CounterOrError.get().get();
    Scratch->clear();
    {
      auto PS = ET.withSavedState();
      CrashRecoveryContext CRC;
      CrashRecoveryContext::Enable();
      const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
        Counter->start();
        this->Function(ScratchPtr);
        Counter->stop();
      });
      CrashRecoveryContext::Disable();
      PS.reset();
      if (Crashed) {
#ifdef LLVM_ON_UNIX
        // See "Exit Status for Commands":
        // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html
        constexpr const int kSigOffset = 128;
        return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
#else
        // The exit code of the process on windows is not meaningful as a
        // signal, so simply pass in -1 as the signal into the error.
        return make_error<SnippetSignal>(-1);
#endif // LLVM_ON_UNIX
      }
    }

    auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
    if (!ValidationValuesOrErr)
      return ValidationValuesOrErr.takeError();

    ArrayRef RealValidationValues = *ValidationValuesOrErr;
    for (size_t I = 0; I < RealValidationValues.size(); ++I)
      ValidationCounterValues[I] = RealValidationValues[I];

    return Counter->readOrError(Function.getFunctionBytes());
  }

  const LLVMState &State;
  const ExecutableFunction Function;
  BenchmarkRunner::ScratchSpace *const Scratch;
};

#ifdef __linux__
// The following class implements a function executor that executes the
// benchmark code within a subprocess rather than within the main llvm-exegesis
// process. This allows for much more control over the execution context of the
// snippet, particularly with regard to memory. This class performs all the
// necessary functions to create the subprocess, execute the snippet in the
// subprocess, and report results/handle errors.
class SubProcessFunctionExecutorImpl
    : public BenchmarkRunner::FunctionExecutor {
public:
  static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
  create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
         const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) {
    Expected<ExecutableFunction> EF =
        ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
    if (!EF)
      return EF.takeError();

    return std::unique_ptr<SubProcessFunctionExecutorImpl>(
        new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key,
                                           BenchmarkProcessCPU));
  }

private:
  SubProcessFunctionExecutorImpl(const LLVMState &State,
                                 ExecutableFunction Function,
                                 const BenchmarkKey &Key,
                                 std::optional<int> BenchmarkCPU)
      : State(State), Function(std::move(Function)), Key(Key),
        BenchmarkProcessCPU(BenchmarkCPU) {}

  enum ChildProcessExitCodeE {
    CounterFDReadFailed = 1,
    RSeqDisableFailed,
    FunctionDataMappingFailed,
    AuxiliaryMemorySetupFailed,
    SetCPUAffinityFailed
  };

  StringRef childProcessExitCodeToString(int ExitCode) const {
    switch (ExitCode) {
    case ChildProcessExitCodeE::CounterFDReadFailed:
      return "Counter file descriptor read failed";
    case ChildProcessExitCodeE::RSeqDisableFailed:
      return "Disabling restartable sequences failed";
    case ChildProcessExitCodeE::FunctionDataMappingFailed:
      return "Failed to map memory for assembled snippet";
    case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
      return "Failed to setup auxiliary memory";
    case ChildProcessExitCodeE::SetCPUAffinityFailed:
      return "Failed to set CPU affinity of the benchmarking process";
    default:
      return "Child process returned with unknown exit code";
    }
  }

  Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
    struct msghdr Message = {};
    char Buffer[CMSG_SPACE(sizeof(FD))];
    memset(Buffer, 0, sizeof(Buffer));
    Message.msg_control = Buffer;
    Message.msg_controllen = sizeof(Buffer);

    struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
    ControlMessage->cmsg_level = SOL_SOCKET;
    ControlMessage->cmsg_type = SCM_RIGHTS;
    ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));

    memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD));

    Message.msg_controllen = CMSG_SPACE(sizeof(FD));

    ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0);

    if (BytesWritten < 0)
      return make_error<Failure>("Failed to write FD to socket: " +
                                 Twine(strerror(errno)));

    return Error::success();
  }

  Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
    struct msghdr Message = {};

    char ControlBuffer[256];
    Message.msg_control = ControlBuffer;
    Message.msg_controllen = sizeof(ControlBuffer);

    ssize_t BytesRead = recvmsg(SocketFD, &Message, 0);

    if (BytesRead < 0)
      return make_error<Failure>("Failed to read FD from socket: " +
                                 Twine(strerror(errno)));

    struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);

    int FD;

    if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
      return make_error<Failure>("Failed to get correct number of bytes for "
                                 "file descriptor from socket.");

    memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD));

    return FD;
  }

  Error
  runParentProcess(pid_t ChildPID, int WriteFD, StringRef CounterName,
                   SmallVectorImpl<int64_t> &CounterValues,
                   ArrayRef<const char *> ValidationCounters,
                   SmallVectorImpl<int64_t> &ValidationCounterValues) const {
    auto WriteFDClose = make_scope_exit([WriteFD]() { close(WriteFD); });
    const ExegesisTarget &ET = State.getExegesisTarget();
    auto CounterOrError =
        ET.createCounter(CounterName, State, ValidationCounters, ChildPID);

    if (!CounterOrError)
      return CounterOrError.takeError();

    pfm::CounterGroup *Counter = CounterOrError.get().get();

    // Make sure to attach to the process (and wait for the sigstop to be
    // delivered and for the process to continue) before we write to the counter
    // file descriptor. Attaching to the process before writing to the socket
    // ensures that the subprocess at most has blocked on the read call. If we
    // attach afterwards, the subprocess might exit before we get to the attach
    // call due to effects like scheduler contention, introducing transient
    // failures.
    if (ptrace(PTRACE_ATTACH, ChildPID, NULL, NULL) != 0)
      return make_error<Failure>("Failed to attach to the child process: " +
                                 Twine(strerror(errno)));

    if (waitpid(ChildPID, NULL, 0) == -1) {
      return make_error<Failure>(
          "Failed to wait for child process to stop after attaching: " +
          Twine(strerror(errno)));
    }

    if (ptrace(PTRACE_CONT, ChildPID, NULL, NULL) != 0)
      return make_error<Failure>(
          "Failed to continue execution of the child process: " +
          Twine(strerror(errno)));

    int CounterFileDescriptor = Counter->getFileDescriptor();
    Error SendError =
        sendFileDescriptorThroughSocket(WriteFD, CounterFileDescriptor);

    if (SendError)
      return SendError;

    int ChildStatus;
    if (waitpid(ChildPID, &ChildStatus, 0) == -1) {
      return make_error<Failure>(
          "Waiting for the child process to complete failed: " +
          Twine(strerror(errno)));
    }

    if (WIFEXITED(ChildStatus)) {
      int ChildExitCode = WEXITSTATUS(ChildStatus);
      if (ChildExitCode == 0) {
        // The child exited succesfully, read counter values and return
        // success.
        auto CounterValueOrErr = Counter->readOrError();
        if (!CounterValueOrErr)
          return CounterValueOrErr.takeError();
        CounterValues = std::move(*CounterValueOrErr);

        auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
        if (!ValidationValuesOrErr)
          return ValidationValuesOrErr.takeError();

        ArrayRef RealValidationValues = *ValidationValuesOrErr;
        for (size_t I = 0; I < RealValidationValues.size(); ++I)
          ValidationCounterValues[I] = RealValidationValues[I];

        return Error::success();
      }
      // The child exited, but not successfully.
      return make_error<Failure>(
          "Child benchmarking process exited with non-zero exit code: " +
          childProcessExitCodeToString(ChildExitCode));
    }

    // An error was encountered running the snippet, process it
    siginfo_t ChildSignalInfo;
    if (ptrace(PTRACE_GETSIGINFO, ChildPID, NULL, &ChildSignalInfo) == -1) {
      return make_error<Failure>("Getting signal info from the child failed: " +
                                 Twine(strerror(errno)));
    }

    // Send SIGKILL rather than SIGTERM as the child process has no SIGTERM
    // handlers to run, and calling SIGTERM would mean that ptrace will force
    // it to block in the signal-delivery-stop for the SIGSEGV/other signals,
    // and upon exit.
    if (kill(ChildPID, SIGKILL) == -1)
      return make_error<Failure>("Failed to kill child benchmarking proces: " +
                                 Twine(strerror(errno)));

    // Wait for the process to exit so that there are no zombie processes left
    // around.
    if (waitpid(ChildPID, NULL, 0) == -1)
      return make_error<Failure>("Failed to wait for process to die: " +
                                 Twine(strerror(errno)));

    if (ChildSignalInfo.si_signo == SIGSEGV)
      return make_error<SnippetSegmentationFault>(
          reinterpret_cast<uintptr_t>(ChildSignalInfo.si_addr));

    return make_error<SnippetSignal>(ChildSignalInfo.si_signo);
  }

  static void setCPUAffinityIfRequested(int CPUToUse) {
// Special case this function for x86_64 for now as certain more esoteric
// platforms have different definitions for some of the libc functions that
// cause buildtime failures. Additionally, the subprocess executor mode (the
// sole mode where this is supported) currently only supports x86_64.

// Also check that we have the SYS_getcpu macro defined, meaning the syscall
// actually exists within the build environment. We manually use the syscall
// rather than the libc wrapper given the wrapper for getcpu is only available
// in glibc 2.29 and later.
#if defined(__x86_64__) && defined(SYS_getcpu)
    // Set the CPU affinity for the child process, so that we ensure that if
    // the user specified a CPU the process should run on, the benchmarking
    // process is running on that CPU.
    cpu_set_t CPUMask;
    CPU_ZERO(&CPUMask);
    CPU_SET(CPUToUse, &CPUMask);
    // TODO(boomanaiden154): Rewrite this to use LLVM primitives once they
    // are available.
    int SetAffinityReturn = sched_setaffinity(0, sizeof(CPUMask), &CPUMask);
    if (SetAffinityReturn == -1) {
      exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
    }

    // Check (if assertions are enabled) that we are actually running on the
    // CPU that was specified by the user.
    [[maybe_unused]] unsigned int CurrentCPU;
    assert(syscall(SYS_getcpu, &CurrentCPU, nullptr) == 0 &&
           "Expected getcpu call to succeed.");
    assert(static_cast<int>(CurrentCPU) == CPUToUse &&
           "Expected current CPU to equal the CPU requested by the user");
#else
    exit(ChildProcessExitCodeE::SetCPUAffinityFailed);
#endif // defined(__x86_64__) && defined(SYS_getcpu)
  }

  Error createSubProcessAndRunBenchmark(
      StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
      ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const {
    int PipeFiles[2];
    int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
    if (PipeSuccessOrErr != 0) {
      return make_error<Failure>(
          "Failed to create a pipe for interprocess communication between "
          "llvm-exegesis and the benchmarking subprocess: " +
          Twine(strerror(errno)));
    }

    SubprocessMemory SPMemory;
    Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
    if (MemoryInitError)
      return MemoryInitError;

    Error AddMemDefError =
        SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
    if (AddMemDefError)
      return AddMemDefError;

    long ParentTID = SubprocessMemory::getCurrentTID();
    pid_t ParentOrChildPID = fork();

    if (ParentOrChildPID == -1) {
      return make_error<Failure>("Failed to create child process: " +
                                 Twine(strerror(errno)));
    }

    if (ParentOrChildPID == 0) {
      if (BenchmarkProcessCPU.has_value()) {
        setCPUAffinityIfRequested(*BenchmarkProcessCPU);
      }

      // We are in the child process, close the write end of the pipe.
      close(PipeFiles[1]);
      // Unregister handlers, signal handling is now handled through ptrace in
      // the host process.
      sys::unregisterHandlers();
      runChildSubprocess(PipeFiles[0], Key, ParentTID);
      // The child process terminates in the above function, so we should never
      // get to this point.
      llvm_unreachable("Child process didn't exit when expected.");
    }

    // Close the read end of the pipe as we only need to write to the subprocess
    // from the parent process.
    close(PipeFiles[0]);
    return runParentProcess(ParentOrChildPID, PipeFiles[1], CounterName,
                            CounterValues, ValidationCounters,
                            ValidationCounterValues);
  }

  void disableCoreDumps() const {
    struct rlimit rlim;

    rlim.rlim_cur = 0;
    setrlimit(RLIMIT_CORE, &rlim);
  }

  [[noreturn]] void runChildSubprocess(int Pipe, const BenchmarkKey &Key,
                                       long ParentTID) const {
    // Disable core dumps in the child process as otherwise everytime we
    // encounter an execution failure like a segmentation fault, we will create
    // a core dump. We report the information directly rather than require the
    // user inspect a core dump.
    disableCoreDumps();

    // The following occurs within the benchmarking subprocess.
    pid_t ParentPID = getppid();

    Expected<int> CounterFileDescriptorOrError =
        getFileDescriptorFromSocket(Pipe);

    if (!CounterFileDescriptorOrError)
      exit(ChildProcessExitCodeE::CounterFDReadFailed);

    int CounterFileDescriptor = *CounterFileDescriptorOrError;

// Glibc versions greater than 2.35 automatically call rseq during
// initialization. Unmapping the region that glibc sets up for this causes
// segfaults in the program. Unregister the rseq region so that we can safely
// unmap it later
#ifdef GLIBC_INITS_RSEQ
    unsigned int RseqStructSize = __rseq_size;

    // Glibc v2.40 (the change is also expected to be backported to v2.35)
    // changes the definition of __rseq_size to be the usable area of the struct
    // rather than the actual size of the struct. v2.35 uses only 20 bytes of
    // the 32 byte struct. For now, it should be safe to assume that if the
    // usable size is less than 32, the actual size of the struct will be 32
    // bytes given alignment requirements.
    if (__rseq_size < 32)
      RseqStructSize = 32;

    long RseqDisableOutput = syscall(
        SYS_rseq,
        reinterpret_cast<uintptr_t>(__builtin_thread_pointer()) + __rseq_offset,
        RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
    if (RseqDisableOutput != 0)
      exit(ChildProcessExitCodeE::RSeqDisableFailed);
#endif // GLIBC_INITS_RSEQ

    // The frontend that generates the memory annotation structures should
    // validate that the address to map the snippet in at is a multiple of
    // the page size. Assert that this is true here.
    assert(Key.SnippetAddress % getpagesize() == 0 &&
           "The snippet address needs to be aligned to a page boundary.");

    size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
    void *MapAddress = NULL;
    int MapFlags = MAP_PRIVATE | MAP_ANONYMOUS;

    if (Key.SnippetAddress != 0) {
      MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
      MapFlags |= MAP_FIXED_NOREPLACE;
    }

    char *FunctionDataCopy =
        (char *)mmap(MapAddress, FunctionDataCopySize, PROT_READ | PROT_WRITE,
                     MapFlags, 0, 0);
    if (reinterpret_cast<intptr_t>(FunctionDataCopy) == -1)
      exit(ChildProcessExitCodeE::FunctionDataMappingFailed);

    memcpy(FunctionDataCopy, this->Function.FunctionBytes.data(),
           this->Function.FunctionBytes.size());
    mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);

    Expected<int> AuxMemFDOrError =
        SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
            Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
    if (!AuxMemFDOrError)
      exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);

    ((void (*)(size_t, int))(uintptr_t)FunctionDataCopy)(FunctionDataCopySize,
                                                         *AuxMemFDOrError);

    exit(0);
  }

  Expected<SmallVector<int64_t, 4>> runWithCounter(
      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
    SmallVector<int64_t, 4> Value(1, 0);
    Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
        CounterName, Value, ValidationCounters, ValidationCounterValues);

    if (PossibleBenchmarkError)
      return std::move(PossibleBenchmarkError);

    return Value;
  }

  const LLVMState &State;
  const ExecutableFunction Function;
  const BenchmarkKey &Key;
  const std::optional<int> BenchmarkProcessCPU;
};
#endif // __linux__
} // namespace

Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
    const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
    unsigned MinInstructions, unsigned LoopBodySize,
    bool GenerateMemoryInstructions) const {
  const std::vector<MCInst> &Instructions = BC.Key.Instructions;
  SmallString<0> Buffer;
  raw_svector_ostream OS(Buffer);
  if (Error E = assembleToStream(
          State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
          Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
                           GenerateMemoryInstructions),
          OS, BC.Key, GenerateMemoryInstructions)) {
    return std::move(E);
  }
  return Buffer;
}

Expected<BenchmarkRunner::RunnableConfiguration>
BenchmarkRunner::getRunnableConfiguration(
    const BenchmarkCode &BC, unsigned MinInstructions, unsigned LoopBodySize,
    const SnippetRepetitor &Repetitor) const {
  RunnableConfiguration RC;

  Benchmark &BenchmarkResult = RC.BenchmarkResult;
  BenchmarkResult.Mode = Mode;
  BenchmarkResult.CpuName =
      std::string(State.getTargetMachine().getTargetCPU());
  BenchmarkResult.LLVMTriple =
      State.getTargetMachine().getTargetTriple().normalize();
  BenchmarkResult.MinInstructions = MinInstructions;
  BenchmarkResult.Info = BC.Info;

  const std::vector<MCInst> &Instructions = BC.Key.Instructions;

  bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;

  BenchmarkResult.Key = BC.Key;

  // Assemble at least kMinInstructionsForSnippet instructions by repeating
  // the snippet for debug/analysis. This is so that the user clearly
  // understands that the inside instructions are repeated.
  if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
    const int MinInstructionsForSnippet = 4 * Instructions.size();
    const int LoopBodySizeForSnippet = 2 * Instructions.size();
    auto Snippet =
        assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
                        LoopBodySizeForSnippet, GenerateMemoryInstructions);
    if (Error E = Snippet.takeError())
      return std::move(E);

    if (auto Err = getBenchmarkFunctionBytes(*Snippet,
                                             BenchmarkResult.AssembledSnippet))
      return std::move(Err);
  }

  // Assemble enough repetitions of the snippet so we have at least
  // MinInstructions instructions.
  if (BenchmarkPhaseSelector >
      BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
    auto Snippet =
        assembleSnippet(BC, Repetitor, BenchmarkResult.MinInstructions,
                        LoopBodySize, GenerateMemoryInstructions);
    if (Error E = Snippet.takeError())
      return std::move(E);
    RC.ObjectFile = getObjectFromBuffer(*Snippet);
  }

  return std::move(RC);
}

Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>>
BenchmarkRunner::createFunctionExecutor(
    object::OwningBinary<object::ObjectFile> ObjectFile,
    const BenchmarkKey &Key, std::optional<int> BenchmarkProcessCPU) const {
  switch (ExecutionMode) {
  case ExecutionModeE::InProcess: {
    if (BenchmarkProcessCPU.has_value())
      return make_error<Failure>("The inprocess execution mode does not "
                                 "support benchmark core pinning.");

    auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
        State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU);
    if (!InProcessExecutorOrErr)
      return InProcessExecutorOrErr.takeError();

    return std::move(*InProcessExecutorOrErr);
  }
  case ExecutionModeE::SubProcess: {
#ifdef __linux__
    auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
        State, std::move(ObjectFile), Key, BenchmarkProcessCPU);
    if (!SubProcessExecutorOrErr)
      return SubProcessExecutorOrErr.takeError();

    return std::move(*SubProcessExecutorOrErr);
#else
    return make_error<Failure>(
        "The subprocess execution mode is only supported on Linux");
#endif
  }
  }
  llvm_unreachable("ExecutionMode is outside expected range");
}

std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
    RunnableConfiguration &&RC, const std::optional<StringRef> &DumpFile,
    std::optional<int> BenchmarkProcessCPU) const {
  Benchmark &BenchmarkResult = RC.BenchmarkResult;
  object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;

  if (DumpFile && BenchmarkPhaseSelector >
                      BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
    auto ObjectFilePath =
        writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
    if (Error E = ObjectFilePath.takeError()) {
      return {std::move(E), std::move(BenchmarkResult)};
    }
    outs() << "Check generated assembly with: /usr/bin/objdump -d "
           << *ObjectFilePath << "\n";
  }

  if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
    BenchmarkResult.Error = "actual measurements skipped.";
    return {Error::success(), std::move(BenchmarkResult)};
  }

  Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
      createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key,
                             BenchmarkProcessCPU);
  if (!Executor)
    return {Executor.takeError(), std::move(BenchmarkResult)};
  auto NewMeasurements = runMeasurements(**Executor);

  if (Error E = NewMeasurements.takeError()) {
    return {std::move(E), std::move(BenchmarkResult)};
  }
  assert(BenchmarkResult.MinInstructions > 0 && "invalid MinInstructions");
  for (BenchmarkMeasure &BM : *NewMeasurements) {
    // Scale the measurements by the number of instructions.
    BM.PerInstructionValue /= BenchmarkResult.MinInstructions;
    // Scale the measurements by the number of times the entire snippet is
    // repeated.
    BM.PerSnippetValue /=
        std::ceil(BenchmarkResult.MinInstructions /
                  static_cast<double>(BenchmarkResult.Key.Instructions.size()));
  }
  BenchmarkResult.Measurements = std::move(*NewMeasurements);

  return {Error::success(), std::move(BenchmarkResult)};
}

Expected<std::string>
BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
  int ResultFD = 0;
  SmallString<256> ResultPath = FileName;
  if (Error E = errorCodeToError(
          FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
                                                          ResultFD, ResultPath)
                           : sys::fs::openFileForReadWrite(
                                 FileName, ResultFD, sys::fs::CD_CreateAlways,
                                 sys::fs::OF_None)))
    return std::move(E);
  raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
  OFS.write(Buffer.data(), Buffer.size());
  OFS.flush();
  return std::string(ResultPath);
}

static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
                          const ValidationEvent RHS) {
  return static_cast<int>(LHS.first) < static_cast<int>(RHS);
}

Error BenchmarkRunner::getValidationCountersToRun(
    SmallVector<const char *> &ValCountersToRun) const {
  const PfmCountersInfo &PCI = State.getPfmCounters();
  ValCountersToRun.reserve(ValidationCounters.size());

  ValCountersToRun.reserve(ValidationCounters.size());
  ArrayRef TargetValidationEvents(PCI.ValidationEvents,
                                  PCI.NumValidationEvents);
  for (const ValidationEvent RequestedValEvent : ValidationCounters) {
    auto ValCounterIt =
        lower_bound(TargetValidationEvents, RequestedValEvent, EventLessThan);
    if (ValCounterIt == TargetValidationEvents.end() ||
        ValCounterIt->first != RequestedValEvent)
      return make_error<Failure>("Cannot create validation counter");

    assert(ValCounterIt->first == RequestedValEvent &&
           "The array of validation events from the target should be sorted");
    ValCountersToRun.push_back(ValCounterIt->second);
  }

  return Error::success();
}

BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}

} // namespace exegesis
} // namespace llvm