third-party/boost-math/include/boost/math/optimization/cma_es.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392

/*
 * Copyright Nick Thompson, 2024
 * Use, modification and distribution are subject to the
 * Boost Software License, Version 1.0. (See accompanying file
 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 */
#ifndef BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
#define BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
#include <atomic>
#include <cmath>
#include <iostream>
#include <limits>
#include <random>
#include <sstream>
#include <stdexcept>
#include <utility>
#include <vector>
#include <boost/math/optimization/detail/common.hpp>
#include <boost/math/tools/assert.hpp>
#if __has_include(<Eigen/Dense>)
#include <Eigen/Dense>
#else
#error "CMA-ES requires Eigen."
#endif

// Follows the notation in:
// https://arxiv.org/pdf/1604.00772.pdf
// This is a (hopefully) faithful reproduction of the pseudocode in the arxiv review
// by Nikolaus Hansen.
// Comments referring to equations all refer to this arxiv review.
// A slide deck by the same author is given here:
// http://www.cmap.polytechnique.fr/~nikolaus.hansen/CmaTutorialGecco2023-no-audio.pdf
// which is also a very useful reference.

#ifndef BOOST_MATH_DEBUG_CMA_ES
#define BOOST_MATH_DEBUG_CMA_ES 0
#endif

namespace boost::math::optimization {

template <typename ArgumentContainer> struct cma_es_parameters {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  ArgumentContainer lower_bounds;
  ArgumentContainer upper_bounds;
  size_t max_generations = 1000;
  ArgumentContainer const *initial_guess = nullptr;
  // In the reference, population size = \lambda.
  // If the population size is zero, it is set to equation (48) of the reference
  // and rounded up to the nearest multiple of threads:
  size_t population_size = 0;
  // In the reference, learning_rate = c_m:
  DimensionlessReal learning_rate = 1;
};

template <typename ArgumentContainer>
void validate_cma_es_parameters(cma_es_parameters<ArgumentContainer> &params) {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  using std::isfinite;
  using std::isnan;
  using std::log;
  using std::ceil;
  using std::floor;

  std::ostringstream oss;
  detail::validate_bounds(params.lower_bounds, params.upper_bounds);
  if (params.initial_guess) {
    detail::validate_initial_guess(*params.initial_guess, params.lower_bounds, params.upper_bounds);
  }
  const size_t n = params.upper_bounds.size();
  // Equation 48 of the arxiv review:
  if (params.population_size == 0) {
    //auto tmp = 4.0 + floor(3*log(n));
    // But round to the nearest multiple of the thread count:
    //auto k = static_cast<size_t>(std::ceil(tmp/params.threads));
    //params.population_size = k*params.threads;
    params.population_size = static_cast<size_t>(4 + floor(3*log(n)));
  }
  if (params.learning_rate <= DimensionlessReal(0) || !isfinite(params.learning_rate)) {
    oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
    oss << ": The learning rate must be > 0, but got " << params.learning_rate << ".";
    throw std::invalid_argument(oss.str());
  }
}

template <typename ArgumentContainer, class Func, class URBG>
ArgumentContainer cma_es(
    const Func cost_function,
    cma_es_parameters<ArgumentContainer> &params,
    URBG &gen,
    std::invoke_result_t<Func, ArgumentContainer> target_value = std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
    std::atomic<bool> *cancellation = nullptr,
    std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr,
    std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr)
 {
  using Real = typename ArgumentContainer::value_type;
  using DimensionlessReal = decltype(Real()/Real());
  using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
  using std::abs;
  using std::log;
  using std::exp;
  using std::pow;
  using std::min;
  using std::max;
  using std::sqrt;
  using std::isnan;
  using std::isfinite;
  using std::uniform_real_distribution;
  using std::normal_distribution;
  validate_cma_es_parameters(params);
  // n = dimension of problem:
  const size_t n = params.lower_bounds.size();
  std::atomic<bool> target_attained = false;
  std::atomic<ResultType> lowest_cost = std::numeric_limits<ResultType>::infinity();
  ArgumentContainer best_vector;
  // p_{c} := evolution path, equation (24) of the arxiv review:
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_c(n);
  // p_{\sigma} := conjugate evolution path, equation (31) of the arxiv review:
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_sigma(n);
  if constexpr (detail::has_resize_v<ArgumentContainer>) {
    best_vector.resize(n, std::numeric_limits<Real>::quiet_NaN());
  }
  for (size_t i = 0; i < n; ++i) {
    p_c[i] = DimensionlessReal(0);
    p_sigma[i] = DimensionlessReal(0);
  }
  // Table 1, \mu = floor(\lambda/2):
  size_t mu = params.population_size/2;
  std::vector<DimensionlessReal> w_prime(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  for (size_t i = 0; i < params.population_size; ++i) {
    // Equation (49), but 0-indexed:
    w_prime[i] = log(static_cast<DimensionlessReal>(params.population_size + 1)/(2*(i+1)));
  }
  // Table 1, notes at top:
  DimensionlessReal positive_weight_sum = 0;
  DimensionlessReal sq_weight_sum = 0;
  for (size_t i = 0; i < mu; ++i) {
    BOOST_MATH_ASSERT(w_prime[i] > 0);
    positive_weight_sum += w_prime[i];
    sq_weight_sum += w_prime[i]*w_prime[i];
  }
  DimensionlessReal mu_eff = positive_weight_sum*positive_weight_sum/sq_weight_sum;
  BOOST_MATH_ASSERT(1 <= mu_eff);
  BOOST_MATH_ASSERT(mu_eff <= mu);
  DimensionlessReal negative_weight_sum = 0;
  sq_weight_sum = 0;
  for (size_t i = mu; i < params.population_size; ++i) {
    BOOST_MATH_ASSERT(w_prime[i] <= 0);
    negative_weight_sum += w_prime[i];
    sq_weight_sum += w_prime[i]*w_prime[i];
  }
  DimensionlessReal mu_eff_m = negative_weight_sum*negative_weight_sum/sq_weight_sum;
  // Equation (54):
  DimensionlessReal c_m = params.learning_rate;
  // Equation (55):
  DimensionlessReal c_sigma = (mu_eff + 2)/(n + mu_eff + 5);
  BOOST_MATH_ASSERT(c_sigma < 1);
  DimensionlessReal d_sigma = 1 + 2*(max)(DimensionlessReal(0), sqrt(DimensionlessReal((mu_eff - 1)/(n + 1))) - DimensionlessReal(1)) + c_sigma;
  // Equation (56):
  DimensionlessReal c_c = (4 + mu_eff/n)/(n + 4 + 2*mu_eff/n);
  BOOST_MATH_ASSERT(c_c <= 1);
  // Equation (57):
  DimensionlessReal c_1 = DimensionlessReal(2)/(pow(n + 1.3, 2) + mu_eff);
  // Equation (58)
  DimensionlessReal c_mu = (min)(1 - c_1, 2*(DimensionlessReal(0.25)  + mu_eff  + 1/mu_eff - 2)/((n+2)*(n+2) + mu_eff));
  BOOST_MATH_ASSERT(c_1 + c_mu <= DimensionlessReal(1));
  // Equation (50):
  DimensionlessReal alpha_mu_m = 1 + c_1/c_mu;
  // Equation (51):
  DimensionlessReal alpha_mu_eff_m = 1 + 2*mu_eff_m/(mu_eff + 2);
  // Equation (52):
  DimensionlessReal alpha_m_pos_def = (1- c_1 - c_mu)/(n*c_mu);
  // Equation (53):
  std::vector<DimensionlessReal> weights(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
  for (size_t i = 0; i < mu; ++i) {
    weights[i] = w_prime[i]/positive_weight_sum;
  }
  DimensionlessReal min_alpha = (min)(alpha_mu_m, (min)(alpha_mu_eff_m, alpha_m_pos_def));
  for (size_t i = mu; i < params.population_size; ++i) {
    weights[i] = min_alpha*w_prime[i]/abs(negative_weight_sum);
  }
  // mu:= number of parents, lambda := number of offspring.
  Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> C = Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>::Identity(n, n);
  ArgumentContainer mean_vector;
  // See the footnote in Figure 6 of the arxiv review:
  // We should consider the more robust initialization described there. . . 
  Real sigma = DimensionlessReal(0.3)*(params.upper_bounds[0] - params.lower_bounds[0]);;
  if (params.initial_guess) {
    mean_vector = *params.initial_guess;
  }
  else {
    mean_vector = detail::random_initial_population(params.lower_bounds, params.upper_bounds, 1, gen)[0];
  }
  auto initial_cost = cost_function(mean_vector);
  if (!isnan(initial_cost)) {
    best_vector = mean_vector;
    lowest_cost = initial_cost;
    if (current_minimum_cost) {
      *current_minimum_cost = initial_cost;
    }
  }
#if BOOST_MATH_DEBUG_CMA_ES
  {
    std::cout << __FILE__ << ":" << __LINE__ << ":" << __func__ << "\n";
    std::cout << "\tRunning a (" << params.population_size/2 << "/" << params.population_size/2 << "_W, " << params.population_size << ")-aCMA Evolutionary Strategy on " << params.threads << " threads.\n";
    std::cout << "\tInitial mean vector: {";
    for (size_t i = 0; i < n - 1; ++i) {
      std::cout << mean_vector[i] << ", ";
    }
    std::cout << mean_vector[n - 1] << "}.\n";
    std::cout << "\tCost: " << lowest_cost << ".\n";
    std::cout << "\tInitial step length: " << sigma << ".\n";
    std::cout << "\tVariance effective selection mass: " << mu_eff << ".\n";
    std::cout << "\tLearning rate for rank-one update of covariance matrix: " << c_1 << ".\n";
    std::cout << "\tLearning rate for rank-mu update of covariance matrix: " << c_mu << ".\n";
    std::cout << "\tDecay rate for cumulation path for step-size control: " << c_sigma << ".\n";
    std::cout << "\tLearning rate for the mean: " << c_m << ".\n";
    std::cout << "\tDamping parameter for step-size update: " << d_sigma << ".\n";
  }
#endif
  size_t generation = 0;

  std::vector<Eigen::Vector<DimensionlessReal, Eigen::Dynamic>> ys(params.population_size);
  std::vector<ArgumentContainer> xs(params.population_size);
  std::vector<ResultType> costs(params.population_size, std::numeric_limits<ResultType>::quiet_NaN());
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> weighted_avg_y(n);
  Eigen::Vector<DimensionlessReal, Eigen::Dynamic> z(n);
  if constexpr (detail::has_resize_v<ArgumentContainer>) {
    for (auto & x : xs) {
      x.resize(n, std::numeric_limits<Real>::quiet_NaN());
    }
  }
  for (auto & y : ys) {
    y.resize(n);
  }
  normal_distribution<DimensionlessReal> dis(DimensionlessReal(0), DimensionlessReal(1));
  do {
    if (cancellation && *cancellation) {
      break;
    }
    // TODO: The reference contends the following in
    // Section B.2 "Strategy internal numerical effort":
    // "In practice, the re-calculation of B and D needs to be done not until about
    // max(1, floor(1/(10n(c_1+c_mu)))) generations."
    // Note that sigma can be dimensionless, in which case C carries the units.
    // This is a weird decision-we're not gonna do that!
    Eigen::SelfAdjointEigenSolver<Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>> eigensolver(C);
    if (eigensolver.info() != Eigen::Success) {
      std::ostringstream oss;
      oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
      oss << ": Could not decompose the covariance matrix as BDB^{T}.";
      throw std::logic_error(oss.str());
    }
    Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> B = eigensolver.eigenvectors();
    // Eigen returns D^2, in the notation of the survey:
    auto D = eigensolver.eigenvalues();
    // So make it better:
    for (auto & d : D) {
      if (d <= 0 || isnan(d)) {
        std::ostringstream oss;
        oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
        oss << ": The covariance matrix is not positive definite. This breaks the evolution path computation downstream.\n";
        oss << "C=\n" << C << "\n";
        oss << "Eigenvalues: " << D;
        throw std::domain_error(oss.str());
      }
      d = sqrt(d);
    }

    for (size_t k = 0; k < params.population_size; ++k) {
      auto & y = ys[k];
      auto & x = xs[k];
      BOOST_MATH_ASSERT(static_cast<size_t>(x.size()) == n);
      BOOST_MATH_ASSERT(static_cast<size_t>(y.size()) == n);
      size_t resample_counter = 0;
      do {
        // equation (39) of Figure 6:
        // Also see equation (4):
        for (size_t i = 0; i < n; ++i) {
          z[i] = dis(gen);
        }
        Eigen::Vector<DimensionlessReal, Eigen::Dynamic> Dz(n);
        for (size_t i = 0; i < n; ++i) {
          Dz[i] = D[i]*z[i];
        }
        y = B*Dz;
        for (size_t i = 0; i < n; ++i) {
          BOOST_MATH_ASSERT(!isnan(mean_vector[i]));
          BOOST_MATH_ASSERT(!isnan(y[i]));
          x[i] = mean_vector[i] + sigma*y[i]; // equation (40) of Figure 6.
        }
        costs[k] = cost_function(x);
        if (resample_counter++ == 50) {
          std::ostringstream oss;
          oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
          oss << ": 50 resamples was not sufficient to find an argument to the cost function which did not return NaN.";
          oss << " Giving up.";
          throw std::domain_error(oss.str());
        }
      } while (isnan(costs[k]));

      if (queries) {
        queries->emplace_back(std::make_pair(x, costs[k]));
      }
      if (costs[k] < lowest_cost) {
        lowest_cost = costs[k];
        best_vector = x;
        if (current_minimum_cost && costs[k] < *current_minimum_cost) {
          *current_minimum_cost = costs[k];
        }
        if (lowest_cost < target_value) {
          target_attained = true;
          break;
        }
      }
    }
    if (target_attained) {
      break;
    }
    if (cancellation && *cancellation) {
      break;
    }
    auto indices = detail::best_indices(costs);
    // Equation (41), Figure 6:
    for (size_t j = 0; j < n; ++j) {
      weighted_avg_y[j] = 0;
      for (size_t i = 0; i < mu; ++i) {
        BOOST_MATH_ASSERT(!isnan(weights[i]));
        BOOST_MATH_ASSERT(!isnan(ys[indices[i]][j]));
        weighted_avg_y[j] += weights[i]*ys[indices[i]][j];
      }
    }
    // Equation (42), Figure 6:
    for (size_t j = 0; j < n; ++j) {
      mean_vector[j] = mean_vector[j] + c_m*sigma*weighted_avg_y[j];
    }
    // Equation (43), Figure 6: Start with C^{-1/2}<y>_{w}
    Eigen::Vector<DimensionlessReal, Eigen::Dynamic> inv_D_B_transpose_y = B.transpose()*weighted_avg_y;
    for (long j = 0; j < inv_D_B_transpose_y.size(); ++j) {
      inv_D_B_transpose_y[j] /= D[j];
    }
    Eigen::Vector<DimensionlessReal, Eigen::Dynamic> C_inv_sqrt_y_avg = B*inv_D_B_transpose_y;
    // Equation (43), Figure 6:
    DimensionlessReal p_sigma_norm = 0;
    for (size_t j = 0; j < n; ++j) {
      p_sigma[j] = (1-c_sigma)*p_sigma[j] + sqrt(c_sigma*(2-c_sigma)*mu_eff)*C_inv_sqrt_y_avg[j];
      p_sigma_norm += p_sigma[j]*p_sigma[j];
    }
    p_sigma_norm = sqrt(p_sigma_norm);
    // A: Algorithm Summary: E[||N(0,1)||]:
    const DimensionlessReal expectation_norm_0I = sqrt(static_cast<DimensionlessReal>(n))*(DimensionlessReal(1) - DimensionlessReal(1)/(4*n) + DimensionlessReal(1)/(21*n*n));
    // Equation (44), Figure 6:
    sigma = sigma*exp(c_sigma*(p_sigma_norm/expectation_norm_0I -1)/d_sigma);
    // A: Algorithm Summary:
    DimensionlessReal h_sigma = 0;
    DimensionlessReal rhs = (DimensionlessReal(1.4) + DimensionlessReal(2)/(n+1))*expectation_norm_0I*sqrt(1 - pow(1-c_sigma, 2*(generation+1)));
    if (p_sigma_norm < rhs) {
      h_sigma = 1;
    }
    // Equation (45), Figure 6:
    p_c = (1-c_c)*p_c + h_sigma*sqrt(c_c*(2-c_c)*mu_eff)*weighted_avg_y;
    DimensionlessReal delta_h_sigma = (1-h_sigma)*c_c*(2-c_c);
    DimensionlessReal weight_sum = 0;
    for (auto & w : weights) {
      weight_sum += w;
    }
    // Equation (47), Figure 6:
    DimensionlessReal K = (1 + c_1*delta_h_sigma - c_1 - c_mu*weight_sum);
    // Can these operations be sped up using `.selfadjointView<Eigen::Upper>`?
    // Maybe: A.selfadjointView<Eigen::Lower>().rankUpdate(p_c, c_1);?
    C = K*C + c_1*p_c*p_c.transpose();
    // Incorporate positive weights of Equation (46):
    for (size_t i = 0; i < params.population_size/2; ++i) {
      C += c_mu*weights[i]*ys[indices[i]]*ys[indices[i]].transpose();
    }
    for (size_t i = params.population_size/2; i < params.population_size; ++i) {
      Eigen::Vector<DimensionlessReal, Eigen::Dynamic> D_inv_BTy = B.transpose()*ys[indices[i]];
      for (size_t j = 0; j < n; ++j) {
        D_inv_BTy[j] /= D[j];
      }
      DimensionlessReal squared_norm = D_inv_BTy.squaredNorm();
      DimensionlessReal K2 = c_mu*weights[i]/squared_norm;
      C += K2*ys[indices[i]]*ys[indices[i]].transpose();
    }
  } while (generation++ < params.max_generations);

  return best_vector;
}

} // namespace boost::math::optimization
#endif