|  | // This file is part of Eigen, a lightweight C++ template library | 
|  | // for linear algebra. | 
|  | // | 
|  | // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> | 
|  | // | 
|  | // This Source Code Form is subject to the terms of the Mozilla | 
|  | // Public License v. 2.0. If a copy of the MPL was not distributed | 
|  | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. | 
|  |  | 
|  | #include <iostream> | 
|  | #include <cstdint> | 
|  | #include <cstdlib> | 
|  | #include <vector> | 
|  | #include <fstream> | 
|  | #include <memory> | 
|  | #include <cstdio> | 
|  |  | 
|  | bool eigen_use_specific_block_size; | 
|  | int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; | 
|  | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size | 
|  | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k | 
|  | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m | 
|  | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n | 
|  | #include <Eigen/Core> | 
|  |  | 
|  | #include <bench/BenchTimer.h> | 
|  |  | 
|  | using namespace Eigen; | 
|  | using namespace std; | 
|  |  | 
|  | static BenchTimer timer; | 
|  |  | 
|  | // how many times we repeat each measurement. | 
|  | // measurements are randomly shuffled - we're not doing | 
|  | // all N identical measurements in a row. | 
|  | const int measurement_repetitions = 3; | 
|  |  | 
|  | // Timings below this value are too short to be accurate, | 
|  | // we'll repeat measurements with more iterations until | 
|  | // we get a timing above that threshold. | 
|  | const float min_accurate_time = 1e-2f; | 
|  |  | 
|  | // See --min-working-set-size command line parameter. | 
|  | size_t min_working_set_size = 0; | 
|  |  | 
|  | float max_clock_speed = 0.0f; | 
|  |  | 
|  | // range of sizes that we will benchmark (in all 3 K,M,N dimensions) | 
|  | const size_t maxsize = 2048; | 
|  | const size_t minsize = 16; | 
|  |  | 
|  | typedef MatrixXf MatrixType; | 
|  | typedef MatrixType::Scalar Scalar; | 
|  | typedef internal::packet_traits<Scalar>::type Packet; | 
|  |  | 
|  | static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); | 
|  | static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); | 
|  | static_assert(maxsize > minsize, "maxsize must be larger than minsize"); | 
|  | static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)"); | 
|  |  | 
|  | // just a helper to store a triple of K,M,N sizes for matrix product | 
|  | struct size_triple_t | 
|  | { | 
|  | size_t k, m, n; | 
|  | size_triple_t() : k(0), m(0), n(0) {} | 
|  | size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} | 
|  | size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} | 
|  | size_triple_t(uint16_t compact) | 
|  | { | 
|  | k = 1 << ((compact & 0xf00) >> 8); | 
|  | m = 1 << ((compact & 0x0f0) >> 4); | 
|  | n = 1 << ((compact & 0x00f) >> 0); | 
|  | } | 
|  | }; | 
|  |  | 
|  | uint8_t log2_pot(size_t x) { | 
|  | size_t l = 0; | 
|  | while (x >>= 1) l++; | 
|  | return l; | 
|  | } | 
|  |  | 
|  | // Convert between size tripes and a compact form fitting in 12 bits | 
|  | // where each size, which must be a POT, is encoded as its log2, on 4 bits | 
|  | // so the largest representable size is 2^15 == 32k  ... big enough. | 
|  | uint16_t compact_size_triple(size_t k, size_t m, size_t n) | 
|  | { | 
|  | return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); | 
|  | } | 
|  |  | 
|  | uint16_t compact_size_triple(const size_triple_t& t) | 
|  | { | 
|  | return compact_size_triple(t.k, t.m, t.n); | 
|  | } | 
|  |  | 
|  | // A single benchmark. Initially only contains benchmark params. | 
|  | // Then call run(), which stores the result in the gflops field. | 
|  | struct benchmark_t | 
|  | { | 
|  | uint16_t compact_product_size; | 
|  | uint16_t compact_block_size; | 
|  | bool use_default_block_size; | 
|  | float gflops; | 
|  | benchmark_t() | 
|  | : compact_product_size(0) | 
|  | , compact_block_size(0) | 
|  | , use_default_block_size(false) | 
|  | , gflops(0) | 
|  | { | 
|  | } | 
|  | benchmark_t(size_t pk, size_t pm, size_t pn, | 
|  | size_t bk, size_t bm, size_t bn) | 
|  | : compact_product_size(compact_size_triple(pk, pm, pn)) | 
|  | , compact_block_size(compact_size_triple(bk, bm, bn)) | 
|  | , use_default_block_size(false) | 
|  | , gflops(0) | 
|  | {} | 
|  | benchmark_t(size_t pk, size_t pm, size_t pn) | 
|  | : compact_product_size(compact_size_triple(pk, pm, pn)) | 
|  | , compact_block_size(0) | 
|  | , use_default_block_size(true) | 
|  | , gflops(0) | 
|  | {} | 
|  |  | 
|  | void run(); | 
|  | }; | 
|  |  | 
|  | ostream& operator<<(ostream& s, const benchmark_t& b) | 
|  | { | 
|  | s << hex << b.compact_product_size << dec; | 
|  | if (b.use_default_block_size) { | 
|  | size_triple_t t(b.compact_product_size); | 
|  | Index k = t.k, m = t.m, n = t.n; | 
|  | internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n); | 
|  | s << " default(" << k << ", " << m << ", " << n << ")"; | 
|  | } else { | 
|  | s << " " << hex << b.compact_block_size << dec; | 
|  | } | 
|  | s << " " << b.gflops; | 
|  | return s; | 
|  | } | 
|  |  | 
|  | // We sort first by increasing benchmark parameters, | 
|  | // then by decreasing performance. | 
|  | bool operator<(const benchmark_t& b1, const benchmark_t& b2) | 
|  | { | 
|  | return b1.compact_product_size < b2.compact_product_size || | 
|  | (b1.compact_product_size == b2.compact_product_size && ( | 
|  | (b1.compact_block_size < b2.compact_block_size || ( | 
|  | b1.compact_block_size == b2.compact_block_size && | 
|  | b1.gflops > b2.gflops)))); | 
|  | } | 
|  |  | 
|  | void benchmark_t::run() | 
|  | { | 
|  | size_triple_t productsizes(compact_product_size); | 
|  |  | 
|  | if (use_default_block_size) { | 
|  | eigen_use_specific_block_size = false; | 
|  | } else { | 
|  | // feed eigen with our custom blocking params | 
|  | eigen_use_specific_block_size = true; | 
|  | size_triple_t blocksizes(compact_block_size); | 
|  | eigen_block_size_k = blocksizes.k; | 
|  | eigen_block_size_m = blocksizes.m; | 
|  | eigen_block_size_n = blocksizes.n; | 
|  | } | 
|  |  | 
|  | // set up the matrix pool | 
|  |  | 
|  | const size_t combined_three_matrices_sizes = | 
|  | sizeof(Scalar) * | 
|  | (productsizes.k * productsizes.m + | 
|  | productsizes.k * productsizes.n + | 
|  | productsizes.m * productsizes.n); | 
|  |  | 
|  | // 64 M is large enough that nobody has a cache bigger than that, | 
|  | // while still being small enough that everybody has this much RAM, | 
|  | // so conveniently we don't need to special-case platforms here. | 
|  | const size_t unlikely_large_cache_size = 64 << 20; | 
|  |  | 
|  | const size_t working_set_size = | 
|  | min_working_set_size ? min_working_set_size : unlikely_large_cache_size; | 
|  |  | 
|  | const size_t matrix_pool_size = | 
|  | 1 + working_set_size / combined_three_matrices_sizes; | 
|  |  | 
|  | MatrixType *lhs = new MatrixType[matrix_pool_size]; | 
|  | MatrixType *rhs = new MatrixType[matrix_pool_size]; | 
|  | MatrixType *dst = new MatrixType[matrix_pool_size]; | 
|  |  | 
|  | for (size_t i = 0; i < matrix_pool_size; i++) { | 
|  | lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k); | 
|  | rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n); | 
|  | dst[i] = MatrixType::Zero(productsizes.m, productsizes.n); | 
|  | } | 
|  |  | 
|  | // main benchmark loop | 
|  |  | 
|  | int iters_at_a_time = 1; | 
|  | float time_per_iter = 0.0f; | 
|  | size_t matrix_index = 0; | 
|  | while (true) { | 
|  |  | 
|  | double starttime = timer.getCpuTime(); | 
|  | for (int i = 0; i < iters_at_a_time; i++) { | 
|  | dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; | 
|  | matrix_index++; | 
|  | if (matrix_index == matrix_pool_size) { | 
|  | matrix_index = 0; | 
|  | } | 
|  | } | 
|  | double endtime = timer.getCpuTime(); | 
|  |  | 
|  | const float timing = float(endtime - starttime); | 
|  |  | 
|  | if (timing >= min_accurate_time) { | 
|  | time_per_iter = timing / iters_at_a_time; | 
|  | break; | 
|  | } | 
|  |  | 
|  | iters_at_a_time *= 2; | 
|  | } | 
|  |  | 
|  | delete[] lhs; | 
|  | delete[] rhs; | 
|  | delete[] dst; | 
|  |  | 
|  | gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter; | 
|  | } | 
|  |  | 
|  | void print_cpuinfo() | 
|  | { | 
|  | #ifdef __linux__ | 
|  | cout << "contents of /proc/cpuinfo:" << endl; | 
|  | string line; | 
|  | ifstream cpuinfo("/proc/cpuinfo"); | 
|  | if (cpuinfo.is_open()) { | 
|  | while (getline(cpuinfo, line)) { | 
|  | cout << line << endl; | 
|  | } | 
|  | cpuinfo.close(); | 
|  | } | 
|  | cout << endl; | 
|  | #elif defined __APPLE__ | 
|  | cout << "output of sysctl hw:" << endl; | 
|  | system("sysctl hw"); | 
|  | cout << endl; | 
|  | #endif | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | string type_name() | 
|  | { | 
|  | return "unknown"; | 
|  | } | 
|  |  | 
|  | template<> | 
|  | string type_name<float>() | 
|  | { | 
|  | return "float"; | 
|  | } | 
|  |  | 
|  | template<> | 
|  | string type_name<double>() | 
|  | { | 
|  | return "double"; | 
|  | } | 
|  |  | 
|  | struct action_t | 
|  | { | 
|  | virtual const char* invokation_name() const { abort(); return nullptr; } | 
|  | virtual void run() const { abort(); } | 
|  | virtual ~action_t() {} | 
|  | }; | 
|  |  | 
|  | void show_usage_and_exit(int /*argc*/, char* argv[], | 
|  | const vector<unique_ptr<action_t>>& available_actions) | 
|  | { | 
|  | cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl; | 
|  | cerr << "available actions:" << endl << endl; | 
|  | for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { | 
|  | cerr << "  " << (*it)->invokation_name() << endl; | 
|  | } | 
|  | cerr << endl; | 
|  | cerr << "options:" << endl << endl; | 
|  | cerr << "  --min-working-set-size=N:" << endl; | 
|  | cerr << "       Set the minimum working set size to N bytes." << endl; | 
|  | cerr << "       This is rounded up as needed to a multiple of matrix size." << endl; | 
|  | cerr << "       A larger working set lowers the chance of a warm cache." << endl; | 
|  | cerr << "       The default value 0 means use a large enough working" << endl; | 
|  | cerr << "       set to likely outsize caches." << endl; | 
|  | cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl; | 
|  | cerr << "       avoid warm caches." << endl; | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | float measure_clock_speed() | 
|  | { | 
|  | cerr << "Measuring clock speed...                              \r" << flush; | 
|  |  | 
|  | vector<float> all_gflops; | 
|  | for (int i = 0; i < 8; i++) { | 
|  | benchmark_t b(1024, 1024, 1024); | 
|  | b.run(); | 
|  | all_gflops.push_back(b.gflops); | 
|  | } | 
|  |  | 
|  | sort(all_gflops.begin(), all_gflops.end()); | 
|  | float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5]; | 
|  |  | 
|  | // multiply by an arbitrary constant to discourage trying doing anything with the | 
|  | // returned values besides just comparing them with each other. | 
|  | float result = stable_estimate * 123.456f; | 
|  |  | 
|  | return result; | 
|  | } | 
|  |  | 
|  | struct human_duration_t | 
|  | { | 
|  | int seconds; | 
|  | human_duration_t(int s) : seconds(s) {} | 
|  | }; | 
|  |  | 
|  | ostream& operator<<(ostream& s, const human_duration_t& d) | 
|  | { | 
|  | int remainder = d.seconds; | 
|  | if (remainder > 3600) { | 
|  | int hours = remainder / 3600; | 
|  | s << hours << " h "; | 
|  | remainder -= hours * 3600; | 
|  | } | 
|  | if (remainder > 60) { | 
|  | int minutes = remainder / 60; | 
|  | s << minutes << " min "; | 
|  | remainder -= minutes * 60; | 
|  | } | 
|  | if (d.seconds < 600) { | 
|  | s << remainder << " s"; | 
|  | } | 
|  | return s; | 
|  | } | 
|  |  | 
|  | const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"; | 
|  |  | 
|  | void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) | 
|  | { | 
|  | FILE* file = fopen(filename, "w"); | 
|  | if (!file) { | 
|  | cerr << "Could not open file " << filename << " for writing." << endl; | 
|  | cerr << "Do you have write permissions on the current working directory?" << endl; | 
|  | exit(1); | 
|  | } | 
|  | size_t benchmarks_vector_size = benchmarks.size(); | 
|  | fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); | 
|  | fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); | 
|  | fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); | 
|  | fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); | 
|  | fclose(file); | 
|  | } | 
|  |  | 
|  | bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) | 
|  | { | 
|  | FILE* file = fopen(filename, "r"); | 
|  | if (!file) { | 
|  | return false; | 
|  | } | 
|  | if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { | 
|  | return false; | 
|  | } | 
|  | size_t benchmarks_vector_size = 0; | 
|  | if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { | 
|  | return false; | 
|  | } | 
|  | if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { | 
|  | return false; | 
|  | } | 
|  | benchmarks.resize(benchmarks_vector_size); | 
|  | if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { | 
|  | return false; | 
|  | } | 
|  | unlink(filename); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | void try_run_some_benchmarks( | 
|  | vector<benchmark_t>& benchmarks, | 
|  | double time_start, | 
|  | size_t& first_benchmark_to_run) | 
|  | { | 
|  | if (first_benchmark_to_run == benchmarks.size()) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | double time_last_progress_update = 0; | 
|  | double time_last_clock_speed_measurement = 0; | 
|  | double time_now = 0; | 
|  |  | 
|  | size_t benchmark_index = first_benchmark_to_run; | 
|  |  | 
|  | while (true) { | 
|  | float ratio_done = float(benchmark_index) / benchmarks.size(); | 
|  | time_now = timer.getRealTime(); | 
|  |  | 
|  | // We check clock speed every minute and at the end. | 
|  | if (benchmark_index == benchmarks.size() || | 
|  | time_now > time_last_clock_speed_measurement + 60.0f) | 
|  | { | 
|  | time_last_clock_speed_measurement = time_now; | 
|  |  | 
|  | // Ensure that clock speed is as expected | 
|  | float current_clock_speed = measure_clock_speed(); | 
|  |  | 
|  | // The tolerance needs to be smaller than the relative difference between | 
|  | // clock speeds that a device could operate under. | 
|  | // It seems unlikely that a device would be throttling clock speeds by | 
|  | // amounts smaller than 2%. | 
|  | // With a value of 1%, I was getting within noise on a Sandy Bridge. | 
|  | const float clock_speed_tolerance = 0.02f; | 
|  |  | 
|  | if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { | 
|  | // Clock speed is now higher than we previously measured. | 
|  | // Either our initial measurement was inaccurate, which won't happen | 
|  | // too many times as we are keeping the best clock speed value and | 
|  | // and allowing some tolerance; or something really weird happened, | 
|  | // which invalidates all benchmark results collected so far. | 
|  | // Either way, we better restart all over again now. | 
|  | if (benchmark_index) { | 
|  | cerr << "Restarting at " << 100.0f * ratio_done | 
|  | << " % because clock speed increased.          " << endl; | 
|  | } | 
|  | max_clock_speed = current_clock_speed; | 
|  | first_benchmark_to_run = 0; | 
|  | return; | 
|  | } | 
|  |  | 
|  | bool rerun_last_tests = false; | 
|  |  | 
|  | if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { | 
|  | cerr << "Measurements completed so far: " | 
|  | << 100.0f * ratio_done | 
|  | << " %                             " << endl; | 
|  | cerr << "Clock speed seems to be only " | 
|  | << current_clock_speed/max_clock_speed | 
|  | << " times what it used to be." << endl; | 
|  |  | 
|  | unsigned int seconds_to_sleep_if_lower_clock_speed = 1; | 
|  |  | 
|  | while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { | 
|  | if (seconds_to_sleep_if_lower_clock_speed > 32) { | 
|  | cerr << "Sleeping longer probably won't make a difference." << endl; | 
|  | cerr << "Serializing benchmarks to " << session_filename << endl; | 
|  | serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); | 
|  | cerr << "Now restart this benchmark, and it should pick up where we left." << endl; | 
|  | exit(2); | 
|  | } | 
|  | rerun_last_tests = true; | 
|  | cerr << "Sleeping " | 
|  | << seconds_to_sleep_if_lower_clock_speed | 
|  | << " s...                                   \r" << endl; | 
|  | sleep(seconds_to_sleep_if_lower_clock_speed); | 
|  | current_clock_speed = measure_clock_speed(); | 
|  | seconds_to_sleep_if_lower_clock_speed *= 2; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (rerun_last_tests) { | 
|  | cerr << "Redoing the last " | 
|  | << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() | 
|  | << " % because clock speed had been low.   " << endl; | 
|  | return; | 
|  | } | 
|  |  | 
|  | // nothing wrong with the clock speed so far, so there won't be a need to rerun | 
|  | // benchmarks run so far in case we later encounter a lower clock speed. | 
|  | first_benchmark_to_run = benchmark_index; | 
|  | } | 
|  |  | 
|  | if (benchmark_index == benchmarks.size()) { | 
|  | // We're done! | 
|  | first_benchmark_to_run = benchmarks.size(); | 
|  | // Erase progress info | 
|  | cerr << "                                                            " << endl; | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Display progress info on stderr | 
|  | if (time_now > time_last_progress_update + 1.0f) { | 
|  | time_last_progress_update = time_now; | 
|  | cerr << "Measurements... " << 100.0f * ratio_done | 
|  | << " %, ETA " | 
|  | << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) | 
|  | << "                          \r" << flush; | 
|  | } | 
|  |  | 
|  | // This is where we actually run a benchmark! | 
|  | benchmarks[benchmark_index].run(); | 
|  | benchmark_index++; | 
|  | } | 
|  | } | 
|  |  | 
|  | void run_benchmarks(vector<benchmark_t>& benchmarks) | 
|  | { | 
|  | size_t first_benchmark_to_run; | 
|  | vector<benchmark_t> deserialized_benchmarks; | 
|  | bool use_deserialized_benchmarks = false; | 
|  | if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { | 
|  | cerr << "Found serialized session with " | 
|  | << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() | 
|  | << " % already done" << endl; | 
|  | if (deserialized_benchmarks.size() == benchmarks.size() && | 
|  | first_benchmark_to_run > 0 && | 
|  | first_benchmark_to_run < benchmarks.size()) | 
|  | { | 
|  | use_deserialized_benchmarks = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (use_deserialized_benchmarks) { | 
|  | benchmarks = deserialized_benchmarks; | 
|  | } else { | 
|  | // not using deserialized benchmarks, starting from scratch | 
|  | first_benchmark_to_run = 0; | 
|  |  | 
|  | // Randomly shuffling benchmarks allows us to get accurate enough progress info, | 
|  | // as now the cheap/expensive benchmarks are randomly mixed so they average out. | 
|  | // It also means that if data is corrupted for some time span, the odds are that | 
|  | // not all repetitions of a given benchmark will be corrupted. | 
|  | random_shuffle(benchmarks.begin(), benchmarks.end()); | 
|  | } | 
|  |  | 
|  | for (int i = 0; i < 4; i++) { | 
|  | max_clock_speed = max(max_clock_speed, measure_clock_speed()); | 
|  | } | 
|  |  | 
|  | double time_start = 0.0; | 
|  | while (first_benchmark_to_run < benchmarks.size()) { | 
|  | if (first_benchmark_to_run == 0) { | 
|  | time_start = timer.getRealTime(); | 
|  | } | 
|  | try_run_some_benchmarks(benchmarks, | 
|  | time_start, | 
|  | first_benchmark_to_run); | 
|  | } | 
|  |  | 
|  | // Sort timings by increasing benchmark parameters, and decreasing gflops. | 
|  | // The latter is very important. It means that we can ignore all but the first | 
|  | // benchmark with given parameters. | 
|  | sort(benchmarks.begin(), benchmarks.end()); | 
|  |  | 
|  | // Collect best (i.e. now first) results for each parameter values. | 
|  | vector<benchmark_t> best_benchmarks; | 
|  | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { | 
|  | if (best_benchmarks.empty() || | 
|  | best_benchmarks.back().compact_product_size != it->compact_product_size || | 
|  | best_benchmarks.back().compact_block_size != it->compact_block_size) | 
|  | { | 
|  | best_benchmarks.push_back(*it); | 
|  | } | 
|  | } | 
|  |  | 
|  | // keep and return only the best benchmarks | 
|  | benchmarks = best_benchmarks; | 
|  | } | 
|  |  | 
|  | struct measure_all_pot_sizes_action_t : action_t | 
|  | { | 
|  | virtual const char* invokation_name() const { return "all-pot-sizes"; } | 
|  | virtual void run() const | 
|  | { | 
|  | vector<benchmark_t> benchmarks; | 
|  | for (int repetition = 0; repetition < measurement_repetitions; repetition++) { | 
|  | for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { | 
|  | for (size_t msize = minsize; msize <= maxsize; msize *= 2) { | 
|  | for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { | 
|  | for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { | 
|  | for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { | 
|  | for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { | 
|  | benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | run_benchmarks(benchmarks); | 
|  |  | 
|  | cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; | 
|  | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { | 
|  | cout << *it << endl; | 
|  | } | 
|  | } | 
|  | }; | 
|  |  | 
|  | struct measure_default_sizes_action_t : action_t | 
|  | { | 
|  | virtual const char* invokation_name() const { return "default-sizes"; } | 
|  | virtual void run() const | 
|  | { | 
|  | vector<benchmark_t> benchmarks; | 
|  | for (int repetition = 0; repetition < measurement_repetitions; repetition++) { | 
|  | for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { | 
|  | for (size_t msize = minsize; msize <= maxsize; msize *= 2) { | 
|  | for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { | 
|  | benchmarks.emplace_back(ksize, msize, nsize); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | run_benchmarks(benchmarks); | 
|  |  | 
|  | cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; | 
|  | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { | 
|  | cout << *it << endl; | 
|  | } | 
|  | } | 
|  | }; | 
|  |  | 
|  | int main(int argc, char* argv[]) | 
|  | { | 
|  | double time_start = timer.getRealTime(); | 
|  | cout.precision(4); | 
|  | cerr.precision(4); | 
|  |  | 
|  | vector<unique_ptr<action_t>> available_actions; | 
|  | available_actions.emplace_back(new measure_all_pot_sizes_action_t); | 
|  | available_actions.emplace_back(new measure_default_sizes_action_t); | 
|  |  | 
|  | auto action = available_actions.end(); | 
|  |  | 
|  | if (argc <= 1) { | 
|  | show_usage_and_exit(argc, argv, available_actions); | 
|  | } | 
|  | for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { | 
|  | if (!strcmp(argv[1], (*it)->invokation_name())) { | 
|  | action = it; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (action == available_actions.end()) { | 
|  | show_usage_and_exit(argc, argv, available_actions); | 
|  | } | 
|  |  | 
|  | for (int i = 2; i < argc; i++) { | 
|  | if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { | 
|  | const char* equals_sign = strchr(argv[i], '='); | 
|  | min_working_set_size = strtoul(equals_sign+1, nullptr, 10); | 
|  | } else { | 
|  | cerr << "unrecognized option: " << argv[i] << endl << endl; | 
|  | show_usage_and_exit(argc, argv, available_actions); | 
|  | } | 
|  | } | 
|  |  | 
|  | print_cpuinfo(); | 
|  |  | 
|  | cout << "benchmark parameters:" << endl; | 
|  | cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; | 
|  | cout << "scalar type: " << type_name<Scalar>() << endl; | 
|  | cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl; | 
|  | cout << "minsize = " << minsize << endl; | 
|  | cout << "maxsize = " << maxsize << endl; | 
|  | cout << "measurement_repetitions = " << measurement_repetitions << endl; | 
|  | cout << "min_accurate_time = " << min_accurate_time << endl; | 
|  | cout << "min_working_set_size = " << min_working_set_size; | 
|  | if (min_working_set_size == 0) { | 
|  | cout << " (try to outsize caches)"; | 
|  | } | 
|  | cout << endl << endl; | 
|  |  | 
|  | (*action)->run(); | 
|  |  | 
|  | double time_end = timer.getRealTime(); | 
|  | cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; | 
|  | } |