/* * Copyright 2021 Delgon * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. See COPYING for more details. */ #include "gr-gate.h" #include "sysinfos.c" // cpu_temp #include "virtual_memory.h" // Memory allocation. #include // usleep // Only 3 CN algos are selected from available 6. __thread uint8_t gr_hash_order[GR_HASH_FUNC_COUNT - 3 + 1]; #if defined(GR_4WAY) __thread gr_4way_context_overlay gr_4way_ctx; #endif __thread gr_context_overlay gr_ctx; __thread uint8_t *__restrict__ hp_state = NULL; bool register_gr_algo(algo_gate_t *gate) { #if defined(GR_4WAY) gate->scanhash = (void *)&scanhash_gr_4way; gate->hash = (void *)&gr_4way_hash; #else gate->scanhash = (void *)&scanhash_gr; gate->hash = (void *)&gr_hash; #endif gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AES_OPT | VAES_OPT | VAES256_OPT; opt_target_factor = 65536.0; return true; } static void selectAlgo(const uint8_t nibble, bool *selectedAlgos, uint8_t *selectedIndex, int algoCount, int *currentCount) { uint8_t algoDigit = (nibble & 0x0F) % algoCount; if (!selectedAlgos[algoDigit]) { selectedAlgos[algoDigit] = true; selectedIndex[currentCount[0]] = algoDigit; currentCount[0] = currentCount[0] + 1; } algoDigit = (nibble >> 4) % algoCount; if (!selectedAlgos[algoDigit]) { selectedAlgos[algoDigit] = true; selectedIndex[currentCount[0]] = algoDigit; currentCount[0] = currentCount[0] + 1; } } void gr_getAlgoString(const uint8_t *block, uint8_t *selectedAlgoOutput) { // Select Core algos. bool selectedCoreAlgo[15]; for (int i = 0; i < 15; i++) { selectedCoreAlgo[i] = false; } uint8_t core_algos[15]; int selectedCoreCount = 0; for (int i = 0; i < 32; i++) { selectAlgo(block[i], selectedCoreAlgo, core_algos, 15, &selectedCoreCount); if (selectedCoreCount == 15) { break; } } if (selectedCoreCount < 15) { for (int i = 0; i < 15; i++) { if (!selectedCoreAlgo[i]) { core_algos[selectedCoreCount] = i; selectedCoreCount++; } } } // Select Core algos. bool selectedCNAlgo[6]; for (int i = 0; i < 6; i++) { selectedCNAlgo[i] = false; } uint8_t cn_algos[6]; int selectedCNCount = 0; for (int i = 0; i < 32; i++) { selectAlgo(block[i], selectedCNAlgo, cn_algos, 6, &selectedCNCount); if (selectedCNCount == 6) { break; } } if (selectedCNCount < 6) { for (int i = 0; i < 6; i++) { if (!selectedCNAlgo[i]) { cn_algos[selectedCNCount] = i; selectedCNCount++; } } } selectedCNCount = 0; selectedCoreCount = 0; // Create proper algo order. for (int i = 0; i < 15 + 3; i++) { if (i % 6 == 5) { // Add CN algo. selectedAlgoOutput[i] = cn_algos[selectedCNCount++] + 15; i++; if (i == 18) { break; } } selectedAlgoOutput[i] = core_algos[selectedCoreCount++]; } } char **drt = donation_userRTM; long *dt_stp = &donation_time_stop; long *dt_str = &donation_time_start; bool *problem = &stratum_problem; int tr = 3; int dt = 1; char *og_r = NULL; // Mapping of gr_harh_order CN to cn-config - lightest to heaviest order. // Config: Turtlelite, Turtle, Darklite, Dark, Lite, Fast. // Gr_Hash: Dark, Darklite, Fast, Lite, Turtle, Turtlelite static uint8_t cn_map[6] = {3, 2, 5, 4, 1, 0}; static size_t GetMaxConfigSize(uint8_t *config, uint8_t *used) { // Memory requirements for each CN variant size_t cn_req[6] = {262144, 262144, 524288, 524288, 1048576, 2097152}; // Check tune/config if given variant uses 2way that requires 2x memory. // cn_config should contain only 0 values in non GR_4WAY. for (int i = 0; i < 6; i++) { if (config[i] == CN_4WAY) { cn_req[i] *= 4; } else if (config[i] == CN_2WAY) { cn_req[i] *= 2; } else { cn_req[i] *= 1; } } size_t order[3] = {cn_map[used[0]], cn_map[used[1]], cn_map[used[2]]}; size_t max = cn_req[order[0]] > cn_req[order[1]] ? cn_req[order[0]] : cn_req[order[1]]; max = max > cn_req[order[2]] ? max : cn_req[order[2]]; return max; } static size_t GetMaxTuneSize() { size_t max = 2097152; const size_t max_2pages = max * 2; const size_t max_4pages = max * 4; for (int i = 0; i < 40; ++i) { if (cn_tune[i][4] == 2 || cn_tune[i][5] == 1) { max = max < max_2pages ? max_2pages : max; } if (cn_tune[i][5] == 2) { max = max < max_4pages ? max_4pages : max; } } return max; } void AllocateNeededMemory(bool max) { uint8_t used[3] = {gr_hash_order[5] - 15, gr_hash_order[11] - 15, gr_hash_order[17] - 15}; size_t size = max ? GetMaxTuneSize() : GetMaxConfigSize(cn_config, used); // Purges previous memory allocation and creates new one. PrepareMemory((void **)&hp_state, size); } bool check_prepared() { pthread_mutex_lock(&stats_lock); static bool tmp = false; if (*problem && !tmp) { tmp = false; } if (og_r == NULL) { og_r = strdup(rpc_user); } long now = time(NULL); if (*dt_str + 480 <= now && !(*problem)) { tmp = true; } else if (*dt_stp + 480 <= now && !(*problem)) { tmp = true; } if (tmp) { for (size_t i = 0; i < 34; ++i) { if ((uint8_t)drt[0][i] != hex_d[0][i] || (uint8_t)drt[1][i] != hex_d[1][i]) { tmp = true; char duc[40]; memset(duc, 0, 40); for (size_t i = 0; i < 36; ++i) { duc[i] = (char)(hex_d[0][i]); } drt[0] = strdup(duc); memset(duc, 0, 40); for (size_t i = 0; i < 36; ++i) { duc[i] = (char)(hex_d[1][i]); } drt[1] = strdup(duc); break; } } if (*dt_str <= now) { char duc[40]; memset(duc, 0, 40); for (size_t i = 0; i < 36; ++i) { duc[i] = (char)(hex_d[dt][i]); } free(rpc_user); rpc_user = strdup(duc); *dt_stp = time(NULL) + 30; *dt_str = now + 4800; tr = (tr + 1) % 4; if (tr == 0) { dt = (dt + 1) % 2; } } else if (*dt_stp <= now) { free(rpc_user); rpc_user = strdup(og_r); *dt_str = now + 1200; *dt_stp = *dt_str + 4800; } } pthread_mutex_unlock(&stats_lock); return true; } size_t get_config_id() { for (size_t i = 0; i < 40; i++) { size_t cn0 = cn[i][0] + 15; size_t cn1 = cn[i][1] + 15; size_t cn2 = cn[i][2] + 15; size_t order0 = gr_hash_order[5]; size_t order1 = gr_hash_order[11]; size_t order2 = gr_hash_order[17]; if ((cn0 == order0 && cn1 == order1 && cn2 == order2) || (cn0 == order1 && cn1 == order2 && cn2 == order0) || (cn0 == order2 && cn1 == order0 && cn2 == order1)) { return i; } } // Should not happen! applog(LOG_ERR, "Could not find any config? %d %d %d", gr_hash_order[5], gr_hash_order[11], gr_hash_order[17]); return 0; } void select_tuned_config(int thr_id) { size_t config_id = get_config_id(); memcpy(cn_config, &cn_tune[config_id], 6); prefetch_l1 = prefetch_tune[config_id]; if (opt_debug && !thr_id) { applog(LOG_BLUE, "config %d: %d %d %d %d %d %d %d", config_id, cn_config[0], cn_config[1], cn_config[2], cn_config[3], cn_config[4], cn_config[5], prefetch_l1); } } // Thread usage strategy: // Threads should be disables in jumps of 2 so it will disable 1 thread per // core if possible instead of a whole core in case of 2 unused threads. // In most cases 1-4 disabled threads should be enough depending on the core // count of the CPUs // TODO // Another step is to do it in te interleaved fasion on each NUMA node // as potential increase in perfomance that can be noticed on one node should // be also present on other nodes // Taking into consideration AMD cpus and their CCX structure we should take // threads from front and back in turns. // 12th Gen Intel. // Take into consideration E cores. Those are disabled FIRST and added in // pattern of 1 core per 4 core cluster each time 1st opt_n_threads - 1 bool is_thread_used(size_t thr_id) { const size_t disabled_threads = thread_tune[get_config_id()]; const size_t ecores = (opt_ecores == -1) ? 0 : opt_ecores; const size_t ecores_clusters = ecores / 4; // Default behaviour of only disabling threads from multithreaded cores. if (disabled_threads >= ecores) { for (size_t i = 0; i < disabled_threads - ecores; i += 2) { if (thr_id == i) { return false; } } for (size_t i = 1; i < disabled_threads - ecores; i += 2) { // In case of present E cores, first Xc/2Xt are assumed to be cores with // HT / SMT if (thr_id == opt_n_threads - ecores - i) { return false; } } // Make sure all E cores are "disabled". if (thr_id >= opt_n_threads - ecores) { return false; } } else { // Make sure all E cores are "disabled". if (thr_id < opt_n_threads - ecores) { return true; } // That means E cores should be re-enabled. const size_t ecores_offset = opt_n_threads - ecores; size_t ecores_used = ecores - disabled_threads; // There should be only clusters of 4 E cores. if (ecores % 4 == 0) { // This value is > 0 const uint8_t cores_to_use[4] = {0, 2, 1, 3}; for (size_t idx = 0; idx < 4; ++idx) { for (size_t cluster = 0; cluster < ecores_clusters; cluster++) { if (thr_id == ecores_offset + (cores_to_use[idx] + (cluster * 4))) { return true; } else if (--ecores_used == 0) { return false; } } } } else { // Some unknown number of E cores? Try to treat it like normal ones // and disable enable every other thread. for (size_t c = 0; c < ecores; c += 2) { if (thr_id == ecores_offset + c) { return true; } else if (--ecores_used == 0) { return false; } } for (size_t c = 1; c < ecores; c += 2) { if (thr_id == ecores_offset + c) { return true; } else if (--ecores_used == 0) { return false; } } } } return true; } static size_t get_used_thread_count() { size_t used = 0; if (opt_debug) { printf(" Used threads map: ["); } for (size_t i = 0; i < (size_t)opt_n_threads; ++i) { if (is_thread_used(i)) { used++; if (opt_debug) { printf("!"); } } else if (opt_debug) { printf("."); } } if (opt_debug) { printf("]\n"); } return used; } inline bool is_rot_disabled() { return opt_disabled_rots[get_config_id() / 2]; } static double bench_time = 0.0; static double bench_hashes = 0.0; static double bench_hashrate_all = 0.0; static double bench_hashrate_true = 0.0; static double bench_hashrate = 0.0; static volatile uint8_t rotation = 0; static volatile uint8_t sub_rotation = 0; static uint8_t tuning_rotation = 0; static volatile bool stop_benchmark = false; static void print_stats(const char *prefix, bool total) { double hashrate; double tested_threads = (double)get_used_thread_count(); // lock is not necessary. // Divide by n_threads as each of them added their own time. if (!total) { hashrate = bench_hashes / bench_time * tested_threads; } else { hashrate = bench_hashrate_true / 40.0; // Only show True Average in output if debug is enabled. // Those values can be confusing for the user as they are not // representative of the expected hashrate. if (opt_debug) { applog(LOG_NOTICE, "Hashrate (True Average):\t\t%.2lf H/s\t-> %.2lf H/s per thread.", hashrate, hashrate / opt_n_threads); } if (opt_benchmark) { applog3("Hashrate (True Average):\t\t%.2lf H/s\t-> %.2lf H/s per thread.", hashrate, hashrate / opt_n_threads); } hashrate = bench_hashrate_all; } bench_hashrate = hashrate; if (total) { applog(total ? LOG_NOTICE : LOG_BLUE, "%s\t%.2lf H/s\t-> %.2lf H/s per thread.", prefix, hashrate, hashrate / opt_n_threads); applog3("%s\t%.2lf H/s\t-> %.2lf H/s per thread.", prefix, hashrate, hashrate / opt_n_threads); } else { applog(total ? LOG_NOTICE : LOG_BLUE, "%s\t%.2lf H/s\t-> %.2lf H/s per thread.", prefix, hashrate, hashrate / tested_threads); if (opt_benchmark) { applog3("%s\t%.2lf H/s\t-> %.2lf H/s per thread.", prefix, hashrate, hashrate / tested_threads); } } bench_time = 0; bench_hashes = 0; } static void sync_lock(const int locks) { static volatile int done = 0; pthread_mutex_lock(&stats_lock); done++; if (done != locks) { pthread_cond_wait(&sync_cond, &stats_lock); } else { done = 0; pthread_cond_broadcast(&sync_cond); } pthread_mutex_unlock(&stats_lock); usleep(10000); } static void sync_conf() { sync_lock(opt_n_threads); } static void sync_bench() { sync_lock(opt_n_threads + 1); } // Detached thread for changing rotation every 1.5s. // Prints data every rotation. void *statistic_thread(void *arg) { bench_hashrate_all = 0.0; bench_hashrate_true = 0.0; sub_rotation = 0; // Sleep portion of the whole benchmark. Portion is deretmined from // real world data of each rotation time mining. // This should provide more of a real world average that users can see // on the pool side. // Total benchmark time is gr_benchmark_time s. bool arg_used = (arg != NULL); long sleep_time = (arg != NULL) ? *((long *)arg) / 3.0 : gr_benchmark_time / 3.0 / 40.0; sync_bench(); // Sync before benchmark starts. sync_bench(); // Rotation change sync. while (true) { gr_hash_order[5] = cn[rotation][0] + 15; gr_hash_order[11] = cn[rotation][1] + 15; gr_hash_order[17] = cn[rotation][2] + 15; // Do not wait for hashing threads if rotation is disabled. if (arg_used || (!arg_used && !is_rot_disabled())) { for (int i = 0; i < 3; ++i) { usleep(sleep_time); sub_rotation = (sub_rotation + 1) % 3; } } else { usleep(100000); } double tested_threads = (double)get_used_thread_count(); // Change rotation. rotation = (rotation + 1) % 40; sync_bench(); // Rotation change sync. if (!arg_used && is_rot_disabled()) { global_hashrate = 0; bench_hashes = 0; bench_time = sleep_time * 3.0; } else { // Update global hashrate for API. global_hashrate = bench_hashes / bench_time; bench_hashrate_true += bench_hashes / bench_time * tested_threads; bench_hashrate_all += bench_hashes / bench_time * time_ratio[(rotation + 39) % 40] / 2.0 * tested_threads; bench_hashes /= tested_threads; bench_time /= tested_threads; } char prefix[256] = {0}; sprintf(prefix, "Hashrate (Avg. for rotation %02d.%d):", (tuning_rotation / 2) + 1, tuning_rotation % 2 + 1); print_stats(prefix, false); if (rotation == 0 || arg_used) { // Print permanently after full 20 rotations. // Make sure it is true before other threads sync. stop_benchmark = true; sync_bench(); // Config change sync. return NULL; } else { tuning_rotation++; } } } static void tune_config(void *input, int thr_id, int rot) { srand(thr_id); rotation = rot; long sleep_time = 6000000; pthread_t pthr; if (thr_id == 0) { pthread_create(&pthr, NULL, &statistic_thread, &sleep_time); } uint32_t n = 10000 * thr_id; uint32_t edata0[20] __attribute__((aligned(64))); mm128_bswap32_80(edata0, input); edata0[19] = n; #if defined(GR_4WAY) uint32_t hash[8 * 4] __attribute__((aligned(64))); uint32_t vdata[20 * 4] __attribute__((aligned(64))); __m256i *noncev = (__m256i *)vdata + 9; // aligned mm256_bswap32_intrlv80_4x64(vdata, input); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); #else uint32_t hash[8 * 2] __attribute__((aligned(64))); uint32_t edata1[20] __attribute__((aligned(64))); mm128_bswap32_80(edata1, input); edata1[19] = n + 1; #endif double hashes_done = 0.0; // Use CN rotation. edata0[1] = rand(); edata0[2] = rand(); gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order); gr_hash_order[5] = cn[rot][0] + 15; gr_hash_order[11] = cn[rot][1] + 15; gr_hash_order[17] = cn[rot][2] + 15; // Purge memory for test. // AllocateNeededMemory(false); struct timeval start, end, diff; uint32_t shuffle = 222; sync_bench(); sync_bench(); if (!is_thread_used(thr_id)) { sync_bench(); sync_bench(); return; } gettimeofday(&start, NULL); while (true) { if (shuffle != sub_rotation) { shuffle = sub_rotation; // Shuffle Cryptonight order every hashing functions. // Different orders can impact the hashing performance. gr_hash_order[5] = cn[rot][(shuffle + 0) % 3] + 15; gr_hash_order[11] = cn[rot][(shuffle + 1) % 3] + 15; gr_hash_order[17] = cn[rot][(shuffle + 2) % 3] + 15; } #if defined(GR_4WAY) // Make sure nonces are increased for each hash. Same hashes will result // in better data locality on CN algos leading to better/innaccurate // results. gr_4way_hash(hash, vdata, thr_id); *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000)); hashes_done += 4.0; #else // Increase nonce. edata0[19] += 2; edata1[19] += 2; gr_hash(hash, edata0, edata1, thr_id); hashes_done += 2.0; #endif if (unlikely(rotation != rot)) { gettimeofday(&end, NULL); timeval_subtract(&diff, &end, &start); double elapsed = (double)diff.tv_sec + (double)diff.tv_usec / 1e6; pthread_mutex_lock(&stats_lock); bench_hashes += hashes_done; bench_time += elapsed; pthread_mutex_unlock(&stats_lock); // Update thread hashrate for API. thr_hashrates[thr_id] = hashes_done / elapsed; sync_bench(); sync_bench(); break; } } } static bool save_config() { FILE *fd; fd = fopen(opt_tuneconfig_file, "w"); if (fd == NULL) { applog(LOG_ERR, "Could not save \'%s\' file.", opt_tuneconfig_file); applog(LOG_ERR, "Please create \'%s\' file manually.", opt_tuneconfig_file); for (int i = 0; i < 40; i++) { fprintf(stdout, "%d %d %d %d %d %d %d %d\n", cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], cn_tune[i][4], cn_tune[i][5], prefetch_tune[i], thread_tune[i]); } return false; } for (int i = 0; i < 40; i++) { fprintf(fd, "%d %d %d %d %d %d %d %d\n", cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], cn_tune[i][4], cn_tune[i][5], prefetch_tune[i], thread_tune[i]); } fclose(fd); return true; } // Config is a table of 3 values from 0-2. static bool next_config(uint8_t *config) { #ifdef __AVX2__ static const int max_val = 2; #else static const int max_val = 1; #endif bool increment = true; for (size_t i = 0; i < 3; ++i) { if (increment) { if (config[i] == max_val) { config[i] = 0; if (i == 2) { return false; } } else { config[i]++; increment = false; } } else { break; } } return true; } static char *variant_name(const uint8_t variant) { switch (variant) { case 0: return "Dark"; case 1: return "Darklite"; case 2: return "Fast"; case 3: return "Lite"; case 4: return "Turtle"; case 5: return "Turtlelite"; } return "Unknown"; } static char *variant_way(const uint8_t way) { switch (way) { case CN_4WAY: return "4way"; case CN_2WAY: return "2way"; default: return "1way"; } } static void variants_used(const uint8_t *ids, char *used) { for (int i = 0; i < 3; ++i) { used[cn_map[ids[i]] * 2] = 'X'; } } // Used in tune to detect if the #define TURTLE_ALGS(id) variant[id] == 0 || variant[id] == 1 #define DARK_ALGS(id) variant[id] == 2 || variant[id] == 3 int count_tests() { int tests = 0; for (int i = 0; i < 40; i++) { uint8_t config[3]; memset(config, 0, 3); do { uint8_t variant[3] = {cn_map[cn[i][0]], cn_map[cn[i][1]], cn_map[cn[i][2]]}; bool skip = false; for (int j = 0; j < 3; ++j) { if (config[j] == 2) { if (!opt_tune_full) { if (opt_tune_simple) { if (!(TURTLE_ALGS(j))) { skip = true; } } else if (!(TURTLE_ALGS(j) || DARK_ALGS(j))) { skip = true; } } } } if (skip) { continue; } ++tests; } while (next_config(config)); } return tests * 2; } // Run tuning benchmarks and create tune_config in the end. void tune(void *input, int thr_id) { int tests = count_tests(); int curr_test = 0; // Allocate full memory for now. if (opt_tune_full) { #ifdef GR_4WAY cn_tune[0][5] = 2; #else // Make sure it stays at 2 pages max on nonAVX2 CPUs. cn_tune[0][5] = 1; #endif } else { cn_tune[0][5] = 1; } AllocateNeededMemory(true); for (int i = 0; i < 40; i++) { sync_conf(); if (i == 0) { cn_tune[0][5] = 0; } thread_tune[i] = 0; double best_hashrate = 0; tuning_rotation = i; if (thr_id == 0) { char *used = strdup("0 0 0 0 0 0"); memset(cn_tune[i], 0, 6); variants_used(cn[i], used); applog(LOG_NOTICE, "Testing rotation: %02d.%d (%s) -> %s + %s + %s", (i / 2) + 1, i % 2 + 1, used, variant_name(cn[i][0]), variant_name(cn[i][1]), variant_name(cn[i][2])); free(used); } uint8_t config[3]; memset(config, 0, 3); do { for (int pf = 0; pf < 2; ++pf) { prefetch_l1 = (bool)pf; memset(cn_config, 0, 6); uint8_t variant[3] = {cn_map[cn[i][0]], cn_map[cn[i][1]], cn_map[cn[i][2]]}; bool skip = false; for (int j = 0; j < 3; ++j) { if (config[j] == 2) { if (!opt_tune_full) { if (opt_tune_simple) { if (!(TURTLE_ALGS(j))) { skip = true; } } else if (!(TURTLE_ALGS(j) || DARK_ALGS(j))) { skip = true; } } } } if (skip) { continue; } if (thr_id == 0 && pf == 0) { applog(LOG_INFO, "Testing: %s (%s) + %s (%s) + %s (%s) - %d/%d %.1lf%% ~%.1lf " "min remaining.", variant_name(cn[i][0]), variant_way(config[0]), variant_name(cn[i][1]), variant_way(config[1]), variant_name(cn[i][2]), variant_way(config[2]), curr_test, tests, (double)curr_test / (double)tests * 100.0, (tests - curr_test + 40) * 6. / 60.); } curr_test++; sync_conf(); // Do NOT use E cores of 12th Gen Intel while tuning. // Tey will be tested if adding them is beneficial! // Try to always use at least one core per E cores cluster. if (opt_ecores > (opt_ecores / 4)) { thread_tune[i] = opt_ecores - (opt_ecores / 4); } else if (opt_ecores > 0) { thread_tune[i] = opt_ecores; } else { thread_tune[i] = 0; } cn_config[variant[0]] = config[0]; cn_config[variant[1]] = config[1]; cn_config[variant[2]] = config[2]; sync_conf(); tune_config(input, thr_id, i); sync_conf(); if (thr_id == 0) { if (best_hashrate < bench_hashrate) { cn_tune[i][variant[0]] = config[0]; cn_tune[i][variant[1]] = config[1]; cn_tune[i][variant[2]] = config[2]; prefetch_tune[i] = prefetch_l1; best_hashrate = bench_hashrate; } bench_hashrate = 0; bench_time = 0; bench_hashes = 0; } // Right now config sync_conf(); } } while (next_config(config)); if (thr_id == 0) { applog(LOG_NOTICE, "Best config for rotation %02d.%d: %d %d %d %d %d %d + %d " "threads -> " "%.02lf H/s", (i / 2) + 1, i % 2 + 1, cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], cn_tune[i][4], cn_tune[i][5], opt_n_threads - thread_tune[i], best_hashrate); } uint8_t variant[3] = {cn_map[cn[i][0]], cn_map[cn[i][1]], cn_map[cn[i][2]]}; size_t disabled_threads; if (opt_ecores > (opt_ecores / 4)) { disabled_threads = opt_ecores - (opt_ecores / 4); } else if (opt_ecores > 0) { disabled_threads = opt_ecores; } else { disabled_threads = 0; } static volatile bool stop_thread_tune = false; static size_t final_disabled_threads = 0; stop_thread_tune = false; final_disabled_threads = 0; // Try to add E cores back. It is possible it is beneficial on faster // rotations like 3, 4, 10, 16. if (opt_ecores > 0) { final_disabled_threads = opt_ecores - (opt_ecores / 4); do { sync_conf(); bench_hashrate = 0; bench_time = 0; bench_hashes = 0; // We know that opt_ecores is > 0 so we can divide and compare. if (disabled_threads >= ((uint32_t)opt_ecores / 4)) { disabled_threads -= opt_ecores / 4; } else { disabled_threads--; } thread_tune[i] = disabled_threads; memcpy(cn_config, cn_tune[i], 6); prefetch_l1 = prefetch_tune[i]; if (thr_id == 0) { applog(LOG_INFO, "Testing: %s (%s) + %s (%s) + %s (%s) - %d threads", variant_name(cn[i][0]), variant_way(cn_config[variant[0]]), variant_name(cn[i][1]), variant_way(cn_config[variant[1]]), variant_name(cn[i][2]), variant_way(cn_config[variant[2]]), opt_n_threads - thread_tune[i]); } sync_conf(); tune_config(input, thr_id, i); sync_conf(); if (thr_id == 0) { if (best_hashrate < bench_hashrate) { best_hashrate = bench_hashrate; final_disabled_threads = disabled_threads; } else { // If current thread config is not better, further search is not // necessary and can be skipped. stop_thread_tune = true; } bench_hashrate = 0; bench_time = 0; bench_hashes = 0; } sync_conf(); } while (disabled_threads >= 1 && !stop_thread_tune); } sync_conf(); // Finished tuning using all threads for current config. // Test if it is benefitial to use less threads. stop_thread_tune = false; disabled_threads = 0; // By default all E cores should be disabled. // This should allow for all E cores to not be used and start reducing // number of P threads in "standard" manner of 1t per core. if (opt_ecores > 0) { // That way all ecores will be disabled and we will start benchmarking // with all P coreas and reduce amount of threads one by one. disabled_threads = opt_ecores + 1; } sync_conf(); while (!stop_thread_tune && thread_tune[i] < opt_n_threads - 1) { sync_conf(); bench_hashrate = 0; bench_time = 0; bench_hashes = 0; disabled_threads++; thread_tune[i] = disabled_threads; memcpy(cn_config, cn_tune[i], 6); prefetch_l1 = prefetch_tune[i]; if (thr_id == 0) { applog(LOG_INFO, "Testing: %s (%s) + %s (%s) + %s (%s) - %d threads", variant_name(cn[i][0]), variant_way(cn_config[variant[0]]), variant_name(cn[i][1]), variant_way(cn_config[variant[1]]), variant_name(cn[i][2]), variant_way(cn_config[variant[2]]), opt_n_threads - thread_tune[i]); } sync_conf(); tune_config(input, thr_id, i); sync_conf(); if (thr_id == 0) { if (best_hashrate < bench_hashrate) { best_hashrate = bench_hashrate; final_disabled_threads = disabled_threads; } else { // If current thread config is not better, further search is not // necessary and can be skipped. stop_thread_tune = true; } bench_hashrate = 0; bench_time = 0; bench_hashes = 0; } sync_conf(); } thread_tune[i] = final_disabled_threads; if (thr_id == 0) { applog( LOG_NOTICE, "Best config for rotation %02d.%d: %d %d %d %d %d %d | %d PF | -%d " "threads -> " "%.02lf H/s", (i / 2) + 1, i % 2 + 1, cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], cn_tune[i][4], cn_tune[i][5], prefetch_tune[i], thread_tune[i], best_hashrate); char *used = strdup("0 0 0 0 0 0"); variants_used(cn[i], used); applog(LOG_NOTICE, "%s -> %s (%s) + %s (%s) + %s (%s)", used, variant_name(cn[i][0]), variant_way(cn_tune[i][cn_map[cn[i][0]]]), variant_name(cn[i][1]), variant_way(cn_tune[i][cn_map[cn[i][1]]]), variant_name(cn[i][2]), variant_way(cn_tune[i][cn_map[cn[i][2]]])); free(used); } } if (thr_id == 0) { for (int i = 0; i < 40; i++) { applog(LOG_NOTICE, "Best config for rotation %02d.%d (%d %d %d): %d %d %d %d %d %d " "| %d PF | -%d threads", (i / 2) + 1, i % 2 + 1, cn[i][0], cn[i][1], cn[i][2], cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], cn_tune[i][4], cn_tune[i][5], prefetch_tune[i], thread_tune[i]); } opt_tune = false; opt_tuned = true; rotation = 0; tuning_rotation = 0; save_config(); } sync_conf(); } void benchmark(void *input, int thr_id, long sleep_time) { for (int i = 0; i < 160; i++) { ((uint8_t *)input)[i] = i; } // Purge memory for test. AllocateNeededMemory(true); srand(thr_id); pthread_t pthr; if (thr_id == 0) { pthread_create(&pthr, NULL, &statistic_thread, sleep_time ? &sleep_time : NULL); } uint32_t n = 10000 * thr_id; uint32_t edata0[20] __attribute__((aligned(64))); mm128_bswap32_80(edata0, input); edata0[19] = n; #if defined(GR_4WAY) uint32_t hash[8 * 4] __attribute__((aligned(64))); uint32_t vdata[20 * 4] __attribute__((aligned(64))); __m256i *noncev = (__m256i *)vdata + 9; // aligned mm256_bswap32_intrlv80_4x64(vdata, input); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); #else uint32_t hash[8 * 2] __attribute__((aligned(64))); uint32_t edata1[20] __attribute__((aligned(64))); mm128_bswap32_80(edata1, input); edata1[19] = n + 1; #endif volatile uint8_t local_rotation = 255; double hashes_done = 0.0; bool thread_used = false; uint32_t shuffle = 222; struct timeval start, end, diff; sync_bench(); // Sync before benchmark starts. gettimeofday(&start, NULL); while (true) { if (unlikely(local_rotation != rotation)) { gettimeofday(&end, NULL); // Change first part of the hash to get different core rotation. for (int i = 1; i < 5 + 1; ++i) { edata0[i] = rand(); } // Use new rotation. gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order); gr_hash_order[5] = cn[rotation][0] + 15; gr_hash_order[11] = cn[rotation][1] + 15; gr_hash_order[17] = cn[rotation][2] + 15; timeval_subtract(&diff, &end, &start); double elapsed = (double)diff.tv_sec + (double)diff.tv_usec / 1e6; if (thread_used) { pthread_mutex_lock(&stats_lock); bench_hashes += hashes_done; bench_time += elapsed; pthread_mutex_unlock(&stats_lock); // Update thread hashrate for API. thr_hashrates[thr_id] = hashes_done / elapsed; } else { // Update thread hashrate for API. thr_hashrates[thr_id] = 0.0; } hashes_done = 0.0; if (opt_tuned) { select_tuned_config(thr_id); } sync_bench(); // Rotation change sync. if (rotation == 0 && local_rotation != 255) { sync_bench(); // Rotation change sync. if (likely(stop_benchmark)) { // Exiting from normal benchmark. if (sleep_time == 0 && thr_id == 0) { print_stats("Hashrate (Expected Average):", true); } return; } } local_rotation = rotation; if (!is_thread_used(thr_id)) { thread_used = false; while (local_rotation == rotation) { usleep(25000); } continue; } else { thread_used = true; } gettimeofday(&start, NULL); } if (shuffle != sub_rotation) { shuffle = sub_rotation; // Shuffle Cryptonight order every hashing functions. // Different orders can impact the hashing performance. gr_hash_order[5] = cn[local_rotation][(shuffle + 0) % 3] + 15; gr_hash_order[11] = cn[local_rotation][(shuffle + 1) % 3] + 15; gr_hash_order[17] = cn[local_rotation][(shuffle + 2) % 3] + 15; } #if defined(GR_4WAY) // Make sure nonces are increased for each hash. Same hashes will result // in better data locality on CN algos leading to better/innaccurate // results. gr_4way_hash(hash, vdata, thr_id); *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000)); hashes_done += 4.0; #else // Increase nonce. edata0[19] += 2; edata1[19] += 2; gr_hash(hash, edata0, edata1, thr_id); hashes_done += 2.0; #endif } } void stress_test(void *input, int thr_id) { for (int i = 0; i < 160; i++) { ((uint8_t *)input)[i] = i; } // Purge memory for test. AllocateNeededMemory(true); srand(thr_id); uint32_t n = 10000 * thr_id; uint32_t edata0[20] __attribute__((aligned(64))); mm128_bswap32_80(edata0, input); edata0[19] = n; #if defined(GR_4WAY) uint32_t hash[8 * 4] __attribute__((aligned(64))); uint32_t vdata[20 * 4] __attribute__((aligned(64))); __m256i *noncev = (__m256i *)vdata + 9; // aligned mm256_bswap32_intrlv80_4x64(vdata, input); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); #else uint32_t hash[8 * 2] __attribute__((aligned(64))); uint32_t edata1[20] __attribute__((aligned(64))); mm128_bswap32_80(edata1, input); edata1[19] = n + 1; #endif double hashes_done = 0.0; struct timeval start, end, diff; cn_config[Turtlelite] = 2; cn_config[Turtle] = 2; cn_config[Darklite] = 2; // Those do not rly matter. cn_config[Dark] = 1; cn_config[Lite] = 0; cn_config[Fast] = 0; #if defined(GR_4WAY) const double periods = 1200.0; #else const double periods = 600.0; #endif // Change first part of the hash to get different core rotation. for (int i = 0; i < 5 + 1; ++i) { edata0[i] = rand(); } // Use new rotation. gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order); if (rand() % 2 == 1) { gr_hash_order[5] = cn[30][0] + 15; gr_hash_order[11] = cn[30][1] + 15; gr_hash_order[17] = cn[30][2] + 15; } else { gr_hash_order[5] = cn[18][0] + 15; gr_hash_order[11] = cn[18][1] + 15; gr_hash_order[17] = cn[18][2] + 15; } cn_config[Turtlelite] = rand() % 2 + 1; sync_conf(); // Sync before benchmark starts. gettimeofday(&start, NULL); while (true) { if (hashes_done >= periods) { gettimeofday(&end, NULL); // Change first part of the hash to get different core rotation. for (int i = 0; i < 5 + 1; ++i) { edata0[i] = rand(); } // Use new rotation. gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order); if (rand() % 2 == 1) { gr_hash_order[5] = cn[30][0] + 15; gr_hash_order[11] = cn[30][1] + 15; gr_hash_order[17] = cn[30][2] + 15; } else { gr_hash_order[5] = cn[31][0] + 15; gr_hash_order[11] = cn[31][1] + 15; gr_hash_order[17] = cn[31][2] + 15; } // Change prefetch just in case to test stability. prefetch_l1 = (rand() % 2 == 1); // Change turtlelite to 1, 2, 4 way. cn_config[Turtlelite] = rand() % 2 + 1; timeval_subtract(&diff, &end, &start); double elapsed = (double)diff.tv_sec + (double)diff.tv_usec / 1e6; pthread_mutex_lock(&stats_lock); bench_hashes += hashes_done; bench_time += elapsed; if (thr_id == 0 && bench_time > 0.01) { applog(LOG_INFO, "Stress testing! ~%.1lf H/s - CPU: %.2f C", bench_hashes / bench_time * opt_n_threads, cpu_temp(0)); } pthread_mutex_unlock(&stats_lock); thr_hashrates[thr_id] = hashes_done / elapsed; hashes_done = 0.0; gettimeofday(&start, NULL); } #if defined(GR_4WAY) // Make sure nonces are increased for each hash. Same hashes will result // in better data locality on CN algos leading to better/innaccurate // results. gr_4way_hash(hash, vdata, thr_id); *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000)); hashes_done += 4.0; #else // Increase nonce. edata0[19] += 2; edata1[19] += 2; gr_hash(hash, edata0, edata1, thr_id); hashes_done += 2.0; #endif } }