/* * Copyright 2021 Delgon * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. See COPYING for more details. */ #include "gr-gate.h" #if defined(GR_4WAY) #include "cryptonote/cryptonight.h" #define CRYPTONIGHT_HASH(variant, way) \ if (vectorized) { \ dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); \ } \ if (prefetch_l1) { \ if (way == CN_4WAY) { \ cryptonight_4way_hash(hash0, hash1, hash2, hash3, hash0, \ hash1, hash2, hash3); \ } else if (way == CN_2WAY) { \ cryptonight_2way_hash(hash0, hash1, hash0, hash1); \ cryptonight_2way_hash(hash2, hash3, hash2, hash3); \ } else { \ cryptonight_hash(hash0, hash0); \ cryptonight_hash(hash1, hash1); \ cryptonight_hash(hash2, hash2); \ cryptonight_hash(hash3, hash3); \ } \ } else { \ if (way == CN_4WAY) { \ cryptonight_4way_hash(hash0, hash1, hash2, hash3, hash0, \ hash1, hash2, hash3); \ } else if (way == CN_2WAY) { \ cryptonight_2way_hash(hash0, hash1, hash0, hash1); \ cryptonight_2way_hash(hash2, hash3, hash2, hash3); \ } else { \ cryptonight_hash(hash0, hash0); \ cryptonight_hash(hash1, hash1); \ cryptonight_hash(hash2, hash2); \ cryptonight_hash(hash3, hash3); \ } \ } \ vectorized = false; int gr_4way_hash(void *output, const void *input, const int thr_id) { uint64_t vhash[10 * 4] __attribute__((aligned(128))); uint64_t vhashA[10 * 2] __attribute__((aligned(128))); uint64_t vhashB[10 * 2] __attribute__((aligned(128))); uint64_t hash0[10] __attribute__((aligned(64))); uint64_t hash1[10] __attribute__((aligned(64))); uint64_t hash2[10] __attribute__((aligned(64))); uint64_t hash3[10] __attribute__((aligned(64))); gr_4way_context_overlay ctx; memcpy(&ctx, &gr_4way_ctx, sizeof(ctx)); // Start as vectorized from input. bool vectorized = true; switch (gr_hash_order[0]) { case BLAKE: blake512_4way_full(&ctx.blake, vhash, input, 80); vectorized = true; break; case BMW: bmw512_4way_init(&ctx.bmw); bmw512_4way_update(&ctx.bmw, input, 80); bmw512_4way_close(&ctx.bmw, vhash); vectorized = true; break; case GROESTL: #if defined(__VAES__) rintrlv_4x64_2x128(vhashA, vhashB, input, 640); groestl512_2way_full(&ctx.groestl, vhashA, vhashA, 80); groestl512_2way_full(&ctx.groestl, vhashB, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; #else dintrlv_4x64(hash0, hash1, hash2, hash3, input, 640); groestl512_full(&ctx.groestl, hash0, hash0, 640); groestl512_full(&ctx.groestl, hash1, hash1, 640); groestl512_full(&ctx.groestl, hash2, hash2, 640); groestl512_full(&ctx.groestl, hash3, hash3, 640); vectorized = false; #endif break; case SKEIN: skein512_4way_full(&ctx.skein, vhash, input, 80); vectorized = true; break; case JH: jh512_4way_init(&ctx.jh); jh512_4way_update(&ctx.jh, input, 80); jh512_4way_close(&ctx.jh, vhash); vectorized = true; break; case KECCAK: keccak512_4way_init(&ctx.keccak); keccak512_4way_update(&ctx.keccak, input, 80); keccak512_4way_close(&ctx.keccak, vhash); vectorized = true; break; case LUFFA: rintrlv_4x64_2x128(vhashA, vhashB, input, 640); luffa512_2way_full(&ctx.luffa, vhashA, vhashA, 80); luffa512_2way_full(&ctx.luffa, vhashB, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case CUBEHASH: rintrlv_4x64_2x128(vhashA, vhashB, input, 640); cube_2way_full(&ctx.cube, vhashA, 512, vhashA, 80); cube_2way_full(&ctx.cube, vhashB, 512, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case SHAVITE: rintrlv_4x64_2x128(vhashA, vhashB, input, 640); shavite512_2way_full(&ctx.shavite, vhashA, vhashA, 80); shavite512_2way_full(&ctx.shavite, vhashB, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case SIMD: rintrlv_4x64_2x128(vhashA, vhashB, input, 640); simd512_2way_full(&ctx.simd, vhashA, vhashA, 80); simd512_2way_full(&ctx.simd, vhashB, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case ECHO: #if defined(__VAES__) rintrlv_4x64_2x128(vhashA, vhashB, input, 640); echo_2way_full(&ctx.echo, vhashA, 512, vhashA, 80); echo_2way_full(&ctx.echo, vhashB, 512, vhashB, 80); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; #else dintrlv_4x64(hash0, hash1, hash2, hash3, input, 640); echo_full(&ctx.echo, (BitSequence *)hash0, 512, (const BitSequence *)hash0, 80); echo_full(&ctx.echo, (BitSequence *)hash1, 512, (const BitSequence *)hash1, 80); echo_full(&ctx.echo, (BitSequence *)hash2, 512, (const BitSequence *)hash2, 80); echo_full(&ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)hash3, 80); vectorized = false; #endif break; case HAMSI: hamsi512_4way_init(&ctx.hamsi); hamsi512_4way_update(&ctx.hamsi, input, 80); hamsi512_4way_close(&ctx.hamsi, vhash); vectorized = true; break; case FUGUE: dintrlv_4x64(hash0, hash1, hash2, hash3, input, 640); fugue512_full(&ctx.fugue, hash0, hash0, 80); fugue512_full(&ctx.fugue, hash1, hash1, 80); fugue512_full(&ctx.fugue, hash2, hash2, 80); fugue512_full(&ctx.fugue, hash3, hash3, 80); vectorized = false; break; case SHABAL: shabal512_4way_init(&ctx.shabal); rintrlv_4x64_4x32(vhash, input, 640); shabal512_4way_update(&ctx.shabal, vhash, 80); shabal512_4way_close(&ctx.shabal, vhash); dintrlv_4x32_512(hash0, hash1, hash2, hash3, vhash); vectorized = false; break; case WHIRLPOOL: whirlpool_4way_init(&ctx.whirlpool); whirlpool_4way(&ctx.whirlpool, input, 80); whirlpool_4way_close(&ctx.whirlpool, vhash); vectorized = true; break; } for (int i = 1; i < 15 + 3; i++) { const uint8_t algo = gr_hash_order[i]; switch (algo) { case BLAKE: if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } blake512_4way_full(&ctx.blake, vhash, vhash, 64); vectorized = true; break; case BMW: bmw512_4way_init(&ctx.bmw); if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } bmw512_4way_update(&ctx.bmw, vhash, 64); bmw512_4way_close(&ctx.bmw, vhash); vectorized = true; break; case GROESTL: #if defined(__VAES__) if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } groestl512_2way_full(&ctx.groestl, vhashA, vhashA, 64); groestl512_2way_full(&ctx.groestl, vhashB, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; #else if (vectorized) { dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); } groestl512_full(&ctx.groestl, hash0, hash0, 512); groestl512_full(&ctx.groestl, hash1, hash1, 512); groestl512_full(&ctx.groestl, hash2, hash2, 512); groestl512_full(&ctx.groestl, hash3, hash3, 512); vectorized = false; #endif break; case SKEIN: if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } skein512_4way_full(&ctx.skein, vhash, vhash, 64); vectorized = true; break; case JH: jh512_4way_init(&ctx.jh); if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } jh512_4way_update(&ctx.jh, vhash, 64); jh512_4way_close(&ctx.jh, vhash); vectorized = true; break; case KECCAK: keccak512_4way_init(&ctx.keccak); if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } keccak512_4way_update(&ctx.keccak, vhash, 64); keccak512_4way_close(&ctx.keccak, vhash); vectorized = true; break; case LUFFA: if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } luffa512_2way_full(&ctx.luffa, vhashA, vhashA, 64); luffa512_2way_full(&ctx.luffa, vhashB, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case CUBEHASH: if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } cube_2way_full(&ctx.cube, vhashA, 512, vhashA, 64); cube_2way_full(&ctx.cube, vhashB, 512, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case SHAVITE: if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } shavite512_2way_full(&ctx.shavite, vhashA, vhashA, 64); shavite512_2way_full(&ctx.shavite, vhashB, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case SIMD: if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } simd512_2way_full(&ctx.simd, vhashA, vhashA, 64); simd512_2way_full(&ctx.simd, vhashB, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; break; case ECHO: #if defined(__VAES__) if (vectorized) { rintrlv_4x64_2x128(vhashA, vhashB, vhash, 512); } else { intrlv_2x128_512(vhashA, hash0, hash1); intrlv_2x128_512(vhashB, hash2, hash3); } echo_2way_full(&ctx.echo, vhashA, 512, vhashA, 64); echo_2way_full(&ctx.echo, vhashB, 512, vhashB, 64); rintrlv_2x128_4x64(vhash, vhashA, vhashB, 512); vectorized = true; #else if (vectorized) { dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); } echo_full(&ctx.echo, (BitSequence *)hash0, 512, (const BitSequence *)hash0, 64); echo_full(&ctx.echo, (BitSequence *)hash1, 512, (const BitSequence *)hash1, 64); echo_full(&ctx.echo, (BitSequence *)hash2, 512, (const BitSequence *)hash2, 64); echo_full(&ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)hash3, 64); vectorized = false; #endif break; case HAMSI: hamsi512_4way_init(&ctx.hamsi); if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } hamsi512_4way_update(&ctx.hamsi, vhash, 64); hamsi512_4way_close(&ctx.hamsi, vhash); vectorized = true; break; case FUGUE: if (vectorized) { dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); } fugue512_full(&ctx.fugue, hash0, hash0, 64); fugue512_full(&ctx.fugue, hash1, hash1, 64); fugue512_full(&ctx.fugue, hash2, hash2, 64); fugue512_full(&ctx.fugue, hash3, hash3, 64); vectorized = false; break; case SHABAL: shabal512_4way_init(&ctx.shabal); if (vectorized) { dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); } intrlv_4x32_512(vhash, hash0, hash1, hash2, hash3); shabal512_4way_update(&ctx.shabal, vhash, 64); shabal512_4way_close(&ctx.shabal, vhash); dintrlv_4x32_512(hash0, hash1, hash2, hash3, vhash); vectorized = false; break; case WHIRLPOOL: whirlpool_4way_init(&ctx.whirlpool); if (!vectorized) { intrlv_4x64_512(vhash, hash0, hash1, hash2, hash3); } whirlpool_4way(&ctx.whirlpool, vhash, 64); whirlpool_4way_close(&ctx.whirlpool, vhash); vectorized = true; break; case CNTurtlelite: CRYPTONIGHT_HASH(TURTLELITE, cn_config[Turtlelite]); break; case CNTurtle: CRYPTONIGHT_HASH(TURTLE, cn_config[Turtle]); break; case CNDarklite: CRYPTONIGHT_HASH(DARKLITE, cn_config[Darklite]); break; case CNDark: CRYPTONIGHT_HASH(DARK, cn_config[Dark]); break; case CNLite: CRYPTONIGHT_HASH(LITE, cn_config[Lite]); break; case CNFast: CRYPTONIGHT_HASH(FAST, cn_config[Fast]); break; } // Stop early. do not stop while benchmarking or tuning. if (work_restart[thr_id].restart && !(opt_benchmark || opt_tune)) { if (opt_debug && !thr_id) { applog(LOG_DEBUG, "Threads exit early."); } return 0; } } // This should not happen as CN should be last algorithm. if (vectorized) { dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); } memcpy(output, hash0, 32); memcpy(&((uint8_t *)output)[32], hash1, 32); memcpy(&((uint8_t *)output)[64], hash2, 32); memcpy(&((uint8_t *)output)[96], hash3, 32); return 1; } int scanhash_gr_4way(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { uint32_t hash[8 * 4] __attribute__((aligned(64))); uint32_t vdata[20 * 4] __attribute__((aligned(64))); uint32_t edata[20] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; const int thr_id = mythr->id; uint32_t n = first_nonce; uint32_t hashes = 1; __m256i *noncev = (__m256i *)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); if (opt_stress_test) { stress_test(pdata, thr_id); } if (!opt_tuned && opt_tune) { sleep(1); tune(pdata, thr_id); opt_tuned = true; // Tuned. opt_tune = false; return 0; } if (opt_benchmark) { sleep(1); if (thr_id == 0) { applog(LOG_BLUE, "Starting benchmark. Benchmark takes %.0lfs to complete", gr_benchmark_time / 1e6); } benchmark(pdata, thr_id, 0); if (thr_id == 0) { exit(0); } } mm256_bswap32_intrlv80_4x64(vdata, pdata); // Check if algorithm order changed. mm128_bswap32_80(edata, pdata); gr_getAlgoString((const uint8_t *)(&edata[1]), gr_hash_order); if (opt_debug && !thr_id) { char order[100]; for (int i = 0; i < 15 + 3; i++) { sprintf(order + (i * 3), "%02d ", gr_hash_order[i]); } applog(LOG_DEBUG, "Hash order %s", order); } if (opt_tuned) { select_tuned_config(thr_id); } // Allocates hp_state for Cryptonight algorithms. // Needs to be run AFTER gr_hash_order is set! AllocateNeededMemory(true); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); // Check if current rotation is "disabled" by the user. if (is_rot_disabled()) { if (thr_id == 0) { applog(LOG_WARNING, "Detected disabled rotation %d. Waiting...", (get_config_id() / 2) + 1); } while (!(*restart)) { // sleep for 50ms // TODO // use pthread_cond instead. usleep(50000); } hashes_done = 0; return 0; } if (!is_thread_used(thr_id)) { while (!(*restart)) { // sleep for 50ms // TODO // use pthread_cond instead. usleep(50000); } hashes_done = 0; return 0; } while (likely((n < last_nonce) && !(*restart))) { if (gr_4way_hash(hash, vdata, thr_id)) { if (hashes % 50 != 0) { for (int i = 0; i < 4; i++) { if (unlikely(valid_hash(hash + (i << 3), ptarget))) { if (opt_debug) { applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf", bswap_32(n + i), hash_to_diff(hash + (i << 3))); } pdata[19] = bswap_32(n + i); submit_solution(work, hash + (i << 3), mythr); check_prepared(); } } } } *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000)); n += 4; hashes += (enable_donation && donation_percent >= 1.75) ? 0 : 1; } pdata[19] = n; *hashes_done = n - first_nonce; return 0; } #endif // GR_4WAY