/** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #ifdef _WIN32 #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include #include #endif #include #include "caffe2/core/blob_serialization.h" #ifdef __CUDA_ARCH__ #include "caffe2/core/context_gpu.h" #endif #include "caffe2/core/init.h" #include "caffe2/core/logging.h" #include "caffe2/core/net.h" #include "caffe2/core/operator.h" #include "caffe2/core/tensor_int8.h" #include "caffe2/utils/bench_utils.h" #include "caffe2/utils/string_utils.h" #include #include #include #if defined(TARGET_OS_MAC) || \ defined(TARGET_OS_IPHONE) || \ defined(TARGET_IPHONE_SIMULATOR) #include #else #include #endif void observerConfig() { caffe2::ClearGlobalNetObservers(); caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) { return std::make_unique(subject); }); caffe2::ObserverConfig::setReporter( std::make_unique()); } bool backendCudaSet(const string& backend) { bool run_on_gpu = false; if (backend == "cuda") { #ifdef __CUDA_ARCH__ if (caffe2::HasCudaGPU()) { run_on_gpu = true; } else { CAFFE_THROW("NO GPU support on this host machine"); } #else CAFFE_THROW("NO GPU support"); #endif } return run_on_gpu; } void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) { for (int j = 0; j < net_def->op_size(); j++) { caffe2::OperatorDef* op = net_def->mutable_op(j); op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev)); } } void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { if (backend != "builtin") { string engine = backend == "nnpack" ? "NNPACK" : backend == "eigen" ? "EIGEN" : backend == "mkl" ? "MKLDNN" : backend == "cuda" ? "CUDA" : backend == "dnnlowp" ? "DNNLOWP" : backend == "dnnlowp_acc16" ? "DNNLOWP_ACC16" : backend == "default" ? "" : "NONE"; CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); for (int i = 0; i < net_def->op_size(); i++) { caffe2::OperatorDef* op_def = net_def->mutable_op(i); op_def->set_engine(engine); } } } int loadInput( shared_ptr workspace, const bool run_on_gpu, map& tensor_protos_map, const string& input, const string& input_file, const string& input_dims, const string& input_type) { // How many input blobs are in the inputs int blob_num = 1; // Load input. if (input.size()) { vector input_names = caffe2::split(',', input); if (input_file.size()) { vector input_files = caffe2::split(',', input_file); CAFFE_ENFORCE_EQ( input_names.size(), input_files.size(), "Input name and file should have the same number."); for (int i = 0; i < input_names.size(); ++i) { caffe2::TensorProtos tensor_protos; CAFFE_ENFORCE( caffe2::ReadProtoFromFile(input_files[i], &tensor_protos)); workspace->CreateBlob(input_names[i]); tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos)); } // Check that all blobs have the same number of entries blob_num = tensor_protos_map[input_names[0]].protos_size(); for (int i = 1; i < input_names.size(); ++i) { int bnum = tensor_protos_map[input_names[i]].protos_size(); CAFFE_ENFORCE_EQ( blob_num, bnum, "Number of blobs are not the same for all inputs"); } } else if (input_dims.size() || input_type.size()) { CAFFE_ENFORCE_GE( input_dims.size(), 0, "Input dims must be specified when input tensors are used."); CAFFE_ENFORCE_GE( input_type.size(), 0, "Input type must be specified when input tensors are used."); vector input_dims_list = caffe2::split(';', input_dims); CAFFE_ENFORCE_EQ( input_names.size(), input_dims_list.size(), "Input name and dims should have the same number of items."); vector input_type_list = caffe2::split(';', input_type); CAFFE_ENFORCE_EQ( input_names.size(), input_type_list.size(), "Input name and type should have the same number of items."); for (size_t i = 0; i < input_names.size(); ++i) { vector input_dims_str = caffe2::split(',', input_dims_list[i]); vector input_dims; for (const string& s : input_dims_str) { input_dims.push_back(c10::stoi(s)); } caffe2::Blob* blob = workspace->GetBlob(input_names[i]); if (blob == nullptr) { blob = workspace->CreateBlob(input_names[i]); } if (run_on_gpu) { LOG(INFO) << "Running on GPU."; #ifdef __CUDA_ARCH__ caffe2::TensorCUDA* tensor = blob->GetMutable(); CHECK_NOTNULL(tensor); tensor->Resize(input_dims); if (input_type_list[i] == "uint8_t") { tensor->mutable_data(); } else if (input_type_list[i] == "float") { tensor->mutable_data(); } else { CAFFE_THROW("Unsupported input type: ", input_type_list[i]); } #else CAFFE_THROW("Not support GPU on mobile."); #endif } else { if (input_type_list[i] == "uint8_t") { caffe2::int8::Int8TensorCPU* tensor = blob->GetMutable(); CHECK_NOTNULL(tensor); tensor->t.Resize(input_dims); tensor->t.mutable_data(); } else if (input_type_list[i] == "float") { caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); CHECK_NOTNULL(tensor); tensor->Resize(input_dims); tensor->mutable_data(); } else if (input_type_list[i] == "int") { caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU); CHECK_NOTNULL(tensor); tensor->Resize(input_dims); tensor->mutable_data(); } else { CAFFE_THROW("Unsupported input type: ", input_type_list[i]); } } } } else { CAFFE_THROW( "You requested input tensors, but neither input_file nor " "input_dims is set."); } } return blob_num; } void fillInputBlob( shared_ptr workspace, map& tensor_protos_map, int iteration) { if (tensor_protos_map.empty()) { return; } static caffe2::TensorDeserializer deserializer; for (auto& tensor_kv : tensor_protos_map) { caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first); if (blob == nullptr) { blob = workspace->CreateBlob(tensor_kv.first); } // todo: support gpu and make this function a template int protos_size = tensor_kv.second.protos_size(); if (protos_size == 1 && iteration > 0) { // Do not override the input data if there is only one input data, // since it will clear all caches. Rely on wipe_cache to // clear caches continue; } caffe2::TensorProto* tensor_proto = tensor_kv.second.mutable_protos(iteration % protos_size); BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto)); // todo: for other types } } void runNetwork( shared_ptr workspace, caffe2::NetBase* net, map& tensor_protos_map, const bool wipe_cache, const bool run_individual, const bool run_on_gpu, const bool text_output, const int warmup, const int iter, const int num_blobs, const int sleep_before_run, const int sleep_between_iteration, const int sleep_between_net_and_operator, const std::string& output, const std::string& output_folder) { LOG(INFO) << "Starting benchmark."; caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); LOG(INFO) << "Running warmup runs."; for (int i = 0; i < warmup; ++i) { fillInputBlob(workspace, tensor_protos_map, i); CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); } if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_before_run > 0) { std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run)); } LOG(INFO) << "Main runs."; CAFFE_ENFORCE( iter >= 0, "Number of main runs should be non negative, provided ", iter, "."); LOG(INFO) << "net runs."; long long duration_sum = 0; for (int i = 0; i < iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); fillInputBlob(workspace, tensor_protos_map, i); if (wipe_cache) { caffe2::wipe_cache(); } auto start = std::chrono::high_resolution_clock::now(); CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); duration_sum += duration.count(); // Write the output for the first num_blobs times writeOutput( workspace, run_on_gpu, output, output_folder, text_output, i, num_blobs); if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_between_iteration > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_iteration)); } } std::cout << "Average Duration: " << (duration_sum/iter) << " us" << std::endl; if (run_individual) { LOG(INFO) << "operator runs."; if (sleep_between_net_and_operator > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_net_and_operator)); } for (int i = 0; i < iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); fillInputBlob(workspace, tensor_protos_map, i); CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_between_iteration > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_iteration)); } } } } void writeOutput( shared_ptr workspace, const bool run_on_gpu, const string& output, const string& output_folder, const bool text_output, const int index, const int num_blobs) { if (output.size() == 0) { return; } string output_prefix = output_folder.size() ? output_folder + "/" : ""; vector output_names = caffe2::split(',', output); if (output == "*") { output_names = workspace->Blobs(); } for (const string& name : output_names) { CAFFE_ENFORCE( workspace->HasBlob(name), "You requested a non-existing blob: ", name); if (text_output) { if (run_on_gpu) { #ifdef __CUDA_ARCH__ writeTextOutput( workspace->GetBlob(name)->GetMutable(), output_prefix, name, index, num_blobs); #else CAFFE_THROW("Not support GPU."); #endif } else { writeTextOutput( BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU), output_prefix, name, index, num_blobs); } } else { // Do not support multiple entries per blob. CAFFE_ENFORCE( index == 0, "Binary file only support one output."); string serialized = SerializeBlob(*workspace->GetBlob(name), name); string output_filename = output_prefix + name; caffe2::WriteStringToFile(serialized, output_filename.c_str()); } } } void logBenchmarkResult( const std::string& type, const std::string& metric, const std::string& unit, const int value) { LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{" << "\"type\": \"" << type << "\", " << "\"metric\": \"" << metric << "\", " << "\"unit\": \"" << unit << "\", " << "\"value\": " << c10::to_string(value) << "}\n"; } long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) { if (FLAGS_measure_memory) { #if defined(TARGET_OS_IPHONE) || \ defined(TARGET_OS_MAC) || \ defined(TARGET_IPHONE_SIMULATOR) malloc_statistics_t stats = {0}; malloc_zone_statistics(nullptr, &stats); return stats.size_allocated; #elif defined(_WIN32) PROCESS_MEMORY_COUNTERS_EX pmc; GetProcessMemoryInfo( GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc)); return pmc.PrivateUsage; #else struct mallinfo info = mallinfo(); return info.uordblks; #endif } return 0; } int benchmark( int argc, char* argv[], const string& FLAGS_backend, const string& FLAGS_init_net, const string& FLAGS_input, const string& FLAGS_input_dims, const string& FLAGS_input_file, const string& FLAGS_input_type, int FLAGS_iter, bool FLAGS_measure_memory, const string& FLAGS_net, const string& FLAGS_output, const string& FLAGS_output_folder, bool FLAGS_run_individual, int FLAGS_sleep_before_run, int FLAGS_sleep_between_iteration, int FLAGS_sleep_between_net_and_operator, bool FLAGS_text_output, int FLAGS_warmup, bool FLAGS_wipe_cache) { // Check arguments to be correct { // Need to check whether file exists, as the file reader does not assert if // file does not exist std::ifstream net_file(FLAGS_net); CAFFE_ENFORCE(net_file.good()); net_file.close(); std::ifstream init_net_file(FLAGS_init_net); CAFFE_ENFORCE(init_net_file.good()); init_net_file.close(); if (FLAGS_input_file.size() > 0) { vector input_files = caffe2::split(',', FLAGS_input_file); for (auto input_file : input_files) { std::ifstream ifile(input_file); CAFFE_ENFORCE(ifile.good()); ifile.close(); } } } observerConfig(); caffe2::ShowLogInfoToStderr(); auto workspace = std::make_shared(new caffe2::Workspace()); bool run_on_gpu = backendCudaSet(FLAGS_backend); // Run initialization network, measure resources used. long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); caffe2::NetDef init_net_def; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); setOperatorEngine(&init_net_def, FLAGS_backend); CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem; map tensor_protos_map; int num_blobs = loadInput( workspace, run_on_gpu, tensor_protos_map, FLAGS_input, FLAGS_input_file, FLAGS_input_dims, FLAGS_input_type); // Run main network. long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); caffe2::NetDef net_def; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); setOperatorEngine(&net_def, FLAGS_backend); if (!net_def.has_name()) { net_def.set_name("benchmark"); } caffe2::NetBase* net = workspace->CreateNet(net_def); CHECK_NOTNULL(net); runNetwork( workspace, net, tensor_protos_map, FLAGS_wipe_cache, FLAGS_run_individual, run_on_gpu, FLAGS_text_output, FLAGS_warmup, FLAGS_iter, num_blobs, FLAGS_sleep_before_run, FLAGS_sleep_between_iteration, FLAGS_sleep_between_net_and_operator, FLAGS_output, FLAGS_output_folder); predict_vmem = getVirtualMemoryIfOptionEnabled( FLAGS_measure_memory) - predict_vmem; if (FLAGS_measure_memory) { logBenchmarkResult( "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024); } return 0; }