-
-
Save YashasSamaga/48bdb167303e10f4d07b754888ddbdcf to your computer and use it in GitHub Desktop.
#include <iostream> | |
#include <algorithm> | |
#include <vector> | |
#include <chrono> | |
#include <numeric> | |
#include <opencv2/dnn.hpp> | |
#include <opencv2/dnn/all_layers.hpp> | |
#include <opencv2/highgui.hpp> | |
#include "benchmark.hpp" | |
/* OPTION I: | |
* Use random images for testing. | |
* | |
* OPTION II: | |
* Use images in "data/images/img_n.jpg" where `n` varies from 0, 1, 2, 3, .... | |
*/ | |
#define USE_RANDOM_IMAGES | |
constexpr auto default_batch_size = 1; | |
struct mask_type { | |
int backend; | |
int target; | |
}; | |
struct config_type { | |
std::string name; | |
int backend; | |
int target; | |
}; | |
// select backend target combinations that you want to test | |
std::vector<config_type> backends = { | |
//{"OCV CPU", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU}, | |
//{"OCV OpenCL", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL}, | |
//{"OCV OpenCL FP16", cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_OPENCL_FP16}, | |
//{"IE CPU", cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, cv::dnn::DNN_TARGET_CPU}, | |
{"CUDA FP32", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA}, | |
{"CUDA FP16", cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16} | |
}; | |
std::vector<cv::Mat> image_samples; | |
template <class T> | |
auto to_milliseconds(const T& duration) { | |
return std::chrono::duration_cast<std::chrono::milliseconds>(duration); | |
} | |
template <class T> | |
auto to_microseconds(const T& duration) { | |
return std::chrono::duration_cast<std::chrono::microseconds>(duration); | |
} | |
struct perf_result_t | |
{ | |
using duration = std::chrono::microseconds; | |
duration init_time; | |
std::vector<duration> runtimes; | |
}; | |
template <std::size_t BENCHMARK_RUNS, std::size_t WARMUP_RUNS> | |
auto run_network( | |
const std::string& model, const std::string& config, | |
const cv::Mat& blob, | |
const std::vector<std::string>& output_names_, | |
int backend, int target) | |
{ | |
auto net = cv::dnn::readNet(model, config); | |
net.setPreferableBackend(backend); | |
net.setPreferableTarget(target); | |
auto output_names = output_names_; | |
if (output_names.empty()) | |
output_names = net.getUnconnectedOutLayersNames(); | |
std::vector<cv::Mat> output_mats; | |
auto init_time = benchmark([&] { | |
net.setInput(blob); | |
net.forward(output_mats, output_names); | |
}); | |
for(int i = 0; i < WARMUP_RUNS; i++) | |
{ | |
net.setInput(blob); | |
net.forward(output_mats, output_names); | |
} | |
perf_result_t result; | |
result.init_time = init_time; | |
result.runtimes.reserve(BENCHMARK_RUNS); | |
for(int i = 0; i < BENCHMARK_RUNS; i++) | |
{ | |
net.setInput(blob); | |
auto inference_time = benchmark([&] { | |
net.forward(output_mats, output_names); | |
}); | |
result.runtimes.push_back(inference_time); | |
} | |
return result; | |
} | |
void bench_network( | |
const std::string& model, const std::string& config, | |
cv::Size input_size, | |
const std::vector<std::string>& output_names = {}, | |
int count = default_batch_size, | |
std::vector<mask_type> mask = {}) | |
{ | |
#ifndef USE_RANDOM_IMAGES | |
assert(count <= image_samples.size()); | |
#endif | |
std::vector<cv::Mat> images; | |
for (int i = 0; i < count; i++) | |
{ | |
#ifdef USE_RANDOM_IMAGES | |
cv::Mat image(input_size, CV_32FC3); | |
cv::randu(image, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255)); | |
images.push_back(image); | |
#else | |
images.push_back(image_samples[i]); | |
#endif | |
} | |
cv::Mat blob = cv::dnn::blobFromImages(images, 1.0f, input_size, 0.0f); | |
for (auto c : backends) { | |
auto backend = c.backend; | |
auto target = c.target; | |
bool skip = [backend, target, mask] { | |
for (auto m : mask) { | |
if (m.backend == backend && m.target == target) | |
return true; | |
if (m.backend == backend && m.target == -1) | |
return true; | |
if (m.backend == -1 && m.target == target) | |
return true; | |
} | |
return false; | |
} (); | |
if(skip) | |
continue; | |
try { | |
constexpr int WARMUP_RUNS = 10; | |
constexpr int BENCHMARK_RUNS = 100; | |
auto result = run_network<BENCHMARK_RUNS, WARMUP_RUNS>(model, config, blob, output_names, backend, target); | |
float init_time = to_microseconds(result.init_time).count() / 1000.0; | |
std::vector<float> runtimes; | |
for (auto r : result.runtimes) | |
runtimes.push_back(to_microseconds(r).count() / 1000.0); | |
auto sum = std::accumulate(std::begin(runtimes), std::end(runtimes), 0.0f); | |
auto squared_sum = std::inner_product(std::begin(runtimes), std::end(runtimes), std::begin(runtimes), 0.0f); | |
auto min = *std::min_element(std::begin(runtimes), std::end(runtimes)); | |
auto max = *std::max_element(std::begin(runtimes), std::end(runtimes)); | |
auto mean = sum / runtimes.size(); | |
auto stddev = std::sqrt(squared_sum / runtimes.size() - mean * mean); | |
std::cout << '[' << c.name << "]" << '\n' | |
<< "\tinit >> " << init_time << "ms" << '\n' | |
<< "\tinference >> " << "min = " << min << "ms, max = " << max << "ms, mean = " << mean << "ms, stddev = " << stddev << "ms" << std::endl; | |
} catch(const std::exception& ex) { | |
std::cout << ex.what() << std::endl; | |
return; | |
} | |
} | |
std::cout << std::endl; | |
} | |
void bench_alexnet() | |
{ | |
std::cout << "BVLC AlexNet\n"; | |
bench_network("data/alexnet/deploy.prototxt", "data/alexnet/bvlc_alexnet.caffemodel", cv::Size(227, 227)); | |
std::cout << std::endl; | |
} | |
void bench_densenet121() | |
{ | |
std::cout << "DenseNet 121\n"; | |
bench_network("data/densenet121/DenseNet_121.prototxt", "data/densenet121/DenseNet_121.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_east_text_detection() | |
{ | |
std::cout << "East Text Detection\n"; | |
bench_network("data/east_text_detection/frozen_east_text_detection.pb", "", cv::Size(320, 320)); | |
std::cout << std::endl; | |
} | |
void bench_enet() | |
{ | |
std::cout << "ENet Cityscapes\n"; | |
bench_network("data/enet/model-cityscapes.net", "", cv::Size(512, 256), {}, 1); | |
std::cout << std::endl; | |
} | |
void bench_fns_stary_night() | |
{ | |
std::cout << "FastNeuralStyle Stary Night\n"; | |
bench_network("data/fns_stary_night/fast_neural_style_eccv16_starry_night.t7", "", cv::Size(320, 240)); | |
std::cout << std::endl; | |
} | |
void bench_googlenet() | |
{ | |
std::cout << "BVLC GoogleNet\n"; | |
bench_network("data/googlenet/deploy.prototxt", "data/googlenet/bvlc_googlenet.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_inception_v2_faster_rcnn() | |
{ | |
std::cout << "Inception v2 Faster RCNN\n"; | |
bench_network("data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", "data/inception_v2_faster_rcnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", cv::Size(800, 600), {}, default_batch_size, | |
{ | |
{cv::dnn::DNN_BACKEND_INFERENCE_ENGINE, -1} | |
}); | |
std::cout << std::endl; | |
} | |
void bench_inception_v2_mask_rcnn() | |
{ | |
std::cout << "Inception v2 Mask RCNN\n"; | |
bench_network("data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt", "data/inception_v2_mask_rcnn/mask_rcnn_inception_v2_coco_2018_01_28.pb", cv::Size(1024, 1024), { "detection_out_final", "detection_masks"}); | |
std::cout << std::endl; | |
} | |
void bench_mobilenet_ssd() | |
{ | |
std::cout << "MobileNet SSD\n"; | |
bench_network("data/mobilenet_ssd/MobileNetSSD_deploy.prototxt", "data/mobilenet_ssd/MobileNetSSD_deploy.caffemodel", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_mobilenet_ssd_v1_coco() | |
{ | |
std::cout << "MobileNet SSD v1 Coco\n"; | |
bench_network("data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pb", "data/mobilenet_ssd_v1_coco_2017_11_17/ssd_mobilenet_v1_coco_2017_11_17.pbtxt", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_mobilenet_ssd_v2_coco() | |
{ | |
std::cout << "MobileNet SSD v2 Coco\n"; | |
bench_network("data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pb", "data/mobilenet_ssd_v2_coco_2018_03_29/ssd_mobilenet_v2_coco_2018_03_29.pbtxt", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_opencv_face_detector() | |
{ | |
std::cout << "OpenCV Face Detector\n"; | |
bench_network("data/opencv_face_detector/deploy.prototxt", "data/opencv_face_detector/res10_300x300_ssd_iter_140000_fp16.caffemodel", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_openface_nn4_small2_v1() | |
{ | |
std::cout << "OpenFace nn4 small2 v1\n"; | |
bench_network("data/openface_nn4_small2_v1/nn4.small2.v1.t7", "", cv::Size(96, 96)); | |
std::cout << std::endl; | |
} | |
void bench_openpose_pose_mpi() | |
{ | |
std::cout << "OpenPose pose MPI\n"; | |
bench_network("data/openpose_pose_mpi/openpose_pose_mpi_faster_4_stages.prototxt", "data/openpose_pose_mpi/pose_iter_160000.caffemodel", cv::Size(368, 368)); | |
std::cout << std::endl; | |
} | |
void bench_resnet50() | |
{ | |
std::cout << "ResNet 50\n"; | |
bench_network("data/resnet50/ResNet-50-deploy.prototxt", "data/resnet50/ResNet-50-model.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_resnet50_faster_rcnn() | |
{ | |
std::cout << "ResNet50 Faster RCNN\n"; | |
bench_network("data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt", "data/resnet50_faster_rcnn/faster_rcnn_resnet50_coco_2018_01_28.pb", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_resnet101() | |
{ | |
std::cout << "ResNet 101\n"; | |
bench_network("data/resnet101/ResNet-101-deploy.prototxt", "data/resnet101/ResNet-101-model.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_squeezenet() | |
{ | |
std::cout << "SqueezeNet v1.1\n"; | |
bench_network("data/squeezenet/squeezenet_v1.1.prototxt", "data/squeezenet/squeezenet_v1.1.caffemodel", cv::Size(227, 227)); | |
std::cout << std::endl; | |
} | |
void bench_inception_v2_coco() | |
{ | |
std::cout << "Inception v2 Coco\n"; | |
bench_network("data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pb", "data/ssd_inception_v2_coco_2017_11_17/ssd_inception_v2_coco_2017_11_17.pbtxt", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_tensorflow_inception_5h() | |
{ | |
std::cout << "TensorFlow Inception 5h\n"; | |
bench_network("data/tensorflow_inception_5h/tensorflow_inception_graph.pb", "", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_vgg16() | |
{ | |
std::cout << "VGG16 SSD\n"; | |
bench_network("data/vgg16/ssd_vgg16.prototxt", "data/vgg16/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", cv::Size(300, 300)); | |
std::cout << std::endl; | |
} | |
void bench_vgg16_faster_rcnn() | |
{ | |
std::cout << "VGG16 Faster RCNN\n"; | |
bench_network("data/vgg16_faster_rcnn/faster_rcnn_vgg16.prototxt", "data/vgg16_faster_rcnn/VGG16_faster_rcnn_final.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
void bench_vgg_ssd512() | |
{ | |
std::cout << "VGG SSD512\n"; | |
bench_network("data/vgg512/deploy.prototxt", "data/vgg512/VGG_coco_SSD_512x512_iter_360000.caffemodel", cv::Size(512, 512)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v2() | |
{ | |
std::cout << "YOLO v2\n"; | |
bench_network("data/yolov2/yolov2.cfg", "data/yolov2/yolov2.weights", cv::Size(608, 608)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v3() | |
{ | |
std::cout << "YOLO v3\n"; | |
bench_network("data/yolov3/yolov3.cfg", "data/yolov3/yolov3.weights", cv::Size(608, 608)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v3_spp() | |
{ | |
std::cout << "YOLO v3 SPP\n"; | |
bench_network("data/yolov3_spp/yolov3-spp.cfg", "data/yolov3_spp/yolov3-spp.weights", cv::Size(608, 608)); | |
std::cout << std::endl; | |
} | |
void bench_yolov3_enet_b0() | |
{ | |
std::cout << "EfficientNet B0 YOLOv3\n"; | |
bench_network("data/yolov3-enet-b0/enet-coco.cfg", "data/yolov3-enet-b0/enetb0-coco_final.weights", cv::Size(416, 416)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v3_tiny() | |
{ | |
std::cout << "YOLO v3 Tiny\n"; | |
bench_network("data/yolov3-tiny/yolov3-tiny.cfg", "data/yolov3-tiny/yolov3-tiny.weights", cv::Size(416, 416)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v3_tiny_prn() | |
{ | |
std::cout << "YOLO v3 Tiny PRN\n"; | |
bench_network("data/yolov3-tiny-prn/yolov3-tiny-prn.cfg", "data/yolov3-tiny-prn/yolov3-tiny-prn.weights", cv::Size(416, 416)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v4() | |
{ | |
std::cout << "YOLO v4\n"; | |
bench_network("data/yolov4/yolov4.cfg", "data/yolov4/yolov4.weights", cv::Size(608, 608)); | |
std::cout << std::endl; | |
} | |
void bench_yolo_v4_tiny() | |
{ | |
std::cout << "YOLO v4 Tiny\n"; | |
bench_network("data/yolov4-tiny/yolov4-tiny.cfg", "data/yolov4-tiny/yolov4-tiny.weights", cv::Size(416, 416)); | |
std::cout << std::endl; | |
} | |
void bench_zf_faster_rcnn() | |
{ | |
std::cout << "ZF Faster RCNN\n"; | |
bench_network("data/zf_faster_rcnn/faster_rcnn_zf.prototxt", "data/zf_faster_rcnn/ZF_faster_rcnn_final.caffemodel", cv::Size(224, 224)); | |
std::cout << std::endl; | |
} | |
int main(int argc, char *argv[]) | |
{ | |
constexpr auto total_images = 10; | |
auto prefix = std::string("data/images/img_"), | |
suffix = std::string(".jpg"); | |
/* populate sample images */ | |
for (int i = 0; i < total_images; i++) { | |
auto file = prefix + std::to_string(i) + suffix; | |
auto image = cv::imread(file); | |
image_samples.push_back(image); | |
} | |
bench_yolo_v4(); | |
bench_yolo_v4_tiny(); | |
return 0; | |
bench_alexnet(); | |
bench_densenet121(); | |
bench_east_text_detection(); | |
bench_enet(); | |
bench_fns_stary_night(); | |
bench_googlenet(); | |
bench_inception_v2_faster_rcnn(); | |
bench_inception_v2_mask_rcnn(); | |
bench_mobilenet_ssd(); | |
bench_mobilenet_ssd_v1_coco(); | |
bench_mobilenet_ssd_v2_coco(); | |
bench_opencv_face_detector(); | |
bench_openface_nn4_small2_v1(); | |
bench_openpose_pose_mpi(); | |
bench_resnet50(); | |
bench_resnet50_faster_rcnn(); | |
bench_resnet101(); | |
bench_squeezenet(); | |
bench_inception_v2_coco(); | |
bench_tensorflow_inception_5h(); | |
bench_vgg16(); | |
bench_vgg_ssd512(); | |
bench_vgg16_faster_rcnn(); | |
bench_yolo_v2(); | |
bench_yolo_v3_tiny(); | |
bench_yolo_v3_tiny_prn(); | |
bench_yolo_v3(); | |
bench_yolo_v3_spp(); | |
bench_yolov3_enet_b0(); | |
bench_yolo_v4(); | |
bench_yolo_v4_tiny(); | |
bench_zf_faster_rcnn(); | |
return 0; | |
} |
#ifndef BENCHMARK_HPP | |
#define BENCHMARK_HPP | |
#include <chrono> | |
template <class Function, typename ...Args> | |
auto benchmark(Function function, Args&& ...args) { | |
using std::chrono::steady_clock; | |
auto start = steady_clock::now(); | |
function(std::forward<Args>(args)...); | |
auto end = steady_clock::now(); | |
return std::chrono::duration_cast<std::chrono::microseconds>(end - start); | |
} | |
/* doNotOptimizeAway from https://stackoverflow.com/a/36781982/1935009 */ | |
#ifdef _MSC_VER | |
#pragma optimize("", off) | |
template <class T> | |
void doNotOptimizeAway(T&& datum) { | |
datum = datum; | |
} | |
#pragma optimize("", on) | |
#elif defined(__clang__) | |
template <class T> | |
__attribute__((__optnone__)) void doNotOptimizeAway(T&& /* datum */) {} | |
#else | |
template <class T> | |
void doNotOptimizeAway(T&& datum) { | |
asm volatile("" : "+r" (datum)); | |
} | |
#endif | |
#endif /* BENCHMARK_HPP */ |
g++ -I/usr/local/include/opencv4/ benchmark.cpp -lopencv_core -lopencv_imgproc -lopencv_dnn -lopencv_imgcodecs -O3 -std=c++17 |
The statistics reported are for the net.forward()
call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.
You can get a significant speedup by disabling NMS. You have to set nms_threshold=0
in all [yolo]
blocks in yolov3.cfg
. Check this for information on doing it from the code.
Thanks for your quick reply.
When I copy the code to C++, it cannot find region layer
auto layer = net.getLayer(layerId).dynamicCastcv::dnn::RegionLayer();
My OpenCV is 4.3.0 with cuda build. which version is your code using?
@xjsxujingsong You need the master branch (nmsThreshold
field in region layer was exposed recently). You can use YOLOv4 if you use the master branch. YOLOv4 is nearly as fast as YOLOv3 and gives higher performance.
Thanks. I am rebuilding the latest OpenCV.
I am find your page through https://github.com/AlexeyAB/darknet
From the table there, take the last line for example
OpenCV FP16, FPS : 100
OpenCV FP16 batch=4, FPS: 133
It looks like batching wont speed up too much. Is this using this benchmark.cpp or you disable the nms?
@xjsxujingsong I updated benchmark.cpp to disable NMS yesterday. The table there used the old benchmark.cpp
code with NMS manually by setting nms_threshold=0
in all [yolo]
blocks in yolov4.cfg
. The new benchmark.cpp
does the same in the code.
Hi @YashasSamaga, I want to implement YOLO detection on multiple camera.
So, my strategy right now is spawning YOLO model for each camera which is not a good idea since the used GPU memory will become very big. (if 1 YOLO model need ~2GB, then a RTX 2080 Ti can only handle 4-5 cameras at once).
I have an idea of utilizing batch size to solve this problem.
The statistics reported are for the
net.forward()
call. Youhave got 70.73ms for batch size of four. That's the total time for the inference. To calculate FPS, you will divide 70.73 by 4 which comes to around ~17.68 which is slightly faster than batch size of one. The decrease you see in FP32 is presumably because of NMS.You can get a significant speedup by disabling NMS. You have to set
nms_threshold=0
in all[yolo]
blocks inyolov3.cfg
. Check this for information on doing it from the code.
Based on your answer above, if I want to utilize batch size to handle multiple camera, I have to consider how many camera that will be handled by 1 YOLO model , so the FPS won't be dropped.
Have I come in the right conclusion? Or Do you have some suggestion regarding this solution?
best regards,
Albert Christianto
I would recommend having a fixed number of networks (one network on each thread) — you can have a constant parameter, say NUM_NETWORK_THREADS
that you can tune by trial and error. You can have a task queue which your cameras will populate with work items. One of the unused network will pick up a pending work and dump outputs to an output queue. If the queue has multiple items pending, you can do batch inference to improve throughput.
The most important point you have to remember is that both latency and throughput increases with batch size. Small batch sizes will report results faster but overall throughput will be lower (time per image is higher); large batch sizes will take longer to report results but overall throughput will be higher (time per image is lower). You need to balance the two factors for optimal performance (and maybe power consumption).
@YashasSamaga.
Ok, I get the point.
Thank you for your reply and your insight. It is very helpful for me.
best regards,
Albert Christianto
Hi, thanks for your code.
I am testing the speed on yolov3. It looks like incresing batch_size wont speed up the process. See the log for batch=1, 2, and 4. The running time increase when the batch increases. Is this normal?
YOLO v3
BATCH=1
[CUDA FP32]
init >> 724.753ms
inference >> min = 36.77ms, max = 46.585ms, mean = 37.8626ms, stddev = 1.09954ms
[CUDA FP16]
init >> 469.271ms
inference >> min = 19.636ms, max = 22.132ms, mean = 20.1432ms, stddev = 0.410955ms
[CUDA FP32]
init >> 987.252ms
inference >> min = 65.468ms, max = 71.962ms, mean = 67.4592ms, stddev = 0.933586ms
[CUDA FP16]
init >> 510.797ms
inference >> min = 35.917ms, max = 38.802ms, mean = 36.991ms, stddev = 0.525474ms
BATCH=4
[CUDA FP32]
init >> 947.806ms
inference >> min = 118.306ms, max = 139.88ms, mean = 133.521ms, stddev = 2.06297ms
[CUDA FP16]
init >> 632.171ms
inference >> min = 69.454ms, max = 75.5ms, mean = 70.737ms, stddev = 1.05118ms