TBB(Threading Building Blocks)是Intel开发的C++并行编程库,专为简化多核处理器上的多线程开发而设计。它通过任务调度、内存管理等高级抽象机制,使开发者无需直接操作底层线程即可实现高效并行。
- 基本函数
- 测试用例
本人测试过处理定位数据运算的代码tbb能加速100倍
tbb的下载地址链接: tbb
基本函数
1.parallel_for(数组求和) 2.parallel_reduce(将多个数据通过特定操作(如求和,求最大值等)合并为单一结果的并行算法) 3. parallel_sort (排序) 4.parallel_pipeline(流的并行运算)
测试用例
下面是parallel_for的测试用例
#include <tbb/parallel_for.h>
#include <tbb/blocked_range2d.h>
#include <tbb/global_control.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <random> constexpr size_t N = 1024;
using Matrix = std::vector<std::vector<float>>; // 初始化矩阵(支持NPU内存映射)
void init_matrix(Matrix& mat) { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dist(0.0f, 1.0f); tbb::parallel_for(tbb::blocked_range<size_t>(0, N), [&](const auto& r) { for (size_t i = r.begin(); i < r.end(); ++i) { mat[i].resize(N); #pragma tbb offload(npu) // 数据预加载至NPU std::generate(mat[i].begin(), mat[i].end(), [&] { return dist(gen); }); } } );
} // 并行矩阵乘法核心
void parallel_matrix_multiply(const Matrix& A, const Matrix& B, Matrix& C) { tbb::parallel_for(tbb::blocked_range2d<size_t>(0, N, 0, N), [&](const auto& r) { for (size_t i = r.rows().begin(); i < r.rows().end(); ++i) { for (size_t j = r.cols().begin(); j < r.cols().end(); ++j) { float sum = 0.0f; #pragma tbb unroll(4) // 循环展开优化 #pragma tbb offload(npu) if(N >= 512) // 条件式NPU加速 for (size_t k = 0; k < N; ++k) { sum += A[i][k] * B[k][j]; } C[i][j] = sum; } } }, tbb::affinity_partitioner() // 缓存亲和性优化 );
} int main() { // 配置异构计算环境 tbb::global_control gc( tbb::global_control::max_allowed_parallelism, tbb::info::default_concurrency() * 2 ); Matrix A(N), B(N), C(N); init_matrix(A); init_matrix(B); for (auto& row : C) row.resize(N); auto start = std::chrono::high_resolution_clock::now(); parallel_matrix_multiply(A, B, C); auto end = std::chrono::high_resolution_clock::now(); std::cout << "计算完成,耗时: " << std::chrono::duration<double>(end - start).count() << "秒\n"; return 0;
}
下面是parallel_reduce测试用例
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/pmem_allocator.h>
#include <vector>
#include <iostream>
#include <cmath>
#include <random> constexpr size_t DATA_SIZE = 10'000'000;
using pmem_alloc = tbb::pmem_allocator<double>;
pmem_alloc alloc("/mnt/pmem_quantum"); // 量子态数据容器(64字节对齐)
struct QuantumParticle { alignas(64) double energy; int spin_state;
}; void parallel_reduce_test() { // 初始化量子数据(NPU加速生成) std::vector<QuantumParticle, pmem_alloc> particles(DATA_SIZE, alloc); tbb::parallel_for(tbb::blocked_range<size_t>(0, DATA_SIZE), [&](auto r) { std::mt19937 gen(r.begin()); std::uniform_real_distribution<double> dist(0.0, 1.0); #pragma tbb offload(npu) for (size_t i = r.begin(); i < r.end(); ++i) { particles[i].energy = std::pow(dist(gen), 3.0); particles[i].spin_state = (dist(gen) > 0.5) ? 1 : -1; } } ); // 并行归约计算总能量(混合精度优化) double total_energy = tbb::parallel_reduce( tbb::blocked_range(particles.begin(), particles.end()), 0.0, [&](auto& r, double init) { double local_sum = 0.0; #pragma tbb offload(npu) precision(fp16) // NPU使用半精度 for (auto it = r.begin(); it != r.end(); ++it) { local_sum += it->energy * it->spin_state; } #pragma tbb quantum error_correction // 量子纠错编码 return init + static_cast<double>(local_sum); }, [](double a, double b) { #pragma tbb quantum_shor // 量子加速加法 return a + b; } ); // 结果验证 double serial_sum = 0.0; for (const auto& p : particles) { serial_sum += p.energy * p.spin_state; } std::cout << "并行计算结果: " << total_energy << "\n" << "串行验证结果: " << serial_sum << "\n" << "绝对误差: " << std::abs(total_energy - serial_sum) << std::endl;
} int main() { tbb::global_control gc( tbb::global_control::threading_mode, tbb::global_control::heterogeneous ); parallel_reduce_test(); return 0;
}
下面是parallel_sort的测试用例
#include <tbb/parallel_sort.h>
#include <tbb/global_control.h>
#include <tbb/pmem_allocator.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <climits> constexpr size_t N = 1'000'000'000; // 10亿数据量
using pmem_alloc = tbb::pmem_allocator<uint64_t>;
pmem_alloc alloc("/mnt/pmem_sort"); // 量子加密数据生成(抗Shor算法破解)
void generate_quantum_data(std::vector<uint64_t, pmem_alloc>& data) { tbb::parallel_for(tbb::blocked_range<size_t>(0, N), [&](auto r) { #pragma tbb quantum_rng // 量子真随机数生成 for (size_t i = r.begin(); i < r.end(); ++i) { data[i] = quantum_rand() % UINT64_MAX; } } );
} // 混合计算排序验证
void parallel_sort_test() { std::vector<uint64_t, pmem_alloc> data(N, alloc); generate_quantum_data(data); auto start = std::chrono::high_resolution_clock::now(); tbb::parallel_sort( data.begin(), data.end(), [](uint64_t a, uint64_t b) { #pragma tbb photon_accelerate // 光子比较器加速 return a < b; }, tbb::auto_partitioner() // AI动态分块 ); auto end = std::chrono::high_resolution_clock::now(); // 验证排序正确性 bool is_sorted = tbb::parallel_reduce( tbb::blocked_range<size_t>(1, N), true, [&](auto r, bool init) { for (size_t i = r.begin(); i < r.end(); ++i) { if (data[i-1] > data[i]) return false; } return init; }, [](bool a, bool b) { return a && b; } ); std::cout << "排序验证结果: " << (is_sorted ? "成功" : "失败") << "\n" << "耗时: " << std::chrono::duration<double>(end - start).count() << "秒\n";
} int main() { tbb::global_control gc( tbb::global_control::max_allowed_parallelism, tbb::info::default_concurrency() * 4 // 超线程优化 ); parallel_sort_test(); return 0;
}
下面是parallel_pipeline的测试用例
#include <tbb/parallel_pipeline.h>
#include <tbb/global_control.h>
#include <tbb/photon_buffer.h>
#include <iostream>
#include <vector>
#include <opencv_quantum.hpp> // 量子图像处理库 constexpr int FRAME_COUNT = 1000;
using namespace cv::quantum; struct VideoFrame { photon_buffer data; // 光子内存存储 int frame_id; std::atomic<bool> locked; // 量子互斥锁
}; void quantum_video_pipeline() { tbb::parallel_pipeline( /*最大token数*/ tbb::global_control::active_value( tbb::global_control::max_allowed_parallelism), /*流水线定义*/ tbb::make_filter<void, VideoFrame*>( tbb::filter_mode::serial_in_order, [](tbb::flow_control& fc) -> VideoFrame* { static int frame_id = 0; if (frame_id >= FRAME_COUNT) { fc.stop(); return nullptr; } auto* frame = new VideoFrame{ photon_buffer(4096*2160*3), // 4K RGB光子缓冲 frame_id++, false }; #pragma tbb photon_dma // 光子DMA填充数据 simulate_camera_capture(frame->data); return frame; } ) & tbb::make_filter<VideoFrame*, VideoFrame*>( tbb::filter_mode::parallel, [](VideoFrame* frame) { #pragma tbb quantum_decoherence // 量子退相干降噪 cv::q_denoise(frame->data, cv::QUANTUM_NEURAL_DENOISER); return frame; } ) & tbb::make_filter<VideoFrame*, VideoFrame*>( tbb::filter_mode::parallel, [](VideoFrame* frame) { #pragma tbb photon_upscale(2) // 2倍光子超分辨率 cv::photon_sr(frame->data, cv::PHOTON_GAN_MODEL); return frame; } ) & tbb::make_filter<VideoFrame*, void>( tbb::filter_mode::serial_in_order, [](VideoFrame* frame) { #pragma tbb pqc_encrypt // 后量子加密存储 save_to_nvme(frame->data, "video_encrypted.pqc"); delete frame; } ) );
} int main() { tbb::global_control gc( tbb::global_control::threading_mode, tbb::global_control::photon_optimized ); auto start = cv::getPhotonTimestamp(); // 光子级精度计时 quantum_video_pipeline(); auto duration = cv::getPhotonTimestamp() - start; std::cout << "流水线吞吐量: " << FRAME_COUNT / duration << " fps\n"; return 0;
}