[CUDA编程] cuda graph优化心得

CUDA Graph

1. cuda graph的使用场景

cuda graph在一个kernel要多次执行，且每次只更改kernel 参数或者不更改参数时使用效果更加；但是如果将graph替换已有的kernel组合，且没有重复执行，感觉效率不是很高反而低于原始的kernel调用；【此外， graph启动还需要耗时】

2. 使用方式

2.1 stream capture 方式

基本范式，通过start capture 和end Capture 以及构建graph exec方式实现graph执行，效率不高；用于graph多次执行的情况。ref: cuda_sample: jacobi
不需要GraphCreate 一个graph对象。cudaStreamEndCapture 会直接创建一个graph。

checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream));if ((k & 1) == 0) {JacobiMethod<<<nblocks, nthreads, 0, stream>>>(A, b, conv_threshold, x,x_new, d_sum);} else {JacobiMethod<<<nblocks, nthreads, 0, stream>>>(A, b, conv_threshold,x_new, x, d_sum);}checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost, stream));checkCudaErrors(cudaStreamEndCapture(stream, &graph));if (graphExec == NULL) {checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));} else {cudaGraphExecUpdateResult updateResult_out;checkCudaErrors(cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out));if (updateResult_out != cudaGraphExecUpdateSuccess) {if (graphExec != NULL) {checkCudaErrors(cudaGraphExecDestroy(graphExec));}printf("k = %d graph update failed with error - %d\n", k,updateResult_out);checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));}}checkCudaErrors(cudaGraphLaunch(graphExec, stream));checkCudaErrors(cudaStreamSynchronize(stream));// 封装 capture过程
class MyCudaGraph {public:CudaGraph(): graph_(nullptr),graph_instance_(nullptr),stream_(nullptr),is_captured_(false) {RPV_CUDA_CHECK(cudaGraphCreate(&graph_, 0));}~CudaGraph() {if (graph_ != nullptr) {RPV_CUDA_CHECK(cudaGraphDestroy(graph_));}if (graph_instance_ != nullptr) {RPV_CUDA_CHECK(cudaGraphExecDestroy(graph_instance_));}}void set_stream(const cudaStream_t& stream) { stream_ = stream; }const cudaGraph_t& graph() const { return graph_; }const cudaGraphExec_t& graph_instance() const { return graph_instance_; }void CaptureStart() const {RPV_CUDA_CHECK(cudaStreamBeginCapture(stream_, cudaStreamCaptureModeGlobal));}void CaptureEnd() const {// stream 捕捉模式不需要cudaGraphCreate 来初始化 graph_.RPV_CUDA_CHECK(cudaStreamEndCapture(stream_, &graph_));}bool IsCaptured() const { return is_captured_; }void Launch() const {if (graph_instance_ == nullptr) {RPV_CUDA_CHECK(cudaGraphInstantiate(&graph_instance_, graph_, nullptr, nullptr, 0));}RPV_CUDA_CHECK(cudaGraphLaunch(graph_instance_, stream_));}void UpdateLaunch() const {cudaGraphExecUpdateResult update_result;// 当第一次构建完graph_instance_（cudaGraphExec_t）后， 后续捕捉都只需要更新graphexec 即可。RPV_CUDA_CHECK(cudaGraphExecUpdate(graph_instance_, graph_, nullptr, &update_result));if (update_result != cudaGraphExecUpdateSuccess) {if (graph_instance_ != nullptr) { // 注意，如果更新失败，则需要将graph_instance_ 删除，并用cudaGraphInstantiate重新生成一个新的graph exec对象。RPV_CUDA_CHECK(cudaGraphExecDestroy(graph_instance_));}LOG(WARNING) << "cuda graph update failed.";RPV_CUDA_CHECK(cudaGraphInstantiate(&graph_instance_, graph_, nullptr, nullptr, 0));}RPV_CUDA_CHECK(cudaGraphLaunch(graph_instance_, stream_)); // 执行graph是通过cudaGraphLaunch 执行cudaGraphExec_t对象来实现}void AddKernelNode(cudaGraphNode_t& node, cudaKernelNodeParams& param) const {node_ = node;cudaGraphAddKernelNode(&node_, graph_, nullptr, 0, &param); // 往graph中添加node_,注意需要提前cudaGraphCreate graph才行。}void ExecKernelNodeSetParams(cudaKernelNodeParams& param) const {cudaGraphExecKernelNodeSetParams(graph_instance_, node_, &param);RPV_CUDA_CHECK(cudaGraphLaunch(graph_instance_, stream_));}private:mutable cudaGraphNode_t node_;mutable cudaGraph_t graph_;mutable cudaGraphExec_t graph_instance_;mutable cudaStream_t stream_;mutable bool is_captured_;DISALLOW_COPY_AND_ASSIGN(CudaGraph);
};

2.2 Node Param方式

ref: cuda sample: jacobi
注意node的方式需要构建每个node的依赖node。并且通过更新kernel param的方式来更新graph exec, 效率可能更高。但是

cudaGraph_t graph;cudaGraphExec_t graphExec = NULL;double sum = 0.0;double *d_sum = NULL;checkCudaErrors(cudaMalloc(&d_sum, sizeof(double)));std::vector<cudaGraphNode_t> nodeDependencies;cudaGraphNode_t memcpyNode, jacobiKernelNode, memsetNode;cudaMemcpy3DParms memcpyParams = {0};cudaMemsetParams memsetParams = {0};memsetParams.dst = (void *)d_sum;memsetParams.value = 0;memsetParams.pitch = 0;// elementSize can be max 4 bytes, so we take sizeof(float) and width=2memsetParams.elementSize = sizeof(float);memsetParams.width = 2;memsetParams.height = 1;checkCudaErrors(cudaGraphCreate(&graph, 0));checkCudaErrors(cudaGraphAddMemsetNode(&memsetNode, graph, NULL, 0, &memsetParams));nodeDependencies.push_back(memsetNode);cudaKernelNodeParams NodeParams0, NodeParams1;NodeParams0.func = (void *)JacobiMethod;NodeParams0.gridDim = nblocks;NodeParams0.blockDim = nthreads;NodeParams0.sharedMemBytes = 0;void *kernelArgs0[6] = {(void *)&A, (void *)&b,     (void *)&conv_threshold,(void *)&x, (void *)&x_new, (void *)&d_sum};NodeParams0.kernelParams = kernelArgs0;NodeParams0.extra = NULL;checkCudaErrors(cudaGraphAddKernelNode(&jacobiKernelNode, graph, nodeDependencies.data(),nodeDependencies.size(), &NodeParams0));nodeDependencies.clear();nodeDependencies.push_back(jacobiKernelNode);memcpyParams.srcArray = NULL;memcpyParams.srcPos = make_cudaPos(0, 0, 0);memcpyParams.srcPtr = make_cudaPitchedPtr(d_sum, sizeof(double), 1, 1);memcpyParams.dstArray = NULL;memcpyParams.dstPos = make_cudaPos(0, 0, 0);memcpyParams.dstPtr = make_cudaPitchedPtr(&sum, sizeof(double), 1, 1);memcpyParams.extent = make_cudaExtent(sizeof(double), 1, 1);memcpyParams.kind = cudaMemcpyDeviceToHost;checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(),nodeDependencies.size(), &memcpyParams));checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));NodeParams1.func = (void *)JacobiMethod;NodeParams1.gridDim = nblocks;NodeParams1.blockDim = nthreads;NodeParams1.sharedMemBytes = 0;void *kernelArgs1[6] = {(void *)&A,     (void *)&b, (void *)&conv_threshold,(void *)&x_new, (void *)&x, (void *)&d_sum};NodeParams1.kernelParams = kernelArgs1;NodeParams1.extra = NULL;int k = 0;for (k = 0; k < max_iter; k++) {checkCudaErrors(cudaGraphExecKernelNodeSetParams(graphExec, jacobiKernelNode,((k & 1) == 0) ? &NodeParams0 : &NodeParams1));checkCudaErrors(cudaGraphLaunch(graphExec, stream));checkCudaErrors(cudaStreamSynchronize(stream));if (sum <= conv_threshold) {checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream));nblocks.x = (N_ROWS / nthreads.x) + 1;size_t sharedMemSize = ((nthreads.x / 32) + 1) * sizeof(double);if ((k & 1) == 0) {finalError<<<nblocks, nthreads, sharedMemSize, stream>>>(x_new, d_sum);} else {finalError<<<nblocks, nthreads, sharedMemSize, stream>>>(x, d_sum);}checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost, stream));checkCudaErrors(cudaStreamSynchronize(stream));printf("GPU iterations : %d\n", k + 1);printf("GPU error : %.3e\n", sum);break;}}

对比发现 graph 反而耗时更长

2.3 通过传递kernel为指针，然后更改指针的值来是graph执行更高效

官方其他实例，通过更新值
ref: mandrake: wtsne_gpu
这个开源工程通过封装 device value为一个container，从而通过替换这个显存问题的值来重复执行graph. 效率更高。

// Start capturecapture_stream.capture_start();// Y updatewtsneUpdateYKernel<real_t><<<block_count, block_size, 0, capture_stream.stream()>>>(device_ptrs.rng, get_node_table(), get_edge_table(), device_ptrs.Y,device_ptrs.I, device_ptrs.J, device_ptrs.Eq, device_ptrs.qsum,device_ptrs.qcount, device_ptrs.nn, device_ptrs.ne, eta0, nRepuSamp,device_ptrs.nsq, bInit, iter_d.data(), maxIter,device_ptrs.n_workers, n_clashes_d.data());// s (Eq) updatecub::DeviceReduce::Sum(qsum_tmp_storage_.data(), qsum_tmp_storage_bytes_,qsum_.data(), qsum_total_device_.data(),qsum_.size(), capture_stream.stream());cub::DeviceReduce::Sum(qcount_tmp_storage_.data(), qcount_tmp_storage_bytes_, qcount_.data(),qcount_total_device_.data(), qcount_.size(), capture_stream.stream());update_eq<real_t><<<1, 1, 0, capture_stream.stream()>>>(device_ptrs.Eq, device_ptrs.nsq, qsum_total_device_.data(),qcount_total_device_.data(), iter_d.data());capture_stream.capture_end(graph.graph());// End capture// Main SCE loop - run captured graph maxIter times// NB: Here I have written the code so the kernel launch parameters (and all// CUDA API calls) are able to use the same parameters each loop, mainly by// using pointers to device memory, and two iter counters.// The alternative would be to use cudaGraphExecKernelNodeSetParams to// change the kernel launch parameters. See// 0c369b209ef69d91016bedd41ea8d0775879f153const auto start = std::chrono::steady_clock::now();for (iter_h = 0; iter_h < maxIter; ++iter_h) {graph.launch(graph_stream.stream());if (iter_h % MAX(1, maxIter / 1000) == 0) {// Update progress meterEq_device_.get_value_async(&Eq_host_, graph_stream.stream()); // 只是更改kernel参数指针中的值n_clashes_d.get_value_async(&n_clashes_h, graph_stream.stream());real_t eta = eta0 * (1 - static_cast<real_t>(iter_h) / (maxIter - 1));// Check for interrupts while copyingcheck_interrupts();// Make sure copies have finishedgraph_stream.sync();update_progress(iter_h, maxIter, eta, Eq_host_, write_per_worker,n_clashes_h);}if (results->is_sample_frame(iter_h)) {Eq_device_.get_value_async(&Eq_host_, copy_stream.stream());update_frames(results, graph_stream, copy_stream, curr_iter, curr_Eq,iter_h, Eq_host_);}}

2.4

当连续执行graph多次，且存在kernel 参数更新的话，可以看到下一个graph启动与上一个graph执行存在并行，从而实现了graph的启动隐藏，并且graph执行要比kernel执行更加快，因此对于某个kernel重复执行多次且更改不大的情况下或者多流处理时，可以考虑用graph.
比如一些固定输入的kernel 需要多次执行，且可以用stream并行，那么可以考虑用graph来高效执行。

3. 不同版本的api

#if CUDA_VERSION < 12000cudaGraphExecUpdateResult update_result{};cudaGraphNode_t error_node = nullptr;OF_CUDA_CHECK(cudaGraphExecUpdate(graph_exec_, graph, &error_node, &update_result));if (update_result == cudaGraphExecUpdateSuccess) { return; }
#elsecudaGraphExecUpdateResultInfo update_result{};  // 新版本使用这个结构体接受OF_CUDA_CHECK(cudaGraphExecUpdate(graph_exec_, graph, &update_result));if (update_result.result == cudaGraphExecUpdateSuccess) { return; }
#endif  // CUDA_VERSION < 12000

[CUDA编程] cuda graph优化心得

CUDA Graph

1. cuda graph的使用场景

2. 使用方式

2.1 stream capture 方式

2.2 Node Param方式

2.3 通过传递kernel为指针，然后更改指针的值来是graph执行更高效

2.4

3. 不同版本的api

推荐新闻

热搜词