网页无法访问qq可以登陆_微盟微商城官网_seo网络推广哪家专业_百度广告商

常量内存其实只是全局内存的一种虚拟地址形式。2个特性：

高速缓存
支持单个值广播到线程束中的每个线程。

声明关键字：

__constant__

对比使用常量内存和确切的变量：

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>#define KERNEL_LOOP 65536__constant__ static const int const_data_01 = 0x55555555;
__constant__ static const int const_data_02 = 0x77777777;
__constant__ static const int const_data_03 = 0x33333333;
__constant__ static const int const_data_04 = 0x11111111;__global__ void warmup(int* const data, const int num_elements){const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid < num_elements){int d = 0x55555555;for (int i = 0; i < KERNEL_LOOP; i++){d ^= 0x55555555;d |= 0x77777777;d &= 0x33333333;d |= 0x11111111;}data[tid] = d;}
}__global__ void const_test_gpu_literal(int* const data, const int num_elements){const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid < num_elements){int d = 0x55555555;for (int i = 0; i < KERNEL_LOOP; i++){d ^= 0x55555555;d |= 0x77777777;d &= 0x33333333;d |= 0x11111111;}data[tid] = d;}
}__global__ void const_test_gpu_const(int* const data, const int num_elements){const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid < num_elements){int d = 0x55555555;for (int i = 0; i < KERNEL_LOOP; i++){d ^= const_data_01;d |= const_data_02;d &= const_data_03;d |= const_data_04;}data[tid] = d;}
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaSetDevice(dev);cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);const int num_elements = 128 * 1024;const int num_threads = 256;const int num_blocks = (num_elements + num_threads - 1)/num_threads;const int num_bytes = num_elements * sizeof(int);int *data;cudaMalloc((int**)&data, num_bytes);dim3 block(num_threads, 1);dim3 grid(num_blocks,1);Timer timer;timer.start();warmup<<<grid,block>>>(data, num_elements);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);timer.start();const_test_gpu_literal<<<grid,block>>>(data, num_elements);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("const_test_gpu_literal <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);timer.start();const_test_gpu_const<<<grid,block>>>(data, num_elements);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("const_test_gpu_const <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);cudaFree(data);cudaDeviceReset();return 0;}

两个内核的执行时间不相上下：

 Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                 Name--------  ---------------  ---------  --------  --------  --------  --------  -----------  ----------------------------------38.1             1376          1    1376.0    1376.0      1376      1376          0.0  warmup(int *, int)31.0             1120          1    1120.0    1120.0      1120      1120          0.0  const_test_gpu_const(int *, int)31.0             1120          1    1120.0    1120.0      1120      1120          0.0  const_test_gpu_literal(int *, int)

进而与全局内存对比：
添加核函数

__device__ static  int data_01 = 0x55555555;
__device__ static  int data_02 = 0x77777777;
__device__ static  int data_03 = 0x33333333;
__device__ static  int data_04 = 0x11111111;//
__global__ void const_test_gpu_gmem(int* const data, const int num_elements){const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid < num_elements){int d = 0x55555555;for (int i = 0; i < KERNEL_LOOP; i++){d ^= data_01;d |= data_02;d &= data_03;d |= data_04;}data[tid] = d;}
}

以及调用：

timer.start();const_test_gpu_gmem<<<grid,block>>>(data, num_elements);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("const_test_gpu_gmem <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

 Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                 Name--------  ---------------  ---------  --------  --------  --------  --------  -----------  ----------------------------------99.6           867427          1  867427.0  867427.0    867427    867427          0.0  const_test_gpu_gmem(int *, int)0.2             1408          1    1408.0    1408.0      1408      1408          0.0  warmup(int *, int)0.1             1152          1    1152.0    1152.0      1152      1152          0.0  const_test_gpu_const(int *, int)0.1             1152          1    1152.0    1152.0      1152      1152          0.0  const_test_gpu_literal(int *, int)

非常夸张的加速比。
进一步验证，查看生成的PTX（虚拟汇编）代码。为了看到PTX，在编译的时候需要使用-keep编译器选项：
nvcc constant_mem.cu -keep -o constant_mem.exe
会生成： constant_mem.ptx

以warmup函数为例学习PTX:

// 可见的 CUDA 内核入口函数：函数名经过 Name Mangling（实际对应 warmup(int*, int)）
.visible .entry _Z6warmupPii(.param .u64 _Z6warmupPii_param_0,  // 参数1：u64类型的指针（对应 int* 数组）.param .u32 _Z6warmupPii_param_1   // 参数2：u32类型的整数（对应数组长度 N）
)
{// 寄存器声明.reg .pred  %p<2>;  // 谓词寄存器（用于条件判断），定义2个：%p0, %p1.reg .b32   %r<7>;  // 32位通用寄存器，定义7个：%r0-%r6.reg .b64   %rd<5>; // 64位通用寄存器，定义5个：%rd0-%rd4// --- 代码逻辑开始 ---ld.param.u64   %rd1, [_Z6warmupPii_param_0]; // 加载参数1（指针）到 %rd1ld.param.u32   %r2, [_Z6warmupPii_param_1];  // 加载参数2（数组长度）到 %r2// 计算全局线程索引：blockIdx.x * blockDim.x + threadIdx.xmov.u32   %r3, %ctaid.x;    // %r3 = blockIdx.x（块索引）mov.u32   %r4, %ntid.x;     // %r4 = blockDim.x（每块线程数）mov.u32   %r5, %tid.x;      // %r5 = threadIdx.x（线程索引）mad.lo.s32 %r1, %r3, %r4, %r5; // %r1 = %r3 * %r4 + %r5 → 全局线程索引// 检查是否越界（%r1 >= %r2 → 数组越界？）setp.ge.s32 %p1, %r1, %r2;  // 若 %r1 >= %r2，则设置谓词寄存器 %p1 为真@%p1 bra $L__BB0_2;         // 如果 %p1 为真，跳转到标签 $L__BB0_2（直接返回）// 未越界时，向数组写入值 858993459（0x33333333）cvta.to.global.u64 %rd2, %rd1;      // 将指针 %rd1 转换为全局内存地址 %rd2mul.wide.s32 %rd3, %r1, 4;          // 计算偏移：%rd3 = %r1 * 4（int类型占4字节）add.s64      %rd4, %rd2, %rd3;      // 计算目标地址：%rd4 = %rd2（基址） + %rd3（偏移）mov.u32      %r6, 858993459;        // 858993459 的十六进制是 0x33333333st.global.u32 [%rd4], %r6;          // 将 0x33333333 写入地址 %rd4$L__BB0_2:ret;  // 返回（所有线程执行完毕）
}

网页无法访问qq可以登陆_微盟微商城官网_seo网络推广哪家专业_百度广告商

最新新闻

热搜词