您的位置:首页 > 健康 > 养生 > 郑州最新新闻事件今天_医院推广营销方式_百度网页版入口_巨量引擎广告投放

郑州最新新闻事件今天_医院推广营销方式_百度网页版入口_巨量引擎广告投放

2025/6/9 13:09:51 来源:https://blog.csdn.net/weixin_44231807/article/details/142983244  浏览:    关键词:郑州最新新闻事件今天_医院推广营销方式_百度网页版入口_巨量引擎广告投放
郑州最新新闻事件今天_医院推广营销方式_百度网页版入口_巨量引擎广告投放

CUDA C编程笔记

  • 第二章 CUDA编程模型
    • 2.3 组织并行线程
      • 2.3.1 使用块和线程建立矩阵索引
        • 运行结果
        • 代码2-6遇到的问题
      • 2.3.2 使用二维网格和二维块对矩阵求和
        • 正确代码2-7.1 2D-grid-2D-block
        • 运行结果
        • 代码2-7遇到的问题
      • 2.3.3 使用一维网格和一维块对矩阵求和
        • 正确代码2-7.1 1D-grid-1D-block
        • 运行结果
      • 2.3.4 使用二维网格和一维块对矩阵求和
        • 正确代码2-7.1 2D-grid-1D-block
        • 运行结果

第二章 CUDA编程模型

2.3 组织并行线程

2.3.1 使用块和线程建立矩阵索引

对二维数据来说,需要3种索引:

  • ①线程和块索引
  • ②矩阵中给定点的坐标
  • ③全局线性内存中的偏移量

给定线程——>线程和块索引——>线程和块索引的全局内存偏移量ix,iy——>全局内存的存储单元idx

  1. 线程和块索引——>矩阵坐标
    ix = threadIdx.x + blockIdx.x * blockDim.x;
    iy = threadIdx.y + blockIdx.y * blockDim.y;
  2. 矩阵坐标——>全局内存的存储单元
    idx = iy * nx + ix;
    在这里插入图片描述

####正确代码2-6

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"// #define CHECK(call){
//     const cudaERROR_t error = call;
//     if(error != cudaSuccess){
//         printf("Error:%s:%d,",__FILE__,__LINE__);
//         printf("code:%d, reason:%s\n",error,cudaGetErrorString(error));
//         exit(-10*error);//终止程序,返回错误码,返回的是cuda错误码的-10倍的数值,这是自定义的错误码
//     }
// }void initialInt(int *ip, int size){for(int i = 0;i<size;i++){ip[i] = i;}
}void printMatrix(int *C,const int nx,const int ny){int *ic = C;//数组指针printf("\nMatrix:(%d.%d)\n",nx,ny);for(int iy=0;iy<ny;iy++){for(int ix=0;ix<nx;ix++){printf("%3d",ic[ix]);}ic+=nx;//每次打印完一行,指针加一个nxprintf("\n");}printf("\n");
}__global__ void printThreadIndex(int *A,const int nx,const int ny){int ix = threadIdx.x + blockDim.x * blockIdx.x;int iy = threadIdx.y + blockDim.y * blockIdx.y;unsigned int idx = iy*nx + ix;printf("thread_id (%d,%d) block_id (%d,%d) coordinate(%d,%d) global index %2d ival %2d\n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
}int main(int argc,char **argv){printf("%s Starting...\n",argv[0]);//获取设备信息int dev=0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp,dev));printf("Using Device %d:%s\n",dev,deviceProp.name);CHECK(cudaSetDevice(dev));//设置矩阵维度int nx=8;int ny=6;int nxy=nx*ny;int nBytes = nxy*sizeof(float);//分配主机内存int *h_A;h_A = (int *)malloc(nBytes);//初始化主机矩阵initialInt(h_A,nxy);printMatrix(h_A,nx,ny);//分配设备存储int *d_MatA;cudaMalloc((void **)&d_MatA,nBytes);//取地址符,类型强转//从主机向设备传输数据cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);//设置执行配置dim3 block(4,2);dim3 grid(((nx+block.x-1)/block.x),((ny+block.y-1)/block.y));//xy维度各自计算//激活核函数printThreadIndex<<<grid,block>>>(d_MatA,nx,ny);cudaDeviceSynchronize();//释放主机和设备存储cudaFree(d_MatA);free(h_A);//重置设备cudaDeviceReset();return (0);
}
运行结果
$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090Matrix:(8.6)0  1  2  3  4  5  6  78  9 10 11 12 13 14 1516 17 18 19 20 21 22 2324 25 26 27 28 29 30 3132 33 34 35 36 37 38 3940 41 42 43 44 45 46 47thread_id (0,0) block_id (1,0) coordinate(4,0) global index  4 ival  4
thread_id (1,0) block_id (1,0) coordinate(5,0) global index  5 ival  5
thread_id (2,0) block_id (1,0) coordinate(6,0) global index  6 ival  6
thread_id (3,0) block_id (1,0) coordinate(7,0) global index  7 ival  7
thread_id (0,1) block_id (1,0) coordinate(4,1) global index 12 ival 12
thread_id (1,1) block_id (1,0) coordinate(5,1) global index 13 ival 13
thread_id (2,1) block_id (1,0) coordinate(6,1) global index 14 ival 14
thread_id (3,1) block_id (1,0) coordinate(7,1) global index 15 ival 15
thread_id (0,0) block_id (0,1) coordinate(0,2) global index 16 ival 16
thread_id (1,0) block_id (0,1) coordinate(1,2) global index 17 ival 17
thread_id (2,0) block_id (0,1) coordinate(2,2) global index 18 ival 18
thread_id (3,0) block_id (0,1) coordinate(3,2) global index 19 ival 19
thread_id (0,1) block_id (0,1) coordinate(0,3) global index 24 ival 24
thread_id (1,1) block_id (0,1) coordinate(1,3) global index 25 ival 25
thread_id (2,1) block_id (0,1) coordinate(2,3) global index 26 ival 26
thread_id (3,1) block_id (0,1) coordinate(3,3) global index 27 ival 27
thread_id (0,0) block_id (1,1) coordinate(4,2) global index 20 ival 20
thread_id (1,0) block_id (1,1) coordinate(5,2) global index 21 ival 21
thread_id (2,0) block_id (1,1) coordinate(6,2) global index 22 ival 22
thread_id (3,0) block_id (1,1) coordinate(7,2) global index 23 ival 23
thread_id (0,1) block_id (1,1) coordinate(4,3) global index 28 ival 28
thread_id (1,1) block_id (1,1) coordinate(5,3) global index 29 ival 29
thread_id (2,1) block_id (1,1) coordinate(6,3) global index 30 ival 30
thread_id (3,1) block_id (1,1) coordinate(7,3) global index 31 ival 31
thread_id (0,0) block_id (0,0) coordinate(0,0) global index  0 ival  0
thread_id (1,0) block_id (0,0) coordinate(1,0) global index  1 ival  1
thread_id (2,0) block_id (0,0) coordinate(2,0) global index  2 ival  2
thread_id (3,0) block_id (0,0) coordinate(3,0) global index  3 ival  3
thread_id (0,1) block_id (0,0) coordinate(0,1) global index  8 ival  8
thread_id (1,1) block_id (0,0) coordinate(1,1) global index  9 ival  9
thread_id (2,1) block_id (0,0) coordinate(2,1) global index 10 ival 10
thread_id (3,1) block_id (0,0) coordinate(3,1) global index 11 ival 11
thread_id (0,0) block_id (1,2) coordinate(4,4) global index 36 ival 36
thread_id (1,0) block_id (1,2) coordinate(5,4) global index 37 ival 37
thread_id (2,0) block_id (1,2) coordinate(6,4) global index 38 ival 38
thread_id (3,0) block_id (1,2) coordinate(7,4) global index 39 ival 39
thread_id (0,1) block_id (1,2) coordinate(4,5) global index 44 ival 44
thread_id (1,1) block_id (1,2) coordinate(5,5) global index 45 ival 45
thread_id (2,1) block_id (1,2) coordinate(6,5) global index 46 ival 46
thread_id (3,1) block_id (1,2) coordinate(7,5) global index 47 ival 47
thread_id (0,0) block_id (0,2) coordinate(0,4) global index 32 ival 32
thread_id (1,0) block_id (0,2) coordinate(1,4) global index 33 ival 33
thread_id (2,0) block_id (0,2) coordinate(2,4) global index 34 ival 34
thread_id (3,0) block_id (0,2) coordinate(3,4) global index 35 ival 35
thread_id (0,1) block_id (0,2) coordinate(0,5) global index 40 ival 40
thread_id (1,1) block_id (0,2) coordinate(1,5) global index 41 ival 41
thread_id (2,1) block_id (0,2) coordinate(2,5) global index 42 ival 42
thread_id (3,1) block_id (0,2) coordinate(3,5) global index 43 ival 43
代码2-6遇到的问题

报错1、CHECK报错重定义

$ nvcc -arch=sm_86 2-6.1checkThreadIndex.cu -o hello
2-6.1checkThreadIndex.cu:5: 警告:“CHECK”重定义5 | #define CHECK(call){| 
In file included from 2-6.1checkThreadIndex.cu:3:
../common/common.h:6: 附注:这是先前定义的位置6 | #define CHECK(call)

解决:CHECK在common.h头文件和2-6.1中重复定义,把2-6.1中的这段注释掉即可

报错2、打错函数名的报错
解决:根据提示修改为正确的函数名

2.3.2 使用二维网格和二维块对矩阵求和

在这里插入图片描述

用二维网格和二维块编写矩阵加法核函数。
CPU校验主函数:用来验证GPU的结果是否正确

void sumMatrixOhHost(float *A,float *B,float *C,const int nx,const int ny){float *ia = A;float *ib = B;float *ic = C;for(int iy = 0;iy<ny;iy++){//行优先的顺序for(int ix = 0;ix<nx;ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;ib += nx;ic += nx;//y在外层,x在内层,遍历完一行之后,指针移动到下一行的开头}
}

GPU上的核函数:

__global__ void sumMatrixOnGPU2D(float *MatA,float *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;unsigned int idx = iy * nx + ix;if(ix<nx && iy<my)·                                 MatC[idx] = MatA[idx] + MatB[idx];
}
正确代码2-7.1 2D-grid-2D-block
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#include "../common/common.h"
//需要加上cpuSecond和initialData函数的定义void initialData(float *ip,int size){time_t t;srand((unsigned) time(&t));for(int i=0;i<size;i++){ip[i]=(float)(rand() & 0xFF)/10.0f;}return;
}double cpuSecond(){//如果用这个函数执行时间长struct timeval tp;gettimeofday(&tp,NULL);return (double)tp.tv_sec + (double)tp.tv_usec*1.e-6;
}void sumMatrixOnHost(float *A,float *B,float *C,const int nx,const int ny){float *ia = A;float *ib = B;float *ic = C;for(int iy=0;iy<ny;iy++){for(int ix=0;ix<nx;ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;ib += nx;ic += nx;}
}__global__ void sumMatrixOnGPU2D(float *MatA,float *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;unsigned int idx = iy*nx + ix;if(ix<nx && iy<ny)//对于每个线程来说,分别判断自己当前的ix和iy是否超出范围MatC[idx] = MatA[idx] + MatB[idx];
}void checkResult(float *hostRef,float *gpuRef,const int N){double epsilon = 1.0E-8;bool match = 1;for(int i=0;i<N;i++){if(abs(hostRef[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);break;}}if(match) printf("Arrays match.\n");return;
}int main(int argc,char **argv){printf("%s Starting...\n",argv[0]);//启动设备int dev = 0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp,dev));printf("Using Device %d:%s\n",dev,deviceProp.name);CHECK(cudaSetDevice(dev));//创建矩阵的数据尺寸int nx = 1<<14;//每个维度都有2的14次方个元素=16384int ny = 1<<14;int nxy = nx*ny;int nBytes = nxy*sizeof(float);printf("Matrix size:nx %d ny %d\n",nx,ny);//动态分配主机内存float *h_A,*h_B,*hostRef,*gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);hostRef = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);//在主机端初始化数据double iStart = cpuSecond();//需要加上这个函数的定义initialData(h_A,nxy);initialData(h_B,nxy);double iElaps = cpuSecond() - iStart;memset(hostRef,0,nBytes);memset(gpuRef,0,nBytes);//在主机端为了结果检测,进行矩阵加法iStart = cpuSecond();sumMatrixOnHost(h_A,h_B,hostRef,nx,ny);iElaps = cpuSecond() - iStart;//分配设备全局内存float *d_MatA,*d_MatB,*d_MatC;cudaMalloc((void **)&d_MatA,nBytes);cudaMalloc((void **)&d_MatB,nBytes);cudaMalloc((void **)&d_MatC,nBytes);//从主机向设备传数据cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);cudaMemcpy(d_MatB,h_B,nBytes,cudaMemcpyHostToDevice);//在主机端激活内核函数int dimx = 32;int dimy = 32;dim3 block(dimx,dimy);//二维dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y);//二维iStart = cpuSecond();sumMatrixOnGPU2D <<<grid,block>>>(d_MatA,d_MatB,d_MatC,nx,ny);cudaDeviceSynchronize();iElaps = cpuSecond() - iStart;printf("sumMatrixOnGPU2D <<<(%d,%d),(%d,%d)>>> elapsed %f sec\n",grid.x,grid.y,block.x,block.y,iElaps);//把gpu端核函数的计算结果复制回主机cudaMemcpy(gpuRef,d_MatC,nBytes,cudaMemcpyDeviceToHost);//检查设备结果checkResult(hostRef,gpuRef,nxy);//释放设备全局内存cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);//释放主机端内存free(h_A);free(h_B);free(hostRef);free(gpuRef);//重置设备cudaDeviceReset();return (0);}
运行结果

第一次用3232的块,运行时间为0.004417 sec
第二次用32
16的块,运行时间为0.004395 sec
第二次用16*16的块,运行时间为0.004395 sec

$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPU2D <<<(512,512),(32,32)>>> elapsed 0.004417 sec
Arrays match.$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPU2D <<<(512,1024),(32,16)>>> elapsed 0.004395 sec
Arrays match.$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPU2D <<<(1024,1024),(16,16)>>> elapsed 0.004395 sec
Arrays match.
代码2-7遇到的问题

报错1、gettimeofday报错无定义

2-7.1sumMatrixOnGPU-2D-grid-2D-block.cu(7): error: identifier "gettimeofday" is undefinedgettimeofday(&tp,

解决:添加头文件#include <sys/time.h>

2.3.3 使用一维网格和一维块对矩阵求和

在这里插入图片描述

核心:一个线程需要处理图中一列的数据(ny个元素)
与二维相比:只有threadIdx.x是有用的

__global__ void sumMatrixOnGPU1D(float *MatA,flaot *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x*blockDim.x;if(ix<nx){//对于每个线程来说,判断当前的ix是否<nxfor(int iy=0;iy<ny;iy++){int idx = iy*nx + ix;MatC[idx] = MatA[idx] + MatB[idx];}}
}
正确代码2-7.1 1D-grid-1D-block
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#include "../common/common.h"
//需要加上cpuSecond和initialData函数的定义void initialData(float *ip,int size){time_t t;srand((unsigned) time(&t));for(int i=0;i<size;i++){ip[i]=(float)(rand() & 0xFF)/10.0f;}return;
}double cpuSecond(){//如果用这个函数执行时间长struct timeval tp;gettimeofday(&tp,NULL);return (double)tp.tv_sec + (double)tp.tv_usec*1.e-6;
}void sumMatrixOnHost(float *A,float *B,float *C,const int nx,const int ny){float *ia = A;float *ib = B;float *ic = C;for(int iy=0;iy<ny;iy++){for(int ix=0;ix<nx;ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;ib += nx;ic += nx;}
}__global__ void sumMatrixOnGPU1D(float *MatA,float *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;if(ix < nx){for(int iy=0;iy<ny;iy++){int idx = iy*nx + ix;MatC[idx] = MatA[idx] + MatB[idx];//这里是形参,所以是matA相加}}
}void checkResult(float *hostRef,float *gpuRef,const int N){double epsilon = 1.0E-8;bool match = 1;for(int i=0;i<N;i++){if(abs(hostRef[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);break;}}if(match) printf("Arrays match.\n");return;
}int main(int argc,char **argv){printf("%s Starting...\n",argv[0]);//启动设备int dev = 0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp,dev));printf("Using Device %d:%s\n",dev,deviceProp.name);CHECK(cudaSetDevice(dev));//创建矩阵的数据尺寸int nx = 1<<14;//每个维度都有2的14次方个元素=16384int ny = 1<<14;int nxy = nx*ny;int nBytes = nxy*sizeof(float);printf("Matrix size:nx %d ny %d\n",nx,ny);//动态分配主机内存float *h_A,*h_B,*hostRef,*gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);hostRef = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);//在主机端初始化数据double iStart = cpuSecond();//需要加上这个函数的定义initialData(h_A,nxy);initialData(h_B,nxy);double iElaps = cpuSecond() - iStart;memset(hostRef,0,nBytes);memset(gpuRef,0,nBytes);//在主机端为了结果检测,进行矩阵加法iStart = cpuSecond();sumMatrixOnHost(h_A,h_B,hostRef,nx,ny);iElaps = cpuSecond() - iStart;//分配设备全局内存float *d_MatA,*d_MatB,*d_MatC;cudaMalloc((void **)&d_MatA,nBytes);cudaMalloc((void **)&d_MatB,nBytes);cudaMalloc((void **)&d_MatC,nBytes);//从主机向设备传数据cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);cudaMemcpy(d_MatB,h_B,nBytes,cudaMemcpyHostToDevice);//在主机端激活内核函数int dimx = 32;int dimy = 1;dim3 block(dimx,dimy);//二维第二个维度值为1,相当于一维dim3 grid((nx+block.x-1)/block.x, 1);iStart = cpuSecond();sumMatrixOnGPU1D <<<grid,block>>>(d_MatA,d_MatB,d_MatC,nx,ny);cudaDeviceSynchronize();iElaps = cpuSecond() - iStart;printf("sumMatrixOnGPU1D <<<(%d,%d),(%d,%d)>>> elapsed %f sec\n",grid.x,grid.y,block.x,block.y,iElaps);//把gpu端核函数的计算结果复制回主机cudaMemcpy(gpuRef,d_MatC,nBytes,cudaMemcpyDeviceToHost);//检查设备结果checkResult(hostRef,gpuRef,nxy);//释放设备全局内存cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);//释放主机端内存free(h_A);free(h_B);free(hostRef);free(gpuRef);//重置设备cudaDeviceReset();return (0);}
运行结果

第一次用321的块,运行时间为0.007075 sec
第二次用128
1的块,运行时间为0.007160 sec

$ nvcc -arch=sm_86 2-7.1sumMatrixOnGPU-1D-grid-1D-block.cu -o hello$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPU1D <<<(512,1),(32,1)>>> elapsed 0.007075 sec
Arrays match.$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPU1D <<<(128,1),(128,1)>>> elapsed 0.007160 sec
Arrays match.
zyn@dell-Precision-7920-Tower:~/cudaC/unit2$ 

2.3.4 使用二维网格和一维块对矩阵求和

在这里插入图片描述

核心:可以看作二维网格二维块的特殊情况,第二个块的维度是1

__global__ void sumMatrixOnGPUMix(float *MatA,float *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = blockIdx.y;unsigned int idx = iy*nx + ix;if(ix<nx && iy<ny){MatC[idx] = MatA[idx] + MatB[idx];}
}
正确代码2-7.1 2D-grid-1D-block
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#include "../common/common.h"
//需要加上cpuSecond和initialData函数的定义void initialData(float *ip,int size){time_t t;srand((unsigned) time(&t));for(int i=0;i<size;i++){ip[i]=(float)(rand() & 0xFF)/10.0f;}return;
}double cpuSecond(){//如果用这个函数执行时间长struct timeval tp;gettimeofday(&tp,NULL);return (double)tp.tv_sec + (double)tp.tv_usec*1.e-6;
}void sumMatrixOnHost(float *A,float *B,float *C,const int nx,const int ny){float *ia = A;float *ib = B;float *ic = C;for(int iy=0;iy<ny;iy++){for(int ix=0;ix<nx;ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx;ib += nx;ic += nx;}
}__global__ void sumMatrixOnGPUMix(float *MatA,float *MatB,float *MatC,int nx,int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = blockIdx.y;unsigned int idx = iy*nx + ix;if(ix<nx && iy<ny){MatC[idx] = MatA[idx] + MatB[idx];}
}void checkResult(float *hostRef,float *gpuRef,const int N){double epsilon = 1.0E-8;bool match = 1;for(int i=0;i<N;i++){if(abs(hostRef[i] - gpuRef[i]) > epsilon){match = 0;printf("Arrays do not match!\n");printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);break;}}if(match) printf("Arrays match.\n");return;
}int main(int argc,char **argv){printf("%s Starting...\n",argv[0]);//启动设备int dev = 0;cudaDeviceProp deviceProp;CHECK(cudaGetDeviceProperties(&deviceProp,dev));printf("Using Device %d:%s\n",dev,deviceProp.name);CHECK(cudaSetDevice(dev));//创建矩阵的数据尺寸int nx = 1<<14;//每个维度都有2的14次方个元素=16384int ny = 1<<14;int nxy = nx*ny;int nBytes = nxy*sizeof(float);printf("Matrix size:nx %d ny %d\n",nx,ny);//动态分配主机内存float *h_A,*h_B,*hostRef,*gpuRef;h_A = (float *)malloc(nBytes);h_B = (float *)malloc(nBytes);hostRef = (float *)malloc(nBytes);gpuRef = (float *)malloc(nBytes);//在主机端初始化数据double iStart = cpuSecond();//需要加上这个函数的定义initialData(h_A,nxy);initialData(h_B,nxy);double iElaps = cpuSecond() - iStart;memset(hostRef,0,nBytes);memset(gpuRef,0,nBytes);//在主机端为了结果检测,进行矩阵加法iStart = cpuSecond();sumMatrixOnHost(h_A,h_B,hostRef,nx,ny);iElaps = cpuSecond() - iStart;//分配设备全局内存float *d_MatA,*d_MatB,*d_MatC;cudaMalloc((void **)&d_MatA,nBytes);cudaMalloc((void **)&d_MatB,nBytes);cudaMalloc((void **)&d_MatC,nBytes);//从主机向设备传数据cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);cudaMemcpy(d_MatB,h_B,nBytes,cudaMemcpyHostToDevice);//在主机端激活内核函数//int dimx = 128;//int dimy = 1;dim3 block(32);//二维第二个维度值为1,相当于一维dim3 grid((nx+block.x-1)/block.x, ny);//nx和ny的值上面设置过了,16384iStart = cpuSecond();sumMatrixOnGPUMix <<<grid,block>>>(d_MatA,d_MatB,d_MatC,nx,ny);cudaDeviceSynchronize();iElaps = cpuSecond() - iStart;printf("sumMatrixOnGPUMix <<<(%d,%d),(%d,%d)>>> elapsed %f sec\n",grid.x,grid.y,block.x,block.y,iElaps);//把gpu端核函数的计算结果复制回主机cudaMemcpy(gpuRef,d_MatC,nBytes,cudaMemcpyDeviceToHost);//检查设备结果checkResult(hostRef,gpuRef,nxy);//释放设备全局内存cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);//释放主机端内存free(h_A);free(h_B);free(hostRef);free(gpuRef);//重置设备cudaDeviceReset();return (0);}
运行结果

结果优于一维一维,但是没有二维二维效果好
第一次块的尺寸为32,运行时间为0.005981 sec
第二次块的尺寸为128,运行时间为0.004372 sec

$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPUMix <<<(512,16384),(32,1)>>> elapsed 0.005981 sec
Arrays match.$ ./hello
./hello Starting...
Using Device 0:NVIDIA GeForce RTX 3090
Matrix size:nx 16384 ny 16384
sumMatrixOnGPUMix <<<(64,16384),(256,1)>>> elapsed 0.004372 sec
Arrays match.

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com