GPU上履行大范圍數(shù)據(jù)并行計(jì)算性能高于cpu上履行,另外,NVIDIA 圖形處理器還支持1種并行性(Parallelism)。這類并行性類似于cpu多線程利用程序中的任務(wù)并行性(Task Parallelism)。任務(wù)并行性是指并行履行兩個(gè)或多個(gè)不同的任務(wù),而不是在大量數(shù)據(jù)上履行同1個(gè)任務(wù)。
在并行環(huán)境中,任務(wù)可以是任意操作。如1個(gè)線程繪制GUI,另外一個(gè)線程通過(guò)網(wǎng)絡(luò)下載更新包。本章介紹 CUDA 流,和如何通過(guò)流在GPU上同時(shí)履行多個(gè)任務(wù)。
差異:malloc()分配的內(nèi)存是標(biāo)準(zhǔn)的、可分頁(yè)的(Pagable)主機(jī)內(nèi)存;cudaHostAlloc()分配的內(nèi)存是頁(yè)鎖定的主機(jī)內(nèi)存。
由于gpu知道內(nèi)存的物理地址,因此可以通過(guò)“直接內(nèi)存訪問(wèn)(Direct Memory Access,DMA)” 技術(shù)來(lái)在gpu和主機(jī)之間復(fù)制數(shù)據(jù)。由于DMA 在履行復(fù)制時(shí)無(wú)需cpu參與,這就說(shuō)明 cpu極可能在dma的履行進(jìn)程中將目標(biāo)內(nèi)存交換到磁盤上,或通過(guò)更新操作系統(tǒng)的可分頁(yè)表來(lái)重新定位目標(biāo)內(nèi)存的物理地址。cpu可能會(huì)移動(dòng)可分頁(yè)的數(shù)據(jù),這便可能對(duì)dma操作造成延遲。因此,固定內(nèi)存很重要。
固定內(nèi)存是雙刃劍,所以建議: 僅對(duì) cudaMemcpy() 調(diào)用中的源內(nèi)存或目標(biāo)內(nèi)存,才使用頁(yè)鎖定內(nèi)存,并且不再需要時(shí)立即釋放。
#include "../common/book.h"
#define SIZE (64*1024*1024)
float cuda_malloc_test( int size, bool up ) {
cudaEvent_t start, stop;
int *a, *dev_a;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
a = (int*)malloc( size * sizeof( *a ) );
HANDLE_NULL( a );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
size * sizeof( *dev_a ) ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for (int i=0; i<100; i++) {
if (up)
HANDLE_ERROR( cudaMemcpy( dev_a, a,
size * sizeof( *dev_a ),
cudaMemcpyHostToDevice ) );
else
HANDLE_ERROR( cudaMemcpy( a, dev_a,
size * sizeof( *dev_a ),
cudaMemcpyDeviceToHost ) );
}
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
free( a );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
return elapsedTime;
}
float cuda_host_alloc_test( int size, bool up ) {
cudaEvent_t start, stop;
int *a, *dev_a;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&a,
size * sizeof( *a ),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
size * sizeof( *dev_a ) ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for (int i=0; i<100; i++) {
if (up)
HANDLE_ERROR( cudaMemcpy( dev_a, a,
size * sizeof( *a ),
cudaMemcpyHostToDevice ) );
else
HANDLE_ERROR( cudaMemcpy( a, dev_a,
size * sizeof( *a ),
cudaMemcpyDeviceToHost ) );
}
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
HANDLE_ERROR( cudaFreeHost( a ) );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
return elapsedTime;
}
int main( void ) {
float elapsedTime;
float MB = (float)100*SIZE*sizeof(int)/1024/1024;
// try it with cudaMalloc
elapsedTime = cuda_malloc_test( SIZE, true );
printf( "Time using cudaMalloc: %3.1f ms\n",
elapsedTime );
printf( "\tMB/s during copy up: %3.1f\n",
MB/(elapsedTime/1000) );
elapsedTime = cuda_malloc_test( SIZE, false );
printf( "Time using cudaMalloc: %3.1f ms\n",
elapsedTime );
printf( "\tMB/s during copy down: %3.1f\n",
MB/(elapsedTime/1000) );
// now try it with cudaHostAlloc
elapsedTime = cuda_host_alloc_test( SIZE, true );
printf( "Time using cudaHostAlloc: %3.1f ms\n",
elapsedTime );
printf( "\tMB/s during copy up: %3.1f\n",
MB/(elapsedTime/1000) );
elapsedTime = cuda_host_alloc_test( SIZE, false );
printf( "Time using cudaHostAlloc: %3.1f ms\n",
elapsedTime );
printf( "\tMB/s during copy down: %3.1f\n",
MB/(elapsedTime/1000) );
}
cudaHostAlloc() --- cudaFreeHost()
10.4 使用單個(gè)CUDA流
#include "../common/book.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
__global__ void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream;
int *host_a, *host_b, *host_c;
int *dev_a, *dev_b, *dev_c;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the stream
HANDLE_ERROR( cudaStreamCreate( &stream ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
N * sizeof(int) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
// copy the locked memory to the device, async
HANDLE_ERROR( cudaMemcpyAsync( dev_a, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream ) );
kernel<<<N/256,256,0,stream>>>( dev_a, dev_b, dev_c );
// copy the data from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream ) );
}
// copy result chunk from locked to full buffer
HANDLE_ERROR( cudaStreamSynchronize( stream ) );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_c ) );
HANDLE_ERROR( cudaStreamDestroy( stream ) );
return 0;
}
這里是單個(gè)流來(lái)講明流的用法,主要看main函數(shù)。
第1件事:選擇1個(gè)支持裝備堆疊(Device Overlap)功能的裝備。支持裝備堆疊功能的GPU能夠在履行1個(gè)CUDA C核函數(shù)的同時(shí),還能在裝備與主機(jī)之間履行復(fù)制操作。正如前面說(shuō)的,我們將使用多個(gè)流來(lái)實(shí)現(xiàn)這類計(jì)算與數(shù)據(jù)傳輸?shù)亩询B。
(1)創(chuàng)建流,
cudaStream_t stream;
// initialize the stream
HANDLE_ERROR( cudaStreamCreate( &stream ) );
(2)數(shù)據(jù)分配,
int *host_a, *host_b, *host_c;
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
N * sizeof(int) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}
cudaHostAlloc() 使用主機(jī)固定內(nèi)存,除復(fù)制快,其他好處1會(huì)分析。
1般情況,我們是將輸入緩沖區(qū)復(fù)制到GPU,啟動(dòng)核函數(shù),然后將輸出緩沖區(qū)復(fù)制回主機(jī);不過(guò)這次我們有些改動(dòng)。
1.將輸入緩沖區(qū)劃分為更小的塊,并在每一個(gè)塊上履行1個(gè)包括3個(gè)步驟的進(jìn)程;
2.將1部份輸入緩沖區(qū)復(fù)制到GPU,這部份緩沖區(qū)運(yùn)行核函數(shù);
3.將輸出緩沖區(qū)中的這部份結(jié)果復(fù)制回主機(jī)。
使用情形: GPU的內(nèi)存遠(yuǎn)小于主機(jī)內(nèi)存,全部緩沖區(qū)沒(méi)法1次性填充到GPU,因此需要分塊進(jìn)行計(jì)算。
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
// copy the locked memory to the device, async
HANDLE_ERROR( cudaMemcpyAsync( dev_a, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream ) );
kernel<<<N/256,256,0,stream>>>( dev_a, dev_b, dev_c );
// copy the data from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream ) );
}
cudaMemcpy() 這個(gè)函數(shù)將以同步方式履行,說(shuō)明,當(dāng)函數(shù)返回時(shí),復(fù)制操作已完成,并且在輸出緩沖區(qū)中包括了復(fù)制進(jìn)去的內(nèi)容。
cudaMemcpyAsync() 異步,只是放置1個(gè)要求,表示在流中履行1次內(nèi)存復(fù)制操作,這個(gè)流是通過(guò)參數(shù)stream來(lái)指定的。任何傳遞給cudaMemcpyAsync() 的主機(jī)內(nèi)存指針必須已通過(guò)cudaHostAlloc() 分配好內(nèi)存。只能以異步方式對(duì)頁(yè)鎖定內(nèi)存進(jìn)行復(fù)制操作。
10.5 使用多個(gè)CUDA 流
在任何支持內(nèi)存復(fù)制和核函數(shù)的履行相互堆疊的裝備上,當(dāng)使用多個(gè)流時(shí),利用程序的整體性能都會(huì)提示。
#include "../common/book.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
__global__ void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c1,
N * sizeof(int) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N*2) {
// copy the locked memory to the device, async
HANDLE_ERROR( cudaMemcpyAsync( dev_a0, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
// copy the data from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0 ) );
// copy the locked memory to the device, async
HANDLE_ERROR( cudaMemcpyAsync( dev_a1, host_a+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
// copy the data from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_c+i+N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1 ) );
}
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a0 ) );
HANDLE_ERROR( cudaFree( dev_b0 ) );
HANDLE_ERROR( cudaFree( dev_c0 ) );
HANDLE_ERROR( cudaFree( dev_a1 ) );
HANDLE_ERROR( cudaFree( dev_b1 ) );
HANDLE_ERROR( cudaFree( dev_c1 ) );
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
return 0;
}
10.6 GPU的工作調(diào)度機(jī)制
流可以看作是:有序的操作序列,其中包括內(nèi)存復(fù)制操作、核函數(shù)調(diào)用。硬件中沒(méi)有流的概念,而是包括1個(gè)或多個(gè)引擎來(lái)履行內(nèi)存復(fù)制操作,和1個(gè)引擎來(lái)履行核函數(shù)。這些引擎彼此獨(dú)立的對(duì)操作進(jìn)行排隊(duì)。
10.7 高效地使用多個(gè)CUDA流
如果同時(shí)調(diào)度某個(gè)流的所有操作,那末容易在無(wú)意中阻塞另外一個(gè)流的復(fù)制操作或核函數(shù)履行。解決這個(gè)問(wèn)題,在將操作放入流的隊(duì)列時(shí)應(yīng)當(dāng)采取寬度優(yōu)先方式,而非深度優(yōu)先方式。也就是說(shuō),將這兩個(gè)流之間的操作交叉添加。
#include "../common/book.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
__global__ void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;
// start the timers
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// initialize the streams
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c0,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b1,
N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c1,
N * sizeof(int) ) );
// allocate host locked memory, used to stream
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// now loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N*2) {
// enqueue copies of a in stream0 and stream1
HANDLE_ERROR( cudaMemcpyAsync( dev_a0, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_a1, host_a+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
// enqueue copies of b in stream0 and stream1
HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
// enqueue kernels in stream0 and stream1
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
// enqueue copies of c from device to locked memory
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( host_c+i+N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1 ) );
}
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
// cleanup the streams and memory
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
HANDLE_ERROR( cudaFree( dev_a0 ) );
HANDLE_ERROR( cudaFree( dev_b0 ) );
HANDLE_ERROR( cudaFree( dev_c0 ) );
HANDLE_ERROR( cudaFree( dev_a1 ) );
HANDLE_ERROR( cudaFree( dev_b1 ) );
HANDLE_ERROR( cudaFree( dev_c1 ) );
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
return 0;
}
這個(gè)速度更快。