Compilers, Parallel Computing, and Grid Computing

coleslawokraSoftware and s/w Development

Dec 1, 2013 (3 years and 11 months ago)

147 views

1

ITCS 6/8010 CUDA Programming, UNC
-
Charlotte, B. Wilkinson,
Feb 14, 2011

Streams.pptx

CUDA Streams

These notes will
introduce the use of multiple CUDA
streams to overlap memory transfers with kernel
computations.


Also introduced is paged
-
locked memory


These materials come from Chapter 10 of “CUDA by Example” by Jason
Sanders and Edwards
Kandrot
.


2

Page
-
locked host memory

(also called pinned host memory)

Page
-
locked memory is not paged in and out main
memory by the OS through paging but will remain
resident.


Allows:



Concurrent host/device memory transfers with kernel operations
(Compute capability 2.x)


see next


Host memory can be mapped to device address space
(Compute capability > 1.0)


Memory bandwidth is higher


Uses real addresses rather than virtual addresses


Does not need to intermediate
copy buffering

3

Note on using page
-
locked memory

Using page
-
locked memory will reduce
memory available to the OS for paging and
so need to be careful in allocating it

4

Allocating page locked memory

cudaMallocHost

( void **
ptr
,
size_t

size )



Allocates page
-
locked host memory that is accessible to
device


cudaHostAlloc

( void **
ptr
,
size_t

size, unsigned
int

flags)


Allocates page
-
locked host memory that is accessible to
device


seems to have more options

5

//Pinned memory test written by Barry Wilkinson, UNC
-
Charlotte. Feb 10, 2011.


#include <
stdio.h
>

#include <
cuda.h
>

#include <
stdlib.h
>


#define SIZE (10*1024*1024) // number of bytes in arrays 10
MBytes


int

main(
int

argc
, char *
argv
[]) {



int

i
;





// loop counter


int

*a;


int

*
dev_a
;



cudaEvent_t

start, stop;


// using
cuda

events to measure time


cudaEventCreate
(&start);


// create events


cudaEventCreate
(&stop);



float elapsed_time_ms1, elapsed_time_ms3;


/*
--------------------
ENTER INPUT PARAMETERS AND DATA
-----------------------
*/



cudaMalloc
((void**)&
dev_a
, SIZE);


// allocate memory on device


/*
----------------

COPY USING PINNED MEMORY
--------------------

*/



cudaHostAlloc
((void**)&a, SIZE ,
cudaHostAllocDefault
);

// allocate page
-
locked memory on host



cudaEventRecord
(start, 0);



for(i = 0; i < 100; i++) {




cudaMemcpy
(
dev_a
, a , SIZE ,
cudaMemcpyHostToDevice
);

//copy to device




cudaMemcpy
(
a,dev_a
, SIZE ,
cudaMemcpyDeviceToHost
);

//copy back to host


}



cudaEventRecord
(stop, 0);

// instrument code to
measue

end time



cudaEventSynchronize
(stop);


cudaEventElapsedTime
(&elapsed_time_ms1, start, stop );




printf
("Time to copy %d bytes of data 100 times on GPU, pinned memory: %f ms
\
n", SIZE, elapsed_time_ms1); // exec. time


Test of
Pinned
Memory

6


/*
----------------

COPY USING REGULAR MEMORY
--------------------

*/



a = (
int
*)
malloc
(SIZE);

// allocate regular memory on host



cudaEventRecord
(start, 0);



for(i = 0; i < 100; i++) {




cudaMemcpy
(
dev_a
, a , SIZE ,
cudaMemcpyHostToDevice
);

//copy to device




cudaMemcpy
(
a,dev_a
, SIZE ,
cudaMemcpyDeviceToHost
);

//copy back to host


}



cudaEventRecord
(stop, 0);

// instrument code to
measue

end time



cudaEventSynchronize
(stop);


cudaEventElapsedTime
(&elapsed_time_ms3, start, stop );




printf
("Time to copy %d bytes of data 100 times on GPU: %f ms
\
n", SIZE, elapsed_time_ms3); // exec. time


/*
--------------------------
SPEEDUP
---------------------------------
*/



printf
("Speedup of using pinned memory = %f
\
n", (float) elapsed_time_ms3 / (float) elapsed_time_ms1);



/*
--------------

clean up
---------------------------------------
*/



free(a);


cudaFree
(
dev_a
);


cudaEventDestroy
(start);


cudaEventDestroy
(stop);



return 0;

}


7

My code

8

./
bandwidthTest

Starting...




Running on...



Device 0: Tesla C2050


Quick Mode



Host to Device Bandwidth, 1 Device(s), Paged memory



Transfer Size (Bytes)


Bandwidth(MB/s)



33554432


1026.7



Device to Host Bandwidth, 1 Device(s), Paged memory



Transfer Size (Bytes)


Bandwidth(MB/s)



33554432


1108.1



Device to Device Bandwidth, 1 Device(s)



Transfer Size (Bytes)


Bandwidth(MB/s)



33554432


84097.6



[
bandwidthTest
]
-

Test results:

PASSED



Press <Enter> to Quit...

-----------------------------------------------------------

Using NVIDIA sample code for bandwidth on
coit
-
grid06

9

CUDA Streams

A CUDA Stream is a sequence of operations
(commands) that are executed in order.


CUDA streams can be created and executed together
and interleaved although the “program order” is always
maintained within each stream.


Streams proved a mechanism to overlap memory
transfer and computations operations in different
stream for increased performance if sufficient
resources are available.

10

Creating a stream

Done by creating a stream object and associated it
with a series of CUDA commands that then becomes
the stream. CUDA commands have a stream pointer
as an argument:



cudaStream_t

stream1;


cudaStreamCreate
(&stream1);



cudaMemcpyAsync
(…, stream1);


MyKernel
<<< grid, block, stream1>>>(…);


cudaMemcpyAsync
(… , stream1);

Cannot use
regular
cudaMemcpy

with streams,
need
asynchronous
commands for
concurrent
operation see
next

Stream

11

cudaMemcpyAsync
( …, stream)

Asynchronous version of
cudaMemcpy

that copies
date to/from host and the device


May return before copy complete


A stream argument specified.


Needs “page
-
locked” memory

12

#define SIZE (N*20)



int

main(void) {


int

*a, *b, *c;


int

*
dev_a
, *
dev_b
, *
dev_c
;



cudaMalloc
( (void**)&
dev_a
, N *
sizeof
(
int
) );


cudaMalloc
( (void**)&
dev_b
, N *
sizeof
(
int
) );


cudaMalloc
( (void**)&
dev_c
, N *
sizeof
(
int
) );



cudaHostAlloc
((void**)&
a,SIZE
*
sizeof
(
int
),
cudaHostAllocDefault
);
// paged
-
locked


cudaHostAlloc
((void**)&
b,SIZE
*
sizeof
(
int
),
cudaHostAllocDefault
);


cudaHostAlloc
((void**)&
c,SIZE
*
sizeof
(
int
),
cudaHostAllocDefault
);



for(
int

i
=0;i<
SIZE;i
++) {



// load data


a[
i
] = rand();


b[
i
] = rand();


}



for(
int

i
=0;I <
SIZE;i
+= N {


// loop over data in chunks


cudaMemcpyAsync
(
dev_a,a+i,N
*
sizeof
(
int
),
cudaMemcpyHostToDevice,stream
);



cudaMemcpyAsync
(
dev_b,a+i,N
*
sizeof
(
int
),
cudaMemcpyHostToDevice,stream
);



kernel<<<N/256,256,0,stream>>>(
dev_a,dev
-
b,dev_c
);



cudaMemcpyAsync
(c+1,dev_c,N*
sizeof
(
int
),
cudaMemcpyDeviceToHost,stream
);


}


cudaStreamSynchronise
(stream); // wait for stream to finish


return 0;

}

Code Example

Page 194
-
95 CUDA by
Example, without error
detection macros

One stream

13

Multiple streams

Assuming device can support it (can check in code if
needed), create two streams with:


cudaStream_t

stream1, stream2;

cudaStreamCreate
(&stream1);

cudaStreamCreate
(&stream2);


and then duplicate stream code for each stream

14


int

*dev_a1, *dev_b1, *dev_c1; // stream 1
mem

ptrs

int

*dev_a2, *dev_b2, *dev_c2; // stream 2
mem

ptrs

//stream 1

cudaMalloc
( (void**)&dev_a1, N *
sizeof
(
int
) );

cudaMalloc
( (void**)&dev_b1, N *
sizeof
(
int
) );

cudaMalloc
( (void**)&dev_c1, N *
sizeof
(
int
) );

//stream 2

cudaMalloc
( (void**)&dev_a2, N *
sizeof
(
int
) );

cudaMalloc
( (void**)&dev_b2, N *
sizeof
(
int
) );

cudaMalloc
( (void**)&dev_c2, N *
sizeof
(
int
) );



for(
int

i
=0;I <
SIZE;i
+= N*2 {


// loop over data in chunks

// stream 1


cudaMemcpyAsync
(dev_a1,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream1);


cudaMemcpyAsync
(dev_b1,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream1);


kernel<<<N/256,256,0,stream1>>>(
dev_a,dev
-
b,dev_c
);


cudaMemcpyAsync
(c+1,dev_c1,N*
sizeof
(
int
),cudaMemcpyDeviceToHost,stream1);

//stream 2


cudaMemcpyAsync
(dev_a2,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream2);


cudaMemcpyAsync
(dev_b2,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream2);


kernel<<<N/256,256,0,stream2>>>(
dev_a,dev
-
b,dev_c
);


cudaMemcpyAsync
(c+1,dev_c2,N*
sizeof
(
int
),cudaMemcpyDeviceToHost,stream2);

}

cudaStreamSynchronise
(stream1); // wait for stream to finish

cudaStreamSynchronise
(stream2); // wait for stream to finish


First attempt
described in book

concatenate
statements of each
stream

15

Simply
concatenating
statements
does not work
well because of
the way the
GPU
schedules work

Page 206 CUDA by Example,

16

Page 207 CUDA by Example,

17

Page 208 CUDA by Example

18

for(
int

i
=0;I <
SIZE;i
+= N*2 {


// loop over data in chunks

// interleave stream 1 and stream 2


cudaMemcpyAsync
(dev_a1,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream1);


cudaMemcpyAsync
(dev_a2,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream2);


cudaMemcpyAsync
(dev_b1,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream1);


cudaMemcpyAsync
(dev_b2,a+i,N*
sizeof
(
int
),cudaMemcpyHostToDevice,stream2);



kernel<<<N/256,256,0,stream1>>>(
dev_a,dev
-
b,dev_c
);


kernel<<<N/256,256,0,stream2>>>(
dev_a,dev
-
b,dev_c
);



cudaMemcpyAsync
(c+1,dev_c1,N*
sizeof
(
int
),cudaMemcpyDeviceToHost,stream1);


cudaMemcpyAsync
(c+1,dev_c2,N*
sizeof
(
int
),cudaMemcpyDeviceToHost,stream2);

}

Second attempt described in book

Interleave statements of each stream

19

Page 210 CUDA by Example

Questions