Parallalization of molecular

yellvillepotatocreekΛογισμικό & κατασκευή λογ/κού

2 Δεκ 2013 (πριν από 3 χρόνια και 6 μήνες)

58 εμφανίσεις

Parallalization

of molecular
dynamics

Danijel

Novakovi
ć
,

Faculty of Electronic Engineering
-

Niš
,

Serbia

dakidaki@gmail.com




10. December, TU
Ilmenau



IAESTE LC
Ilmenau



IAESTE LC Ni
š

Plan of speech


The representation of the problem based on
physics


Description of the C code


Description of the Cuda code


Optimization and suggestions for future
development


Conclusion

Plan of speech


The representation of the problem based on
physics


Description of the C code


Description of the
Cuda

code


Optimization and suggestions for future
development


Conclusion

Definition of problem 1

Definition of problem 2


Definition of problem 3

Plan of speech


The representation of the problem based on
physics


Description of the C code


Description of the
Cuda

code


Optimization and suggestions for future
development


Conclusion

C code

Fortran F95 C prog. language



constants.c
;




trajektorievars.c
;



inputoutput.c
;




trajektoriem.c
;




efef.c
;



main.c




C code 2


trajektoriem.c


int relation(float spy)


void doit_line()



C code 3

Why
Cuda
, not C or Fortran?


Cuda program should be faster


Parallel calculations


Less time for complicate calculations


Better performances


What is
Cuda


C
ompute
U
nied
D
evice
A
rchitecture


C programming language on GPUs


Requires no knowledge of graphics APIs or GPU
programming


Stable, available (for free), documented and
supported


For both Windows and Linux


Designed and developed by NVIDIA

»
Requires an NVIDIA GPU (GeForce 8xxx/Tesla/Quadro)

SIMD


P
-

pipe


Different data in each P


Unique control unit



SIMD

+

A B

C D

A+B

C+D

Grid of thread blocks

-

The computational grid consists of a


grid of
thread blocks

-

Each
thread
executes the kernel

-

The application species the grid and


block dimensions

-

The grid layouts can be 1, 2, or


3
-
dimensional

-

The maximal sizes are determined by


GPU memory and kernel complexity

-

Each block has an unique
block ID

-

Each thread has an unique
thread ID


(within the block)

Special variables


Special variables for thread identification in
kernel


dim3
threadIdx


dim3
blockIdx


dim3
blockDim

outdata[blockIdx.x*blockDim.x+threadIdx.x] =

relation(indata[blockIdx.x*blockDim.x+threadIdx.x],


);



Memory model

Cuda

code


setup_rand.cu


run.cu


kernel.cu


main.c


inputoutput.c


trajektorievars.c


definicije.h



CUDA programs have a basic flow


The host initializes an array with data.


The array is copied from the host to the memory
on the CUDA device.


The CUDA device operates on the data in the
array.


The array is copied back to the host.




Cuda

code

setup_rand.cu

struct DevMemData

{


float* orig_data;

//origina
l data


float* h_data;


//
copy of original data, stored on the host


float* d_data;


//
copy of original data, stored on the
Cuda

device


int n;



// number of elements in arrays


unsigned int memsize;


unsigned int sharedDeviceMem;


int init;

}

DevMemData* GPUMemoryData;

dim3 grid;

dim3 threads;

The host initializes an array with data

void memDataCopyOrigHost

(DevMemData* memData)

{


int i;


if(memData
-
>init==0)



printf("Error: memDataCopyOrigHost: memData not initialised.
\
n");


else



for(i=0;i<memData
-
>n;i++)




memData
-
>h_data[i]= memData
-
>orig_data[i];


return;

}

void memDataCopyHostOrig

(DevMemData* memData)

{


int i;


if(memData
-
>init==0)



printf("Error: memDataCopyOrigHost: memData not initialised.
\
n");


else



for(i=0;i<memData
-
>n;i++)




memData
-
>orig_data[i] = memData
-
>h_data[i];


return;

}


void memDataCopyHostDevice(DevMemData*
memData)

{


CUDA_SAFE_CALL( cudaMemcpy( memData
-
>d_data, memData
-
>h_data,memData
-
>memsize,cudaMemcpyHostToDevice) );

}

T
he array is copied from the host to the memory on the

CUDA device.



setup_rand.cu

void memDataCopyDeviceHost(DevMemData*
memData)

{


CUDA_SAFE_CALL( cudaMemcpy( memData
-
>h_data, memData
-
>d_data,memData
-
>memsize,cudaMemcpyDeviceToHost) );

}

a transfer of the data from the array in the CUDA device memory

back to the array in the host memory



setup_rand.cu

void allocateDeviceMem(float*
original
_data,int n, DevMemData* memData)

{


grid.x=n;


grid.y=1;


grid.z=1;


threads.x=n;


threads.y=1;


threads.z=1;


memData
-
>orig_data=
original
_data;


memData
-
>n=n;


memData
-
>memsize=memData
-
>n*sizeof(float);


memData
-
>sharedDeviceMem=0;


// allocate array on host


memData
-
>h_data=(float*) malloc(memData
-
>n*sizeof(float));


// allocate array on device


CUDA_SAFE_CALL( cudaMalloc( (void**) &memData
-
>d_data, memData
-
>memsize));


memData
-
>init=1;


memDataCopyOrigHost(memData);


memDataCopyHostDevice(memData);

}


setup_rand.cu

void
CudaInit
(
int

deviceid,f

loat
*
h_data,f

loat
*
hh_data,int

n)

{


GPUMemoryData=(struct DevMemData*) malloc(2*sizeof(struct
DevMemData));


printf("CUDA Init Start
\
n");


int gpunum;



//returns the number of compute
-
capable devices


CUDA_SAFE_CALL(cudaGetDeviceCount(&gpunum))
;


printf("GPU Num: %i
\
n",gpunum);


if (
gpunum
<deviceid+1) {
printf
("Not enough GPUs
available!!");return;}


// sets device to be used for GPU executions



CUDA_SAFE_CALL(cudaSetDevice(deviceid));






allocateDeviceMem(h_data,n,&GPUMemoryData[0]);


allocateDeviceMem(hh_data,n,&GPUMemoryData[1]);

}

setup_rand.cu

Function Type Qualifiers

__device__

The
__device__
qualifier declares a function that is:


Executed on the device,
__device__ int relation(float spy,


)


Callable from the device only.

__global__

The
__global__
qualifier declares a function as being a kernel.


Executed on the device,
__global__
void

run_kernel(float* indata,


)


Callable from the host only.

__host__

The
__host__
qualifier declares a function that is:


Executed on the host,

__host__
int

CudaInit
(
int

deviceid
, …)


Callable from the host only.


Kernels functions


Cuda extends C by allowing the programmer
to define C functions, called kernels, that,
when, called, are executed N times in parallel
by N different Cuda threads, as opposed to
only o
nc
e like regular C functions.


__global__ void kernelName <<< a,b >>> ( … )



a
-

the number of blocks,



b
-

the number of threads in each block

ran.cu

void CudaRun(float b,float UdK,float kappa,float dt)

{


int
i;

// kernel execution directive


run_kernel<<<grid,threads>>(GPUMemoryData[0].d_data,GPUMe
moryData[1].d_data,GPUMemoryData[0].n,b,UdK,kappa,dt);


cudaThreadSynchronize();

//wait for compute
-
device to finish


printf("ERROR
-
CUDA: %s
\
n",cudaGetErrorString(cudaGetLastError()));


CUT_CHECK_ERROR("Kernel execution failed");


memDataCopyDeviceHost(&GPUMemoryData[0]);


memDataCopyHostOrig(&GPUMemoryData[0]);


memDataCopyDeviceHost(&GPUMemoryData[1]);


memDataCopyHostOrig(&GPUMemoryData[1]);

}

__global__ void run_kernel(float* indata,float* outdata, int
n,float b,float UdK,float kappa,float dt)

{


outdata[blockIdx.x*blockDim.x+threadIdx.x] =

relation(indata[blockIdx.x*blockDim.x+threadIdx.x],b,UdK,kap
pa,dt);


}

kernel.cu

__device__ int relation(float spy,float b,float UdK,float kappa,float dt)

{



float r, rpa,ti

=0
;


float4

x,dxdt;


int trapped =0, outside =0;


//0


false; 1
-

true


x.y = spy;

x.x = (float) (
-
sqrt(b*b
-
x.y*x.y));


x.z = Um;

x.w = 0.0;

rpa= rP+a;


while (trapped == 0 && outside == 0)


{




dxdt = derivs(ti,x,UdK,kappa);



x = rk4(x,dxdt,ti,dt,UdK,kappa); //Runge Kutta method



ti+= dt;



r = (float) (sqrt(x.x*x.x+x.y*x.y));



if(r<=rpa)

trapped = 1;



if(r>b)

outside = 1;


}


return trapped;


}

kernel.cu

float4,float3,float2,int4,int3,int2


float4 temp;



temp.x



temp.y



temp.z



temp.w

__device__ float4 derivs(float ti, float4 x, float UdK, float
kappa)

{


float2 F;


float4 dxdt;


F = FStokes(x,UdK,kappa); //racuna snagu F, prenosi X


dxdt.x = x.z;


dxdt.y = x.w;


dxdt.z = F.x;


dxdt.w = F.y;


return dxdt;

}

kernel.cu

int main()

{




int
deviceid
=0;


int
cudaInitCheck

= 0;


//
boolean


float
dt
;


float*
inarray
,*
outarray
;


setconst
();




initrP
();
dt

=
initTime
();



inarray

= input(5)


outarray

= (float*)
malloc
(num*
sizeof
(float));


CudaInit
(
deviceid,inarray,outarray,num
);


return 1;

}


kernel.cu

Plan of speech


The representation of the problem based on
physics


Description of the C code


Description of the
Cuda

code


Optimization and suggestions for future
development


Conclusion

Future development


#define Pi 3.1415926535897932384626433832795


#define
kB

1.380650424e
-
23


#define a 11e
-
6


#define alpha 0.18


#define
rhoP

2.0


#define T 300.0


#define lambda 6.8e
-
8 // m,
mittlere

freie

Weglange



#define eta 2.0e
-
5 //
dynamische

Viskositat





#define
vquer

400.0 // m/s,
mittlere

Geschwindigkeit


#define t 0.0


#define Um 0.1


#define
rP

1e
-
8


definicije.h

__device__ int relation(float spy,float b,float UdK,float kappa,float dt)

{

float r, rpa,ti

=0
;


float4

x,dxdt;


int trapped =0, outside =0 ;


//0


false; 1
-

true


x.y = spy;

x.x = (float) (
-
sqrt(b*b
-
x.y*x.y));


x.z = Um;

x.w = 0.0;

rpa= rP+a;


while (trapped == 0 && outside == 0)


{

dxdt = derivs(ti,x,UdK,kappa);



x = rk4(x,dxdt,ti,dt,UdK,kappa); //Runge Kutta method



ti+= dt;



r = (float) (sqrt(x.x*x.x+x.y*x.y));



if(r<=rpa)

trapped = 1;



if(r>b)

outside = 1;


}


return trapped;

}

Future development

__device__ int relation(float spy,float b,float UdK,float kappa,float dt)

{

float r, rpa,ti

=0
;


float4

x,dxdt;


int trapped =0, outside =0, safe =0;




x.y = spy;

x.x = (float) (
-
sqrt(b*b
-
x.y*x.y));


x.z = Um;

x.w = 0.0;

rpa= rP+a;


while (trapped == 0 && outside == 0 &&
(++safe)
<100000001 )


{




dxdt = derivs(ti,x,UdK,kappa);



x = rk4(x,dxdt,ti,dt,UdK,kappa);



ti+= dt;



r = (float) (sqrt(x.x*x.x+x.y*x.y));



if(r<=rpa)

trapped = 1;



if(r>b)

outside = 1;


}


return trapped;


}


Future development

__device__ int relation(float spy,float b,float UdK,float kappa,float dt)

{

float r, rpa,ti

=0
;


float4

x,dxdt;


int trapped =0, outside =0, safe =0;




x.y = spy;

x.x = (float) (
-
sqrt(b*b
-
x.y*x.y));


x.z = Um;

x.w = 0.0;

rpa= rP+a;


while (trapped == 0 && outside == 0 &&
(++safe)
<100000001 )


{




dxdt = derivs(ti,x,UdK,kappa);



x = rk4(x,dxdt,ti,dt,UdK,kappa);



x =
difusion

( );



ti+= dt;



r = (float) (sqrt(x.x*x.x+x.y*x.y));



if(r<=rpa)

trapped = 1;



if(r>b)

outside = 1;


}


return trapped;


}


Future development

Plan of speech


The representation of the problem based on
physics


Description of the C code


Description of the
Cuda

code


Optimization and suggestions for future
development


Conclusion

Conclusion


Parallel calculations


Better performance


Available resources


NVIDIA CUDA Homepage


http://www.nvidia.com/cuda


Contains downloads, documentation, examples and
links


CUDA Forums


http://forums.nvidia.com


Supercomputing 2007 CUDA
Tutorials


http://www.gpgpu.org/sc2007/





Thank you for listening !!!




especilly thank to






Dr.
Hartmut

Grille






Christian M
üller






IAESTE