Programming the GPU

trexpeeverSoftware and s/w Development

Dec 13, 2013 (3 years and 9 months ago)

164 views

Programming the GPU

on Cg


Szirmay
-
Kalos László

email: szirmay@iit.bme.hu

Web: http://www.iit.bme.hu/~szirmay


Hardware

GPU

Frame

buffer

display

CPU

Memory

I/O

Graphics card

Program

OpenGL API

OpenGL API

glLightfv(GL_LIGHT0, GL_DIFFUSE, I);

glMaterialfv( GL_FRONT, GL_DIFFUSE, kd);

glViewport( 0, 0, width, height);

gl
u
LookAt(ex, ey, ez, lax, lay, laz,upx, upy, upz)
;

glScalef(sx, sy, sz)
;

glTranslatef(px, py,pz);


glRotatef(ang
,
a
xis
x,a
xis
y,a
xis
z);


glBegin(GL_TRIANGLES);



glNormal3f(nx
1
,ny
1
,nz
1
);


glColor3f(r
1
,g
1
,b
1
);



glTexCoord2f(u1,v1)



glVertex3f(x
1
,y
1
,z
1
);




glEnd( );

CPU

GPU

State

Uniform

variables

Geometry

Vertex properties



Vertices

PASS

Rendering Pipeline

Virtual world

Camera space,

illumination

Perspective

transformation +

Clipping

+

Homogeneous div.

1.

2.

Viewport transf
+
Rasterization+interpolation

display

color

depth

MODELVIEW

PROJECTION

Texture mapping

(u1, v1)

(u2, v2)

(u3, v3)

x1,y1,z1

x2,y2,z2

x3,y3,z3

szín

Texturing hardware

(u3, v3)

(u1, v1)

(u2, v2)

Linear interpolation:

(u, v)

Texture object in

GPU memory

Why is linear

interpolation our friend?

X

Y

I

I(X,Y) = aX + bY + c

I(X,Y)

I(X+1,Y) = I(X,Y) + a

(X
1
,Y
1
,I
1
)

(X
2
,Y
2
,I
2
)

(X
3
,Y
3
,I
3
)

I
(X,Y)

X counter

I

register

a

X

S

CLK

GPU hardware achitecture

Interface

Transform+

Illumination

Clipping + Hom.division

+
Viewport transform

Projection +
Rasterization

+

Linear interpolation

Texturing

Compositing (Z
-
buffer,
transparency)

Texture

memory

Early Z
-
cull

vertices

triangles

fragments

Vertex

Shader

Fragment

Shader

Why is it fast? Stream processing

Proc 2

Proc 1

Proc 1

Proc
21

Proc
22

Pipelining

Parallelism

Elements are processed INDEPENDENTLY



No internal storage



Parallel execution without synchronization

Vertex shader and its neighborhood




Clipping
:
-
w
<
X
<
w
,
-
w
<
Y
<
w
,
-
w
<
Z
<
w
, 0<color<1

State

Transforms

Lightsources

Materials

POSITION, NORMAL, COLOR0, TEXTCOORD0,…

glVertex

gl
Normal

gl
Color

glTextCoord

glBegin(GL_TRIANGLES)

glEnd( )

POSITION,

COLOR0,

TEXTCOORD0,

… for triangle

vertices

Homogeneous division
:
x
=
X
/
w
,
y
=
Y
/
w
,
z
=
Z
/
w

POSITION, COLOR0, TEXTCOORD0,…

f
or
triangle

vertices

*
MVP

*
MV

*
MV
IT

Illumination

Vertex shader

Viewport transform:

xv = center.x + viewsize.x * x / 2

CPU

GPU

Standard

vertex shader (Cg)

struct ins {


float4 position

: POSITION;

// glVertex


float3 normal

: NORMAL;

// glNormal


float4 color

: COLOR0
; // glColor


float2 texcoord

: TEXCOORD0;

// glTexCoord

};


struct outs {


float4 hposition

: POSITION;


float4
color

: COLOR0;




float2 texcoord

: TEXCOORD0;

};


outs main( in
s

IN,


uniform float4x4
MVP : state.matrix.mvp
) {


outs OUT
;


OUT.hposition = mul(
MVP
, IN.position);


OUT.texcoord = IN.texcoord;


OUT.
color

= IN.
color
;



return OUT;

}

glDisable(GL_LIGHTING );

P
ositional light source


outputs main( ins IN,


uniform float4x4
MV,



uniform float4x4 MV
IT,




uniform float4x4 MVP
,



uniform float3 light
pos,


uniform float4 Idiff, Iamb, Ispec,



uniform float4 em, ka, kd, ks,


uniform float shininess ) {


outs OUT;


OUT.hposition = mul(
MVP
, IN.position);




float3 N = mul(MVIT, IN.normal).xyz;


N =
n
ormalize(
N
);



// glEnable(GL_NORMALIZE)


float3 cpos = mul(MV, IN.position).xyz;


float3

L =
normalize(
lightpos


cpos
)
;


float3 H = normalize(L + V);


OUT.
color

=
em +
Iamb
* ka +


Idiff * kd *
saturate
(dot(N, L)) +


Ispec * ks * pow(saturate(dot(N, H)),shininess);


return OUT;

}

glEnable(GL_LIGHTING );

N

L

V

Fragment

shader and its neighborhood

State

Texture id,

texturing
environment

POSITION, COLOR0, TEXTCOORD0,… for triangle

vertice
s

POSITION, COLOR

Compositing: blending, z
-
buffering

Projection,
Rasterization and
linear interpolation

Fragment

shader

Texturing:

text2d
(u,v)*color0

POSITION, COLOR0, TEXTCOORD0 for
fragments

Texture memory

Frame buffer

Z
-
cull

Standard

fragment shader

gl
Disable
(GL_
TEXTURE_2D
);


gl
Enable
(GL_
TEXTURE_2D
);




with
GL_REPLACE

mode

float
4

main(


in float3 color

: COLOR0) : COLOR

{


return color;

}


float
4

main(


in float2 texcoord

: TEXCOORD0,


in float3 color


: COLOR0,


uniform sampler2D
texture
_map ) : COLOR

{


return text2D
(
texture_map
, texcoord);

}

What can we
do with it
?


Vertex shader:


General
BRDF model
s


Spec. transformations, smooth binding


Waving, procedural animation


Fragment

shader:


Phong shading, shadows


bump/parallax/displacement/reflection


mapping


Both:


General purpose computation

Example 1: Phong shading
instead of Gouraud shading

ambien
t

diff
use

spe
c
ul
ar

Gouraud

versus Phong
shading

Gouraud

Phong

Phong

Gouraud

Gouraud shading




CPU

program





Vertex
shader




Pixel

shader



Position

Normal


Transformations

Materials

Lights

Transformed

posi
t
ion

Color


Rasterization

Interpolation


Illumination

Interpolated

color

Phong shading




CPU

program





Vertex
shader




Pixel

shader



Position

Normal


Transformations

Light position

Transf.position

Transf.normal

View

Light


Rasterization

Interpolation


Illumination

Interpolated

Normal

View

Light

Materials

Light

intensity

Programs


.cpp CPU program:


Capability query of the GPU (profile)


Definition of the Shader environment


Vertex/fragment program

load from file and compile: CREATE


Vertex/fragment program upload to the GPU: LOAD


Selection of the current Vertex/fragment program: BIND


Uniform vertex/fragment variable definition


Uniform vertex/fragment variable setting


Non
-
uniform variables set (glVertex, glColor, glTexCoord…)


.cg vertex program


Fragment program
’s non
-
uniform variables

+ homog
eneous
position


.cg fragment program


Color
output

Initialization

Display

CPU program
-

Initialization

#include <Cg/cgGL.h>




//
cg
functions


CGparameter Lightpos, Shine, Ks, Kd;

//
uniform pars


main( ) {


CGprofile vertexProf, fragmentProf;
//
profiles


vertexProf = cgGLGetLatestProfile(CG_GL_VERTEX);


fragmentProf = cgGLGetLatestProfile(CG_GL_FRAGMENT);



cgGLEnableProfile(vertexProf);


cgGLEnableProfile(fragmentProf);





CGcontext shaderContext = cgCreateContext();

Vertex program loading

CGprogram vertexProg
ram

= cgCreateProgramFromFile(








shaderContext,



CG_SOURCE,








“vertex.cg",



vertexProf,








NULL, NULL);


cgGLLoadProgram(vertexProg
ram
);


//
upload to the GPU

cgGLBindProgram(vertexProgram);


//
this program is to run






// vertex program uniform parameters

Light
pos

= DefineCGParameter(VertexProgram, "lightcam");

Fragment program
loading

CGprogram fragmentProgram = cgCreateProgramFromFile(








shaderContext,


CG_SOURCE,









“fragment.cg",


fragmentProf,








NULL, NULL);


cgGLLoadProgram(
fragmentProgram
);

//
upload to the GPU

cgGLBindProgram(fragmentProgram);

//
this program is to run






//
fragment

program uniform parameters

Shin
e

= DefineCGParameter(
fragment
Program, "shininess");

Kd = DefineCGParameter(
fragment
Program, "kd");

Ks = DefineCGParameter(
fragment
Program, "ks");


… OpenGL initialization

CPU program
-

OpenGL display

void Display( ) {





// state (
uniform
)

parameter setting


glLoadIdentity();


gluLookAt(0, 0,
-
10, 0, 0, 0, 0, 1, 0);


glRotatef(angle, 0, 1, 0);






//
uniform
parameter setting


cgGLSetParameter3f(Light
pos
, 10,
2
0
,

30
);


cgGLSetParameter1f(Shin
e
, 40);


cgGLSetParameter3f(Kd, 1, 0.8, 0.2);


cgGLSetParameter3f(Ks, 2, 2, 2);






//
n
on

uniform param
eters


glBegin( GL_TRIANGLES );


for
( … ) {


glNormal3f(nx, ny, nz);


//
NORMAL register


glVertex3f(x, y, z);


//
POSITION register


}


glEnd();

}

Phong shading
: vertex shader

struct outs {

float4 hposition

: POSITION;




float3 normal

: TEXCOORD0;




float3 view


: TEXCOORD1;




float3 light



: TEXCOORD2;

};

outs main(



in
float4 position


: POSITION;


in
float4 normal


: NORMAL;


uniform float4x4 M
VP


: state.matrix.mvp,


uniform float4x4 M
V


: state.matrix.modelview,


uniform float4x4 M
VIT

: state.matrix.modelview.invtrans,


uniform float3 lightcam


)

{



outs OUT;


OUT.hposition = mul(M
VP
, IN.position);


float3 poscam = mul(M
V
, IN.position).xyz;


OUT.normal = mul(M
VIT
, IN.normal).xyz;


OUT.light = lightcam
-

poscam;


OUT.view =
-
poscam;




return OUT;

}

Vertex

Shader

N

L

V

Phong shading
:
fragment

shader

float3 main(

in
float3 normal

: TEXCOORD0,



in
float3 view

: TEXCOORD1,



in
float3 light

: TEXCOORD2,



uniform float shininess,



uniform float3 kd,



uniform float3 ks

)

: COLOR


{


normal = normalize(normal);


view = normalize(view);


light = normalize(light);




float3 half = normalize(view + light);


float3 color =

kd * saturate(dot(normal, light)) +


ks * pow( saturate(dot(normal,

half)), shininess );


return color;

}

fragment

shader

Example 2
:

Refraction

Example 2
:

Refraction

Result

Refraction computation




CPU

program





Vertex
shader




Pixel

shader



Po
sition

Norm
al


Transform
s

Index of refraction

Transf.
pos

Refraction

direction

Environment map id


Rasterization

Interpolation



Env.Map

texels


Interpolated

Refraction

direction

Env.map

lookup

Refraction
: vertex shader

struct outs {

float4 hPosition : POSITION;



float3 refract
dir

: TEXCOORD0;};



outs main(

in float4
p
osition : POSITION,



in float4
n
ormal

: NORMAL,



uniform float4x4 M
VP
,



uniform float4x4 M
V
,



uniform float4x4 M
VIT
,



uniform float n


) {




outs OUT;


OUT.hPosition = mul(M
VP
,
p
osition);



float3 view = normalize( mul(M
V
,
p
osition).xyz );



float3 norm
cam

= normalize( mul(M
VIT
,
n
ormal).xyz );



OUT.refract
dir

= refract(view, norm
cam
, n);


return OUT;

}

Vertex

Shader

Refraction
:
fragment

shader

float
3

main(

in float3 refract
dir

: TEXCOORD0,



uniform samplerCUBE envMap )

: COLOR

{


return texCUBE(envMap, refract
dir
).rgb;

}

fragment

shader

Pixel

color

Keyframe character animation

Mesh
morphing
:

t= 0

t= 1

Two enclosing
keys

Time
:
t

vertices

Linear
interpolation of
the vertices

Mesh deform
ation

Bone animation

Complete animation

Example 3:

Bone animation

rigid and smooth
binding

Rigid Smooth

Smooth binding
: vertex shader

outputs main(in float4 pos

: POSITION
,




in float4 ind
ices : COLOR0
,




in float4 weights

: NORMAL
,


uniform float4x4
MVP
,


uniform float3x4 bones[30] ) {




outs OUT;


float4

tpos = float4(
0, 0, 0, 0)
;




for (float i = 0; i < 4; i++) {


t
pos += weight
s
.x * mul(bones[ind
ices
.x], pos);


ind
ices

= ind
ices
.yzwx;


weight
s

= weight
s
.yzwx;


}


OUT.hPosition = mul(
MVP
,
t
pos);


return OUT;

}

Stream processing


Proc. 1



Proc. 2



Elements are processed INDEPENDENTLY


Pipelining


Parallelization


No internal storages

Stream processor types

Map

Amplify

Reduce

Sum

GPGPU stream programming

Clippling

Triangle setup +
rasterization
+

Line
ar interpolation

Compositing

Text
ure

memory

Vertex

Shader

Pixel

Shader

Mapping:

Change of stream element data

Mapping


Framebuffer



CPU


Vertices + properties:

Input stream of elements 13 x 4 floats

Conditional reduction

Amplification

Sum + min + reduction

Input/Output and coupling


Input


stream of vertices and properties


Texture memory


Output


Frame buffer


Texture memory

feedback

Mapping algorithms onto the GPU

Problem 1

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

2D array (texture) is available :




u = (float)(i / M) / M;


v = (float)(i % M) / M;


oarray[u][v] = Computation( iarray[u][v], globals );

Globals are uniform parameters


Output array goes to a texture or to the frame buffer


Input array is either a texture or vertex data



Solution 1: Input array is vertex data

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

CPU program:


GlobalPar

= DefineCGParameter(
vertex
Pro
g
, “
globals
");

cgGLSetParameter
4
f(
GlobalPar
, 10,
2
0
,

30, 40
);


glViewport(0, 0, M, M);

glBegin(GL_POINTS);

for(int i = 0; i < N; i++) { // M * M > N


float x = (float)(i / M) / M * 2
-

1; //
-
1..1


float y = (float)(i % M) / M * 2
-

1; //
-
1..1


glColor4fv( &iarray[i] );


glVertex2f(x, y); // POSITION

}

glEnd( );

void main( in float2 index : POSITION,


in float4 iarray : COLOR0,


out float4 hpos : POSITION,


out float4 oarray : TEXCOORD0,


uniform float4 globals ) {


hpos = float2(index, 0, 1);


oarray = Computation( iarray, globals );

}

Solution 1: Vertex shader computing

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

Vertex shader

float4 main( in float4 oarray : TEXCOORD0 ) : COLOR {


return oarray;

}

Fragment


shader

Solution 2: Fragment shader computing

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

Vertex shader

float4 main( in float4 iarray : TEXCOORD0,


uniform float4 globals ) : COLOR {


return Computation( iarray, globals );

}

Fragment


shader

void main( in float2 index : POSITION,


in float4 iarray : COLOR0,


out float4 hpos : POSITION,


out float4 array : TEXCOORD0) {


hpos = float2(index, 0, 1);


array = iarray;

}

Solution 3: Input array is in texture

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

CPU program:


glViewport(0, 0, M, M);

cgGLSetParameter
4
f(
GlobalPar
, 10,
2
0
,

30, 40
);


glBegin(GL_QUADS);


glTexCoord2f(0, 0); glVertex2f(
-
1,
-
1);


glTexCoord2f(0, 1); glVertex2f(
-
1, 1);


glTexCoord2f(1, 1); glVertex2f( 1, 1);


glTexCoord2f(1, 0); glVertex2f( 1,
-
1);

glEnd( );

Solution 3: Input array is in texture

Globals globals;

for(int i = 0; i < N; i++) {


oarray[i] = Computation( iarray[i], globals );

}

Vertex

shader

float4 main( in float4 iindex : TEXCOORD0,


uniform float4 globals,


uniform sampler2D iarraytex ) : COLOR {


float4 irray = tex2D(iarraytex, iindex);


return Computation( iarray, globals );

}

Fragment


shader

void main( in float2 oindex : POSITION,


in float2 iindex : TEXCOORD0,


out float4 hpos : POSITION,


out float2 index : TEXCOORD0 ) {


hpos = float4(oindex, 0, 1);


index = iindex;

}

Problem 2

Globals globals;

for(int i = 0; i < N; i++) {


int j = IarrayIdx( iarray, i, globals);



oarray[i] = Computation( iarray[j], globals );

}

Vertex

shader

float4 main( in float4 iindex : TEXCOORD0,


uniform float4 globals,


uniform sampler2D iarraytex ) : COLOR {


float2 j = IarrayIdx(iarraytex, iindex, globals);


float4 iarray = tex2D(iarraytex, j);


return Computation( iarray, globals );

}

Fragment


shader

void main( in float2 oindex : POSITION,


in float2 iindex : TEXCOORD0,


out float4 hpos : POSITION,


out float2 index : TEXCOORD0 ) {


hpos = float4(oindex, 0, 1);


index = iindex;

}

Problem 3

Globals globals1, global2;

for(int i = 0; i < N; i++) {


int j = OarrayIdx(i, globals1);



oarray[j] = Computation( iarray[i], globals2 );

}

Vertex

shader

float4 main( in float4 iindex : TEXCOORD0,


uniform float4 globals2,


uniform sampler2D iarraytex ) : COLOR {


float4 irray = tex2D(iarraytex, iindex);


return Computation( iarray, globals );

}

Fragment


shader

void main( in float2 oindex : POSITION,


in float2 iindex : TEXCOORD0,


out float4 hpos : POSITION,


out float2 index : TEXCOORD0,


uniform float4 globals1 ) {


float2 newoindex = OarrayIdx(iindex, globals1);


hpos = float4(newoindex * 2


float2(1,1), 0, 1);


index = iindex;

}

Other problems

Globals globals;

float sum = 0

for(int i = 0; i < N; i++) {


sum += Computation( iarray[i] );

}

Globals globals;

float min = MAX;

for(int i = 0; i < N; i++) {


c = Computation( iarray[i] );


if (min > c) min = c;

}

Ray tracing on the GPU

Ray tracing:


for each ray do


t = infinity


for each triangle do


tnew = Intersect(triangle, ray)


if (tnew < t) t = tnew


endfor


hit[ray] = ray.o + ray.dir * t

endfor

Problems:



two loops
-

all elements with all elements



t

is a global variable

Input stream of

geometry

Textures

z
-
buffer

Ray engine

Input texture: rays

Combination:

a triangle with

each ray

pixels

A
)

a triangle is a point,


and pixel shader loops

B
)

a

triangle is a


full screen quad, pixel shader


intersects a triangle with a ray

Input stream: triangles

Output texture: hits

Ray engine




CPU

program





Vertex
shader




Pixel

shader




“Triangles”

as full screen

quads

“Triangles”

as full screen

quads

Ray texture ids


Rasterization

Interpolation



Rays in

Texture
maps


Intersection

between one

triangle and

a ray

Triangles

as many times

as pixels

the quad

has

CPU: triangles as full screen quads

Triangle triang[ntriangles];


void Display( ) {


...


glBegin( GL_QUADS );


for
(
int i
= 0; i < ntriangles, i++) {


glMultiTexCoord2fARB(GL_TEXTURE1_ARB,

// TEXCOORD1




triang[i].v1.x, triang[i].v1.y, triang[i].v1.z);


glMultiTexCoord2fARB(GL_TEXTURE2_ARB,

// TEXCOORD2




triang[i].v2.x, triang[i].v2.y, triang[i].v2.z);


glMultiTexCoord2fARB(GL_TEXTURE3_ARB,

// TEXCOORD3




triang[i].v3.x, triang[i].v3.y, triang[i].v3.z);



glTexCoord2f(0,0); glVertex3f(
-
1,
-
1,0); // TEXCOORD0,
POSITION


glTexCoord2f(0,1); glVertex3f(
-
1, 1,0); // TEXCOORD0,
POSITION


glTexCoord2f(1,1); glVertex3f( 1, 1,0); // TEXCOORD0,
POSITION


glTexCoord2f(1,0); glVertex3f( 1,
-
1,0);

// TEXCOORD0,
POSITION



}


glEnd();

}

Vertex shader does “nothing”

struct outs {


float3 hposition : POSITION,


float2 rayuv : TEXCOORD0,


float3 r1 : TEXCOORD1,


float3 r2 : TEXCOORD2,


float3 r3 : TEXCOORD3

};


outs main( in float3 position : POSITION,




in float2 rayuv : TEXCOORD0,




in float3 r1 : TEXCOORD1,




in float3 r2 : TEXCOORD2,




in float3 r3 : TEXCOORD3 ) {


outs OUT;


OUT.r1 = IN.r1; OUT.r2 = IN.r2; OUT.r3 = IN.r3;


OUT.rayuv = IN.rayuv;


OUT.hposition = float4(IN.position, 1);


return OUT;

}

Triangle
-
ray intersection

1.
Plane intersection
:
p

=
rayo

+
raydir



t
,
t

> 0





(
p

-

r1
) ∙
n

= 0
,


norm
a
l:

n

= (
r2

-

r1
) x (
r3

-

r1
)


2.
Is the intersection inside the triangle
?


((
r2
-

r1
) x (
p
-

r1
)) ∙
n

> 0


((
r3
-

r2
) x (
p
-

r2
)) ∙
n

> 0


((
r1
-

r3
) x (
p
-

r3
)) ∙
n

> 0

r1

r1

r2

p

r3

(r1


rayo)



n

raydir



n

t
=

Pixel shader: ray
-
triangle intersection

void main(in float2 rayuv : TEXCOORD0, // ray index


in float3 r1 : TEXCOORD1, // vertex 1


in float3 r2 : TEXCOORD2, // vertex 2


in float3 r3 : TEXCOORD3, // vertex 3 of triang



out float3 p : COLOR, // hit point to texture


out float t : DEPTH, // z buffer finds the min



uniform sampler2D rayo
rg
s, // array of rays


uniform sampler2D raydirs,


uniform float maxdepth) {



float3 rayo = tex2D(rayo
rg
s, rayuv); // ray pars


float3 raydir = tex2d(raydirs, rayuv);


float3 normal = cross(r2


r1, r3


r1);


t = dot(p1


rayo, normal)/ dot(raydir, normal);


p = rayo + raydir * t;


if (dot(cross(r2
-
r1, p
-
r1), normal) < 0 ||


dot(cross(r3
-
r2, p
-
r2), normal) < 0 ||


dot(cross(r1
-
r3, p
-
r3), normal) < 0) t = 2; // ignore


else t
/= maxdepth;

}