# Programming the GPU

Software and s/w Development

Dec 13, 2013 (4 years and 10 months ago)

194 views

Programming the GPU

on Cg

Szirmay
-
Kalos László

email: szirmay@iit.bme.hu

Web: http://www.iit.bme.hu/~szirmay

Hardware

GPU

Frame

buffer

display

CPU

Memory

I/O

Graphics card

Program

OpenGL API

OpenGL API

glLightfv(GL_LIGHT0, GL_DIFFUSE, I);

glMaterialfv( GL_FRONT, GL_DIFFUSE, kd);

glViewport( 0, 0, width, height);

gl
u
LookAt(ex, ey, ez, lax, lay, laz,upx, upy, upz)
;

glScalef(sx, sy, sz)
;

glTranslatef(px, py,pz);

glRotatef(ang
,
a
xis
x,a
xis
y,a
xis
z);

glBegin(GL_TRIANGLES);

glNormal3f(nx
1
,ny
1
,nz
1
);

glColor3f(r
1
,g
1
,b
1
);

glTexCoord2f(u1,v1)

glVertex3f(x
1
,y
1
,z
1
);

glEnd( );

CPU

GPU

State

Uniform

variables

Geometry

Vertex properties

Vertices

PASS

Rendering Pipeline

Virtual world

Camera space,

illumination

Perspective

transformation +

Clipping

+

Homogeneous div.

1.

2.

Viewport transf
+
Rasterization+interpolation

display

color

depth

MODELVIEW

PROJECTION

Texture mapping

(u1, v1)

(u2, v2)

(u3, v3)

x1,y1,z1

x2,y2,z2

x3,y3,z3

szín

Texturing hardware

(u3, v3)

(u1, v1)

(u2, v2)

Linear interpolation:

(u, v)

Texture object in

GPU memory

Why is linear

interpolation our friend?

X

Y

I

I(X,Y) = aX + bY + c

I(X,Y)

I(X+1,Y) = I(X,Y) + a

(X
1
,Y
1
,I
1
)

(X
2
,Y
2
,I
2
)

(X
3
,Y
3
,I
3
)

I
(X,Y)

X counter

I

register

a

X

S

CLK

GPU hardware achitecture

Interface

Transform+

Illumination

Clipping + Hom.division

+
Viewport transform

Projection +
Rasterization

+

Linear interpolation

Texturing

Compositing (Z
-
buffer,
transparency)

Texture

memory

Early Z
-
cull

vertices

triangles

fragments

Vertex

Fragment

Why is it fast? Stream processing

Proc 2

Proc 1

Proc 1

Proc
21

Proc
22

Pipelining

Parallelism

Elements are processed INDEPENDENTLY

No internal storage

Parallel execution without synchronization

Clipping
:
-
w
<
X
<
w
,
-
w
<
Y
<
w
,
-
w
<
Z
<
w
, 0<color<1

State

Transforms

Lightsources

Materials

POSITION, NORMAL, COLOR0, TEXTCOORD0,…

glVertex

gl
Normal

gl
Color

glTextCoord

glBegin(GL_TRIANGLES)

glEnd( )

POSITION,

COLOR0,

TEXTCOORD0,

… for triangle

vertices

Homogeneous division
:
x
=
X
/
w
,
y
=
Y
/
w
,
z
=
Z
/
w

POSITION, COLOR0, TEXTCOORD0,…

f
or
triangle

vertices

*
MVP

*
MV

*
MV
IT

Illumination

Viewport transform:

xv = center.x + viewsize.x * x / 2

CPU

GPU

Standard

struct ins {

float4 position

: POSITION;

// glVertex

float3 normal

: NORMAL;

// glNormal

float4 color

: COLOR0
; // glColor

float2 texcoord

: TEXCOORD0;

// glTexCoord

};

struct outs {

float4 hposition

: POSITION;

float4
color

: COLOR0;

float2 texcoord

: TEXCOORD0;

};

outs main( in
s

IN,

uniform float4x4
MVP : state.matrix.mvp
) {

outs OUT
;

OUT.hposition = mul(
MVP
, IN.position);

OUT.texcoord = IN.texcoord;

OUT.
color

= IN.
color
;

return OUT;

}

glDisable(GL_LIGHTING );

P
ositional light source

outputs main( ins IN,

uniform float4x4
MV,

uniform float4x4 MV
IT,

uniform float4x4 MVP
,

uniform float3 light
pos,

uniform float4 Idiff, Iamb, Ispec,

uniform float4 em, ka, kd, ks,

uniform float shininess ) {

outs OUT;

OUT.hposition = mul(
MVP
, IN.position);

float3 N = mul(MVIT, IN.normal).xyz;

N =
n
ormalize(
N
);

// glEnable(GL_NORMALIZE)

float3 cpos = mul(MV, IN.position).xyz;

float3

L =
normalize(
lightpos

cpos
)
;

float3 H = normalize(L + V);

OUT.
color

=
em +
Iamb
* ka +

Idiff * kd *
saturate
(dot(N, L)) +

Ispec * ks * pow(saturate(dot(N, H)),shininess);

return OUT;

}

glEnable(GL_LIGHTING );

N

L

V

Fragment

State

Texture id,

texturing
environment

POSITION, COLOR0, TEXTCOORD0,… for triangle

vertice
s

POSITION, COLOR

Compositing: blending, z
-
buffering

Projection,
Rasterization and
linear interpolation

Fragment

Texturing:

text2d
(u,v)*color0

POSITION, COLOR0, TEXTCOORD0 for
fragments

Texture memory

Frame buffer

Z
-
cull

Standard

gl
Disable
(GL_
TEXTURE_2D
);

gl
Enable
(GL_
TEXTURE_2D
);

with
GL_REPLACE

mode

float
4

main(

in float3 color

: COLOR0) : COLOR

{

return color;

}

float
4

main(

in float2 texcoord

: TEXCOORD0,

in float3 color

: COLOR0,

uniform sampler2D
texture
_map ) : COLOR

{

return text2D
(
texture_map
, texcoord);

}

What can we
do with it
?

General
BRDF model
s

Spec. transformations, smooth binding

Waving, procedural animation

Fragment

bump/parallax/displacement/reflection

mapping

Both:

General purpose computation

ambien
t

diff
use

spe
c
ul
ar

Gouraud

versus Phong

Gouraud

Phong

Phong

Gouraud

CPU

program

Vertex

Pixel

Position

Normal

Transformations

Materials

Lights

Transformed

posi
t
ion

Color

Rasterization

Interpolation

Illumination

Interpolated

color

CPU

program

Vertex

Pixel

Position

Normal

Transformations

Light position

Transf.position

Transf.normal

View

Light

Rasterization

Interpolation

Illumination

Interpolated

Normal

View

Light

Materials

Light

intensity

Programs

.cpp CPU program:

Capability query of the GPU (profile)

Vertex/fragment program

load from file and compile: CREATE

Selection of the current Vertex/fragment program: BIND

Uniform vertex/fragment variable definition

Uniform vertex/fragment variable setting

Non
-
uniform variables set (glVertex, glColor, glTexCoord…)

.cg vertex program

Fragment program
’s non
-
uniform variables

+ homog
eneous
position

.cg fragment program

Color
output

Initialization

Display

CPU program
-

Initialization

#include <Cg/cgGL.h>

//
cg
functions

CGparameter Lightpos, Shine, Ks, Kd;

//
uniform pars

main( ) {

CGprofile vertexProf, fragmentProf;
//
profiles

vertexProf = cgGLGetLatestProfile(CG_GL_VERTEX);

fragmentProf = cgGLGetLatestProfile(CG_GL_FRAGMENT);

cgGLEnableProfile(vertexProf);

cgGLEnableProfile(fragmentProf);

CGprogram vertexProg
ram

= cgCreateProgramFromFile(

CG_SOURCE,

“vertex.cg",

vertexProf,

NULL, NULL);

ram
);

//

cgGLBindProgram(vertexProgram);

//
this program is to run

// vertex program uniform parameters

Light
pos

= DefineCGParameter(VertexProgram, "lightcam");

Fragment program

CGprogram fragmentProgram = cgCreateProgramFromFile(

CG_SOURCE,

“fragment.cg",

fragmentProf,

NULL, NULL);

fragmentProgram
);

//

cgGLBindProgram(fragmentProgram);

//
this program is to run

//
fragment

program uniform parameters

Shin
e

= DefineCGParameter(
fragment
Program, "shininess");

Kd = DefineCGParameter(
fragment
Program, "kd");

Ks = DefineCGParameter(
fragment
Program, "ks");

… OpenGL initialization

CPU program
-

OpenGL display

void Display( ) {

// state (
uniform
)

parameter setting

gluLookAt(0, 0,
-
10, 0, 0, 0, 0, 1, 0);

glRotatef(angle, 0, 1, 0);

//
uniform
parameter setting

cgGLSetParameter3f(Light
pos
, 10,
2
0
,

30
);

cgGLSetParameter1f(Shin
e
, 40);

cgGLSetParameter3f(Kd, 1, 0.8, 0.2);

cgGLSetParameter3f(Ks, 2, 2, 2);

//
n
on

uniform param
eters

glBegin( GL_TRIANGLES );

for
( … ) {

glNormal3f(nx, ny, nz);

//
NORMAL register

glVertex3f(x, y, z);

//
POSITION register

}

glEnd();

}

struct outs {

float4 hposition

: POSITION;

float3 normal

: TEXCOORD0;

float3 view

: TEXCOORD1;

float3 light

: TEXCOORD2;

};

outs main(

in
float4 position

: POSITION;

in
float4 normal

: NORMAL;

uniform float4x4 M
VP

: state.matrix.mvp,

uniform float4x4 M
V

: state.matrix.modelview,

uniform float4x4 M
VIT

: state.matrix.modelview.invtrans,

uniform float3 lightcam

)

{

outs OUT;

OUT.hposition = mul(M
VP
, IN.position);

float3 poscam = mul(M
V
, IN.position).xyz;

OUT.normal = mul(M
VIT
, IN.normal).xyz;

OUT.light = lightcam
-

poscam;

OUT.view =
-
poscam;

return OUT;

}

Vertex

N

L

V

:
fragment

float3 main(

in
float3 normal

: TEXCOORD0,

in
float3 view

: TEXCOORD1,

in
float3 light

: TEXCOORD2,

uniform float shininess,

uniform float3 kd,

uniform float3 ks

)

: COLOR

{

normal = normalize(normal);

view = normalize(view);

light = normalize(light);

float3 half = normalize(view + light);

float3 color =

kd * saturate(dot(normal, light)) +

ks * pow( saturate(dot(normal,

half)), shininess );

return color;

}

fragment

Example 2
:

Refraction

Example 2
:

Refraction

Result

Refraction computation

CPU

program

Vertex

Pixel

Po
sition

Norm
al

Transform
s

Index of refraction

Transf.
pos

Refraction

direction

Environment map id

Rasterization

Interpolation

Env.Map

texels

Interpolated

Refraction

direction

Env.map

lookup

Refraction

struct outs {

float4 hPosition : POSITION;

float3 refract
dir

: TEXCOORD0;};

outs main(

in float4
p
osition : POSITION,

in float4
n
ormal

: NORMAL,

uniform float4x4 M
VP
,

uniform float4x4 M
V
,

uniform float4x4 M
VIT
,

uniform float n

) {

outs OUT;

OUT.hPosition = mul(M
VP
,
p
osition);

float3 view = normalize( mul(M
V
,
p
osition).xyz );

float3 norm
cam

= normalize( mul(M
VIT
,
n
ormal).xyz );

OUT.refract
dir

= refract(view, norm
cam
, n);

return OUT;

}

Vertex

Refraction
:
fragment

float
3

main(

in float3 refract
dir

: TEXCOORD0,

uniform samplerCUBE envMap )

: COLOR

{

return texCUBE(envMap, refract
dir
).rgb;

}

fragment

Pixel

color

Keyframe character animation

Mesh
morphing
:

t= 0

t= 1

Two enclosing
keys

Time
:
t

vertices

Linear
interpolation of
the vertices

Mesh deform
ation

Bone animation

Complete animation

Example 3:

Bone animation

rigid and smooth
binding

Rigid Smooth

Smooth binding

outputs main(in float4 pos

: POSITION
,

in float4 ind
ices : COLOR0
,

in float4 weights

: NORMAL
,

uniform float4x4
MVP
,

uniform float3x4 bones[30] ) {

outs OUT;

float4

tpos = float4(
0, 0, 0, 0)
;

for (float i = 0; i < 4; i++) {

t
pos += weight
s
.x * mul(bones[ind
ices
.x], pos);

ind
ices

= ind
ices
.yzwx;

weight
s

= weight
s
.yzwx;

}

OUT.hPosition = mul(
MVP
,
t
pos);

return OUT;

}

Stream processing

Proc. 1

Proc. 2

Elements are processed INDEPENDENTLY

Pipelining

Parallelization

No internal storages

Stream processor types

Map

Amplify

Reduce

Sum

GPGPU stream programming

Clippling

Triangle setup +
rasterization
+

Line
ar interpolation

Compositing

Text
ure

memory

Vertex

Pixel

Mapping:

Change of stream element data

Mapping

Framebuffer

CPU

Vertices + properties:

Input stream of elements 13 x 4 floats

Conditional reduction

Amplification

Sum + min + reduction

Input/Output and coupling

Input

stream of vertices and properties

Texture memory

Output

Frame buffer

Texture memory

feedback

Mapping algorithms onto the GPU

Problem 1

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

2D array (texture) is available :

u = (float)(i / M) / M;

v = (float)(i % M) / M;

oarray[u][v] = Computation( iarray[u][v], globals );

Globals are uniform parameters

Output array goes to a texture or to the frame buffer

Input array is either a texture or vertex data

Solution 1: Input array is vertex data

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

CPU program:

GlobalPar

= DefineCGParameter(
vertex
Pro
g
, “
globals
");

cgGLSetParameter
4
f(
GlobalPar
, 10,
2
0
,

30, 40
);

glViewport(0, 0, M, M);

glBegin(GL_POINTS);

for(int i = 0; i < N; i++) { // M * M > N

float x = (float)(i / M) / M * 2
-

1; //
-
1..1

float y = (float)(i % M) / M * 2
-

1; //
-
1..1

glColor4fv( &iarray[i] );

glVertex2f(x, y); // POSITION

}

glEnd( );

void main( in float2 index : POSITION,

in float4 iarray : COLOR0,

out float4 hpos : POSITION,

out float4 oarray : TEXCOORD0,

uniform float4 globals ) {

hpos = float2(index, 0, 1);

oarray = Computation( iarray, globals );

}

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

float4 main( in float4 oarray : TEXCOORD0 ) : COLOR {

return oarray;

}

Fragment

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

float4 main( in float4 iarray : TEXCOORD0,

uniform float4 globals ) : COLOR {

return Computation( iarray, globals );

}

Fragment

void main( in float2 index : POSITION,

in float4 iarray : COLOR0,

out float4 hpos : POSITION,

out float4 array : TEXCOORD0) {

hpos = float2(index, 0, 1);

array = iarray;

}

Solution 3: Input array is in texture

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

CPU program:

glViewport(0, 0, M, M);

cgGLSetParameter
4
f(
GlobalPar
, 10,
2
0
,

30, 40
);

glTexCoord2f(0, 0); glVertex2f(
-
1,
-
1);

glTexCoord2f(0, 1); glVertex2f(
-
1, 1);

glTexCoord2f(1, 1); glVertex2f( 1, 1);

glTexCoord2f(1, 0); glVertex2f( 1,
-
1);

glEnd( );

Solution 3: Input array is in texture

Globals globals;

for(int i = 0; i < N; i++) {

oarray[i] = Computation( iarray[i], globals );

}

Vertex

float4 main( in float4 iindex : TEXCOORD0,

uniform float4 globals,

uniform sampler2D iarraytex ) : COLOR {

float4 irray = tex2D(iarraytex, iindex);

return Computation( iarray, globals );

}

Fragment

void main( in float2 oindex : POSITION,

in float2 iindex : TEXCOORD0,

out float4 hpos : POSITION,

out float2 index : TEXCOORD0 ) {

hpos = float4(oindex, 0, 1);

index = iindex;

}

Problem 2

Globals globals;

for(int i = 0; i < N; i++) {

int j = IarrayIdx( iarray, i, globals);

oarray[i] = Computation( iarray[j], globals );

}

Vertex

float4 main( in float4 iindex : TEXCOORD0,

uniform float4 globals,

uniform sampler2D iarraytex ) : COLOR {

float2 j = IarrayIdx(iarraytex, iindex, globals);

float4 iarray = tex2D(iarraytex, j);

return Computation( iarray, globals );

}

Fragment

void main( in float2 oindex : POSITION,

in float2 iindex : TEXCOORD0,

out float4 hpos : POSITION,

out float2 index : TEXCOORD0 ) {

hpos = float4(oindex, 0, 1);

index = iindex;

}

Problem 3

Globals globals1, global2;

for(int i = 0; i < N; i++) {

int j = OarrayIdx(i, globals1);

oarray[j] = Computation( iarray[i], globals2 );

}

Vertex

float4 main( in float4 iindex : TEXCOORD0,

uniform float4 globals2,

uniform sampler2D iarraytex ) : COLOR {

float4 irray = tex2D(iarraytex, iindex);

return Computation( iarray, globals );

}

Fragment

void main( in float2 oindex : POSITION,

in float2 iindex : TEXCOORD0,

out float4 hpos : POSITION,

out float2 index : TEXCOORD0,

uniform float4 globals1 ) {

float2 newoindex = OarrayIdx(iindex, globals1);

hpos = float4(newoindex * 2

float2(1,1), 0, 1);

index = iindex;

}

Other problems

Globals globals;

float sum = 0

for(int i = 0; i < N; i++) {

sum += Computation( iarray[i] );

}

Globals globals;

float min = MAX;

for(int i = 0; i < N; i++) {

c = Computation( iarray[i] );

if (min > c) min = c;

}

Ray tracing on the GPU

Ray tracing:

for each ray do

t = infinity

for each triangle do

tnew = Intersect(triangle, ray)

if (tnew < t) t = tnew

endfor

hit[ray] = ray.o + ray.dir * t

endfor

Problems:

two loops
-

all elements with all elements

t

is a global variable

Input stream of

geometry

Textures

z
-
buffer

Ray engine

Input texture: rays

Combination:

a triangle with

each ray

pixels

A
)

a triangle is a point,

B
)

a

triangle is a

intersects a triangle with a ray

Input stream: triangles

Output texture: hits

Ray engine

CPU

program

Vertex

Pixel

“Triangles”

as full screen

“Triangles”

as full screen

Ray texture ids

Rasterization

Interpolation

Rays in

Texture
maps

Intersection

between one

triangle and

a ray

Triangles

as many times

as pixels

has

CPU: triangles as full screen quads

Triangle triang[ntriangles];

void Display( ) {

...

for
(
int i
= 0; i < ntriangles, i++) {

glMultiTexCoord2fARB(GL_TEXTURE1_ARB,

// TEXCOORD1

triang[i].v1.x, triang[i].v1.y, triang[i].v1.z);

glMultiTexCoord2fARB(GL_TEXTURE2_ARB,

// TEXCOORD2

triang[i].v2.x, triang[i].v2.y, triang[i].v2.z);

glMultiTexCoord2fARB(GL_TEXTURE3_ARB,

// TEXCOORD3

triang[i].v3.x, triang[i].v3.y, triang[i].v3.z);

glTexCoord2f(0,0); glVertex3f(
-
1,
-
1,0); // TEXCOORD0,
POSITION

glTexCoord2f(0,1); glVertex3f(
-
1, 1,0); // TEXCOORD0,
POSITION

glTexCoord2f(1,1); glVertex3f( 1, 1,0); // TEXCOORD0,
POSITION

glTexCoord2f(1,0); glVertex3f( 1,
-
1,0);

// TEXCOORD0,
POSITION

}

glEnd();

}

struct outs {

float3 hposition : POSITION,

float2 rayuv : TEXCOORD0,

float3 r1 : TEXCOORD1,

float3 r2 : TEXCOORD2,

float3 r3 : TEXCOORD3

};

outs main( in float3 position : POSITION,

in float2 rayuv : TEXCOORD0,

in float3 r1 : TEXCOORD1,

in float3 r2 : TEXCOORD2,

in float3 r3 : TEXCOORD3 ) {

outs OUT;

OUT.r1 = IN.r1; OUT.r2 = IN.r2; OUT.r3 = IN.r3;

OUT.rayuv = IN.rayuv;

OUT.hposition = float4(IN.position, 1);

return OUT;

}

Triangle
-
ray intersection

1.
Plane intersection
:
p

=
rayo

+
raydir

t
,
t

> 0

(
p

-

r1
) ∙
n

= 0
,

norm
a
l:

n

= (
r2

-

r1
) x (
r3

-

r1
)

2.
Is the intersection inside the triangle
?

((
r2
-

r1
) x (
p
-

r1
)) ∙
n

> 0

((
r3
-

r2
) x (
p
-

r2
)) ∙
n

> 0

((
r1
-

r3
) x (
p
-

r3
)) ∙
n

> 0

r1

r1

r2

p

r3

(r1

rayo)

n

raydir

n

t
=

-
triangle intersection

void main(in float2 rayuv : TEXCOORD0, // ray index

in float3 r1 : TEXCOORD1, // vertex 1

in float3 r2 : TEXCOORD2, // vertex 2

in float3 r3 : TEXCOORD3, // vertex 3 of triang

out float3 p : COLOR, // hit point to texture

out float t : DEPTH, // z buffer finds the min

uniform sampler2D rayo
rg
s, // array of rays

uniform sampler2D raydirs,

uniform float maxdepth) {

float3 rayo = tex2D(rayo
rg
s, rayuv); // ray pars

float3 raydir = tex2d(raydirs, rayuv);

float3 normal = cross(r2

r1, r3

r1);

t = dot(p1

rayo, normal)/ dot(raydir, normal);

p = rayo + raydir * t;

if (dot(cross(r2
-
r1, p
-
r1), normal) < 0 ||

dot(cross(r3
-
r2, p
-
r2), normal) < 0 ||

dot(cross(r1
-
r3, p
-
r3), normal) < 0) t = 2; // ignore

else t
/= maxdepth;

}