# Mini Tutorial - UCLA

Biotechnology

Oct 1, 2013 (4 years and 7 months ago)

113 views

1

Mini t
utorial for the randomGLM R package

Lin Song, Steve Horvath

In this mini tutorial, we brief
ly

show how to
fit an

RGLM predict
or
. We
illustrate it using
the brain
cancer gene
expression data used in [1].

1. Binary outcome prediction

library(randomGLM)

data(mini)

# training data set whose columns contain the features (here genes)

x = mini\$x

# Since some of the column names are not unique we change them

colnames(x) =
make.names(colnames(x), unique = TRUE)

# outcome of the training data set

y = mini\$yB

table(y)

Output:

y

0 1

28 27

# test data set whose columns must equal those of the training data set

xtest = mini\$xtest

# outcome of the test data set

ytest =
mini\$yBtest

table(ytest)

Output:

y

ytest

0 1

33 32

#
Fit the RGLM predictor assuming that your computer has only 1 core (nThreads=1
).
Here we use the default values of all RGLM parameters. To learn how to choose
sider the tutorial

RGLMparameterTuningTutorial.docx which is posted on our webpage.

http://labs.genetics.ucla.edu/horvath/RGLM/

RGLM = randomGLM(x, y, xtest, classify=TRUE, keepModels=TRUE, nThreads=1)

# out
-
of
-
bag prediction

on the training data at the level of the response

predictedOOB = RGLM\$predictedOOB

table(y,

predictedOOB
)

Output:

predictedOOB

y 0 1

0 24 4

1 6 21

Message: The OOB prediction is
wrong
for 4+6=10 observation.

Thus, the OOB estimate of the
error rate is (4+6)/length(y)=

0.181818

The OOB estimate of the accuracy is 1
-
0.181818=0.818181

# This is the test set
prediction

predictedTest = RGLM\$predictedTest

2

table(y
test
,

predicted
Test
)

Output:

predictedTest

ytest 0 1

0 28 5

1 3 29

Message: The test set prediction is wrong for 3+5=8 observation.

Thus, the test set error rate is (3+5)/length(ytest)= 0.1230769

The test set estimate of the accuracy is 1
-

0.1230769=0.8769231

#
Class probabilities in the test set.

predict
edTes
t.response = RGLM\$predictedTest.response

predictedTest.response

Output:

0 1

dat65_21484_21474 4.206930e
-
01 0.57930700

dat65_21486_21475 2.898402e
-
01 0.71015982

dat65_21488_21476 3.814986e
-
01 0.61850144

dat65_21490_21477 6.999995e
-
02 0.93000005

..............ETC

Message:

For each test set observation (rows) t
he output reports the probability of being
class 0 or class 1. By thresholding these values, one obtains the predicted class
outcome
reported in
RGL
M\$predictedTest. To choose a different threshold in the
randomGLM function, consider the RGLM parameter thresholdClassProb (default
value 0.5).

# variable importance measures

varImp = RGLM\$timesSelectedByForwardRegression

#
Create a data frame that
reports the variable importance measure of each feature.

datvarImp=data.frame(Feature=
as.character(
dimnames(
RGLM\$timesSelectedByForw
ardRegression
)[[2]]
)
,
timesSelectedByForwardRegression
=
as.numeric(
RGLM\$timesSelectedByForwardRegression
)
)

#
Report the 2
0 m
ost significant features

datvarImp
[rank(
-
datvarImp[,2],ties.method="first")<=20,]

Output:

Feature timesSelectedByForwardRegression

214 200839_s_at 10

299 200986_at 3

452 201280_s_at

4

973 202291_s_at 9

1141 202625_at 7

1224 202803_s_at 5

1285 202953_at 3

1711 204174_at

3

1860 204829_s_at 4

1903 205105_at 3

2210 208306_x_at 6

2631 209619_at 5

3000 212203_x_at

4

3622 215049_x_at 4

3781 217362_x_at 9

4134 218217_at 5

4145 218232_at 9

4607 220856_x_at

5

3

4626 220980_s_at 5

4914 38487_at 5

#
Barplot of the
importance measures for the

20 most important

features

datVarImpSelected=datvarImp[rank(
-
datvarImp[,2],ties.method="first")<=20, ]

par(mfrow=c(1,1), mar=
c(4,8,3
,1))

barplot(
datVarImpSelected[,2],horiz=TRUE,names.arg=

datVarImpSelected[,1],xlab="Feature Importance"
,las=2,main="Most significant
features for the
RGLM"
,cex.axis=1,cex.main=1.5,cex.lab=1.5
)

# indices of fe
atures selected into the model in

bag 1

RGLM\$featuresInForwardRegression[[1]]

Output

X200660_at X202291_s_at X212145_at

Feature.1 119 973 2974

# Model
coefficients in

bag 1.

coef(RGLM\$models[[1]])

Output

(Intercept) X200660_at X202291_s_at X212145_at

7979.738216 2.002009 5.220940
-
36.515803

2.
Quantitative, c
ontinuous outcome prediction

library(randomGLM)

the

data

(they are part of the randomGLM package).

data(mini)

4

# prepare
the training
data

x = mini\$x

# Since some of the column names are not unique we change them

colnames(x) = make.names(colnames(x), unique = TRUE)

y = mini\$yC

# prepare the test data

xtest = min
i\$xtest

colnames(x
test
) = make.names(colnames(x
test
), unique = TRUE)

ytest = mini\$yCtest

#
Fit the RGLM predictor

RGLM = randomGLM(x, y, xtest,

classify=FALSE,

keepModels=TRUE,

)

# out
-
of
-
bag prediction

at the level of response

predictedOOB.response

= RGLM\$predictedOOB.response

# test set prediction

predictedTest.response = RGLM\$predictedTest.response

# variable importance measures

varImp = RGLM\$timesSelectedByForwardRegression

#
Barplot of the
importance measures for the

20 most

important

features

datvarImp=data.frame(Feature=as.character(dimnames(
RGLM\$timesSelectedByForw
ardRegression
)[[2]]),
timesSelectedByForwardRegression
=
as.numeric(
RGLM\$timesSelectedByForwardRegression
))

datVarImpSelected=datvarImp[rank(
-
datvarImp[,2],ties.method="first")<=20, ]

par(mfrow=c(1,1), mar=c(4,8,3,1))

barplot(datVarImpSelected[,2],horiz=TRUE,names.arg=

datVarImpSelected[,1],xlab="Feature Importance",las=2,main="Most significant
features for the
RGLM",cex.axis=1,cex.main=1.5,cex.lab=1.5)

# indices of features selected into the model of bag 1

RGLM\$featuresInForwardRegression[[1]]

Output

X218232_at X203175_at X200986_at X216913_s_at X32128_at X208451_s_at
X216231_s_at X208885_at

Feature.
1 4145 1378 299 3757 4865 2227
3713 2353

5

X220856_x_at X212588_at X202625_at X218831_s_at X208961_s_at
X201041_s_at X201850_at X212119_at

Feature.1 4607 3155

ETC

# Model co
efficients of bag 1

coef(RGLM\$models[[1]])

Output

(Intercept) X218232_at X203175_at X200986_at X216913_s_at
X32128_at X208451_s_at

-
4.038310e+02 1.350274e
-
01 8.108334e
-
02
-
3.471803e
-
02 2.447514e
-
01
8.666125e
-
01 7.484849e
-
02

X216231_s_at X208885_at X220856_x_at X212588_at X202625_at
X218831_s_at X208961_s_at

-
4.420324e
-
02
-
3.588560e
-
02 1.852245e
-
02
-
4.221644e
-
01 2.107064e
-
01
-
2.589818e
-
01 1.694786e
-
01

X201041_s_at X201850_at X212119_at X201954_at

X204682_at
X204053_x_at X201887_at

-
3.752358e
-
02 6.106237e
-
02 1.290596e
-
01
-
2.070730e
-
01
-
2.556034e
-
01
3.274343e
-
01
-
3.154036e
-
01

X200660_at X214836_x_at X217947_at X211990_at X200620_at
X202253_s_at X202237_at

1.046733e
-
01
-
3.
279971e
-
02
-
8.911381e
-
02 9.909101e
-
02 2.114678e
-
01
4.784094e
-
01 1.220026e
-
01

X211963_s_at X213975_s_at X202833_s_at X201438_at X218473_s_at
X208894_at X219059_s_at

-
9.914986e
-
02 3.551280e
-
02
-
1.968306e
-
01 3.784942e
-
02
-
7.550249e
-
02
2.20
4820e
-
02
-
1.191214e
-
01

X219505_at X202353_s_at X203882_at X217748_at X215121_x_at

9.010647e
-
02
-
6.963570e
-
02
-
8.129762e
-
03 3.327119e
-
02 2.239568e
-
03

References

1.
Song L, Langfelder P, Horvath S (2013) Random generalized linear model: a
highly accurate and
interpretable ensemble predictor. BMC Bioinformatics 14:5 PMID: 23323760DOI:
10.1186/1471
-
2105
-
14
-
5.