# Lab 8 - Computing

Διαχείριση Δεδομένων

29 Νοε 2012 (πριν από 5 χρόνια και 5 μήνες)

291 εμφανίσεις

1

Lab 8

(Optional)

(17 April
-
2012
)

Using R with

Postgre
SQL

Spatial Statistics & Knowledge Discovery

DT786

In this lab we will cover

1)

Some more R Programs
:
Installing PL/R

and u
sing R functions in PostgreSQL

2)

Access
ing

PostgreSQL tables from R

3)

Getting Iris
h data from PostgreSQL

It
is not necessary
to use PostgreSQL
for the assignment.

1)

Some more R Programs

Start R from All P
rograms | R. Paste the following into R , press return and note results.

The lines starting with # are comments

#list packages

libr
ary()

getwd()

# set working dir, note
\
\

for Windows

#
setwd("C:
\
\
My
-
R
-
Dir
")

#

Plotting Xs and Ys

y<
-
c(11,12,9,7,5,8,4,4,5,3)

x<
-
c(11,12,9,7,5,8,4,4,5,3)

plot(x,y)

x<
-

1:10

# type x

x

y<
-
c(11,12,9,7,5,8,4,4,5,3)

plot(x,y
,ylab="Response varia
ble",xlab="Explanatory
variable”
)

# Note the range of the axes.

# lm stands for linear model
, which was coverd in Lab1.

# T
he following creates a model and draw
s

the line

# Note, plot

(in previous step)

must be called before abline,

# When plot is called we get
fresh axi
s & lose previous plots
.

abline(lm(y~x))

# The lines(
a
n existing
graph.

# It can not produce a graph on its own.

# Usually
it follows plot(x, y)
that produces a graph.

2

# User
-
specified line,

with given start and end points

# lty stands for line type, in this case number 2.

lines(c(1,10),c(1,10)
,type="l"
,lty=2)

#Note axes.

#Set the range of the axes.

xl <
-

c(0,ceiling(max(x
)))

yl <
-

c(0,ceiling(max(y
)))

plot(x,y,xlim=xl,ylim=yl)

lines(c(0,10
),c(0,10)
,lty=2)

#Note first argument contains
the x’s and y’s are in the
second argument.

# The following points are added to the scatterplot

#
using plotting character pch=3 (plus sign +).

v<
-
c(2,4,6,8,10)

w<
-
c(8,5,6,6,2)

points(v,w,pch=3)

# correlat
ion

x = c(45,43,46,48,51,46,50,47,46,45)

y = c(42,43,40,44,53,48,50,49,41,44)

cor(x,y)

# plotting

two

functions

x<
-
seq(0,10,0.1)

y1 <
-

2 + 3 * x
-

0.25 * x ^ 2

y2 <
-

3 + 3.3 * x
-

0.3 * x ^ 2

par(bg="ghostwhite")

plot(x,y2,type="n",ylab="")

lines(x,y2,
col="red")

lines(x,y1,col="blue")

#
Make a pie chart

pie(rep(1, 30), col = rainbow(30), radius = 0.9)

# kilometres travelled

kilometres_
travelled = c(65311, 65624, 65908, 66219, 66499, 66821, 67145, 67447)

x = diff(
kilometres_
travelled)

# print x

x

#

Describe the R classes

data.frame.

?
data.frame

#or

3

help(data.frame)

# make a data.frame.

# Childs age in months

age <
-

18:29

height <
-

c(76.1,77,78.1,78.2,78.8,79.7,79.9,81.1,81.2,81.8,82.8,83.5)

#Construct a data frame object

height.age <
-

da
ta.frame(ag
e,height
)

names(height.age)

slotNames(height.age)

#Set the range of the axes.

xl <
-

c(0,ceiling(max(age)))

yl <
-

c(0,ceiling(max(height)))

p
lot(age,height,xlim=xl,ylim=yl)

#
Draw the regression line

abline(lm(height~age))

#
What would you expect the heig
ht of a 25.5 month old child to be?

#
What value would you expect the following R expression to produce?

#Can you explain the result?

cor(age,height) == cor(height,age)

#

Describe the R classe

SpatialPointsDataFrame
.

#What does the following code do
?

library(spdep)

temperature <
-

c(16.1,17,18.1,18.2,19.7,19.9,19.1,18.2,19.8)

x <
-

c(1,1,1,2,2,2,3,3,3)

y <
-

c(1,2,3,1,2,3,1,2,3)

xy <
-

cbind(x,y)

d <
-

data.frame(temperature)

sdf <
-

SpatialPointsDataFrame(xy,d)

tmp <
-

as.character(sdf\$temperature)

vals <
-

list("sp.text", coordinates(sdf), tmp, cex=0.7, pos=1)

spplot(sdf, sp.layout = vals, main="Temp. Gird")

3
.
Installing PL/R

4

PL/R is a PostgreSQL language extension that allows you to write PostgreSQL func
tions
and aggregate functions using

the R statis
tical computing language.

This note covers Windows, for non
-
Windows users see
http://www.joeconway.com/plr/doc/plr
-
install.html
.

http://www.bostongis.com/PrinterFriendly.aspx?content_name=postgresql_plr_tut01

This note is more or less exactly as;

http:
//www.bostongis.com/PrinterFriendly.aspx?content_name=postgresql_plr_tut01

3.
1. Copy PL/R from CD or d
nstallation file from;
http://www.joeconway.com/web/guest/pl/r

3.
2. Extract
the files plr.dll and plr.sql from
plr
-
8.3.0.9
-
pg84.win32
.zip to a temporary
folder. You can extract the entire set of files if you wish.

3.
3. You will find plr.dll in a subfolder your temporary folder. Y
ou need to copy the
plr.dll into;

C:
\
Program File
s
\
PostgreSQL
\
8.4
\
lib

.

3.4
A
lso copy
the plr.sql into

C:
\
Program Files
\
PostgreSQL
\
8.4
\
share
\
contrib
\
PLR

You will have to make the appropriate folder structure.

5

3.
5 Make and set

the environment var
iable R_HOME.

Do this via the control
panel:

Control Panel
-
> System
-
-
> Environment Variables

Add an R_HOME system variable and the R_HOME location of your R install.

If you accept the defaults this is usually
C:
\
Program Files
\
R
\
R
-
2.10.1

3.6
Also in the System

Variables section e
dit
Path

system variable and add the R
bin

folder

to the end of it.

;
C:
\
Program Files
\
R
\
R
-
2.10.1
\
bin

Do not r
emove existing entries in your
P
ath
, just add this to the end
.

Note on Windows
a semicolon is used as a sepa
rator in
Path
.
.

6

3.7
.
Restar
t your PostgreSQL service from Control P
anel
-
> Services. On rare
circumstances, you may need to restart the computer for changes to take effect.

3.8
In order to start using PL/R in a database, you need to load the help functions in the
dat
abase. This is similar to setting up pgRou
ting or PostGiS.

Do the following.

3.8.
-

select the database you want to enable with PL/R

(e.g. testdb
or postgis
)

and then click the SQL icon to get to the query window.

3.8.
2. Choose
-
>
File
-
> Open
-
> path/to/PostgreSQL/8.4/contrib/plr.sql

NOTE: on Windows the default location is

C:
\
Program Files
\
PostgreSQL
\
8.4
\
contrib
\
plr.sql

3.8

3. Click the Green arrow to execute

Now R is installed in PostgreSQL

:

Sometime
s
plr.sql
,

may be

down to the fact

that Windows
uses a carriage return/linefeed pair (CR/LF), whilst Unix only uses the LF character.

R
was ported from Linux.
Evidently when copying from IE
-

IE puts in carriage returns
of Unix line breaks.

When creating PL/R functions make sure to u
se Unix line
indows carriage returns by using an edit
will allow you to specify Unix line breaks.

So you could us
e;

File | Save As |
plrn
.sql
. using Unix format

Alternatively you can
plr.sql
directly

from a command prompt.

7

Copy
plr.sql to

C:
\
Program Files
\
PostgreSQL
\
8.4
\
bin

Open a command prompt to the same directory and type

psql
-
d postgis
-
U postgres
-
f plr.sql

This will i
nstall PLR into the
postgis

database

(you could use
testdb

or any other
db)
.

By default, PostgreSQL converts the CR/LF to LF cha
racter when inserting and updat
ing
records. This means that insertions into the database get added with just an LF.

For more
de
tail see:

http://www.bostongis.com/PrinterFriendly.aspx?content_name=postgresql_plr_tut01

3.
9
.
Test R. R
un the follo
wing commands from SQL shell

to test
out R

SELECT * FROM plr_environ();

SELECT * FROM r_typenames();

SELECT plr_array_accum('{23,35}', 42);

3.
The usually way to use PL/R in
PostgreSQL is to include an R function i
n an SQL function, using the CREATE
FUNCTION statement

CREATE OR REPLACE FUNCTION plr_array (text, text)

RETURNS text[]

AS '\$libdir/plr','plr_array'

LANGUAGE 'C' WITH (isstrict);

--

not test

select plr_array('hello','world');

We are not too concerned wi
th the deta
ils, we just need to know that
the statistical power
of R is

available in PostgreSQL.

create or replace function r_median(_float8)

returns float as 'median(arg1)' language 'plr';

In PostgreSQL an aggregate function is a function that

computes a single value from a
collection of records (e.g. sum). Here separate aggregates are created for each R function.

CREATE AGGREGATE median (

8

sfunc = plr_array_accum,

basetype = float8,

stype = _float8,

finalfunc = r_median

);

Now we mak
e a table to test the function.

create table foo(f0 int, f1 text, f2 float8);

insert into foo values(1,'cat1',1.21);

insert into foo values(2,'cat1',1.24);

insert into foo values(3,'cat1',1.18);

insert into foo values(4,'cat1',1.26);

insert into foo value
s(5,'cat1',1.15);

insert into foo values(6,'cat2',1.15);

insert into foo values(7,'cat2',1.26);

insert into foo values(8,'cat2',1.32);

insert into foo values(9,'cat2',1.30);

select f1, median(f2) from foo group by f1 order by f1;

We can check the details

of the median function

\
df median

3.11
Create standard deviation
function

This is an important function for your assignment.

CREATE AGGREGATE my_sd (

sfunc = plr_array_accum,

basetype = float8,

stype = _float8,

finalfunc = my_sd

);

CREATE OR
REPLACE FUNCTION my_sd(double precision[])

RETURNS double precision AS

--

This is a call to R’s standard deviation function.

'sd(arg1)'

LANGUAGE 'plr' VOLATILE

COST 100;

ALTER FUNCTION my_sd(double precision[]) OWNER TO postgres;

3.
12

Using R func
tions in PostgreSQL

--

SELECT count(ed.gid) as total_ed, avg(ed.male1_1 + ed.female1_1) as
avg_bg_pop,

median(ed.male1_1 + ed.female1_1) as median_pop, sum(ed.male1_1 +
ed.female1_1) as totalpop,
my_
sd(ed.male1_1 + ed.female1_1) a
s stdev

FROM dublin_eds ed

WHERE (ed.male1_1 + ed.female1_1) > 0;

9

--

include areas,

SELECT area(ed.the_geom) as areas, avg(ed.male1_1 + ed.female1_1) as
avg_bg_pop,

median(ed.male1_1 + ed.female1_1) as median_pop, sum(ed.male1_1 +
e
d.female1_1) as totalpop,
my_
sd(ed.male1_1 + ed.female1_1) as stdev

FROM dublin_eds ed

WHERE (ed.male1_1 + ed.female1_1) > 0

GROUP
by ed.the_geom;

See
http://ww
w.bostongis.com/PrinterFriendly.aspx?content_name=postgresql_plr_tut0

4.
Access
ing

PostgreSQL tables from R

Not necessary for assignment

First you must configure windows ODBC and make PostgreSQL available as a data
source. You do this via the control

panel.

Control Panel |

| Data Sources (ODBC) | Add

10

You can test the connection by pressing the Test button.

If all goes well t
he new PostgreSQL ODBC source is now available.

# Paste each of the following lines i
n
to

R
,

and observe the
result

library(RODBC)

ch <
-

odbcConnect("PostgreSQL30")

require(utils)

data()

data(package = "base")

#Connecting R to postgreSQL

# Control Panel | Administrative Tools | Data Sources
(ODBC)

channel <
-

od
bcConnect("PostgreSQL30")

# Find tables in testdb

sqlTables(channel)

# If you have the county table loaded

in the PostgreSQL
database connected to R then you can perform SQL in R.

sqlQuery(channel, "select name from county")

# Store an R data set as a
PostgreSQL table

sqlSave(channel, USArrests, rownames = "state", addPK=TRUE)

# or more simply

sqlSave(channel, USArrests)

11

# when you are done with it,

close(channel)

# or if you prefer

odbcClose(channel)

5)

Getting Irish data from PostgreSQL

library(RODBC
)

channel <
-

odbcConnect("PostgreSQL30")

# Find tables in postgis

sqlTables(channel)

# If you have the county table loaded

in the PostgreSQL
database connected to R then you can perform SQL in R.

sqlQuery(channel, "select name from county")

#Constructin
g a neighbourhood matrix in R.

channel <
-

odbcConnect("PostgreSQL30")

# retrieve table from data base:

table = sqlQuery(chann
el, "select gid

from county;")

# select and sort the count
y codes:

ids = factor(sort(as.character(table
\$gid
)))

# county has
the co
untry borders an
d was formed by an SQL command

neighb = matrix(0, length(ids), length(ids))

dimnames(neighb) = list(ids, ids)

for (id in levels(ids)) {

# form the SQL command:

sqlCommand <
-

paste( "SELECT a.gid FROM county
a, county b WHERE
b.gid='", id
,

"'

AND touch
es(a.the_geom , b.the_geom) ",
"GROUP BY a.gid
ORDER BY a.gid;", sep = "")

ret <
-

sqlQuery(channel, sqlCommand)

# do the SQL query:

ret = sql
Query(channel, sqlCommand)\$g
id

# fill the relevant matrix entries:

neighb[match(id, ids), match(ret
, ids)] = 1

}

odbcClose(channel)

options(width=110)

neighb