Aldi Kraja
October 17, 2008
A.
where is the location of my work?
mylogin@dsgcluster.dsg.wustl.edu
/dsguser/mylogin
Where do I go from here?
For example:
/dsgmnt/dsg200/genetics/
users
/jim/impfhs/
/dsgmnt/dsg200/genetics/
data
/jim/impfhs/
Let’s assume that the user created a lot of
programs and data (or a few of them)
pgms
data
B.
After work the results will go in a central directory
, maybe in the
/
dsgmnt
/dsg200/genetics/data/
fhsanalysis
/impute/
`
Who is in charge of that directory? Is that directory related with my
project? What are the permissions of that directory? Do I need to
change permissions of the directory, or files to read only? Do I need to
notify anybody?
Examples:
drwx
------
2 aldi genetics 4096 Feb 4 2008 mail
cd
/
dsgmnt
/dsg200/genetics
[aldi@blade6
-
3
-
1 genetics]$
ls
-
al
total 16
drwxrwsr
-
x 4 aldi genetics 160 Sep 15 11:38 .
drwxr
-
xr
-
x 4 root
root
128 Oct 14 11:38 ..
drwxrwsr
-
x 12 aldi genetics 312 Oct 15 12:00 data
-
rw
-
rw
-
r
--
1
warwick
genetics 0 Aug 26 10:50
loki.head
-
rw
-
rw
-
r
--
1
warwick
genetics 0 Aug 26 10:50
loki.nohead
drwxrwsr
-
x 16 aldi genetics 1824 Oct 16 14:26 users
Line command editors:
emacs
,
xemacs
,
vi
,
less
,
etc
sas
&
`
Data:
sas
viewer
(free software),
sas
,
jmp
,
through samba (
windows explorer
, go to
network
,
mercury5
, login and
passwd
, /
dsg1
;
/
dsg2
; /
dsg3
; /
dsg4
; /
dsg100
; /
dsg200
;
/
dsgweb
Batch work through LSF:
bsub
sas
pgm.sas
proc print data=
mydata
(
obs
=10); title “my
data”; run;
proc contents data=
mydata
; title “contents”
run;
My SAS output will be called “results”. Do I need
to add anything else in the name of the set?
Do I need to keep the output all in one directory?
Does a colleague has the right to change
your
data?
Do I have the right to delete
your
data?
Do I have the right to delete
your
tmp
data?
Can I open
your
data in a
sas
viewer?
Can I delete
your
program run?
Assume I am reading text data with sas, but
they are very large
SAS viewer does not open the created sas
data, because they have more than 37,000
columns. What do you do?
JMP can go beyond that limit; proc print by
keeping a few selected columns, compare
with the original
If my variables are genotypes of this kind
1/3 2/2 3/3 1/4 1/1 what do I need to do
to save space in our servers?
markname
chrom
pos
M1
1 10001
M2
1 10011
M3
1 10015
M4
1 10021
M5
2
20001
markname
M1
M2
M3
M4
markname
alele
percent
M1
1
10.0
M1 3
90.0
M2
2
25.0
M2
4
75.0
M3
2
0.10
M3
4
99.9
M4
1
0.10
M4
2
99.9
Subject M1 M2 M3 M4…
Mn
1
1/1 2/4 2/2 2/2 … 1/1
2
1/3 2/2 2/4 1/1 … 1/1
3
3/3 4/4 2/4 2/2 … 1/1
4
1/3 2/4 4/4 1/1 … 1/1
…
map1
locdes1
genefreq1
ganon1
*
--------------------------------------
;
* split.sas ;
* split data based on a decided number ;
*
magicNumber
=, ;
* By: Aldi
Kraja
;
* October 3, 2008 ;
*
--------------------------------------
;
options
mprint
mlogic
symbolgen
;
%macro split(
magicNumber
=200,
chroms
=
1,
chrome=
1);
%do j=&
chroms
%to &chrome ;
libname
ind&j
"/
dsgmnt
/dsg200/genetics/data/
mydata
/impute/
c&j
";
libname
out&j
"/
dsgmnt
/dsg200/genetics/data/
mydata
/split/
c&j
";
data
genefreqc&j
;
set
ind&j
..
genefreqc&j
;
run;
data
mlgeno&j
;
set
ind&j
..
mlgenoc&j
;
run;
data _null_;
file "./myscript.sh";
put "
cd
/
dsgmnt
/dsg200/genetics/data/
mydata
/split/
c&j
" ";
put " find .
-
name '*sas7bdat' |
xargs
/bin/
rm
-
f ";
put "
cd
/
dsgmnt
/dsg200/genetics/users/
mydir
/split/c&&j ";
run;
%
sysexec
chmod
+x
./myscript.sh ;
%
sysexec
./myscript.sh ;
data _null_;
set
ind&j
.
.
mapc&j
end=
eod
;
if
eod
then call
symput
("
totm",trim
(left(_n_)));
run;
%let counter=0;
%do m=
1 %to &
totm
%by &
magicNumber
;
%let counter=%
eval
(&counter +1);
%let s=&m ;
%let e=%
eval
(&s + &
magicNumber
-
1);
%if &m = %
eval
((&
totm
/&
magicNumber
)* &
magicNumber
+
1) %then %do;
%let
stringcomp&counter
=%quote( &s <= _n_ <= &
totm
) ;
%end;
%else %do;
%let
stringcomp&counter
=%quote( &s <= _n_ <= &e );
%end;
%put &&
stringcomp&counter
;
%end;
%do k=
1 %to &counter;
data
out&j
.
.
mapc&j.p&k
;
format markname $12.; length markname $
12;
set
ind&j
.
.
mapc&j
;
markname
=compress(SNP);
if &&
stringcomp&k
then output; run;
data _null_;
set
out&j
.
.
mapc&j.p&k
end=
eod
;
call
symput
("m"||trim(left(_n_)),trim(left(
markname
)));
if
eod
then call
symput
("
ptotm",trim
(left(_n_))); run;
markname
chrom
pos
M1
1 10001
M2
1 10011
M3
1 10015
M4
1 10021
M5
2
20001
mapc&j
data
out&j
.
.
mlgenoc&j.p&k
;
set
mlgenoc&j
(keep=subject %do ii=
1 %to &
ptotm
; &&
m&ii
%end;) ;
run;
data
out&j
.
.
genefreqc&j.p&k
;
set
genefreqc&j
;
%do ii=
1 %to &
ptotm
;
if
markname
="&&
m&ii
" then output;
%end;
run;
%end; %* k counter loop;
%end; %* j chromosome loop;
%mend split;
%
split
markname
alele
percent
M1
1
10.0
M1 3
90.0
M2
2
25.0
M2
4
75.0
M3
2
0.10
M3
4
99.9
M4
1
0.10
M4
2
99.9
genefreqc&j
Subject M1 M2 M3 M4…
Mn
1
1/1 2/4 2/2 2/2 … 1/1
2
1/3 2/2 2/4 1/1 … 1/1
3
3/3 4/4 2/4 2/2 … 1/1
4
1/3 2/4 4/4 1/1 … 1/1
…
mlgenoc&j
*
------------------------------------------------------------
;
* program: parallel
mixed.interface.sas
;
*
------------------------------------------------------------
;
* Purpose: run mixed model ;
* ;
* By: Aldi
Kraja
;
* February 17, 2008 ;
* ;
;
*
------------------------------------------------------------
;
%macro parallel(v=,
dsplit
=,
pheno
=,
type=,
rotation=,
subject=,
pedid
=,
fid=,
mid=,
sex=,
dirout
=,
genlabel
=,
markname
=);
data _null_;
file "./rmscript.sh";
put "
cd
&
dirout
";
put " find .
-
name '*mixed*sas7bdat' |
xargs
/bin/
rm
-
f ";
put " find .
-
name '*freq*.sas7bdat' |
xargs
/bin/
rm
-
f ";
put "
cd
/
dsgmnt
/dsg200/genetics/users/
mydir
/split/
c&v
./";
run;
%
sysexec
chmod
+x ./rmscript.sh ;
%
sysexec
./rmscript.sh ;
%do y=1 %to &
dsplit
;
%
sysexec
rm
-
f ./
test&y
..
sas
;
%*
---------------------------------
;
%* let start to program in parallel;
%*
---------------------------------
;
data _null_;
file "./
test&y
..
sas
"
lrecl
=36000;
put "options
nofmterr
; options
nomprint
nomlogic
nosymbolgen
; ";
put '%include "/
dsgmnt
/dsg200/genetics/users/mydir1/macroutil.sas"; ';
put '%include "/
dsgmnt
/dsg200//genetics/users/mydir2/mixed.sas"; ';
put "
libname
ph22 '/
dsgmnt
/dsg200/genetics/data/
mydir
/
pheno
'; ";
put "
libname
trip '/
dsgmnt
/dsg200/genetics/data/
mydir
/
geno
/'; ";
put "
libname
snpc&v
'/
dsgmnt
/dsg200/genetics/data/
mydir
/split/
c&v
'; ";
put " *
-------------
map
------------------
; ";
put " data
cmap&v.p&y
; ";
put " format &
markname
$12.; length &
markname
$12 ; ";
put " set
snpc&v
..
mapc&v.p&y
(where=(
chrom
=&v )); ";
put " run; ";
put "%
sortit
(
cmap&v.p&y,cmap&v.p&y
, &
markname,nodupkey
) ";
put " *
-------------
locdes
------------------
; ";
put "%
sortit
(
cmap&v.p&y,clocdes&v.p&y,&markname,nodupkey
) ";
put " data
cmap&v.p&y
; ";
put " merge
cmap&v.p&y
(in=in1)
clocdes&v.p&y
(in=in2); ";
put " by &
markname
; if in1 and in2; ";
put " run; ";
put " %
sortit
(
cmap&v.p&y
,
cmap&v.p&y
,&
markname
) ";
put " *
-------------
genefreq
------------------
; ";
put " data
genefreq&v.p&y
; ";
put " format &
markname
$12.; length &
markname
$12 ; ";
put " set
snpc&v
..
genefreqc&v.p&y
; ";
put "run; ";
put " %
sortit
(
genefreq&v.p&y,genefreq&v.p&y,&markname
) ";
put " data
genefreq&v.p&y
; ";
put " merge
genefreq&v.p&y
(in=in1)
clocdes&v.p&y
(in=in2 )
cmap&v.p&y
; ";
put " by &
markname
; if in1 and in2; run; ";
put " *
-------------
ganon
------------------
; ";
put " data
aganon&v.p&y
; set
snpc&v
..
mlgenoc&v.p&y
; run; ";
put " *
-------------
triplet
------------------
; ";
put " data trip; set
trip.phenos
(keep=&
pedid
&subject &fid &mid &sex ); run; ";
put " %
sortit
(
trip,trip,&subject
,
nodupkey
) ";
put " *
-------------
phenodata
------------------
;
put " %
sortit
(ph22.fanomiss,phenodata (keep=&subject &
pheno
),&subject) ";
put " data
phenodata
; merge
phenodata
(in=in1) trip (in=in2); by &subject; ";
put " if in1 and in2; ";
put " if &subject =. then do; ";
put " put '
-------------------------------------------------------------------
'; ";
put " put 'This is a note for showing that you have missing &subject in the data';";
put " put '
-------------------------------------------------------------------
'; ";
put " put _all_ ; abort
abend
; ";
put " end; ";
put " run; ";
put " %
sortit
(
phenodata,phenodata,&subject
) ";
put " %
sortit
(
aganon&v.p&y,aganon&v.p&y,&subject
) ";
put "data
aganon&v.p&y
; merge
aganon&v.p&y
(in=in1)
phenodata
(in=in2); by &subject ;";
put " if in1 ; ";
put " run; ";
put " proc datasets lib=work; delete
phenodata
; run; ";
put " *
-------------
finally set up the macro
------------------
; ";
put " %
alleled
( ";
put "
datain
=
aganon&v.p&y
, “ ;
put "
clocdes
=
clocdes&v.p&y
, ";
put "
cmap
=
cmap&v.p&y
, ";
put " dist=position, ";
put "
cgenefreq
=
genefreq&v.p&y
, ";
put " triplet=trip, ";
put "
chrom
=
chrom
, ";
put "
marshvar
=
cmap
, ";
put "
markvar
=
markname
, ";
put "
pedid
=&
pedid
, ";
put " subject=&subject, ";
put " fid=&fid, ";
put " mid=&mid, ";
put " sex=&sex, ";
put "
qualaff
=, ";
put "
phenodata
=
aganon&v.p&y
, ";
put "
pheno
=&
pheno
, ";
put "
qualtrait
=, ";
put "
qualcovars
=&sex, ";
put "
quantcovars
=, ";
put "
correctForMean
=NO, ";
put "
bymark
=500, ";
put "
programd
=MIXED, ";
put " model=a, ";
put " time=, ";
put "
genlabel
=&
genlabel&y
, ";
put "
barplot
=0, ";
put "
whohasmarkers
=NO) ";
run;
%end ; %* end of y loop for each split dataset ;
data _null_;
file "./mixedscript.sh ";
%do j=1 %to &
dsplit
;
put "
bsub
nohup
sas
-
memsize
1G ./
test&j
..
sas
";
put "sleep 5 ";
%end;
put "echo Finished the submission for
chrom
: &v a total of &
dsplit
jobs";
run;
X '
chmod
+x ./mixedscript.sh ';
%
sysexec
./mixedscript.sh ;
%mend parallel;
%parallel(v=22,
dsplit
=170,
pheno
=Factor1mleod,
type=allf1,
rotation=
varimax
,
subject=subject,
pedid
=
pedid
,
fid=
dadsubj
,
mid=
momsubj
,
sex=sex,
dirout
=/
dsgmnt
/dsg200/genetics/data/
mydir
/results/w/
c&v
./&type./&rot./,
genlabel
=f1fhs,
markname
=
markname
)
*
------------------------------------------------------------
;
* summarize.sas ;
* Purpose: summarize the results of Mixed model ;
*
------------------------------------------------------------
;
%global
nobs
;
%let
nobs
=0;
%include "/
dsgmnt
/dsg200/genetics/users/
mydir
/macroutil.sas";
%let study=
mystudy
;
libname
inncbi
"/
dsgmnt
/dsg200/genetics/data/
generaldir
/
annot
/";
%macro test(
totloop
=968 1105 872 816 841 912 717 738 611 693 651 625 521 420 362 357 293 385 186 318 170 170,
chroms
=1,
chrome=22,
fact=1,
dirout
=/
dsgmnt
/dsg200/genetics/data/
mydir
/results/
final&study
./
f&fact
./);
libname
out&study
"&
dirout
";
%let count=1;
%let tloop1=%scan(&
totloop,&count
);
%do %while(%scan(&
totloop,&count
) ^= );
%let count=%
eval
(&count+1);
%let
tloop&count
=%scan(&
totloop,&count
);
%end;
%let count=%
eval
(&count
-
1);
%do k=&
chroms
%to &chrome;
%* delete what you will append to;
proc datasets lib=
out&study
;
delete
mixed&study.c&k
&
study.c&k
; run;
libname
freq&k
"/
dsgmnt
/dsg200/genetics/data/
mydir
/split/
c&k
";
%do f=&fact %to &fact;
libname
fhs&k.f&f
"/
dsgmnt
/dsg200/genetics/data/
mydir
/results/w/
c&k
./
allf&f
./
var
";
%do j=1 %to &&
tloop&k
;
%
isempty
(data=
fhs&k.f&f
..
amixedf&f.&study&j
)
%if &
nobs
=0 %then %do; %*
--------------
work on empty set check
--------------
;
data a; factor=&f ;
notempty
=.; empty=&j ;output; run;
%if &j=1 %then %do;
data
out&study
..&
study.c&k
;
set a; run;
proc datasets lib=work; delete a; run;
%end; %* loop when j is 1;
%else %do;
data
out&study
..&
study.c&k
;
set
out&study
..&
study.c&k
a; run;
proc datasets lib=work; delete a; run;
%end; %* end of loop j is not 1;
%end; %*
-----------------------------
;
%else %do; %* the
nobs
is not 0 =======================;
data a;
factor=&f ;
notempty
=&j; empty=. ;output; run;
%*
--------------------------------------------
;
%* capture the frequencies present in the data;
%*
--------------------------------------------
;
data b (drop=
markname
);
format
markn
$12.; length
markn
$12;
set
fhs&k.f&f
..
ameanfreqpermarkf&f.&study&j
;
markn
=
substr
(markname,2);
markn
=compress(
markn
||"XXXXXXXXX"); run;
data b; set b (rename=(
markn
=
markname
)) ; run;
%
sortit
(
b,b
,
markname
model)
proc transpose data=b out=c (rename=(COL1=AA COL2=AB COL3=BB));
by
markname
;
var
freq; run;
%*
-----------
work on freq ********;
data p;
set
freq&k
..
genefreqc&k.p&j
; run;
proc sort data=p; by
markname
percent allele; run;
proc transpose data=p out=q (rename=(COL1=a1 COL2=a2));
by
markname
;
var
allele; run;
proc transpose data=p out=r (rename=(COL1=MAF COL2=MOA));
by
markname
;
var
percent; run;
proc sort data=q; by
markname
; run;
proc sort data=r; by
markname
; run;
data s (drop=_NAME_);
merge q (in=in1) r (in=in2); by
markname
; if in1 and in2; run;
%
sortit
(
s,s,markname
)
proc datasets lib=work; delete p q r ; run;
%*
--------------------------------------------
;
%* work on the mixed model output ;
%*
--------------------------------------------
;
%if &j=1 %then %do; data
out&study
..&
study.c&k
; set a; run;
%
sortit
(
fhs&k.f&f
..
amixedf&f.&study&j,d,markname
)
%
sortit
(
fhs&k.f&f
..
ameanfreqpermarkf&f.&study&j,b,markname
model)
%
sortit
(
c,c
(drop=_NAME_),
markname
)
data d;
merge c (in=in1) d (in=in2) s;
by
markname
;
if in1 and in2;
factor=&f ;
run;
data
out&study
..
mixed&study.c&k
(drop=model analysis _LABEL_); set d ;run;
proc datasets lib=work; delete a c d s; run;
%end; %* loop when j is 1;
%else %do;
data
out&study
..&
study.c&k
;
set
out&study
..&
study.c&k
a; run;
%
sortit
(
fhs&k.f&f
..
amixedf&f.&study&j,d,markname
)
%
sortit
(
c,c
(drop=_NAME_),
markname
)
data d;
merge c (in=in1) d (in=in2) s;
by
markname
;
if in1 or in2; run;
data
out&study
..
mixed&study.c&k
(drop=model analysis _LABEL_); set
out&study
..
mixed&study.c&k
d ;
factor=&f ;
run;
proc datasets lib=work; delete a c d s; run;
%end; %* end of loop j is not 1;
%end; %* end of condition the
nobs
is not 0;
%end; %* end of j loop (splits of the same chromosome) ;
%end; %* end of f loop (factors) ;
%*
--------------------------------------------------------------
;
%* finally annotate based on the newest version of NCBI
dbSNP
;
%*
--------------------------------------------------------------
;
%
sortit
(inncbi.ncbisnpb128_c&k,annot&k,markname,nodupkey)
%
sortit
(
out&study
..
mixed&study.c&k,out&study
..
mixed&study.c&k,markname
)
data
out&study
..
mixed&study.c&k
;
merge
out&study
..
mixed&study.c&k
(in=in1)
annot&k
(in=in2);
by
markname
; if in1; run;
proc datasets lib=work; delete
annot&k
; run;
%end; %* end of k loop (chromosome)
;
%mend test;
%test
Remove all split data
Do NOT remove the original source data
gzip unused results (you can gunzip results
at the time you need them again)
Multi
-
threaded systems and parallel
programming are common tools in modern
software applications, and are used to
enhance the scalability and performance of
large jobs
The statistical tasks included but were not
limited to:
Drafting your analysis objectives
Creating specifications for analysis datasets
Creating specifications for tables, figures, and
listings
Identifying first look analyses
Performing data checking
Parallel programming analysis datasets
Checking data results and parallel
programming tables, figures, and listings
Removal of the un
-
necessary data
Enter the password to open this PDF file:
File name:
-
File size:
-
Title:
-
Author:
-
Subject:
-
Keywords:
-
Creation Date:
-
Modification Date:
-
Creator:
-
PDF Producer:
-
PDF Version:
-
Page Count:
-
Preparing document for printing…
0%
Σχόλια 0
Συνδεθείτε για να κοινοποιήσετε σχόλιο