SAS Code for the Import Individual Text, CSV, or Excel Files Process

Let’s consider the SAS code for the Import Individual Text, CSV, or Excel Files process.

This code is contained in the DataImportEngine.sas file located in your JMP installation directory.

8      Navigate to the DataImportEngine.sas file.

Typically, when JMP Genomics is loaded on a Windows machine, the default path to this file is C:\Program Files\Program Files\SASHome\JMPGenomics\13\Genomics\ProcessLibrary\.

8      Open the DataImportEngine.sas file with a text editing program.

The SAS code for the Import Individual Text, CSV, or Excel Files process should appear as shown in the following sections. Instead of showing the SAS code as one continuous entity, the code has been divided up into basic sections described in the Getting Started with JMP Genomics chapter of this guide.

Preamble

As in the Column Contents code example, the preamble section of the Import Individual Text, CSV, or Excel Files code contains a simple history section for tracking changes. The concluding Description: block describes a brief overview of the program. This same description is used at the beginning of the XML file, which will be discussed later.

The SAS code for the preamble is shown below:

 

/*

JMP Genomics Data Import Engine

 

Name: DataImportEngine

Author: Tzu-Ming Chu, SAS Institute Inc.

Support: Tzu-Ming Chu, SAS Institute Inc.

History:

tzuchu 03Mar04 initial coding

tzuchu 13Jan05 drop ProjectID, FileType

tzuchu 22Sep05 add FileType for extension not covered

tzuchu 10May06 activate nolabel SAS option

tzuchu 08Jun06 add InputStmt to import extrem wide data

tzuchu 20Jul06 add DataStepStatements option to modify output

tzuchu 30Nov06 extent to import multiple files

tzuchu 18Jul07 add ScanColumns to import extrem wide data

tzuchu 17Oct07 remove nolabel SAS option

 

Description:

 

Data Import Engine imports files in specified folder

into SAS data seta by calling Proc Import.

The files are assumed to be one of the

following types:

1. Tab delimited with extension .txt.

2. Comma-separated with extension .csv.

3. Excel with extension .xls.

4. SAS with extension .sas7bdat.

If the file extension is not one of the four

above, File Type specification will be needed.

*/

 

/*

-----------------------------------------------------------

Copyright (C) 2004 SAS Institute, Inc. All rights reserved.

 

Notice:

The above copyright notice and this notice must appear in

any whole or partial copy of this code and any related

documentation. Permission to use, copy, and modify the

source code contained in this file is hereby granted, but

is limited to customers of SAS with a valid license for the

SAS software product with which this file was provided.

Except as expressly set forth herein, the terms of such

license apply.

-----------------------------------------------------------

*/

 

OPTIONS nospool;

Process Variable Definitions

The Import Individual Text, CSV, or Excel Files process has a variety of global process variables in the section of code that begins with *ProcessVariables; including input file-associated parameters InFiles, VarNameList, DataStartRow, ScanRows and output data-associated parameters OutData and OutPath.

InFiles defines what files to import, and the remaining parameters enable modifications to the PROC IMPORT run. OutData and OutPath specify the name and path of the output SAS data set.

The SAS code outlining the process variable definitions is shown below:

*ProcessVariables;

 

%global InPath FileFilter InFiles FileType VarNameRow

InputStmt LengthStmt VarNameList DataStartRow UniqueVarNameFlag

ApplyOriginalName DataStepStatements ScanRows ScanColumns

OutDataPrefix Compress MaxColLen OutPath MacroPath;

Initialization and Argument Checking

Immediately after *ProcessBody; is a section similar to the initialization section from the Column Contents example. The initialization section defines a few variables and also does the following:

%include's Utility Macros.sas and InputEngineMacros.sas
checks for various file types according to the extension on the input file
determines the logical record length according to the operating system
creates a LIBNAME corresponding to OutPath.

The SAS code specifying initialization and argument checking is shown below:

*ProcessBody;

 

%macro DataImport;

 

*initialization;

%if ^%symexist(ProcessName) %then %let ProcessName = DataImportEngine;

%put ProcessName = &ProcessName;

 

%include "&MacroPath./UtilityMacros.sas" / nosource;

 

%include "&MacroPath./InputEngineMacros.sas" / nosource;

 

%CheckEndSlash(InPath);

%let f = 1;

%let fLen = 0;

%let FileList = %trim(%left(&InFiles));

%*put Infiles=&FileList;

%do %while(%length(&FileList));

* extract one file name;

%let flag = 0;

%let name = ;

%do %While(^&flag);

%let e = %index(&FileList,%str(|));

%if ^&e %then %do;

%let name = %left(&name &FileList);

%let flag = 1;

%let FileList = ;

%end;

%else %do;

%let name = %left(&name %substr(&FileList,1,&e-1));

%if %index(&name,.) %then %let flag = 1;

%let FileList = %substr(&FileList,&e+1);

%end;

%end;

%let InFile&f = &name;

%if %index(&name,.) %then %do;

%let ext = %scan(&name,-1);

%let InName&f = %substr(&name,1,%length(&name)-%length(&ext)-1);

%end;

%else %do;

%let InName&f = &name;

%end;

%let OutName&f = &OutDataPrefix.&&InName&f;

%CheckDataName(_name=InName&f);

%CheckDataName(_name=OutName&f);

%if %length(&&OutName&f)>&fLen %then %let fLen = %length(&&OutName&f);

%put InFile&f = &&InFile&f;

%put InName&f = &&InName&f;

%put OutName&f = &&OutName&f;

%let f = %eval(&f+1);

%end;

%let nFiles = %eval(&f-1);

 

%if &nFiles = 0 %then %do;

%put ERROR: There is no file selected to be imported.;

%goto exit;

%end;

%else %do;

* check duplication for OutNames;

data a;

length OutName $&fLen.;

%do i = 1 %to &nFiles;

OutName = "&&OutName&i";

output;

%end;

run;

%let RowDupFlag =;

%naming(_Dset=a,_AllVars=OutName,_UniqueRow=1,_DupFlag=RowDupFlag);

data _null_;

set a;

name="OutName"||left(trim(_n_));

call symput(name,OutName);

run;

%end;

 

%put There are &nfiles file(s) selected to import.;

 

%if &nFiles = 1 and %symexist(OutData) %then %do;

%if %length(&OutData) %then %do;

%let OutName1 = &OutData;

%CheckDataName(_name=OutName1);

%end;

%end;

 

%if %length(&FileType) %then %do;

%if %upcase(&FileType) = COMMA SEPARATED %then %let FType = CSV;

%else %if %upcase(&FileType) = SPACE DELIMITED %then %let FType = DLM;

%else %if %upcase(&FileType) = TAB DELIMITED %then %let FType = TAB;

%end;

%else %do;

%let FType = ;

%CheckFileType(&InPath.&InFile1,FType,_ErrorFlag=0);

%end;

 

%if &sysscp=WIN %then %do;

%if %upcase(&FType) = CSV %then %let Dilimiter = '2C'x;

%else %if %upcase(&FType) = DLM %then %let Dilimiter = '20'x;

%else %if %upcase(&FType) = TAB %then %let Dilimiter = '09'x;

%let CR = '0D0A'x;

%end;

%else %do;

%if %upcase(&FType) = CSV %then %let Dilimiter = '2C0D'x;

%else %if %upcase(&FType) = DLM %then %let Dilimiter = '200D'x;

%else %if %upcase(&FType) = TAB %then %let Dilimiter = '090D'x;

%let CR = '0A'x;

%end;

* Check Excel file and OS system;

%CheckExcel(&FType,The input file);

%if &exiterror %then %goto exit;

 

%*if &DataStartRow < 1 %then %let DataStartRow = 1;

%if &DataStartRow < 1 %then %do;

%put ERROR: Please specify a value for Data Start Row.;

%let exiterror = 1;

%goto exit;

%end;

%if &VarNameRow < 1 %then %let VarNameRow = 0;

 

* compression;

%if (&Compress = Yes) %then %let dsoptions = (compress=yes);

%else %let dsoptions = ;

 

* define libnames;

libname InLib "&InPath";

libname OutLib "&OutPath";

 

* loop through all files;

%do nf = 1 %to &nFiles;

 

%if %sysfunc(fileexist(&&InPath.&&InFile&nf))=0 %then %do;

%put ERROR: File &&InPath.&&InFile&nf does not exist.;

%let exiterror = 1;

%goto exit;

%end;

 

%if %upcase(&FType) = CSV %then %do;

%let DLT = '2C0D'x;

%let DLT2 = '2C'x;

%end;

%else %if %upcase(&FType) = DLM %then %do;

%let DLT = '200D'x;

%let DLT2 = '20'x;

%end;

%else %if %upcase(&FType) = TAB %then %do;

%let DLT = '090D'x;

%let DLT2 = '09'x;

%end;

 

* check file type;

%if %upcase(&FType) = SAS %then %do;

data OutLib.&&OutName&nf;

set Inlib.&&InName&nf;

%if %length(&DataStepStatements) %then %do;

%unquote(&DataStepStatements);

%end;

run;

%end;

%else %do;

%if %index(upcase(&FType),EXCEL) %then %do;

%if &ScanColumns %then %do;

%put WARNING: Parameter "Minimum Number of Columns to Scan" is only available for importing Comma Separated, Space Delimited, or Tab Delimited files.;

%end;

%if %length(&InputStmt) %then %do;

%put WARNING: Parameter "List of Variable Names and Types" is only available for importing Comma Separated, Space Delimited, or Tab Delimited files.;

%end;

%if &ScanColumns | %length(&InputStmt) %then %do;

%put WARNING: Excel file type is detected. The above parameter will not be applied.;

%end;

%goto excel;

%end;

SAS Data Step and Procedure Code

This section of the code for the Import Individual Text, CSV, or Excel Files process carries out two functions:

The %skip_header_import_2 macro, which is defined in InputEngineMacros.sas, imports raw data file by using the IMPORT procedure (PROC IMPORT) and the DATA step. This macro is used because it allows you to read in the data with a specified row that contains variable names and a specified data-starting row.
The list of variable names is parsed, using complex SAS macro coding, and renamed, using PROC DATASETS, to names provided by the user.

The SAS data step and procedure code is shown below:

* step1: extract variable names and locate the 1st data line;

* read in data as binary format to bypass the LRECL limitation;

%let skipbyte = ;

%if &VarNameRow %then %do;

%if &VarNameRow = 1 %then %do;

%let skipbyte = 0;

%end;

%else %do;

%GetLineLoc(&&InPath.&&InFile&nf,&VarNameRow,skipbyte);

%if &skipbyte %then %let skipbyte = %eval(&skipbyte-1);

%end;

%put skipbyte to scan column name line: &skipbyte;

 

data _vName_;

infile "&&InPath.&&InFile&nf" RECFM=N delimiter=&DLT col=columnpt DSD;

length VarName $500 vTmp $32;

flag = 1;

%if &skipbyte %then if _n_ = 1 then input +&skipbyte @;;

do while(flag);

input VarName $@;

if index(varname,'0A'x) then do; * the last variable name;

flag = 0;

end;

else do;

vTmp = dequote(VarName);

end;

if flag then do;

output;

end;

end;

if ^flag then do;

stop;

end;

keep vTmp flag;

run;

%if &syserr>2 %then %goto exit;

%end;

 

* get loc for 1st data line;

%GetLineLoc(&&InPath.&&InFile&nf,&DataStartRow,skipbyte);

%if &skipbyte %then %let skipbyte = %eval(&skipbyte-1);

%put skipbyte to the 1st data line: &skipbyte;

 

 

/* add this part for skiping list-style parameters when importing wide snp data 07/16/07 */

/* need to cover (&ScanColumn) but (^&VarNameRow) */

%if &ScanColumns & ^%index(upcase(&FType),EXCEL) %then %do;

* step 2 create a short file for scan;

%let did = %sysfunc(open(_vName_));

%let nv = %sysfunc(attrn(&did,NOBS));

%let rc = %sysfunc(close(&did));

%if &ScanColumns > &nv %then %let ScanColumns = &nv;

%let MaxLen = 2000;

%let workpath=%sysfunc(pathname(work));

filename tmpfile "&workpath.&_slash_.tmpfile071607.txt";

%*if %sysfunc(fexist(tmpfile)) %then %let frc=%sysfunc(fdelete(tmpfile));

 

data _null_;

infile "&&InPath.&&InFile&nf" delimiter=&DLT MISSOVER TRUNCOVER DSD lrecl=&MaxLen;

file tmpfile recfm=n lrecl=&MaxLen;

length x $&MaxLen..;

%if &skipbyte %then %do;

if _n_ = 1 then input +&skipbyte @;

%end;

input x 1-&MaxLen;

/* v3.1 code

Note: scan function count two adjcent tabs as one tab

do i = 1 to &ScanColumns;

y=scan(x,i,&DLT);

put y @;

if i < &ScanColumns then put +(-1) &Dilimiter @;

else put +(-1) &CR;

end;

*/

c = 1;

s = 1;

do i = 1 to &MaxLen;

if i > length(x) then do;

y = substr(x,s);

put y +(-1) &CR;

i = &MaxLen+20;

end;

else do;

if index(substr(x,i,1),&DLT2) then do;

e = i - 1;

if e>=s then do;

y = substr(x,s,e-s+1);

put y +(-1) @;

end;

if c < &ScanColumns then do;

put &Dilimiter @;

end;

else do;

put &CR;

i = &MaxLen;

end;

s = i + 1;

c=c+1;

end;

end;

end;

if _n_ > &ScanRows then stop;

run;

 

* step 3 call Proc Import for detecting variable attributes;

PROC IMPORT OUT= WORK._tmp1

DATAFILE= "&workpath.&_slash_.tmpfile071607.txt"

DBMS=&FType REPLACE;

GETNAMES=YES;

DATAROW=%if &VarNameRow>1 %then %eval(&DataStartRow-&VarNameRow+1);

%else &DataStartRow;;

RUN;

%let frc=%sysfunc(fdelete(tmpfile));

 

* step 4 check naming convention for all vars;

%Naming(_Dset=_vName_,_AllVars=vTmp,_UniqueRow=1);

%let did1 = %sysfunc(open(_tmp1));

%let did2 = %sysfunc(open(_vName_));

%let nv = %sysfunc(attrn(&did2,NOBS));

%let vNum = %sysfunc(VARNUM(&did2,vTmp));

 

* step 5 read full data;

data OutLib.&&OutName&nf.&dsoptions;

infile "&InPath.&&InFile&nf" RECFM=N delimiter=&DLT DSD LRECL=32767;

length

%let j = &ScanColumns;

%do i = 1 %to &nv;

%if &i < &ScanColumns %then %do;

%if (%nrbquote(%upcase(%sysfunc(VARNAME(&did1,&i))))=PROBE_SET_ID |

%nrbquote(%upcase(%sysfunc(VARNAME(&did1,&i))))=PROBESET_ID) &

(%sysfunc(VARTYPE(&did1,&i)) = C) %then %do;

%sysfunc(VARNAME(&did1,&i)) $32.

%end;

%else %if %nrbquote(%upcase(%sysfunc(VARNAME(&did1,&i))))=CHROMOSOME %then %do;

%sysfunc(VARNAME(&did1,&i)) $2.

%end;

%else %do;

%sysfunc(VARNAME(&did1,&i)) %if %sysfunc(VARTYPE(&did1,&i))=N %then %sysfunc(VARLEN(&did1,&i)).; %else $%sysfunc(VARLEN(&did1,&i)).;

%end;

%end;

%else %do;

%if %sysfunc(VARTYPE(&did1,&j))=N %then %let Len = %sysfunc(VARLEN(&did1,&j)).;

%else %let Len = $%sysfunc(VARLEN(&did1,&j)).;

%if &i = &ScanColumns %then %let rc=%sysfunc(fetchobs(&did2,&ScanColumns));

%sysfunc(getvarc(&did2,&vNum)) &Len

%let rc=%sysfunc(fetch(&did2));

%end;

%end;

;

if _n_ = 1 then input +&skipbyte @;

input

%do i = 1 %to &nv;

%if &i = 1 %then %let rc=%sysfunc(fetchobs(&did2,1));

%sysfunc(getvarc(&did2,&vNum))

%let rc=%sysfunc(fetch(&did2));

%end;

+1 @;

run;

%let rc = %sysfunc(close(&did2));

%let rc = %sysfunc(close(&did1));

%end;

/* end 07/16/07 */

 

%else %if %length(&InputStmt) %then %do;

%if %length(&FType) > 3 %then %do;

%put WARNING: "List of Variable Names and Types" is only available for importing Comma Separated, Space Delimited, or Tab Delimited files.;

%put WARNING: Excel file type is detected. The above parameter will not be applied.;

%goto excel;

%end;

%else %do;

/*

* read in data as binary format to bypass the LRECL limitation;

* step1: extract variable names;

%if &VarNameRow %then %do;

data _vName_;

infile "&&InPath.&&InFile&nf" RECFM=N delimiter=&DLT col=columnpt DSD lrecl=32767;

length VarName $32767 vTmp $ 32;

retain nVars 0 crLoc 0 nrows 1;

flag = 0;

do while(nrows <= &VarNameRow);

*do while(nrows <= &DataStartRow);

crLoc = columnpt;

input VarName $@;put '1' varname=;

idx = index(VarName,'0A'x); put '2' idx=;

if (idx & nrows) then do;

do while(idx); * keep track of carriage returns as number of rows;

* output;

vTmp = dequote(substr(VarName,1,idx-1));

crLoc = crLoc + idx;put nrows= idx= varname=;

VarName = substr(VarName,idx+1,lengthn(VarName)-idx);

idx = index(VarName,'0A'x);

nrows = nrows + 1;

end;

end;

if nrows = &VarNameRow then do;

if nrows = &VarNameRow then vTmp = dequote(VarName);

output;

end;

end;

call symputx('crEndLoc',crLoc-1); * location of the last carriage return before data;

stop;

* keep vTmp flag;

run;

%if &syserr>2 %then %goto exit;

%end;

%else %do;

%let crEndLoc = 0;

%end;

%*put Data starts at &crEndLoc;

%put crEndLoc = &crEndLoc;

%put Delimiter = &DLT;

%*goto exit;*/

/*

proc print data=_vName_;

run;

*/

* determine name of last variable;

%let i = 1;

%do %while(%length(%scan(&InputStmt,&i,' ()$-')));

%let lastvar = %scan(&InputStmt,&i,' ()$-');

%let i = %eval(&i+1);

%end;

%put lastvar = &lastvar;

* input the file;

data OutLib.&&OutName&nf.&dsoptions;

infile "&InPath.&&InFile&nf" RECFM=N delimiter=&DLT MISSOVER DSD LRECL=32767;

%if %length(&LengthStmt) %then %do;

length &LengthStmt;

%end;

if _n_ = 1 then input +&skipbyte @;

input &InputStmt +1;

* patch to adjust processing when last var contains the end of line character;

_idx = index(&lastvar,'0A'x);

if (_idx) then do;

_len = lengthn(&lastvar);

* assume reading has gone too far, so back up the input pointer;

&lastvar = dequote(substr(&lastvar,1,_idx-1));

_mvback = _idx - _len - 2;

input +_mvback @;

end;

drop _idx _len _mvback;

run;

%if &syserr>2 %then %goto exit;

* rename back to original column names;

%if %upcase(&ApplyOriginalName) = YES & &VarNameRow %then %do;

* check naming convention;

%Naming(_Dset=_vName_,_AllVars=vTmp,_UniqueRow=1);

data ov;

set OutLib.&&OutName&nf;

if (0);

run;

proc transpose data=_vName_ out=nv(drop=_name_);

var flag;

id vTmp;

run;

 

%let did = %sysfunc(open(ov));

%let did2 = %sysfunc(open(nv));

%let nv = %sysfunc(attrn(&did,NVARS));

data OutLib.&&OutName&nf.&dsoptions;

set OutLib.&&OutName&nf;

%do i = 1 %to &nv;

%if %upcase(%sysfunc(varname(&did2,&i))) ne %upcase(%sysfunc(varname(&did,&i))) %then %do;

%sysfunc(varname(&did2,&i)) = %sysfunc(varname(&did,&i));

drop %sysfunc(varname(&did,&i));

%end;

%end;

/*drop %do i = 1 %to &nv; %sysfunc(varname(&did,&i)) %end; ;*/

run;

/* proc datasets rename takes a long time for wide data sets, so use above approach for now

* turn off notes to avoid large log;

option nonotes;

proc datasets library=OutLib nolist;

modify &&OutName&nf;

%do i = 1 %to &nv;

rename %sysfunc(varname(&did,&i)) = %sysfunc(varname(&did2,&i));

%end;

quit;

option notes;

*/

%let rc = %sysfunc(close(&did2));

%let rc = %sysfunc(close(&did));

%end;

%end;

%end;

%else %do;

 

%excel:; * label for excel file;

* import raw data file;

%let VarList = ;

%if %upcase(&UniqueVarNameFlag)=YES %then %let uFlag = 1;

%else %let uFlag = 0;

%skip_header_import_2(DataFile=%quote(&InPath.&&InFile&nf),DataType=&FType,

OutSet=OutLib.&&OutName&nf,FirstRow=&DataStartRow,VarNameRow=&VarNameRow,

VarLabelRow=&VarNameRow,GuessRows=&ScanRows,_varlist=VarList,

UniqueVarNameFlag=&uFlag);

%TrimStringVariabe(_Ds=OutLib.&&OutName&nf,_MaxLen=&MaxColLen);

* apply dsoptions, e.g. compression;

%if %length(&dsoptions) %then %do;

data OutLib.&&OutName&nf.&dsoptions;

set OutLib.&&OutName&nf;

run;

%end;

 

* rename if needed;

%if %length(&VarNameList) %then %do;

* extract each name from VarNameList which lists names to be replaced;

%let i = 1;

%let j = 1;

%do %while(%length(%scan(&VarNameList,&i,%str( ))));

%let tmp&i = %scan(&VarNameList,&i,%str( ));

%let hyp=%index(&&tmp&i,%str(-));

%if &hyp>0 %then %do;

%let fnumloc=%sysfunc(indexc(&&tmp&i,'0123456789'));

%let prefix=%substr(&&tmp&i,1,%eval(&fnumloc-1));

%let snumloc=%eval(%sysfunc(indexc(%substr(&&tmp&i,%eval(&hyp+1)),'0123456789'))+&hyp);

%let num1=%substr(&&tmp&i,&fnumloc,%eval(&hyp-&fnumloc));

%let num2=%substr(&&tmp&i,&snumloc);

%do k = &num1 %to &num2;

%let TmpName&j = &prefix.&k;

%let j = %eval(&j+1);

%end;

%end;

%else %do;

%let TmpName&j = &&tmp&i;

%let j = %eval(&j+1);

%end;

%let i = %eval(&i+1);

%end;

%nObsVars(OutLib.&&OutName&nf);

proc datasets lib=OutLib nolist;

modify &&OutName&nf;

%do i = 1 %to %sysfunc(min(%eval(&j-1),&nVars));

%if %scan(&VarList,&i) ne &&TmpName&i & &&TmpName&i ne . %then

rename %scan(&VarList,&i) = &&TmpName&i;;

%end;

quit;

%end;

%end;

 

%if %length(&DataStepStatements) %then %do;

data OutLib.&&OutName&nf.&dsoptions;

set OutLib.&&OutName&nf;

%unquote(&DataStepStatements);

run;

%end;

%end;

 

%NObsVars(OutLib.&&OutName&nf);

%if (&nobs=0 | &nvars=0) %then %do;

data OutLib.exitedwitherrors;

error=1;

run;

%put ERROR: Error(s) were encountered creating output data set.;

%let exiterror = 1;

%goto exit;

%end;

JSL File Generation

The Import Individual Text, CSV, or Excel Files process does not generate any JMP analyses or graphics. Because of this, there is no code for generating a JSL file. See Creating a JSL File for Dynamic Graphics and Analyses for a detailed discussion of JSL file generation.

Package Publication

This section is the same as the section in the Column Contents example, except that the %PackageInsertDataset command is used to publish the output data set.

The SAS code for the package publication is shown below:

* Packaging;

%PackageInsertDataset(data from &ProcessName,outlib,&&OutName&nf,No);

 

%end; * end of nf loop;

 

%let InData = &InName1; * to be used for the suffix in setting file;

%exit:;

%ExitSDSProcess(&OutPath, &ProcessName, &syserr, &exiterror, &clockrunning);

 

%mend DataImport;

Macro Conclusion and Invocation

The final line of the code, %DataImport, which invokes the macro that was just defined is shown below:

%DataImport;

Next: XML Code for the Import Individual Text, CSV, or Excel Files Process