read_dbf.sas
%macro read_dbf (filename, ds, keep_del=0, showflds=0);
/* Author: Richard A. DeVenezia
* Date: 18 May 1993
*
* 02/23/96 RAD change date variables to default to mmddyy10.
* format and informat;
* 10/24/96 RAD change method used to skip header portion
* (use cut -c -<hdrLen+1>)
* 4/ 9/98 RAD ensure variable names are valid SAS names
* 1/ 4/00 RAD file descriptor date of last update year item is
* 100+ for year 2000+. Mod with 100 and rely on SAS
* YEARCUTOFF option
* 11/13/00 RAD rewrite data input portion to remove reliance on UNIX cut
*/
/* Read a DBASE III(+) file into a SAS dataset
* filename - file in operating system assumed to be in .dbf format
* ds - destination SAS dataset
* keep_del - 0 to ignore deleted records, 1 to retain deleted records
*
* DBASE III structures
*
* File Descriptor:
* Ofs Len Field
* 0 1 dBASE III version number (3 ls bits)
* 1 3 date of last update (YY MM DD in pib1. format)
* 4 4 number of records (longint - pib4.)
* 8 2 length of header structure (int - pib2.)
* [header + field descriptions ]
* 10 2 length of each record (int - pib2.) [deleted flag + fields]
* 12 20 reserved
*
* Field Descriptors (32 chars each) immediately follow header
* (number of fields implied from length of header structure)
* Ofs Len Field
* 0 11 field name (variable) zero terminated
* 11 1 field type (C N F L D M)
* C - Character: Ascii
* N/F - Numeric: -.0123456789
* L - Logical: ?YyTtNnFf
* D - Date: yyyymmdd
* M - Memo: 10 digit pointer for .dbt file
* 12 4 field data address (do not care)
* 16 1 field length (short - pib1.)
* 17 1 decimals (digits to right of decimal point) (short - pib1.)
* 18 14 reserved
*
* immediately following field descriptors
* '00'x'0D'x <DATA> - dBASE III
* '0D'x <DATA> - dBASE III+
* header length field should take care of skipping over these positions
*
* <DATA>
* all data is stored in character format (i.e. no binary integer or
* floating point formats), numerics are stored in ascii representation
*/
%local notes mprint symgen;
%let notes = %sysfunc (getoption(NOTES));
%let mprint = %sysfunc (getoption(MPRINT));
%let symgen = %sysfunc (getoption(SYMBOLGEN));
options nonotes nomprint nosymbolgen;
%local filename ds keep_del showflds;
%local i type;
%if (%quote(&filename)=) or (%quote(&ds)=) %then %do;
%put usage: read_dbf(filename, ds);
%goto ByeBye;
%end;
%*
* Read File Descriptor and each Field Descriptor
*;
%local hdrLen nFields recLen numRex;
%local maxLenV; %* maximum length of SAS variable name in output dataset;
%if (&sysver >= 7)
%then %let maxLenV = 11; %* dBase field names can be 11 characters at most;
%else %let maxLenV = 8; %* SAS variable names can be 8 characters at most;
data &DS (keep=i name varname type length decimals ) ;
infile "&filename" unbuffered recfm=n;
* dBASE file descriptor (32 bytes);
input
version pib1.
u1 pib1. %* YY - Note: value is 100+ for year 2000+ ;
u2 pib1. %* MM ;
u3 pib1. %* DD ;
%*numRex pib4. (works only on Intel based SAS);
rx1 pib1.
rx2 pib1.
rx3 pib1.
rx4 pib1.
%*hdrLen pib2. (works only on Intel based SAS);
hd1 pib1.
hd2 pib1.
%*recLen pib2. (works only on Intel based SAS);
rl1 pib1.
rl2 pib1.
reserved $char20.
;
version = mod (version, 8);
if version ^= 3 then stop;
numRex = rx1 + 256 * (rx2 + 256 * (rx3 + 256 * rx4));
hdrLen = hd1 + 256 * (hd2);
recLen = rl1 + 256 * (rl2);
* u1 = mod (u1, 100);
* updated = mdy (u2, u3, u1);
* format updated date7.;
numFlds = (hdrLen - 33) / 32;
* fileSize = hdrLen + recLen*numRex + 1;
call symput ('nFields',trim(left(put(numFlds,8.))));
call symput ('hdrLen', trim(left(put(hdrLen,8.))));
call symput ('recLen', trim(left(put(recLen,8.))));
call symput ('numRex', trim(left(put(numRex,8.))));
put 'NOTE:' hdrLen= recLen= numFlds= numRex=;
* read each dBASE field descriptor (32 bytes each);
* field name will be mapped to a SAS eight character variable name;
array _var [2000] $&maxLenV _temporary_;
length varname $&maxLenV;
do i = 1 to numFlds;
input
name $CHARZB11. %* convert null chars to blanks;
type $CHAR1.
reserve1 $CHAR4.
length pib1.
decimals pib1.
reserve2 $CHAR14.
;
if type='F' then type='N';
* map first word of field name to variable name;
* change any non alphanumeric to underscore;
varname = upcase(name);
varname = scan (varname, 1, ' ');
if not ('A' <= substr (varname,1,1) <= 'Z') then
varname = '_' || varname;
do j = 1 to length (varname);
if not ('A' <= substr (varname,j,1) <= 'Z' or
'0' <= substr (varname,j,1) <= '9')
then
substr (varname,j,1) = '_';
end;
* check if this SAS varname has been used already;
found = 0;
do j = 1 to 2000 while (_var[j] ^= "" and not found);
found = ( _var[j] = varname );
end;
if not found then
_var[j]=varname;
else do;
do k = 1 to 2000 while (found);
n = left(put(k,4.));
varname = substr(varname,1,8-length(n)) || n;
found = 0;
do j = 1 to 2000 while (_var[j] ^= "" and not found);
found = ( _var[j] = varname );
end;
end;
_var[j] = varname;
end;
output;
end;
stop;
run;
%if (&syserr ^= 0) %then %goto ByeBye;
%*
* localize all macro variables that describe each variable in .dbf file;
*;
%do i = 1 %to &nFields;
%local
var&i
label&i
type&i
len&i
dec&i
;
%end;
%*
* now populate those macro variables with values
*;
data _null_;
set &ds;
n = left(put(i,8.));
call symput ('var' ||n, trim(varname));
call symput ('label'||n, trim(name));
call symput ('type' ||n, trim(type));
call symput ('len' ||n, trim(left(put(length,8.))));
call symput ('dec' ||n, trim(left(put(decimals,8.))));
run;
%if (&syserr ^= 0) %then %goto ByeBye;
%if (&showflds) %then %do;
%* report file structure;
%put &filename;
%put hdrLen=&hdrLen recLen=&recLen numRex=&numRex;
%do i=1 %to &nfields;
%put %substr(&i..%str( ),1,3) &&var&i &&label&i &&type&i &&len&i &&dec&i;
%end;
%end;
%*
* read data part of .dbf file
*;
%local type;
data &ds;
infile "&filename" unbuffered recfm=n;
attrib deleted length=$1;
%do i=1 %to &nFields;
%let type=&&type&i;
ATTRIB &&var&i label="&&label&i"
%if (&type=D) %then length=4 format=mmddyy10. informat=mmddyy10.; %else
%if (&type=L) %then length=3 format=1. ; %else
%if (&type=C) %then length=$&&len&i; %else
%if (&type=N) %then format=&&len&i...&&dec&i;
;
%end;
attrib source length=$150 label='Data source';
retain source "&filename";
input @&hdrLen deleted $CHAR1.;
do i = 1 to &numRex;
input
deleted $CHAR1.
;
%if (not &keep_del) %then %do;
if deleted ne ' ' then do;
%* since this record is tagged as deleted in the .dbf and the macro
* was invoked indicating deleted records are not to be saved,
* skip over the rest of the record;
input +(&recLen-2) dummy $CHAR1.;
drop dummy;
goto NextRcrd;
end;
%end;
input
%do i=1 %to &nFields;
%let type=&&type&i;
%if (&type=D) %then &&var&i yymmdd8. ; %else
%if (&type=L) %then l&i $CHAR1. ; %else
%if (&type=C) %then &&var&i $CHAR&&len&i...; %else
%if (&type=N) %then &&var&i &&len&i...&&dec&i; %else
%if (&type=M) %then &&var&i $CHAR&&len&i...;
%end;
;
%do i=1 %to &nfields;
%let type=&&type&i;
%if (&type=L) %then %do;
if (upcase(l&i) in ("T" "Y")) then &&var&i=1; else
if (upcase(l&i) in ("F" "N")) then &&var&i=0;
drop l&i;
%end;
%else
%if (&type=M) %then %do;
drop &&var&i;
%end;
%end;
NextRcrd:
%if (&keep_del) %then %do;
output;
%end;
%else %do;
if deleted = ' ' then output;
%end;
end;
stop;
drop deleted;
run;
%ByeBye:
options &symgen &mprint ¬es;
%mend read_dbf;