dm 'clear log;log off'; /* Richard A. DeVenezia * www.devenezia.com * 5/17/2004 * * Sample problem of * http://www.devenezia.com/downloads/sas/macros/index.php?m=hash-macros * adapted to hash object * * mod * 3JUL2004 SAS birdie provides a sublime solution * RAD many more comments, add array hashing for comparison */ /* * From a SESUG 2003 coffee table discussion. * * Environment: * Each transaction has an id (tid), a prior transaction id (pid) and an amount * Each id is a unique 15 digit character string * A transaction can be the prior transaction of only one transaction * A path in the transaction data is the set of transactions x,y,z,... such that * x.tid = y.pid and y.tid = z.pid and ... * Some transactions are in paths of length 1 * * Task: * Find M (unknown) path totals amongst N (known) transactions * * Fake data construction: * Set N to the number of transactions * Create M (a random number of) paths in the transaction data */ options notes fullstimer; %* number of transactions; %let N = 20; %let N = 1E6; %* probability of transaction being a prior transaction; %let pPrior = 0.65; %* make fake data; data _null_ (keep=tid pid amount); retain seed 36911001; length tid pid ptid $15; declare hash t(hashexp:10); t.defineKey ('tid'); t.defineData ('tid', 'pid', 'amount'); t.defineDone (); call missing (tid, pid, amount); do row = 1 to &N; * select a new unique random tid; do until (t.find() ne 0); tid = put (int (1e15 * ranuni(seed)), z15.); end; * select a new random non-zero amount; do until (amount); amount = 25 + int(12 * rannor(seed)); end; * add the new tid and amount to the hash; pid = ' '; t.add (); * with probability pPrior, determine if this transaction * should be a prior transaction in a trail; if ranuni(seed) < &pPrior and row > 1 then do; * create a 'prior' link; * make this transaction prior to the previously added tid; t.replace (key:ptid, data:ptid, data:tid, data:pamount); * track the longest trail; len + 1; if len > maxlen then maxlen = len; end; else len = .; * info needed when setting up a 'prior' link; ptid = tid; pamount = amount; end; t.output (dataset:'transactions'); put maxlen=; run ; * disorder the transactions; proc sort data=transactions; by amount; run ; ********************************************************************; * Richards hash solution adapted from original version of * hashing using arrays example at: * - http://www.devenezia.com/downloads/sas/macros/index.php?m=hash-macros * Original array example has been updated to a solution similar in * design to the 'little birdies' solution shown below this solution * * In essence: * - A paved goat path is still a goat path * Does not hurt to understand this data step, but dont duplicate it ********************************************************************; /* * Finding and summing ~350,000 paths amongst 1,000,000 transactions * takes about 12 seconds. (x years later... "Man, that was slow") */ data trail_sums_lame_algorithm (keep=tid ntids total_amount ) ; length tid pid $15 ; call missing (tid, pid, amount); declare hash t (dataset:'transactions', hashexp:10, ordered:'A'); t.defineKey ('tid'); t.defineData ('tid', 'amount', 'pid'); t.defineDone (); declare hash f (hashexp:9); f.defineKey ('pid'); f.defineData ('pid', 'tid'); f.defineDone (); declare hiter ti('t'); declare hiter fi('f'); * populate a forward reference hash; * ti iterates t which was automatically loaded from dataset transactions, * the keys are iterated in ascending order; * a blank pid means a tid has no prior pid; * the f hash is configured to reverse the role of key and data with respect to h; * in f the key is the pid and the data the tid; * thus, later, a find for a specific tid in f will return the tid it is prior to; do rc = ti.first() by 0 while (rc eq 0); * pid is the key of f; * dont add null valued keys; if pid ne '' then f.add(); rc = ti.next(); end; * summarize by trail; do rc = ti.first() by 0 while (rc eq 0); * a transaction will have been tagged as processed * if it occured in a trail that was previously traversed; if pid ne 'PROCESSED' then do; * follow pids to first tid in trail; * each transactions data is the key of a prior transaction; do rc = t.find() by 0 while (pid ne ' '); tid = pid; t.find (); end; * now at first transaction in a trail, start the summary and * traverse trail using forward hash; * attempting to find the next transaction in a trail * when at the last transaction will cause f.find() to be non-zero; * put / tid= amount=; * debug; ntids = 1; total_amount = amount ; pid = 'PROCESSED'; t.replace(); pid = tid; do while (f.find() eq 0); ntids + 1; t.find (); total_amount + amount; * put tid= amount= total_amount=; * debug; pid = 'PROCESSED'; t.replace(); pid = tid; end; output; end; rc = ti.next(); end; run ; ********************************************************************; * A little birdies hash (JS); * Sublime, sweet and oh so fast. A balanced separation of content and context ********************************************************************; /* * Finding and summing ~350,000 paths amongst 1,000,000 transactions * takes about 4 seconds. */ /* Different program to compute trail_sums */ data trail_sums_fast_algorithm; keep tid ntids total_amount; /* Create START and TRANS hash tables */ dcl hash start(hashexp:9); start.defineKey('tid'); start.defineData('tid', 'amount'); start.defineDone(); dcl hash trans(hashexp:10); trans.defineKey('pid'); trans.defineData('tid', 'pid', 'amount'); trans.defineDone(); /* Load a transaction into one of the hash tables. * If a starting transaction, add to START. * Otherwise, add to TRANS. */ /* RAD: * TRANS key and data reverses role of tid and pid, * making it a forward linking structure. * START is a list of head of trail nodes */ do while (not done); set transactions end=done; if missing(pid) then start.add(); else trans.add(); end; /* Iterate through START and * follow the path through TRANS. Build an output * data set that contains trail summary information. */ dcl hiter startIter('start'); rc = startIter.first(); do while (rc = 0); total_amount = amount; ntids = 1; do while (trans.find(key:tid) = 0); total_amount + amount; ntids + 1; end; output; rc = startIter.next(); end; stop; run; ********************************************************************; * A little birdies hash adapted to hashing arrays; * And oh, Oh, OH, so fast ********************************************************************; /* * Finding and summing ~350,000 paths amongst 1,000,000 transactions * takes about 2.5 seconds. */ * good security means dont run code from the net blindly; * remove the \\extreme\macros filename * - if you trust my source (that is run from the internet) * to not hose you with a worm or virus or worse; * - if you dont trust me, good! * download hash-macros.sas to a local path, study it and * alter \\extreme\macros to match the download path; options nosource2; filename trusted "http://www.devenezia.com:80/downloads/sas/macros/download.php?file=hash-macros.sas"; filename trusted "\\extreme\macros\hash-macros.sas"; %include trusted; filename trusted; %global hash_size; %let hash_size = %hashSize (load=0.75, data=transactions); data trail_sums_arry_algorithm (keep=t_id ntids total_amount rename=(t_id=tid)); %* two hashes will allow forward lookup given only backlinks; %* different hashing policies only for testing purposes; %declareHash (start, &hash_size, policy=CL) %declareHashKey (start, tid) %declareHashData (start, amount) %declareHash (next, &hash_size, policy=LP) %declareHashKey (next, pid) %declareHashData (next, tid) %declareHashData (next, amount) do until (endOfTransactions); set transactions ( keep=tid pid amount rename=(tid=t_id pid=p_id) ) end=endOfTransactions; %* transform key variables from character to numeric; tid = input (t_id, 15.); pid = input (p_id, 15.); if p_id = '' then do; %hashAddKey (start, rc=rc) if (rc < 0) then do; put "ERROR: " tid= "is not unique"; stop; end; %hashSetData (start, amount) end; else do; %* hash pid to allow forward lookup; %hashAddKey (next, rc=rc) if (rc < 0) then do; put "ERROR: " pid= "is not unique"; stop; end; %hashSetData (next, tid) %hashSetData (next, amount) end; end; %*-----; * summarize by trail; do key_addr = 1 to &hash_size; tid = %keyArray (start)(key_addr); * keyArray(tid)(key_addr) = . means no tid was ever hashed to key_addr; * but every tid was hashed, so . means there is * no tid that corresponds to key_addr and thus key_addr can be skipped; if tid eq . then continue; * sum amount while following the trail; total_amount = %dataArray (start,amount)(key_addr); ntids = 1; do until (0); * check if tid is also pid; pid = tid; %hashFetch (next, rc=rc) if rc < 0 then leave; total_amount + %hashGetData (next, amount); ntids + 1; tid = %hashGetData (next, tid); end; t_id = put (tid,z15.); output; end; stop; format tid pid z15.; run; ********************************************************************; * Do all the algorithms agree ? ********************************************************************; ods html file="%sysfunc(pathname(WORK))/trail_sums.html" style=sasweb; %let by = descending ntids descending total_amount tid; proc sort data=trail_sums_lame_algorithm; by &by; proc sort data=trail_sums_fast_algorithm; by &by; proc sort data=trail_sums_arry_algorithm; by &by; run; proc compare briefsummary compare=trail_sums_fast_algorithm base=trail_sums_lame_algorithm ; title "Comparison of two trail summarizing techniques"; run; proc compare briefsummary compare=trail_sums_fast_algorithm base=trail_sums_arry_algorithm ; title "Comparison of two trail summarizing techniques"; run; proc print label data=trail_sums_fast_algorithm (obs=10); title "Summary of some trails"; label tid='Most recent tid of trail' ntids='Number of transactions in trail' ; run; ods html close; dm 'log on';