dm 'clear log;log off';

/* Richard A. DeVenezia
 * www.devenezia.com
 * 5/17/2004
 *
 * Sample problem of
 * http://www.devenezia.com/downloads/sas/macros/index.php?m=hash-macros
 * adapted to hash object
 *
 * mod
 *  3JUL2004 SAS birdie provides a sublime solution
 *           RAD many more comments, add array hashing for comparison
 */

/*
 * From a SESUG 2003 coffee table discussion.
 *
 * Environment:
 * Each transaction has an id (tid), a prior transaction id (pid) and an amount
 * Each id is a unique 15 digit character string
 * A transaction can be the prior transaction of only one transaction
 * A path in the transaction data is the set of transactions x,y,z,... such that
 * x.tid = y.pid and y.tid = z.pid and ...
 * Some transactions are in paths of length 1
 *
 * Task:
 * Find M (unknown) path totals amongst N (known) transactions
 *
 * Fake data construction:
 * Set N to the number of transactions
 * Create M (a random number of) paths in the transaction data
 */

options notes fullstimer;

%* number of transactions;

%let N = 20;
%let N = 1E6;

%* probability of transaction being a prior transaction;

%let pPrior = 0.65;

%* make fake data;

data _null_ (keep=tid pid amount);

  retain seed 36911001;

  length tid pid ptid $15;

  declare hash t(hashexp:10);
  t.defineKey ('tid');
  t.defineData ('tid', 'pid', 'amount');
  t.defineDone ();
  call missing (tid, pid, amount);

  do row = 1 to &N;
    * select a new unique random tid;

    do until (t.find() ne 0);
      tid = put (int (1e15 * ranuni(seed)), z15.);
    end;

    * select a new random non-zero amount;

    do until (amount);
      amount = 25 + int(12 * rannor(seed));
    end;

    * add the new tid and amount to the hash;

    pid = ' ';
    t.add ();

    * with probability pPrior, determine if this transaction
    * should be a prior transaction in a trail;

    if ranuni(seed) < &pPrior and row > 1 then do;

      * create a 'prior' link;
      * make this transaction prior to the previously added tid;
      t.replace (key:ptid, data:ptid, data:tid, data:pamount);

      * track the longest trail;
      len + 1;
      if len > maxlen then maxlen = len;
    end;
    else
      len = .;

    * info needed when setting up a 'prior' link;
    ptid = tid;
    pamount = amount;
  end;

  t.output (dataset:'transactions');

  put maxlen=;
run ;

* disorder the transactions;

proc sort data=transactions;
  by amount;
run ;

********************************************************************;
* Richards hash solution adapted from original version of
* hashing using arrays example at:
* - http://www.devenezia.com/downloads/sas/macros/index.php?m=hash-macros
* Original array example has been updated to a solution similar in
* design to the 'little birdies' solution shown below this solution
*
* In essence:
* - A paved goat path is still a goat path
* Does not hurt to understand this data step, but dont duplicate it
********************************************************************;

/*
 * Finding and summing ~350,000 paths amongst 1,000,000 transactions
 * takes about 12 seconds. (x years later... "Man, that was slow")
 */

data trail_sums_lame_algorithm (keep=tid ntids total_amount ) ;

  length tid pid $15 ;

  call missing (tid, pid, amount);

  declare hash t (dataset:'transactions', hashexp:10, ordered:'A');
  t.defineKey ('tid');
  t.defineData ('tid', 'amount', 'pid');
  t.defineDone ();

  declare hash f (hashexp:9);
  f.defineKey ('pid');
  f.defineData ('pid', 'tid');
  f.defineDone ();

  declare hiter ti('t');
  declare hiter fi('f');

  * populate a forward reference hash;
  * ti iterates t which was automatically loaded from dataset transactions,
  * the keys are iterated in ascending order;

  * a blank pid means a tid has no prior pid;
  * the f hash is configured to reverse the role of key and data with respect to h;
  * in f the key is the pid and the data the tid;
  * thus, later, a find for a specific tid in f will return the tid it is prior to;

  do rc = ti.first() by 0 while (rc eq 0);
    * pid is the key of f;
    * dont add null valued keys;
    if pid ne '' then f.add();
    rc = ti.next();
  end;

  * summarize by trail;

  do rc = ti.first() by 0 while (rc eq 0);

    * a transaction will have been tagged as processed
    * if it occured in a trail that was previously traversed;

    if pid ne 'PROCESSED' then do;

      * follow pids to first tid in trail;
      * each transactions data is the key of a prior transaction;

      do rc = t.find() by 0 while (pid ne ' ');
        tid = pid;
        t.find ();
      end;

      * now at first transaction in a trail, start the summary and
      * traverse trail using forward hash;
      * attempting to find the next transaction in a trail
      * when at the last transaction will cause f.find() to be non-zero;

      * put / tid= amount=; * debug;

      ntids = 1;
      total_amount = amount ;

      pid = 'PROCESSED';
      t.replace();

      pid = tid;

      do while (f.find() eq 0);
        ntids + 1;

        t.find ();
        total_amount + amount;

        * put tid= amount= total_amount=; * debug;

        pid = 'PROCESSED';
        t.replace();

        pid = tid;
      end;

      output;

    end;

    rc = ti.next();
  end;
run ;



********************************************************************;
* A little birdies hash (JS);
* Sublime, sweet and oh so fast.  A balanced separation of content and context
********************************************************************;

/*
 * Finding and summing ~350,000 paths amongst 1,000,000 transactions
 * takes about 4 seconds.
 */

/* Different program to compute trail_sums */
data trail_sums_fast_algorithm;
  keep tid ntids total_amount;

  /* Create START and TRANS hash tables */
  dcl hash start(hashexp:9);
  start.defineKey('tid');
  start.defineData('tid', 'amount');
  start.defineDone();

  dcl hash trans(hashexp:10);
  trans.defineKey('pid');
  trans.defineData('tid', 'pid', 'amount');
  trans.defineDone();

  /* Load a transaction into one of the hash tables.
   * If a starting transaction, add to START.
   * Otherwise, add to TRANS.
   */

  /* RAD:
   * TRANS key and data reverses role of tid and pid,
   * making it a forward linking structure.
   * START is a list of head of trail nodes
   */

  do while (not done);
    set transactions end=done;
    if missing(pid)
      then start.add();
      else trans.add();
  end;

  /* Iterate through START and
   * follow the path through TRANS.  Build an output
   * data set that contains trail summary information.
   */
  dcl hiter startIter('start');

  rc = startIter.first();
  do while (rc = 0);
    total_amount = amount;
    ntids = 1;

    do while (trans.find(key:tid) = 0);
      total_amount + amount;
      ntids + 1;
    end;

    output;
    rc = startIter.next();
  end;

  stop;
run;


********************************************************************;
* A little birdies hash adapted to hashing arrays;
* And oh, Oh, OH, so fast
********************************************************************;

/*
 * Finding and summing ~350,000 paths amongst 1,000,000 transactions
 * takes about 2.5 seconds.
 */

* good security means dont run code from the net blindly;
* remove the \\extreme\macros filename
* - if you trust my source (that is run from the internet)
*   to not hose you with a worm or virus or worse;
* - if you dont trust me, good!
*   download hash-macros.sas to a local path, study it and
*   alter \\extreme\macros to match the download path;

options nosource2;
filename trusted "http://www.devenezia.com:80/downloads/sas/macros/download.php?file=hash-macros.sas";
filename trusted "\\extreme\macros\hash-macros.sas";
%include trusted;
filename trusted;


%global hash_size;

%let hash_size = %hashSize (load=0.75, data=transactions);

data trail_sums_arry_algorithm (keep=t_id ntids total_amount rename=(t_id=tid));

 %* two hashes will allow forward lookup given only backlinks;
 %* different hashing policies only for testing purposes;

  %declareHash (start, &hash_size, policy=CL)
  %declareHashKey  (start, tid)
  %declareHashData (start, amount)

  %declareHash (next, &hash_size, policy=LP)
  %declareHashKey  (next, pid)
  %declareHashData (next, tid)
  %declareHashData (next, amount)

  do until (endOfTransactions);
    set transactions
        ( keep=tid pid amount
          rename=(tid=t_id pid=p_id)
        )
        end=endOfTransactions;

    %* transform key variables from character to numeric;

    tid = input (t_id, 15.);
    pid = input (p_id, 15.);

    if p_id = '' then do;
      %hashAddKey (start, rc=rc)

      if (rc < 0) then do;
        put "ERROR: " tid= "is not unique";
        stop;
      end;

      %hashSetData (start, amount)
    end;
    else do;
      %* hash pid to allow forward lookup;

      %hashAddKey (next, rc=rc)

      if (rc < 0) then do;
        put "ERROR: " pid= "is not unique";
        stop;
      end;

      %hashSetData (next, tid)
      %hashSetData (next, amount)
    end;
  end;

%*-----;

  * summarize by trail;

  do key_addr = 1 to &hash_size;

    tid = %keyArray (start)(key_addr);

    * keyArray(tid)(key_addr) = . means no tid was ever hashed to key_addr;
    * but every tid was hashed, so . means there is
    * no tid that corresponds to key_addr and thus key_addr can be skipped;

    if tid eq . then continue;

    * sum amount while following the trail;

    total_amount = %dataArray (start,amount)(key_addr);
    ntids = 1;

    do until (0);
      * check if tid is also pid;
      pid = tid;
      %hashFetch (next, rc=rc)
      if rc < 0 then leave;

      total_amount + %hashGetData (next, amount);
      ntids + 1;
      tid = %hashGetData (next, tid);
    end;

    t_id = put (tid,z15.);

    output;

  end;

  stop;

  format tid pid z15.;
run;



********************************************************************;
* Do all the algorithms agree ?
********************************************************************;

ods html file="%sysfunc(pathname(WORK))/trail_sums.html" style=sasweb;

%let by = descending ntids descending total_amount tid;

proc sort data=trail_sums_lame_algorithm; by &by;
proc sort data=trail_sums_fast_algorithm; by &by;
proc sort data=trail_sums_arry_algorithm; by &by;
run;

proc compare
  briefsummary
  compare=trail_sums_fast_algorithm
     base=trail_sums_lame_algorithm
;
  title "Comparison of two trail summarizing techniques";
run;

proc compare
  briefsummary
  compare=trail_sums_fast_algorithm
     base=trail_sums_arry_algorithm
;
  title "Comparison of two trail summarizing techniques";
run;



proc print label data=trail_sums_fast_algorithm (obs=10);
  title "Summary of some trails";

  label
    tid='Most recent tid of trail'
    ntids='Number of transactions in trail'
  ;
run;

ods html close;

dm 'log on';