# iffstats.awk
#
#/*****************************************************************************
#                Copyright Carnegie Mellon University 1992
#
#                      All Rights Reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appear in all copies and that
# both that copyright notice and this permission notice appear in
# supporting documentation, and that the name of CMU not be
# used in advertising or publicity pertaining to distribution of the
# software without specific, written prior permission.
#
# CMU DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
# CMU BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.
#*****************************************************************************/
#
#
# Gawk(1) program to print statistics of various properties of an IFF file.
#
# $Header: iffstats.awk,v 1.9 91/10/24 17:03:07 heydon Exp $
#
# Written by Allan Heydon for the Miro project at Carnegie Mellon
#
# SYNTAX
#   gawk -f iffstats.awk [ +a | +A ] [ +g | +G ] typefile donetypes=1 file
#
# SYNOPSIS
#   This awk script processes the IFF file 'file' (or the standard input if
#   'file' is given as "-") representing an instance picture and computes
#   various statistics about it, including:
#      * total number of boxes
#      * number of boxes of each role (subj, obj) and each type
#      * total number of arrows
#      * number of arrows of each type and each parity
#   These statistics are printed in an easy-to-read format once the file has
#   been processed.
#
#   if the "+a" (arrow incidence) flag is specified, then additional
#   information regarding the incidence of arrows on subj and obj boxes is
#   also produced.
#
#   If the "+g" (geometry) flag is specified, then additional information
#   regarding the geometry of the boxes is also produced. These include:
#      * average indegree, outdegree for each role (and overall)
#      * average box depth for each role (and overall)
#
#   The capitalized versions "+A" and "+G" additionally cause histograms of
#   the respective statistics to be reported.
#
# BUGS
#   The options cannot be specified with a "-" prefix because bawk
#   mysteriously gobbles any of its command-line arguments starting with "-".
#
#   There are restrictions on the input iff file.
#
# FILES
#   ~miro/bin/iffstats		script using this awk program
#
# SEE ALSO
#   fs2iff(1), iff2ciff(1), iffincidences(1)

# GLOBAL VARIABLES ============================================================
#
#   String role[type]		role (either 'subj' or 'obj') of type 'type'
#   int subj_box_total		total number of boxes with role = subj
#   int obj_box_total		total number of boxes with role = obj
#   int arrow_total		total number of arrows
#   int subj_box_cnt[type]	number of subj boxes of type 'type'
#   String subj_list		SEP_CHAR separated list of subj box sysnames
#   int obj_box_cnt[type]	number of obj boxes of type 'type'
#   String obj_list		SEP_CHAR separated list of obj box sysnames
#   void arrow_perms[type]	array indexed by all arrow types
#   int pos_arrow_cnt[type]	number of positive arrows of type 'type'
#   int neg_arrow_cnt[type]	number of negative arrows of type 'type'
#   int inc_cnt[sysname,type]	number of arrows of type 'type' incident on
#				  box 'sysname'
#   int in_degree[sysname]	in_degree of box with sysname 'sysname'
#   int out_degree[sysname]	out_degree of box with sysname 'sysname'
#   String parents[sysname]	SEP_CHAR separated list of sysnames of parent
#				  box(es) of box with sysname 'sysname'
#   int max_in_deg		max in_degree found by compute_inout_cnts()
#   int max_out_deg		max out_degree found by compute_inout_cnts()
#   int in_cnt[0-max_in_deg]	number of boxes having indegree 'i', where 'i'
#				  is the array index (see compute_inout_cnts())
#   int out_cnt[0-max_out_deg]	number of boxes having outdegree 'i', where 'i'
#				  is the array index (see compute_inout_cnts())
#   int depth[sysname]		depth of box with sysname 'sysname'; the value
#				  NO_DEPTH is used if it has not been computed

function min(m,n  ,  minmn) {
  minmn = (m < n) ? m : n;
  return minmn;
}

function max(m,n  ,  maxmn) {
  maxmn = (m > n) ? m : n;
  return maxmn;
}

# SHIFT_ARGV() ================================================================
#
# Left-shifts the contents of ARGV[2] through ARGV[ARGC-1] and then decrements
# ARGC, so long as ARGC-1 >= 2.
#
function shift_argv(i) {
  if (ARGC > 2) {
    ARGC--;
    for (i=1; i<ARGC; i++) { ARGV[i] = ARGV[i+1]; }
  }
}

# MARK_TYPE() =================================================================
#
# Sets role['type'] to 'role_name' for 'type' and all descendents of 'type'
# according to the child_types[] array.
#
function mark_type(type,role_name,   num_children,children,i) {
  role[type] = role_name
  num_children = split(child_types[type],children,SEP_CHAR)
  for (i=1; i<=num_children; i++) mark_type(children[i],role_name);
}

# PRINT_BOXES() ===============================================================
#
# Print box information. For now, subj and obj boxes are separated, so it is
# assumed that each box type corresponds to one role or the other, but not
# both. If there are boxes of some type for both roles, two columns for that
# type will be printed, the first for subj boxes of that type, and the second
# for obj boxes of that type.
#
function print_boxes() {
  printf("BOXES\n\n      ");
  for (type in subj_box_cnt) { printf("  " S_Fmt,type); }
  for (type in obj_box_cnt) { printf("  " S_Fmt,type); }
  printf("  " S_Fmt "\n","Total");
  printf(" subj:");
  for (type in subj_box_cnt) {
    printf("  " D_Fmt,subj_box_cnt[type]);
  }
  for (type in obj_box_cnt) { printf("  " D_Fmt,0); }
  printf("  " D_Fmt "\n",subj_box_total);
  printf("  obj:");
  for (type in subj_box_cnt) { printf("  " D_Fmt,0); }
  for (type in obj_box_cnt) {
    printf("  " D_Fmt,obj_box_cnt[type]);
  }
  printf("  " D_Fmt "\n",obj_box_total);
  printf("Total:");
  for (type in subj_box_cnt) { printf("  " D_Fmt,subj_box_cnt[type]); }
  for (type in obj_box_cnt) { printf("  " D_Fmt,obj_box_cnt[type]); }
  printf("  " D_Fmt "\n",(subj_box_total+obj_box_total));
}

# PRINT_ARROW_PERMS() =========================================================
#
# Print a line with all the arrow types and the word "Total" at the end.
#
function print_arrow_perms(  perm) {
  printf("      ");
  for (perm in arrow_perms) {
    printf("  " S_Fmt,perm);
  }
  printf("  " S_Fmt "\n","Total");
}

# PRINT_ARROWS() ==============================================================
#
# LOCAL VARIABLES
#   int pos_total		total number of pos arrows
#   int neg_total		total number of neg arrows
#   int perm_total[perm]	total number of arrows of perm 'perm'
#   int i			temporary counter
#
function print_arrows(  pos_total,neg_total,perm_total,i,perm) {
  printf("\nARROWS\n\n");
  print_arrow_perms();
  printf("  pos:");
  for (perm in arrow_perms) {
    printf("  " D_Fmt,pos_arrow_cnt[perm]);
    pos_total += pos_arrow_cnt[perm];
    perm_total[perm] = pos_arrow_cnt[perm];
  }
  printf("  " D_Fmt "\n",pos_total);
  printf("  neg:");
  for (perm in arrow_perms) {
    printf("  " D_Fmt,neg_arrow_cnt[perm]);
    neg_total += neg_arrow_cnt[perm];
    perm_total[perm] += neg_arrow_cnt[perm];
  }
  printf("  " D_Fmt "\n",neg_total);
  printf("Total:");
  for (perm in arrow_perms) {
    printf("  " D_Fmt,perm_total[perm]);
  }
  arrow_total = pos_total + neg_total;
  printf("  " D_Fmt "\n",arrow_total);
}

# PRINT_INCIDENCE_TYPE(int flag, String box_list, int total_boxes_of_type) ====
#
# LOCAL VARIABLES
#   cnt					number of sysnames in box_list
#   names[1-cnt]			array of sysnames in box_list
#   max_inc				maximum incidence of boxes in names[]
#   inc					incidence of current box
#   inc_cnt_total[0-max_inc,perm]	number of boxes having incidence of
#					  first array index for arrow of 'perm'
#   box_inc_total			total arrows incident on curr box
#   any_total[inc]			total boxes whose total number of
#					  incident arrows is the array index
#   i					temporary variable
#
function print_incidence_type(flag,box_list,total_boxes,  cnt,names,max_inc,
                              inc,inc_cnt_total,box_inc_total,any_total,
			      perm_total,i) {
  printf("%s:\n\n", flag ? " Histogram" : "");
  print_arrow_perms();
  if (flag) {
    cnt = split(box_list,names,SEP_CHAR);
    max_inc = 0;
    for (i=1; i<=cnt; i++) {
      box_inc_total = 0;
      for (perm in arrow_perms) {
        inc = 0 + inc_cnt[names[i],perm];	# force caste to integer
        inc_cnt_total[inc,perm]++;
        max_inc = max(max_inc,inc);
        box_inc_total += inc;
      }
      any_total[box_inc_total]++;
      max_inc = max(max_inc,box_inc_total);
    }
    for (inc=0; inc<=max_inc; inc++) {
      perm_total = 0;
      for (perm in arrow_perms) { perm_total += inc_cnt_total[inc,perm]; }
      if (any_total[inc] > 0 || perm_total > 0) {
        printf(" " Cnt_Fmt ":",inc);
        for (perm in arrow_perms) {
          printf("  " D_Fmt,inc_cnt_total[inc,perm]);
        }
        printf("  " D_Fmt "\n",any_total[inc]);
      }
    }
  }
  printf("  Avg:");
  for (perm in arrow_perms) {
    printf("  " F_Fmt, (pos_arrow_cnt[perm]+neg_arrow_cnt[perm])/total_boxes);
  }
  printf("  " F_Fmt "\n\n",arrow_total/total_boxes);
}

# PRINT_INCIDENCE(flag) =======================================================
#
# Prints arrow incidence statistics; also prints histograms if flag is 1.
#
function print_incidence(flag) {
  printf("\nARROW INCIDENCE\n\n");
  printf("Subject Boxes");
  print_incidence_type(flag,subj_list,subj_box_total);
  printf("Object Boxes");
  print_incidence_type(flag,obj_list,obj_box_total);
}

# MAKE_INT(s) =================================================================
#
# RETURNS the value of s if s is a string of digits; 0 otherwise.
#
function make_int(s,  result) {
  if (s ~ "[0-9]+") result = s;
  else result = 0;
  return result;
}

function print_quotient(a,b) {
  if (b != 0) {
    printf(F_Fmt "\n",(a/b));
  } else {
    print "none";
  }
}

# COMPUTE_INOUT_CNTS(box_cnt,sysname) =========================================
#
# PARAMETERS
#   box_cnt			number of sysnames in sysname[]
#   sysname[1-box_cnt]		array of sysnames of boxes to consider
#
function compute_inout_cnts(box_cnt,sysname,  i,val) {
  max_in_deg = -1;
  max_out_deg = -1;
  for (i=1; i<=box_cnt; i++) {
    max_in_deg = max(max_in_deg,in_degree[sysname[i]]);
    max_out_deg = max(max_out_deg,out_degree[sysname[i]]);
  }
  for (i=0; i<=max_in_deg; i++)  { in_cnt[i]  = 0; }
  for (i=0; i<=max_out_deg; i++) { out_cnt[i] = 0; }
  for (i=1; i<=box_cnt; i++) {
    val = make_int(in_degree[sysname[i]]);  in_cnt[val]++;
    val = make_int(out_degree[sysname[i]]); out_cnt[val]++;
  }
}

# PRINT_DEGREES(flag,deg_type,max,cnt) ========================================
#
# Print the in-/out-degree statistics (depending on 'deg_type'), with a full
# histogram if 'flag' is 1.
#
# RETURNS the weighted in-/out-degree.
#
# PARAMETERS
#   int flag			1 if histogram should be printed; 0 otherwise
#   int deg_type		IN or OUT
#   int max			highest in-/out-degree
#   int cnt[0-max]		number of boxes with in-/out-degree 'i', where
#				  'i' is the array index
#
function print_degrees(flag,deg_type,max,cnt,  i,box_cnt,result) {
  box_cnt = 0;
  result = 0;
  if (flag==1) {
    printf("  %s-Degree Histogram\n",(deg_type==IN ? "In" : "Out"));
  }
  for (i=0; i<=max; i++) {
    if (cnt[i] != 0) {
      box_cnt += cnt[i];
      result += (i * cnt[i]);
      if (flag==1) { printf(Histogram_Fmt,i,cnt[i]); }
    }
  }
  if (flag != 1) {
    printf("  %-14s = " D_Fmt "\n",(deg_type==IN ? "Leaves" : "Roots"),cnt[0]);
  }
  box_cnt -= cnt[0];
  if (flag==1) {
    printf("   Avg: ");
  } else {
    printf("  %-14s = ",("Avg " (deg_type==IN ? "in" : "out") "-degree"));
  }
  print_quotient(result,box_cnt);
  if (flag==1) { printf("\n"); }
  return(result);
}

# COMPUTE_DEPTH(sysname) ======================================================
#
# Compute depth[sysname] if not already computed and RETURN that value.
#
# PARAMETERS
#   int b			sysname of box whose depth is to be computed
#
# LOCAL VARIABLES
#   int p_cnt			number of parents of this box
#   int p_list[1-p_cnt]		array of sysnames of parents of this box
#
function compute_depth(b,  p_cnt,p_list,i,result) {
  if (depth[b] != NO_DEPTH) {
    result = depth[b];
  } else if (parents[b] == "") {
    # base case
    result = 0;
    depth[b] = result;
  } else {
    # recursive case
    result = -1;
    p_cnt = split(parents[b],p_list,SEP_CHAR);
    for (i=1; i<=p_cnt; i++) {
      result = max(result,compute_depth(p_list[i]));
    }
    result++;
    depth[b] = result;
  }
  return(result);
}

# PRINT_DEPTHS(flag,box_cnt,sysname) ==========================================
#
# Compute and print depth information for boxes in the sysname array.
#
# RETURNS the weighted sum of the depths of these boxes.
#
# PARAMETERS
#   int flag			1 if histogram is to be printed; 0 otherwise
#   int box_cnt			number of boxes in sysname[]
#   int sysname[1-box_cnt]	array of box sysnames to consider
#
# LOCAL VARIABLES
#   int max_depth		maximum depth value found
#   int cnt[0-max_depth]	number of boxes of depth 'i', where 'i' is the
#				  array index
#
function print_depths(flag,box_cnt,sysname,  i,max_depth,cnt,result) {
  # reset depths of these boxes
  for (i=1; i<=box_cnt; i++) { depth[sysname[i]] = NO_DEPTH; }

  # compute depths, starting depth-first search from leaves
  for (i=1; i<=box_cnt; i++) {
    if (in_degree[sysname[i]] == 0) { compute_depth(sysname[i]); }
  }

  # fill in cnt[] array
  max_depth = -1;
  for (i=1; i<=box_cnt; i++) {
    max_depth = max(max_depth,depth[sysname[i]]);
    cnt[depth[sysname[i]]]++;
  }

  # compute weighted sum (and print histogram if specified)
  if (flag==1) { printf("  Depth Histogram\n"); }
  for (i=0; i<=max_depth; i++) {
    if (cnt[i] != 0) {
      result += (i * cnt[i]);
      if (flag==1) { printf(Histogram_Fmt,i,cnt[i]); }
    }
  }
  if (flag == 1) {
    printf("   Avg: ");
  } else {
    printf("  %-14s = ","Avg depth");
  }
  print_quotient(result,box_cnt);
  printf("\n");
  return(result);
}

# PRINT_GEOMETRY(flag) ========================================================
#
# Prints geometry statistics; also prints histograms if flag is 1.
#
# LOCAL VARIABLES
#   int subj_cnt		number of subj boxes
#   int obj_cnt			number of obj boxes
#   int names[1-_cnt]		array of subj sysnames (or obj sysnames)
#   int in_deg			weighted sum of in_degrees
#   int out_deg			weighted sum of out_degress
#   int leaves			total number of boxes with in-degree 0
#   int roots			total number of boxes with out-degree 0
#   int depth_total		weighted sum of box depths
#
function print_geometry(flag,  subj_cnt,obj_cnt,names,in_deg,out_deg,
			leaves,roots) {
  printf("\nGEOMETRY\n\n");
  printf("Subject Boxes:\n\n");
  subj_cnt = split(subj_list,names,SEP_CHAR);
  compute_inout_cnts(subj_cnt,names);
  leaves = in_cnt[0];
  roots = out_cnt[0];
  in_deg = print_degrees(flag,IN,max_in_deg,in_cnt);
  out_deg = print_degrees(flag,OUT,max_out_deg,out_cnt);
  depth_total = print_depths(flag,subj_cnt,names);
  printf("Object Boxes:\n\n");
  obj_cnt = split(obj_list,names,SEP_CHAR);
  compute_inout_cnts(obj_cnt,names);
  leaves += in_cnt[0];
  roots += out_cnt[0];
  in_deg += print_degrees(flag,IN,max_in_deg,in_cnt);
  out_deg += print_degrees(flag,OUT,max_out_deg,out_cnt);
  depth_total += print_depths(flag,obj_cnt,names);
  printf("Totals:\n\n");
  printf("  Leaves         = " D_Fmt "\n",leaves);
  printf("  Avg in-degree  = ");
  print_quotient(in_deg,(subj_cnt+obj_cnt-leaves));
  printf("  Roots          = " D_Fmt "\n",roots);
  printf("  Avg out-degree = ");
  print_quotient(out_deg,(subj_cnt+obj_cnt-roots));
  printf("  Avg depth      = ");
  print_quotient(depth_total,(subj_cnt+obj_cnt));
}

# BEGIN =======================================================================

BEGIN {
  # set field separator so "=", ";" separate fields
  FS = "[ \t]*[;=][ \t]*"

  # initialize global variables when possible
  donetypes = 0;
  subj_box_total = 0;
  obj_box_total = 0;
  arrow_total = 0;
  subj_list = "";
  obj_list = "";
  arrow_type_list = "";

  # define parameters to print_degrees
  IN = 2;
  OUT = 3;

  # define value to use if a box's depth has not yet been computed
  NO_DEPTH = -1;

  # define globals for output print widths
  Print_Width = 6;
  Cnt_Width = 4;
  Average_Degree_Precision = 2;
  S_Fmt = "%" Print_Width "." Print_Width "s";
  D_Fmt = "%" Print_Width "d"
  Cnt_Fmt = "%" Cnt_Width "d"
  F_Fmt = "%" Print_Width "." Average_Degree_Precision "f";
  Histogram_Fmt = "  " Cnt_Fmt ": " D_Fmt "\n";

  # define separator character for lists
  SEP_CHAR = " ";
  PERM_SEP_CHAR = "[ \t]*,[ \t]*"

  # parse command-line arguments
  error_found = 0;
  a_flag = 0; a_hist = 0;
  g_flag = 0; g_hist = 0;
  while (ARGC >= 1) {
    if      (ARGV[1] == "+a") { a_flag = 1; }
    else if (ARGV[1] == "+A") { a_flag = 1; a_hist = 1; }
    else if (ARGV[1] == "+g") { g_flag = 1; }
    else if (ARGV[1] == "+G") { g_flag = 1; g_hist = 1; }
    else break;
    shift_argv();
  }
  if (ARGV[1] ~ /^((\+)|(-)).+/) {
    print "SYNTAX: awk -f iffstats.awk [ +a | +A ] [ +g | +G ] file";
    error_found = 1;
    exit 1;
  }
}

# BOXTYPE =====================================================================

donetypes==0 && /^[ \t]*>[ \t]*BOXTYPE/ {
  supertype = "";
  match($1,"^[ \t]*>[ \t]*BOXTYPE[ \t]*");
  $1 = substr($1,RLENGTH+1);
  for (i=1; i < NF; i = i+2) {
    if      ($i == "type-name") { typename = $(i+1); }
    else if ($i == "supertype") { supertype = $(i+1); }
  }
  if (typename == "subject") subject_found = 1;
  if (typename == "object") object_found = 1;
  if (supertype != "") {
    child_types[supertype] = typename SEP_CHAR child_types[supertype];
  }
  next;
}

# DONETYPES ===================================================================

donetypes==1 {
  donetypes = 2
  if ( subject_found != 1) {
    print "iffstats: did not find type 'subject' in type file";
    exit 1;
  }
  if ( object_found != 1) {
    print "iffstats: did not find type 'object' in type file";
    exit 1;
  }
  mark_type("subject","subj");
  mark_type("object","obj");
}

# BOX =========================================================================

/^[ \t]*>[ \t]*BOX/ {
  match($1,"^[ \t]*>[ \t]*BOX[ \t]*");
  $1 = substr($1,RLENGTH+1);
  for (i=1; i < NF; i = i+2) {
    if      ($i == "type")    { type = $(i+1); }
    else if ($i == "sysname") { sysname = $(i+1); }
  }
  if (role[type] == "subj") {
    subj_box_total++;
    subj_box_cnt[type]++;
    if (a_flag || g_flag) { subj_list = subj_list SEP_CHAR sysname; }
  } else if (role[type] == "obj") {
    obj_box_total++;
    obj_box_cnt[type]++;
    if (a_flag || g_flag) { obj_list = obj_list SEP_CHAR sysname; }
  }
  next;
}

# ARROW =======================================================================

/^[ \t]*>[ \t]*ARROW/ {
  match($1,"^[ \t]*>[ \t]*ARROW[ \t]*");
  $1 = substr($1,RLENGTH+1);
  for (i=1; i < NF; i = i+2) {
    if      ($i == "from")        { from = $(i+1);    }
    else if ($i == "to")          { to = $(i+1);      }
    else if ($i == "parity")      { parity = $(i+1);  }
    else if ($i == "permissions") { permset = $(i+1); }
  }
  if (substr(permset,1,1) == "{" && substr(permset,length(permset)) == "}") {
    permset = substr(permset,2,length(permset)-2);
    perm_cnt = split(permset,perms,PERM_SEP_CHAR);
    for (i=1; i<=perm_cnt; i++) {
      perm = perms[i];
      inc_cnt[from,perm]++;
      inc_cnt[to,perm]++;
      arrow_perms[perm] = "";
      if (parity == "pos") {
        pos_arrow_cnt[perm]++;
      } else if (parity == "neg") {
        neg_arrow_cnt[perm]++;
      }
    }
  } else {
    print "Permissions string '" permset "' of improper form.";
    exit(1);
  }
  next;
}

# INSIDE ======================================================================

g_flag==1 && /^[ \t]*>[ \t]*INSIDE/ {
  match($1,"^[ \t]*>[ \t]*INSIDE[ \t]*");
  $1 = substr($1,RLENGTH+1);
  for (i=1; i < NF; i = i+2) {
    if ($i == "parent")        { pindex = $(i+1); }
    else if ($i == "children") { clist = $(i+1);  }
  }
  if (substr(clist,1,1) == "{" && substr(clist,length(clist)) == "}") {
    clist = substr(clist,2,length(clist)-2);
    child_cnt = split(clist,children,",");
    in_degree[pindex] += child_cnt;
    for (i=1; i <= child_cnt; i++) {
      cindex = children[i];
      out_degree[cindex]++;
      parents[cindex] = parents[cindex] SEP_CHAR pindex;
    }
  } else {
    print "Children string '" clist "' of improper form.";
    exit(1);
  }
  next;
}

# END =========================================================================

END {
  if (!error_found) {
    print_boxes();
    print_arrows();
    if (a_flag) print_incidence(a_hist);
    if (g_flag) print_geometry(g_hist);
  }
}
