#!/usr/bin/env bash
#
# Test awk vs Python speed.
#
# On this hash table benchmark, Python is maybe 10% slower than gawk.  mawk is
# twice is fast as gawk (and bwk).
#
# Python has much more functionality, so it's not exactly a fair comparison,
# but it's instructive.
#
# Update: simply adding tolower() makes gawk much slower than Python (555 ms
# vs. 280 ms), and mawk is still much faster at 138 ms.
#
# Mawk is known to be fast?  Faster than Java on this benchmark.
# https://brenocon.com/blog/2009/09/dont-mawk-awk-the-fastest-and-most-elegant-big-data-munging-language/
#
# Usage:
#   ./awk-python.sh <function name>

set -o nounset
set -o pipefail
set -o errexit

readonly FILES=(../*.sh ../*/*.sh ../*.py ../*/*.py ../*/*/*.py)

# Test out hash table implementations
# mawk is faster: 77ms vs 155ms for 10 iterations.
proc test-awk {
  for awk in [gawk mawk ~/git/bwk/bwk] {
    echo ---
    echo $awk
    echo ---
    time for i in []{1..10} {
      $awk '
      { 
        line = tolower($0)
        num_lines += 1

        # NOTE: gawk has length(); mawk does not
        if (!(line in unique)) {
          num_unique += 1
        }
        unique[line] += 1
      }
      END {
        print "unique lines: " num_unique
        print "total lines: " num_lines
      }
      ' $(FILES[@])

    }
  }
}

# Python VM is slower: 160-170 ms.  Oops.
#
# Well Python has more general dictionaries -- they take more than strings.
proc test-python {
  time for i in []{1..10} {
    python -S -c '
import collections
import sys

num_lines = 0
num_unique = 0
unique = collections.defaultdict(int)

for path in sys.argv[1:]:
  with open(path) as f:
    for line in f:
      line = line.lower()
      num_lines += 1

      if line not in unique:
        num_unique += 1
      unique[line] += 1

print "unique lines: ", num_unique
print "total lines: ", num_lines
      ' $(FILES[@])

  }
}

# Only 10-30 ms.  We are doing real work.
proc test-wc {
  time for i in []{1..10} {
    cat $(FILES[@]) | wc -c
  }
}

proc files {
  echo $(FILES[@])
  echo "$(#FILES[@]) files"
}

@Argv