Document AWK source

author Johannes Truschnigg <johannes@truschnigg.info>

Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)

committer Johannes Truschnigg <johannes@truschnigg.info>

Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)
author Johannes Truschnigg <johannes@truschnigg.info>
Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)
committer Johannes Truschnigg <johannes@truschnigg.info>
Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)
diff --git a/__lagdetect.awk b/__lagdetect.awk

index 1366bead431549a6b7af36aa0c61d7c04dbd7f6e..4eb6530eaf69410ca9c8d5dce7f0e7307c9725e6 100644 (file)
--- a/__lagdetect.awk
+++ b/__lagdetect.awk
@@ -18,7 +18,7 @@
  # latency reported by fping(8). It is useless on its own, and should be started
  # by the accompanying shellscript wrapper found in the same directory.
  #
-# This is work in progresa and has severe limitations. Latency upswings that
+# This is work in progress and has severe limitations. Latency upswings that
  # are NOT caused by bufferbloat cannot be told apart from those that are caused
  # by it, but hopefully will be in a future version. Presently, only downstream
  # bandwidth is being shaped.
@@ -84,6 +84,9 @@ BEGIN {
  }
  
  
+# Compute a sensible number of steps between the MIN and MAX bandwidth
+# boundaries configured.
+# XXX TODO impl. tunable for acting more fine-grained?
  function slice_bw_window_rx(bw_lower_bound_rx, bw_upper_bound_rx) {
    bw_bound_delta_rx = (bw_upper_bound_rx - bw_lower_bound_rx)
    bw_steps_rx = 1+int((log(bw_bound_delta_rx)/log(10)) ^ 1.6)
@@ -122,8 +125,11 @@ function get_time() {
  }
  
  
+# Update bandwidth estimates from interface statistics. Ticks on each latency
+# record, but will throttle if these flow in too fast.
  function update_bw() {
    # If the last update happened rather recently, do not compute new bw stats.
+  # XXX TODO do we need to snashot ts here?
    if (! (update_ts())) {
      #print "# " ts " too fast, skiping bw updates " ts - ts_old
      return
@@ -138,10 +144,13 @@ function update_bw() {
    getline rx < STATS_RX
    close(STATS_RX)
  
+  # Convert to bytes to Kbit, compute rates over ts_delta (given in seconds)
    rx_delta=int(((rx - rx_old) / 128))
    rx_rate=int((rx_delta / ts_delta))
    tx_delta=int(((tx - tx_old) / 128))
    tx_rate=int((tx_delta / ts_delta))
+
+  # Record max. rates in each direction
    if (tx_rate_max < tx_rate) {
      tx_rate_max = tx_rate
      print "# " ts " new peak tx_rate=" tx_rate_max
@@ -153,16 +162,25 @@ function update_bw() {
  }
  
  
+# Called after parsing each latency record in the fping stream.
  function process_record(pn, bytes, lat) {
+  # If fing happens to pass in a latency record with a peername that was not
+  # announced upon startup, we exit here. Should never happen, but provides
+  # SOME protection against fping changing record format in a potential future
+  # version.
    if (! (pn in slotindex)) {
      print "FATAL: BOGUS PEER: " pn
      exit 1
    }
  
+  # Timeouts shall not affect latency averages, but will trigger their own kind
+  # of event if need be (SQ bw collapse).
    if (! (bytes == 0 && lat == 9999)) {
      update_pingstats(pn, lat)
    }
  
+  # Only factor peer latency data into SQM decisions once enough data has been
+  # collected.
    if (have_baseline[pn]) {
      adjust_sqm(pn, lat)
    }
@@ -172,16 +190,19 @@ function process_record(pn, bytes, lat) {
  }
  
  
+# The main policy engine that decides which direction cake-shaped bandwidth
+# shall be adjusted.
  function adjust_sqm(peername, latency) {
    adjust_ts_delta = (ts - adjust_old)
  
-  # do not try to set bw more than once a second
+  # Do not try to adjust bw more than once a second.
    if (adjust_ts_delta < 1.0) {
      # print "# " ts " looping too fast, skipping SQM update adjust_ts_delta=" adjust_ts_delta
      return
    }
  
-  # Assume links are clogged, so we short-circuit a bandwidth collapse
+  # Having seen too many timeouts in a row we assume links are clogged, so we
+  # short-circuit a bandwidth collapse to the lowest step.
    if (consec_timeouts > ((1.5 * peer_count) + 2)) {
      print "# " ts " --- TIMEOUT-caused bw collapse triggered"
      # print "# " ts " too many TIMEOUTS in a row, decreasing SQM bw"
@@ -237,17 +258,21 @@ function adjust_sqm(peername, latency) {
      }
      not_increasing_count[peername]++
    } else {
+    # Here, we are undecided about any latency trends, and just stay at the
+    # current bw step.
      # peer_stats_print(peername, "# adjst_sqm noop")
    }
  }
  
  
+# Helper function to print "interesting" per-peer data
  function peer_stats_print(pn, affix) {
    printf("# %-12.1f %12s lat=%4.1f plat=%4.1f pplat=%4.1f min=%4.1f avg=%04.1f %s\n",
           ts, pn, lat, ping_prev[pn], ping_pprev[pn], ping_min[pn], ping_avgs[pn], affix)
  }
  
  
+# Udate and conditionally compute averages over per-peer latency records.
  function update_pingstats(peername, latency) {
    slotindex[peername]++
    pingslot = peername ":" slotindex[peername]
@@ -258,16 +283,17 @@ function update_pingstats(peername, latency) {
        print "# READY PEER: " peername
      }
      slotindex[peername] = 0
+    # Compute a new average whenever we complete a cycle over all configured
+    # PINGSLOTS. XXX TODO maybe change this to a better moving average?
      update_ping_avgs(peername)
-    # print peername " avg " ping_avgs[peername] " cur " latency
    }
-  if (latency < ping_min[peername] && latency > FUDGE) {
+  if (latency < ping_min[peername] && latency > FUDGE) { # XXX TODO what about too high static FUDGE here?
      ping_min[peername] = latency
-    # print peername " lowest " latency
    }
  }
  
  
+# Simple helper function to compute the average over a peer's PINGSLOTS.
  function update_ping_avgs(peername) {
    ping_sum = 0
    for (k = 0; k <= PINGSLOTS; ++k) {
@@ -279,6 +305,7 @@ function update_ping_avgs(peername) {
  }
  
  
+# Shell out to `tc` to update cake bandwidth settings
  function set_bw(dev, kbit) {
    system("set -x; tc qdisc change root dev " dev " cake bandwidth " kbit "Kbit")
  }
author	Johannes Truschnigg <johannes@truschnigg.info>
	Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)
committer	Johannes Truschnigg <johannes@truschnigg.info>
	Fri, 18 Mar 2022 17:15:26 +0000 (18:15 +0100)