From f4a1c18dd1d134d0be9364ccbab8948ea40d0736 Mon Sep 17 00:00:00 2001
From: Johannes Truschnigg <johannes@truschnigg.info>
Date: Fri, 18 Mar 2022 18:15:26 +0100
Subject: [PATCH] Document AWK source

---
 __lagdetect.awk | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/__lagdetect.awk b/__lagdetect.awk
index 1366bea..4eb6530 100644
--- a/__lagdetect.awk
+++ b/__lagdetect.awk
@@ -18,7 +18,7 @@
 # latency reported by fping(8). It is useless on its own, and should be started
 # by the accompanying shellscript wrapper found in the same directory.
 #
-# This is work in progresa and has severe limitations. Latency upswings that
+# This is work in progress and has severe limitations. Latency upswings that
 # are NOT caused by bufferbloat cannot be told apart from those that are caused
 # by it, but hopefully will be in a future version. Presently, only downstream
 # bandwidth is being shaped.
@@ -84,6 +84,9 @@ BEGIN {
 }
 
 
+# Compute a sensible number of steps between the MIN and MAX bandwidth
+# boundaries configured.
+# XXX TODO impl. tunable for acting more fine-grained?
 function slice_bw_window_rx(bw_lower_bound_rx, bw_upper_bound_rx) {
   bw_bound_delta_rx = (bw_upper_bound_rx - bw_lower_bound_rx)
   bw_steps_rx = 1+int((log(bw_bound_delta_rx)/log(10)) ^ 1.6)
@@ -122,8 +125,11 @@ function get_time() {
 }
 
 
+# Update bandwidth estimates from interface statistics. Ticks on each latency
+# record, but will throttle if these flow in too fast.
 function update_bw() {
   # If the last update happened rather recently, do not compute new bw stats.
+  # XXX TODO do we need to snashot ts here?
   if (! (update_ts())) {
     #print "# " ts " too fast, skiping bw updates " ts - ts_old
     return
@@ -138,10 +144,13 @@ function update_bw() {
   getline rx < STATS_RX
   close(STATS_RX)
 
+  # Convert to bytes to Kbit, compute rates over ts_delta (given in seconds)
   rx_delta=int(((rx - rx_old) / 128))
   rx_rate=int((rx_delta / ts_delta))
   tx_delta=int(((tx - tx_old) / 128))
   tx_rate=int((tx_delta / ts_delta))
+
+  # Record max. rates in each direction
   if (tx_rate_max < tx_rate) {
     tx_rate_max = tx_rate
     print "# " ts " new peak tx_rate=" tx_rate_max
@@ -153,16 +162,25 @@ function update_bw() {
 }
 
 
+# Called after parsing each latency record in the fping stream.
 function process_record(pn, bytes, lat) {
+  # If fing happens to pass in a latency record with a peername that was not
+  # announced upon startup, we exit here. Should never happen, but provides
+  # SOME protection against fping changing record format in a potential future
+  # version.
   if (! (pn in slotindex)) {
     print "FATAL: BOGUS PEER: " pn
     exit 1
   }
 
+  # Timeouts shall not affect latency averages, but will trigger their own kind
+  # of event if need be (SQ bw collapse).
   if (! (bytes == 0 && lat == 9999)) {
     update_pingstats(pn, lat)
   }
 
+  # Only factor peer latency data into SQM decisions once enough data has been
+  # collected.
   if (have_baseline[pn]) {
     adjust_sqm(pn, lat)
   }
@@ -172,16 +190,19 @@ function process_record(pn, bytes, lat) {
 }
 
 
+# The main policy engine that decides which direction cake-shaped bandwidth
+# shall be adjusted.
 function adjust_sqm(peername, latency) {
   adjust_ts_delta = (ts - adjust_old)
 
-  # do not try to set bw more than once a second
+  # Do not try to adjust bw more than once a second.
   if (adjust_ts_delta < 1.0) {
     # print "# " ts " looping too fast, skipping SQM update adjust_ts_delta=" adjust_ts_delta
     return
   }
 
-  # Assume links are clogged, so we short-circuit a bandwidth collapse
+  # Having seen too many timeouts in a row we assume links are clogged, so we
+  # short-circuit a bandwidth collapse to the lowest step.
   if (consec_timeouts > ((1.5 * peer_count) + 2)) {
     print "# " ts " --- TIMEOUT-caused bw collapse triggered"
     # print "# " ts " too many TIMEOUTS in a row, decreasing SQM bw"
@@ -237,17 +258,21 @@ function adjust_sqm(peername, latency) {
     }
     not_increasing_count[peername]++
   } else {
+    # Here, we are undecided about any latency trends, and just stay at the
+    # current bw step.
     # peer_stats_print(peername, "# adjst_sqm noop")
   }
 }
 
 
+# Helper function to print "interesting" per-peer data
 function peer_stats_print(pn, affix) {
   printf("# %-12.1f %12s lat=%4.1f plat=%4.1f pplat=%4.1f min=%4.1f avg=%04.1f %s\n",
          ts, pn, lat, ping_prev[pn], ping_pprev[pn], ping_min[pn], ping_avgs[pn], affix)
 }
 
 
+# Udate and conditionally compute averages over per-peer latency records.
 function update_pingstats(peername, latency) {
   slotindex[peername]++
   pingslot = peername ":" slotindex[peername]
@@ -258,16 +283,17 @@ function update_pingstats(peername, latency) {
       print "# READY PEER: " peername
     }
     slotindex[peername] = 0
+    # Compute a new average whenever we complete a cycle over all configured
+    # PINGSLOTS. XXX TODO maybe change this to a better moving average?
     update_ping_avgs(peername)
-    # print peername " avg " ping_avgs[peername] " cur " latency
   }
-  if (latency < ping_min[peername] && latency > FUDGE) {
+  if (latency < ping_min[peername] && latency > FUDGE) { # XXX TODO what about too high static FUDGE here?
     ping_min[peername] = latency
-    # print peername " lowest " latency
   }
 }
 
 
+# Simple helper function to compute the average over a peer's PINGSLOTS.
 function update_ping_avgs(peername) {
   ping_sum = 0
   for (k = 0; k <= PINGSLOTS; ++k) {
@@ -279,6 +305,7 @@ function update_ping_avgs(peername) {
 }
 
 
+# Shell out to `tc` to update cake bandwidth settings
 function set_bw(dev, kbit) {
   system("set -x; tc qdisc change root dev " dev " cake bandwidth " kbit "Kbit")
 }
-- 
2.39.5