From f4a1c18dd1d134d0be9364ccbab8948ea40d0736 Mon Sep 17 00:00:00 2001 From: Johannes Truschnigg Date: Fri, 18 Mar 2022 18:15:26 +0100 Subject: [PATCH] Document AWK source --- __lagdetect.awk | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/__lagdetect.awk b/__lagdetect.awk index 1366bea..4eb6530 100644 --- a/__lagdetect.awk +++ b/__lagdetect.awk @@ -18,7 +18,7 @@ # latency reported by fping(8). It is useless on its own, and should be started # by the accompanying shellscript wrapper found in the same directory. # -# This is work in progresa and has severe limitations. Latency upswings that +# This is work in progress and has severe limitations. Latency upswings that # are NOT caused by bufferbloat cannot be told apart from those that are caused # by it, but hopefully will be in a future version. Presently, only downstream # bandwidth is being shaped. @@ -84,6 +84,9 @@ BEGIN { } +# Compute a sensible number of steps between the MIN and MAX bandwidth +# boundaries configured. +# XXX TODO impl. tunable for acting more fine-grained? function slice_bw_window_rx(bw_lower_bound_rx, bw_upper_bound_rx) { bw_bound_delta_rx = (bw_upper_bound_rx - bw_lower_bound_rx) bw_steps_rx = 1+int((log(bw_bound_delta_rx)/log(10)) ^ 1.6) @@ -122,8 +125,11 @@ function get_time() { } +# Update bandwidth estimates from interface statistics. Ticks on each latency +# record, but will throttle if these flow in too fast. function update_bw() { # If the last update happened rather recently, do not compute new bw stats. + # XXX TODO do we need to snashot ts here? if (! (update_ts())) { #print "# " ts " too fast, skiping bw updates " ts - ts_old return @@ -138,10 +144,13 @@ function update_bw() { getline rx < STATS_RX close(STATS_RX) + # Convert to bytes to Kbit, compute rates over ts_delta (given in seconds) rx_delta=int(((rx - rx_old) / 128)) rx_rate=int((rx_delta / ts_delta)) tx_delta=int(((tx - tx_old) / 128)) tx_rate=int((tx_delta / ts_delta)) + + # Record max. rates in each direction if (tx_rate_max < tx_rate) { tx_rate_max = tx_rate print "# " ts " new peak tx_rate=" tx_rate_max @@ -153,16 +162,25 @@ function update_bw() { } +# Called after parsing each latency record in the fping stream. function process_record(pn, bytes, lat) { + # If fing happens to pass in a latency record with a peername that was not + # announced upon startup, we exit here. Should never happen, but provides + # SOME protection against fping changing record format in a potential future + # version. if (! (pn in slotindex)) { print "FATAL: BOGUS PEER: " pn exit 1 } + # Timeouts shall not affect latency averages, but will trigger their own kind + # of event if need be (SQ bw collapse). if (! (bytes == 0 && lat == 9999)) { update_pingstats(pn, lat) } + # Only factor peer latency data into SQM decisions once enough data has been + # collected. if (have_baseline[pn]) { adjust_sqm(pn, lat) } @@ -172,16 +190,19 @@ function process_record(pn, bytes, lat) { } +# The main policy engine that decides which direction cake-shaped bandwidth +# shall be adjusted. function adjust_sqm(peername, latency) { adjust_ts_delta = (ts - adjust_old) - # do not try to set bw more than once a second + # Do not try to adjust bw more than once a second. if (adjust_ts_delta < 1.0) { # print "# " ts " looping too fast, skipping SQM update adjust_ts_delta=" adjust_ts_delta return } - # Assume links are clogged, so we short-circuit a bandwidth collapse + # Having seen too many timeouts in a row we assume links are clogged, so we + # short-circuit a bandwidth collapse to the lowest step. if (consec_timeouts > ((1.5 * peer_count) + 2)) { print "# " ts " --- TIMEOUT-caused bw collapse triggered" # print "# " ts " too many TIMEOUTS in a row, decreasing SQM bw" @@ -237,17 +258,21 @@ function adjust_sqm(peername, latency) { } not_increasing_count[peername]++ } else { + # Here, we are undecided about any latency trends, and just stay at the + # current bw step. # peer_stats_print(peername, "# adjst_sqm noop") } } +# Helper function to print "interesting" per-peer data function peer_stats_print(pn, affix) { printf("# %-12.1f %12s lat=%4.1f plat=%4.1f pplat=%4.1f min=%4.1f avg=%04.1f %s\n", ts, pn, lat, ping_prev[pn], ping_pprev[pn], ping_min[pn], ping_avgs[pn], affix) } +# Udate and conditionally compute averages over per-peer latency records. function update_pingstats(peername, latency) { slotindex[peername]++ pingslot = peername ":" slotindex[peername] @@ -258,16 +283,17 @@ function update_pingstats(peername, latency) { print "# READY PEER: " peername } slotindex[peername] = 0 + # Compute a new average whenever we complete a cycle over all configured + # PINGSLOTS. XXX TODO maybe change this to a better moving average? update_ping_avgs(peername) - # print peername " avg " ping_avgs[peername] " cur " latency } - if (latency < ping_min[peername] && latency > FUDGE) { + if (latency < ping_min[peername] && latency > FUDGE) { # XXX TODO what about too high static FUDGE here? ping_min[peername] = latency - # print peername " lowest " latency } } +# Simple helper function to compute the average over a peer's PINGSLOTS. function update_ping_avgs(peername) { ping_sum = 0 for (k = 0; k <= PINGSLOTS; ++k) { @@ -279,6 +305,7 @@ function update_ping_avgs(peername) { } +# Shell out to `tc` to update cake bandwidth settings function set_bw(dev, kbit) { system("set -x; tc qdisc change root dev " dev " cake bandwidth " kbit "Kbit") } -- 2.39.5