From: Johannes Truschnigg Date: Thu, 17 Mar 2022 18:16:19 +0000 (+0100) Subject: Fix up comments and TIMEOUT collapse X-Git-Url: https://johannes.truschnigg.info/gitweb/?a=commitdiff_plain;h=e028e1b79ee1934cfda1a7f728957e2616e0b943;p=sqm_lagthrottle Fix up comments and TIMEOUT collapse --- diff --git a/__lagdetect.awk b/__lagdetect.awk index 31bd12a..e0531e6 100644 --- a/__lagdetect.awk +++ b/__lagdetect.awk @@ -44,7 +44,7 @@ BEGIN { ping_min[pn] = 65535 # minimum latency observed for PEER ping_avgs[pn] = -1 # mean of last PINGSLOTS recorded latencies per PEER ping_prev[pn] = -1 # latency recorded in the previous cycle - ping_pprev[pn] = -1 # latency recorded in the previous cycle + ping_pprev[pn] = -1 # latency recorded in the pre-previous cycle not_increasing_count[pn] = 0 # number of consecutive cycles with no (fudged) latency increase } ts = get_time() @@ -64,12 +64,11 @@ BEGIN { pn=$1 seq=$3 bytes=0 - lat=9999 + lat=9999 # XXX TODO this could be a bad idea, revise update_bw() - # XXX TODO what now? + # XXX TODO what now? (how) can we do better in case of an apparent outage? consec_timeouts++ process_record(pn, bytes, lat) - #print "TIMEOUT for " pn } @@ -94,7 +93,6 @@ function slice_bw_window_rx(bw_lower_bound_rx, bw_upper_bound_rx) { print "# RX step #" k " = " STEPS_RX[k] "Kbps " } k-- - print "# MAX INDEX " k bw_maxindex_rx = k bw_next_step_rx = k bw_cur_step_rx = k @@ -127,7 +125,7 @@ function get_time() { function update_bw() { # If the last update happened rather recently, do not compute new bw stats. if (! (update_ts())) { - #print "too fast, skiping bw updates " ts - ts_old + #print "# " ts " too fast, skiping bw updates " ts - ts_old return } @@ -158,18 +156,14 @@ function update_bw() { function process_record(pn, bytes, lat) { if (! (pn in slotindex)) { - print "FATAL: BOGUS PEER: " pn # XXX really? + print "FATAL: BOGUS PEER: " pn exit 1 } - if (bytes == 0 && lat == 9999) { - print "# WARN: TIMEOUT: " pn - # XXX TODO: what to do now? + if (! (bytes == 0 && lat == 9999)) { + update_pingstats(pn, lat) } - #printf("%s %db in %.2fms\n", pn, bytes, lat) - update_pingstats(pn, lat) - if (have_baseline[pn]) { adjust_sqm(pn, lat) } @@ -182,39 +176,44 @@ function process_record(pn, bytes, lat) { function adjust_sqm(peername, latency) { adjust_ts_delta = (ts - adjust_old) - # Assume links are clogged, so we short-circuit a bandwidth decrease + # Assume links are clogged, so we short-circuit a bandwidth collapse if (consec_timeouts > ((1.5 * peer_count) + 2)) { + print "# " ts " --- TIMEOUT-caused bw collapse triggered" # print "# " ts " too many TIMEOUTS in a row, decreasing SQM bw" bw_cur_step_rx = 0 - set_bw(IFACE_RX, STEPS_RX[bw_cur_step_rx]) + bw_next_step_rx = 0 + set_bw(IFACE_RX, STEPS_RX[bw_next_step_rx]) return } # do not try to set bw more than once a second if (adjust_ts_delta < 1.0) { - # print "looping too fast, skipping SQM update adjust_ts_delta=" adjust_ts_delta + # print "# " ts " looping too fast, skipping SQM update adjust_ts_delta=" adjust_ts_delta return } # XXX TODO - this needs to get MUCH better! # 1.) FUDGE needs to be properly accounted in BW increase path # 2.) Try to dynamically compute/adapt FUDGE in no-load condition? - # Try to determine the latency trend over the last few samples + # 3.) Does considering last2avg really make sense? + # 4.) Does THRESHOLD really make sense? (at least the name kinda does not...) + # + # Try to determine a latency trend over the last few samples last2avg = (((ping_prev[peername] + latency) / 2.0)) thresh_fudged = (FUDGE + (ping_avgs[peername] * THRESHOLD)) if (last2avg > thresh_fudged ) { if (latency > ping_prev[peername] && latency > ping_pprev[peername]) { if ( rx_rate < (rx_rate_max * 0.1)) { # XXX TODO is this correct? - print "# " ts " line does not appear to be loaded, skipping SQM bw downgrade" + print "# " ts " line does not appear to be loaded, skipping futile SQM bw downgrade" } else { - # print ts " " adjust_ts_delta " CONSIDER BW DECREASE # " peername " lat=" latency " prev=" ping_prev[peername] " pprev=" ping_pprev[peername] " avg=" ping_avgs[peername] " min=" ping_min[peername] + # print "# " ts " " adjust_ts_delta " BW DECREASE # " peername " lat=" latency " prev=" ping_prev[peername] " pprev=" ping_pprev[peername] " avg=" ping_avgs[peername] " min=" ping_min[peername] if (bw_cur_step_rx > 0) { bw_next_step_rx = bw_cur_step_rx - 1 - print "# --- " ts " choosing bw rx step " bw_next_step_rx " := " STEPS_RX[bw_next_step_rx] "Kbps" + print "--- " ts " choosing bw rx step " bw_next_step_rx " := " STEPS_RX[bw_next_step_rx] "Kbps rx=" rx_rate " tx=" tx_rate set_bw(IFACE_RX, STEPS_RX[bw_next_step_rx]) bw_cur_step_rx = bw_next_step_rx } else { - print "# already at lowest bw rx step " bw_cur_step_rx + print "--- " ts " already at lowest bw rx step " bw_cur_step_rx " rx=" rx_rate " tx=" tx_rate } } # print "set adjust_old=" ts @@ -223,27 +222,28 @@ function adjust_sqm(peername, latency) { not_increasing_count[peername]=0 } else if (latency < (ping_prev[peername] + FUDGE/2) && latency < (ping_pprev[peername] + FUDGE / 2) && ping_prev[peername] < (ping_pprev[peername] + FUDGE/2)) { if (not_increasing_count[peername] > (5 * PINGSLOTS)) { - # print ts " " adjust_ts_delta " CONSIDER BW INCREASE # " peername " lat=" latency " prev=" ping_prev[peername] " pprev=" ping_pprev[peername] " avg=" ping_avgs[peername] " min=" ping_min[peername] + # print "# " ts " " adjust_ts_delta " BW INCREASE # " peername " lat=" latency " prev=" ping_prev[peername] " pprev=" ping_pprev[peername] " avg=" ping_avgs[peername] " min=" ping_min[peername] if (bw_cur_step_rx < bw_maxindex_rx) { bw_next_step_rx = bw_cur_step_rx + 1 - print "# +++ choosing " ts " bw rx step " bw_next_step_rx " := " STEPS_RX[bw_next_step_rx] "Kbps" + print "+++ " ts " choosing bw rx step " bw_next_step_rx " := " STEPS_RX[bw_next_step_rx] "Kbps rx=" rx_rate " tx=" tx_rate set_bw(IFACE_RX, STEPS_RX[bw_next_step_rx]) bw_cur_step_rx = bw_next_step_rx } else { - print "# already at highest bw rx step " bw_cur_step_rx + print "+++ " ts " already at highest bw rx step " bw_cur_step_rx " rx=" rx_rate " tx=" tx_rate } not_increasing_count[peername]=0 adjust_old = ts } not_increasing_count[peername]++ } else { - peer_stats_print(peername, "# adjst_sqm noop") + # peer_stats_print(peername, "# adjst_sqm noop") } } function peer_stats_print(pn, affix) { - printf("%-12.1f %12s lat=%04.1f plat=%04.1f pplat=%04.1f avg=%04.1f %s\n", ts, pn, lat, ping_prev[pn], ping_pprev[pn], ping_avgs[pn], affix) + printf("# %-12.1f %12s lat=%4.1f plat=%4.1f pplat=%4.1f min=%4.1f avg=%04.1f %s\n", + ts, pn, lat, ping_prev[pn], ping_pprev[pn], ping_min[pn], ping_avgs[pn], affix) } @@ -274,17 +274,16 @@ function update_ping_avgs(peername) { ping_sum += pingstats[ps] } ping_avg = ping_sum / PINGSLOTS - # print "AVG over " PINGSLOTS " samples for peer " peername " sum=" ping_sum " avg=" ping_avg ping_avgs[pn] = ping_avg } function set_bw(dev, kbit) { - #print "# BW CHANGE dev=" dev " kbit=" kbit - system("set -x; : " ts "; tc qdisc change root dev " dev " cake bandwidth " kbit "Kbit") + system("set -x; tc qdisc change root dev " dev " cake bandwidth " kbit "Kbit") } END { + print "# " ts " BYE" # Usually not reached. }