diff lib/background_loop.py @ 2216:acf1b68d031a

fancier background_loop reporting for faders
author drewp@bigasterisk.com
date Tue, 23 May 2023 14:43:12 -0700
parents d8853f173568
children 436a1fdbfe4a
line wrap: on
line diff
--- a/lib/background_loop.py	Tue May 23 14:41:04 2023 -0700
+++ b/lib/background_loop.py	Tue May 23 14:43:12 2023 -0700
@@ -6,8 +6,10 @@
 from typing import Any, Awaitable, Callable, Union
 
 from prometheus_client import Gauge, Summary
+from light9.recentfps import RecentFps
+from braillegraph import horizontal_graph
 
-log = logging.getLogger()
+log = logging.getLogger('loop')
 
 
 # throw this away (when net_routes is rewritten)
@@ -45,14 +47,21 @@
         func: UserFunc,
         sleep_period: float,
         metric_prefix: str,
+        extra_sleep_on_error: float = 2,
+        log_fps=False,
     ):
         self.func = func
         self.sleep_period = sleep_period
+        self.extra_sleep_on_error = extra_sleep_on_error
+        self.metric_prefix = metric_prefix
 
         self.up_metric = Gauge(f'{metric_prefix}_up', 'not erroring')
         self.call_metric = Summary(f'{metric_prefix}_calls', 'calls')
         self.lastSuccessRun = 0
         self.everSucceeded = False
+        self.succeeding = False
+        self.fps = RecentFps() if log_fps else None
+        self.lastFpsLog = 0
 
     async def call_func(self, first_run: bool, call_kwargs={}) -> F_RET:
         with self.call_metric.time():
@@ -64,6 +73,7 @@
 
     async def _run(self):
         self.first_run = True
+        self.newlyFailing = True
         while True:
             await self._runOne()
             await asyncio.sleep(self.sleep_period)
@@ -75,20 +85,44 @@
         return await self._runOne(call_kwargs=kwargs)
 
     async def _runOne(self, call_kwargs={}):
+        now = time.time()
+        self._updateFps(now)
         try:
             result = await self.call_func(self.first_run, call_kwargs)
             self.lastSuccessRun = time.time()
+            if self.fps:
+                self.fps.mark()
             self.up_metric.set(1)
             self.everSucceeded = True
+            self.newlyFailing = True
             self.first_run = False
         except Exception as ex:
-            log.error(ex)
-            traceback.print_exc()
+            if self.newlyFailing:
+                traceback.print_exc()
+                self.newlyFailing = False
+            else:
+                log.error(ex)
             self.up_metric.set(0)
             result = None
+            self.succeeding = False
+
+            await asyncio.sleep(self.extra_sleep_on_error)
             # todo: something that reveals error ratio
         return result
 
+    def _updateFps(self, now: float):
+        if self.fps is None:
+            return
+        if now < self.lastFpsLog + 5:
+            return
+        d = self.fps()
+        y_hi = 1 / self.sleep_period
+        if not d:
+            return
+        pts = [int(min(4, y / y_hi * 4)) for y in d['recents']]
+        log.info(f'{self.metric_prefix} fps={d["average"]:3.1f} (req={y_hi:3.1f}) {horizontal_graph(pts)}')
+        self.lastFpsLog = now
+
 
 def loop_forever(
     func: UserFunc,
@@ -96,6 +130,8 @@
     metric_prefix='background_loop',
     up_metric=None,
     call_metric=None,
+    extra_sleep_on_error=2,
+    log_fps=False,
 ):
     """
     sleep_period is the sleep time after however long func takes to run
@@ -103,6 +139,6 @@
     if up_metric is not None or call_metric is not None:
         raise NotImplementedError('remove old-style metrics')
 
-    loop = Loop(func, sleep_period, metric_prefix)
+    loop = Loop(func, sleep_period, metric_prefix, extra_sleep_on_error, log_fps)
     _created.append(asyncio.create_task(loop._run()))
     return loop