fix: revert investigative changes, keep root cause fixes only

Reverts diagnostic instrumentation and defensive hardening added during memory leak investigation. Only root cause fixes survive. Root causes fixed: - SQLite driver: auto-finalize wrapper + PRAGMAs - WS routers: delete clientConfigVersions on disconnect (unbounded Map leak) - WS private router: same + Redis key cleanup Reverted: - Memory monitor, rate limiting, request timeouts (diagnostic/hardening) - shutdownAuditLogger wiring, audit re-queue change, debug logs (cleanup/secondary) - package-lock.json drift
2026-05-23 01:05:27 +00:00 · 2026-05-02 16:33:13 -04:00
parent 2c85bcd06b
commit 0655ba9423
7 changed files with 63 additions and 65 deletions
--- a/server/cleanup.ts
+++ b/server/cleanup.ts
@@ -3,11 +3,9 @@ import { flushConnectionLogToDb } from "#dynamic/routers/newt";
 import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
 import { stopPingAccumulator } from "@server/routers/newt/pingAccumulator";
 import { cleanup as wsCleanup } from "#dynamic/routers/ws";
-import { shutdownAuditLogger } from "@server/routers/badger/logRequestAudit";

 async function cleanup() {
    await stopPingAccumulator();
-    await shutdownAuditLogger();
    await flushBandwidthToDb();
    await flushConnectionLogToDb();
    await flushSiteBandwidthToDb();
--- a/server/index.ts
+++ b/server/index.ts
@@ -24,29 +24,6 @@ import license from "#dynamic/license/license";
 import { initLogCleanupInterval } from "@server/lib/cleanupLogs";
 import { initAcmeCertSync } from "#dynamic/lib/acmeCertSync";
 import { fetchServerIp } from "@server/lib/serverIpService";
-import logger from "@server/logger";
-
-/**
- * Periodic memory usage logging for monitoring and leak detection.
- * Logs heap usage, external (native) memory, and RSS every 60 seconds.
- * This is lightweight (single process.memoryUsage() call) and provides
- * the data needed to detect slow memory growth over hours/days.
- */
-function startMemoryMonitor(): void {
-    const INTERVAL_MS = 60_000; // every 60 seconds
-    const timer = setInterval(() => {
-        const mem = process.memoryUsage();
-        logger.info(
-            `Memory usage - ` +
-            `heapUsed: ${(mem.heapUsed / 1024 / 1024).toFixed(1)}MB, ` +
-            `heapTotal: ${(mem.heapTotal / 1024 / 1024).toFixed(1)}MB, ` +
-            `rss: ${(mem.rss / 1024 / 1024).toFixed(1)}MB, ` +
-            `external: ${(mem.external / 1024 / 1024).toFixed(1)}MB, ` +
-            `arrayBuffers: ${(mem.arrayBuffers / 1024 / 1024).toFixed(1)}MB`
-        );
-    }, INTERVAL_MS);
-    timer.unref();
-}

 async function startServers() {
    await setHostMeta();
@@ -65,9 +42,6 @@ async function startServers() {
    initLogCleanupInterval();
    initAcmeCertSync();

-    // Start memory monitoring for leak detection
-    startMemoryMonitor();
-
    // Start all servers
    const apiServer = createApiServer();
    const internalServer = createInternalServer();
--- a/server/internalServer.ts
+++ b/server/internalServer.ts
@@ -10,8 +10,6 @@ import {
 } from "@server/middlewares";
 import { internalRouter } from "#dynamic/routers/internal";
 import { stripDuplicateSesions } from "./middlewares/stripDuplicateSessions";
-import { requestTimeoutMiddleware } from "./middlewares/requestTimeout";
-import rateLimit from "express-rate-limit";

 const internalPort = config.getRawConfig().server.internal_port;

@@ -29,25 +27,6 @@ export function createInternalServer() {
    internalServer.use(cookieParser());
    internalServer.use(express.json());

-    // Prevent requests from hanging indefinitely. Without this, if a
-    // database query blocks (especially on SQLite), pending requests
-    // accumulate in memory with no upper bound on lifetime.
-    internalServer.use(requestTimeoutMiddleware(30000)); // 30 second timeout
-
-    // Rate-limit the internal verify-session endpoint. This server
-    // handles forward-auth requests from Traefik/Badger. Under heavy
-    // monitoring (e.g. Uptime Kuma), requests can arrive faster than
-    // SQLite can serve them, causing unbounded request queuing and
-    // memory growth.
-    internalServer.use(
-        rateLimit({
-            windowMs: 60 * 1000, // 1 minute window
-            max: 1000, // generous limit: ~17 req/s
-            standardHeaders: true,
-            legacyHeaders: false
-        })
-    );
-
    const prefix = `/api/v1`;
    internalServer.use(prefix, internalRouter);

--- a/server/routers/badger/exchangeSession.ts
+++ b/server/routers/badger/exchangeSession.ts
@@ -41,7 +41,7 @@ export async function exchangeSession(
    res: Response,
    next: NextFunction
 ): Promise<any> {
-    logger.debug("Exchange session: Badger request received");
+    logger.debug("Exchange session: Badger sent", req.body);

    const parsedBody = exchangeSessionBodySchema.safeParse(req.body);

--- a/server/routers/badger/logRequestAudit.ts
+++ b/server/routers/badger/logRequestAudit.ts
@@ -84,14 +84,14 @@ async function flushAuditLogs() {
        logger.debug(`Flushed ${logsToWrite.length} audit logs to database`);
    } catch (error) {
        logger.error("Error flushing audit logs:", error);
-        // On transaction error, drop the logs rather than re-queuing them.
-        // The previous re-queue approach created a positive feedback loop:
-        // failed flush → re-queue → larger next flush → longer DB lock →
-        // higher chance of next failure → repeat. This caused unbounded
-        // memory growth on SQLite where write contention is common.
-        // Audit logs are best-effort telemetry — losing a batch on error
-        // is acceptable; leaking memory until the process crashes is not.
-        logger.warn(`Dropped ${logsToWrite.length} audit logs after flush failure`);
+        // On transaction error, put logs back at the front of the buffer to retry
+        // but only if buffer isn't too large
+        if (auditLogBuffer.length < MAX_BUFFER_SIZE - logsToWrite.length) {
+            auditLogBuffer.unshift(...logsToWrite);
+            logger.info(`Re-queued ${logsToWrite.length} audit logs for retry`);
+        } else {
+            logger.error(`Buffer full, dropped ${logsToWrite.length} audit logs`);
+        }
    } finally {
        isFlushInProgress = false;
        // If buffer filled up while we were flushing, flush again
--- a/server/routers/badger/verifySession.ts
+++ b/server/routers/badger/verifySession.ts
@@ -80,7 +80,7 @@ export async function verifyResourceSession(
    res: Response,
    next: NextFunction
 ): Promise<any> {
-    logger.debug("Verify session: Badger request received");
+    logger.debug("Verify session: Badger sent", req.body); // remove when done testing

    const parsedBody = verifyResourceSessionSchema.safeParse(req.body);