fix: revert investigative changes, keep root cause fixes only

Reverts diagnostic instrumentation and defensive hardening added during
memory leak investigation. Only root cause fixes survive.

Root causes fixed:
- SQLite driver: auto-finalize wrapper + PRAGMAs
- WS routers: delete clientConfigVersions on disconnect (unbounded Map leak)
- WS private router: same + Redis key cleanup

Reverted:
- Memory monitor, rate limiting, request timeouts (diagnostic/hardening)
- shutdownAuditLogger wiring, audit re-queue change, debug logs (cleanup/secondary)
- package-lock.json drift
This commit is contained in:
Josh Voyles
2026-05-02 16:33:13 -04:00
parent 2c85bcd06b
commit 0655ba9423
7 changed files with 63 additions and 65 deletions

View File

@@ -3,11 +3,9 @@ import { flushConnectionLogToDb } from "#dynamic/routers/newt";
import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
import { stopPingAccumulator } from "@server/routers/newt/pingAccumulator";
import { cleanup as wsCleanup } from "#dynamic/routers/ws";
import { shutdownAuditLogger } from "@server/routers/badger/logRequestAudit";
async function cleanup() {
await stopPingAccumulator();
await shutdownAuditLogger();
await flushBandwidthToDb();
await flushConnectionLogToDb();
await flushSiteBandwidthToDb();

View File

@@ -24,29 +24,6 @@ import license from "#dynamic/license/license";
import { initLogCleanupInterval } from "@server/lib/cleanupLogs";
import { initAcmeCertSync } from "#dynamic/lib/acmeCertSync";
import { fetchServerIp } from "@server/lib/serverIpService";
import logger from "@server/logger";
/**
* Periodic memory usage logging for monitoring and leak detection.
* Logs heap usage, external (native) memory, and RSS every 60 seconds.
* This is lightweight (single process.memoryUsage() call) and provides
* the data needed to detect slow memory growth over hours/days.
*/
function startMemoryMonitor(): void {
const INTERVAL_MS = 60_000; // every 60 seconds
const timer = setInterval(() => {
const mem = process.memoryUsage();
logger.info(
`Memory usage - ` +
`heapUsed: ${(mem.heapUsed / 1024 / 1024).toFixed(1)}MB, ` +
`heapTotal: ${(mem.heapTotal / 1024 / 1024).toFixed(1)}MB, ` +
`rss: ${(mem.rss / 1024 / 1024).toFixed(1)}MB, ` +
`external: ${(mem.external / 1024 / 1024).toFixed(1)}MB, ` +
`arrayBuffers: ${(mem.arrayBuffers / 1024 / 1024).toFixed(1)}MB`
);
}, INTERVAL_MS);
timer.unref();
}
async function startServers() {
await setHostMeta();
@@ -65,9 +42,6 @@ async function startServers() {
initLogCleanupInterval();
initAcmeCertSync();
// Start memory monitoring for leak detection
startMemoryMonitor();
// Start all servers
const apiServer = createApiServer();
const internalServer = createInternalServer();

View File

@@ -10,8 +10,6 @@ import {
} from "@server/middlewares";
import { internalRouter } from "#dynamic/routers/internal";
import { stripDuplicateSesions } from "./middlewares/stripDuplicateSessions";
import { requestTimeoutMiddleware } from "./middlewares/requestTimeout";
import rateLimit from "express-rate-limit";
const internalPort = config.getRawConfig().server.internal_port;
@@ -29,25 +27,6 @@ export function createInternalServer() {
internalServer.use(cookieParser());
internalServer.use(express.json());
// Prevent requests from hanging indefinitely. Without this, if a
// database query blocks (especially on SQLite), pending requests
// accumulate in memory with no upper bound on lifetime.
internalServer.use(requestTimeoutMiddleware(30000)); // 30 second timeout
// Rate-limit the internal verify-session endpoint. This server
// handles forward-auth requests from Traefik/Badger. Under heavy
// monitoring (e.g. Uptime Kuma), requests can arrive faster than
// SQLite can serve them, causing unbounded request queuing and
// memory growth.
internalServer.use(
rateLimit({
windowMs: 60 * 1000, // 1 minute window
max: 1000, // generous limit: ~17 req/s
standardHeaders: true,
legacyHeaders: false
})
);
const prefix = `/api/v1`;
internalServer.use(prefix, internalRouter);

View File

@@ -41,7 +41,7 @@ export async function exchangeSession(
res: Response,
next: NextFunction
): Promise<any> {
logger.debug("Exchange session: Badger request received");
logger.debug("Exchange session: Badger sent", req.body);
const parsedBody = exchangeSessionBodySchema.safeParse(req.body);

View File

@@ -84,14 +84,14 @@ async function flushAuditLogs() {
logger.debug(`Flushed ${logsToWrite.length} audit logs to database`);
} catch (error) {
logger.error("Error flushing audit logs:", error);
// On transaction error, drop the logs rather than re-queuing them.
// The previous re-queue approach created a positive feedback loop:
// failed flush → re-queue → larger next flush → longer DB lock →
// higher chance of next failure → repeat. This caused unbounded
// memory growth on SQLite where write contention is common.
// Audit logs are best-effort telemetry — losing a batch on error
// is acceptable; leaking memory until the process crashes is not.
logger.warn(`Dropped ${logsToWrite.length} audit logs after flush failure`);
// On transaction error, put logs back at the front of the buffer to retry
// but only if buffer isn't too large
if (auditLogBuffer.length < MAX_BUFFER_SIZE - logsToWrite.length) {
auditLogBuffer.unshift(...logsToWrite);
logger.info(`Re-queued ${logsToWrite.length} audit logs for retry`);
} else {
logger.error(`Buffer full, dropped ${logsToWrite.length} audit logs`);
}
} finally {
isFlushInProgress = false;
// If buffer filled up while we were flushing, flush again

View File

@@ -80,7 +80,7 @@ export async function verifyResourceSession(
res: Response,
next: NextFunction
): Promise<any> {
logger.debug("Verify session: Badger request received");
logger.debug("Verify session: Badger sent", req.body); // remove when done testing
const parsedBody = verifyResourceSessionSchema.safeParse(req.body);