vibetunnel/mac/VibeTunnel/Core/Services/ServerManager.swift

546 lines
19 KiB
Swift

import Foundation
import Observation
import OSLog
import SwiftUI
/// Errors that can occur during server operations
enum ServerError: LocalizedError {
case repeatedCrashes(count: Int)
case portInUse(port: Int)
case startupFailed(String)
var errorDescription: String? {
switch self {
case .repeatedCrashes:
"Server keeps crashing"
case .portInUse(let port):
"Port \(port) is already in use"
case .startupFailed(let reason):
"Server startup failed: \(reason)"
}
}
var failureReason: String? {
switch self {
case .repeatedCrashes(let count):
"The server crashed \(count) times in a row"
case .portInUse(let port):
"Another process is using port \(port)"
case .startupFailed:
nil
}
}
var recoverySuggestion: String? {
switch self {
case .repeatedCrashes:
"Check the logs for errors or try a different port"
case .portInUse:
"Stop the other process or choose a different port"
case .startupFailed:
"Check the server configuration and try again"
}
}
}
/// Manages the VibeTunnel server lifecycle.
///
/// `ServerManager` is the central coordinator for server lifecycle management in VibeTunnel.
/// It handles starting, stopping, and restarting the Go server, manages server configuration,
/// and provides logging capabilities.
@MainActor
@Observable
class ServerManager {
static let shared = ServerManager()
var port: String {
get { UserDefaults.standard.string(forKey: "serverPort") ?? "4020" }
set { UserDefaults.standard.set(newValue, forKey: "serverPort") }
}
var bindAddress: String {
get {
let mode = DashboardAccessMode(rawValue: UserDefaults.standard.string(forKey: "dashboardAccessMode") ?? ""
) ??
.localhost
return mode.bindAddress
}
set {
// Find the mode that matches this bind address
if let mode = DashboardAccessMode.allCases.first(where: { $0.bindAddress == newValue }) {
UserDefaults.standard.set(mode.rawValue, forKey: "dashboardAccessMode")
}
}
}
private var cleanupOnStartup: Bool {
get { UserDefaults.standard.bool(forKey: "cleanupOnStartup") }
set { UserDefaults.standard.set(newValue, forKey: "cleanupOnStartup") }
}
private(set) var bunServer: BunServer?
private(set) var isRunning = false
private(set) var isRestarting = false
private(set) var lastError: Error?
/// Track if we're in the middle of handling a crash to prevent multiple restarts
private var isHandlingCrash = false
/// Number of consecutive crashes for backoff
private var consecutiveCrashes = 0
/// Last crash time for crash rate detection
private var lastCrashTime: Date?
private let logger = Logger(subsystem: "sh.vibetunnel.vibetunnel", category: "ServerManager")
private init() {
// Skip observer setup and monitoring during tests
let isRunningInTests = ProcessInfo.processInfo.environment["XCTestConfigurationFilePath"] != nil ||
ProcessInfo.processInfo.environment["XCTestBundlePath"] != nil ||
ProcessInfo.processInfo.environment["XCTestSessionIdentifier"] != nil ||
ProcessInfo.processInfo.arguments.contains("-XCTest") ||
NSClassFromString("XCTestCase") != nil
if !isRunningInTests {
setupObservers()
// Start health monitoring
startHealthMonitoring()
}
}
deinit {
NotificationCenter.default.removeObserver(self)
}
private func setupObservers() {
// Watch for server mode changes when the value actually changes
NotificationCenter.default.addObserver(
self,
selector: #selector(userDefaultsDidChange),
name: UserDefaults.didChangeNotification,
object: nil
)
}
@objc
private nonisolated func userDefaultsDidChange() {
// No server-related defaults to monitor
}
/// Start the server with current configuration
func start() async {
// Check if we already have a running server
if let existingServer = bunServer {
let state = existingServer.getState()
switch state {
case .running:
logger.info("Server already running on port \(existingServer.port)")
// Ensure our state is synced
isRunning = true
lastError = nil
return
case .starting:
logger.info("Server is already starting")
return
case .stopping:
logger.warning("Cannot start server while it's stopping")
lastError = BunServerError.invalidState
return
case .crashed, .idle:
// Clean up and proceed with start
bunServer = nil
isRunning = false
}
}
// First check if port is truly available by trying to bind to it
let portNumber = Int(self.port) ?? 4_020
let canBind = await PortConflictResolver.shared.canBindToPort(portNumber)
if !canBind {
logger.warning("Cannot bind to port \(portNumber), checking for conflicts...")
}
// Check for port conflicts before starting
if let conflict = await PortConflictResolver.shared.detectConflict(on: portNumber) {
logger.warning("Port \(self.port) is in use by \(conflict.process.name) (PID: \(conflict.process.pid))")
// Handle based on conflict type
switch conflict.suggestedAction {
case .killOurInstance(let pid, let processName):
logger.info("Attempting to kill conflicting process: \(processName) (PID: \(pid))")
do {
try await PortConflictResolver.shared.resolveConflict(conflict)
// resolveConflict now includes exponential backoff
} catch {
logger.error("Failed to resolve port conflict: \(error)")
lastError = PortConflictError.failedToKillProcess(pid: pid)
return
}
case .reportExternalApp(let appName):
logger.error("Port \(self.port) is used by external app: \(appName)")
lastError = ServerManagerError.portInUseByApp(
appName: appName,
port: Int(self.port) ?? 4_020,
alternatives: conflict.alternativePorts
)
return
case .suggestAlternativePort:
// This shouldn't happen in our case
logger.warning("Port conflict requires alternative port")
}
}
do {
let server = BunServer()
server.port = port
server.bindAddress = bindAddress
// Set up crash handler
server.onCrash = { [weak self] exitCode in
Task { @MainActor in
await self?.handleServerCrash(exitCode: exitCode)
}
}
try await server.start()
bunServer = server
// Check server state to ensure it's actually running
if server.getState() == .running {
isRunning = true
lastError = nil
// Reset crash counter on successful start
consecutiveCrashes = 0
} else {
logger.error("Server started but not in running state")
isRunning = false
bunServer = nil
lastError = BunServerError.processFailedToStart
return
}
logger.info("Started server on port \(self.port)")
// Pass the local auth token to SessionMonitor
SessionMonitor.shared.setLocalAuthToken(server.localToken)
// Trigger cleanup of old sessions after server starts
await triggerInitialCleanup()
} catch {
logger.error("Failed to start server: \(error.localizedDescription)")
lastError = error
// Always clean up on error
isRunning = false
bunServer = nil
}
}
/// Stop the current server
func stop() async {
guard let server = bunServer else {
logger.warning("No server running")
isRunning = false // Ensure state is synced
return
}
logger.info("Stopping server")
// Clear crash handler to prevent auto-restart
server.onCrash = nil
await server.stop()
bunServer = nil
isRunning = false
// Clear the auth token from SessionMonitor
SessionMonitor.shared.setLocalAuthToken(nil)
// Reset crash tracking when manually stopped
consecutiveCrashes = 0
lastCrashTime = nil
}
/// Restart the current server
func restart() async {
// Set restarting flag to prevent UI from showing "stopped" state
isRestarting = true
defer { isRestarting = false }
await stop()
// Wait with exponential backoff for port to be available
let portNumber = Int(self.port) ?? 4_020
var retries = 0
let maxRetries = 5
while retries < maxRetries {
let delay = 1.0 * pow(2.0, Double(retries)) // 1, 2, 4, 8, 16 seconds
logger.info("Waiting \(delay) seconds for port to be released (attempt \(retries + 1)/\(maxRetries))...")
try? await Task.sleep(for: .seconds(delay))
if await PortConflictResolver.shared.canBindToPort(portNumber) {
logger.info("Port \(portNumber) is now available")
break
}
retries += 1
}
if retries == maxRetries {
logger.error("Port \(portNumber) still unavailable after \(maxRetries) attempts")
lastError = PortConflictError.portStillInUse(port: portNumber)
return
}
await start()
}
/// Trigger cleanup of exited sessions after server startup
private func triggerInitialCleanup() async {
// Check if cleanup on startup is enabled
guard cleanupOnStartup else {
logger.info("Cleanup on startup is disabled in settings")
return
}
logger.info("Triggering initial cleanup of exited sessions")
// Delay to ensure server is fully ready
try? await Task.sleep(for: .milliseconds(10_000))
do {
// Create URL for cleanup endpoint
guard let url = URL(string: "http://localhost:\(self.port)/api/cleanup-exited") else {
logger.warning("Failed to create cleanup URL")
return
}
var request = URLRequest(url: url)
request.httpMethod = "POST"
request.timeoutInterval = 10
// Add local auth token if available
if let server = bunServer {
request.setValue(server.localToken, forHTTPHeaderField: "X-VibeTunnel-Local")
}
// Make the cleanup request
let (data, response) = try await URLSession.shared.data(for: request)
if let httpResponse = response as? HTTPURLResponse {
if httpResponse.statusCode == 200 {
// Try to parse the response
if let jsonData = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let cleanedCount = jsonData["cleaned_count"] as? Int
{
logger.info("Initial cleanup completed: cleaned \(cleanedCount) exited sessions")
} else {
logger.info("Initial cleanup completed successfully")
}
} else {
logger.warning("Initial cleanup returned status code: \(httpResponse.statusCode)")
}
}
} catch {
// Log the error but don't fail startup
logger.warning("Failed to trigger initial cleanup: \(error.localizedDescription)")
}
}
/// Manually trigger a server restart (for UI button)
func manualRestart() async {
await restart()
}
/// Clear the authentication cache (e.g., when password is changed or cleared)
func clearAuthCache() async {
// Authentication cache clearing is no longer needed as external servers handle their own auth
logger.info("Authentication cache clearing requested - handled by external server")
}
// MARK: - Server Management
/// Handle server crash with automatic restart logic
private func handleServerCrash(exitCode: Int32) async {
// Special handling for exit code 9 (port in use)
if exitCode == 9 {
logger.error("Server failed to start: Port \(self.port) is already in use")
} else {
logger.error("Server crashed with exit code: \(exitCode)")
}
// Update state immediately
isRunning = false
bunServer = nil
// Prevent multiple simultaneous crash handlers
guard !isHandlingCrash else {
logger.warning("Already handling a crash, skipping duplicate handler")
return
}
isHandlingCrash = true
defer { isHandlingCrash = false }
// Check crash rate
let now = Date()
if let lastCrash = lastCrashTime {
let timeSinceLastCrash = now.timeIntervalSince(lastCrash)
if timeSinceLastCrash < 60 { // Less than 1 minute since last crash
consecutiveCrashes += 1
} else {
// Reset counter if it's been a while
consecutiveCrashes = 1
}
} else {
consecutiveCrashes = 1
}
lastCrashTime = now
// Implement exponential backoff for crashes
let maxRetries = 3
guard consecutiveCrashes <= maxRetries else {
logger.error("Server crashed \(self.consecutiveCrashes) times in a row, giving up on auto-restart")
lastError = ServerError.repeatedCrashes(count: consecutiveCrashes)
return
}
// Special handling for exit code 9 (port already in use)
if exitCode == 9 {
logger.info("Port \(self.port) is in use, checking for conflicts...")
// Check for port conflicts
if let conflict = await PortConflictResolver.shared.detectConflict(on: Int(self.port) ?? 4_020) {
logger.warning("Found port conflict: \(conflict.process.name) (PID: \(conflict.process.pid))")
// Try to resolve the conflict
if case .killOurInstance(let pid, let processName) = conflict.suggestedAction {
logger.info("Attempting to kill conflicting process: \(processName) (PID: \(pid))")
do {
try await PortConflictResolver.shared.resolveConflict(conflict)
// resolveConflict now includes exponential backoff
} catch {
logger.error("Failed to resolve port conflict: \(error)")
lastError = PortConflictError.failedToKillProcess(pid: pid)
return
}
} else {
logger.error("Cannot auto-resolve port conflict")
return
}
} else {
// Port might still be in TIME_WAIT state, wait with backoff
logger.info("Port may be in TIME_WAIT state, checking availability...")
let portNumber = Int(self.port) ?? 4_020
var retries = 0
let maxRetries = 5
while retries < maxRetries {
let delay = 2.0 * pow(2.0, Double(retries)) // 2, 4, 8, 16, 32 seconds
logger.info("Waiting \(delay) seconds for port to clear (attempt \(retries + 1)/\(maxRetries))...")
try? await Task.sleep(for: .seconds(delay))
if await PortConflictResolver.shared.canBindToPort(portNumber) {
logger.info("Port \(portNumber) is now available")
break
}
retries += 1
}
if retries == maxRetries {
logger.error("Port \(portNumber) still in TIME_WAIT after \(maxRetries) attempts")
lastError = PortConflictError.portStillInUse(port: portNumber)
return
}
}
} else {
// Normal crash handling with exponential backoff
let baseDelay: TimeInterval = 2.0
let delay = baseDelay * pow(2.0, Double(consecutiveCrashes - 1))
logger
.info("Will restart server after \(delay) seconds (attempt \(self.consecutiveCrashes) of \(maxRetries))"
)
// Wait with exponential backoff
try? await Task.sleep(for: .seconds(delay))
}
// Only restart if we haven't been manually stopped in the meantime
guard bunServer == nil else {
logger.info("Server was manually restarted during crash recovery, skipping auto-restart")
return
}
// Restart with full port conflict detection
logger.info("Auto-restarting server after crash...")
await start()
}
/// Monitor server health periodically
func startHealthMonitoring() {
Task {
while true {
try? await Task.sleep(for: .seconds(30))
guard let server = bunServer else { continue }
// Check server state and process health
let state = server.getState()
let health = await server.checkHealth()
if (!health || state == .crashed) && isRunning {
logger.warning("Server health check failed but state shows running, syncing state")
isRunning = false
bunServer = nil
// Only trigger restart if not already handling a crash
if !isHandlingCrash {
await handleServerCrash(exitCode: -1)
}
}
}
}
}
}
// MARK: - Server Manager Error
enum ServerManagerError: LocalizedError {
case portInUseByApp(appName: String, port: Int, alternatives: [Int])
var errorDescription: String? {
switch self {
case .portInUseByApp(let appName, let port, _):
"Port \(port) is in use by \(appName)"
}
}
var failureReason: String? {
switch self {
case .portInUseByApp:
"The port is being used by another application"
}
}
var recoverySuggestion: String? {
switch self {
case .portInUseByApp(_, _, let alternatives):
"Try one of these ports: \(alternatives.map(String.init).joined(separator: ", "))"
}
}
var helpAnchor: String? {
switch self {
case .portInUseByApp:
"port-conflict"
}
}
}