Add PID-based application targeting (#14)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Peter Steinberger 2025-06-09 00:30:10 +01:00 committed by GitHub
parent 2b5c03697c
commit eb6bd60f20
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 553 additions and 5 deletions

View file

@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- **PID-based application targeting**: You can now target applications by their Process ID using the `PID:XXXX` syntax
- Works with both `image` and `list` tools
- Example: `app_target: "PID:663"` to capture windows from process 663
- Provides clear error messages for invalid PIDs or non-existent processes
- Useful for targeting specific instances when multiple copies of an app are running
## [1.0.1] - 2025-01-08
### Fixed

View file

@ -319,6 +319,12 @@ await use_mcp_tool("peekaboo", "image", {
app_target: "frontmost",
format: "png"
});
// Capture by Process ID (useful for multiple instances)
await use_mcp_tool("peekaboo", "image", {
app_target: "PID:663",
path: "~/Desktop/process.png"
});
```
#### Browser Helper Filtering
@ -360,6 +366,12 @@ await use_mcp_tool("peekaboo", "list", {
app: "Preview"
});
// List windows by Process ID
await use_mcp_tool("peekaboo", "list", {
item_type: "application_windows",
app: "PID:663"
});
// Check server status
await use_mcp_tool("peekaboo", "list", {
item_type: "server_status"
@ -564,8 +576,9 @@ Captures macOS screen content and optionally analyzes it. Window shadows/frames
* `app_target` (string, optional): Specifies the capture target. If omitted or empty, captures all screens.
* Examples:
* `"screen:INDEX"`: Captures the screen at the specified zero-based index (e.g., `"screen:0"`). (Note: Index selection from multiple screens is planned for full support in the Swift CLI).
* `"frontmost"`: Aims to capture all windows of the current foreground application. (Note: This is a complex scenario; current implementation may default to screen capture if the exact foreground app cannot be reliably determined by the Node.js layer alone).
* `"frontmost"`: Captures the frontmost window of the currently active application.
* `"AppName"`: Captures all windows of the application named `AppName` (e.g., `"Safari"`, `"com.apple.Safari"`). Fuzzy matching is used.
* `"PID:ProcessID"`: Captures all windows of the application with the specified process ID (e.g., `"PID:663"`). Useful when multiple instances of the same app are running.
* `"AppName:WINDOW_TITLE:Title"`: Captures the window of `AppName` that has the specified `Title` (e.g., `"Notes:WINDOW_TITLE:My Important Note"`).
* `"AppName:WINDOW_INDEX:Index"`: Captures the window of `AppName` at the specified zero-based `Index` (e.g., `"Preview:WINDOW_INDEX:0"` for the frontmost window of Preview).
* `path` (string, optional): Base absolute path for saving the captured image(s). If `format` is `"data"` and `path` is also provided, the image is saved to this path (as a PNG) AND Base64 data is returned. If a `question` is provided and `path` is omitted, a temporary path is used for capture, and the file is deleted after analysis.
@ -604,9 +617,10 @@ For detailed parameter documentation, see [docs/spec.md](./docs/spec.md).
- **Permission checking**: Automatic verification of required permissions
### Window Management
- **Application listing**: Complete list of running applications
- **Application listing**: Complete list of running applications with PIDs
- **Window enumeration**: List all windows for specific apps
- **Flexible matching**: Find apps by partial name, bundle ID, or PID
- **Flexible matching**: Find apps by partial name, bundle ID, or Process ID
- **PID targeting**: Target specific processes using `PID:XXX` syntax
- **Status monitoring**: Active/inactive status, window counts
### AI Integration

View file

@ -132,6 +132,7 @@ Configured AI Providers (from PEEKABOO_AI_PROVIDERS ENV): <parsed list or 'None
"- 'screen:INDEX': Specific display (e.g., 'screen:0').\\n" +
"- 'frontmost': All windows of the current foreground app.\\n" +
"- 'AppName': All windows of 'AppName'.\\n" +
"- 'PID:ProcessID': All windows of the application with the specified process ID (e.g., 'PID:663').\\n" +
"- 'AppName:WINDOW_TITLE:Title': Window of 'AppName' with 'Title'.\\n" +
"- 'AppName:WINDOW_INDEX:Index': Window of 'AppName' at 'Index'."
),
@ -161,6 +162,7 @@ Configured AI Providers (from PEEKABOO_AI_PROVIDERS ENV): <parsed list or 'None
* `"screen:INDEX"`: maps to Swift CLI `--mode screen --screen-index INDEX` (custom Swift CLI flag might be needed or logic to select from multi-screen capture).
* `"frontmost"`: maps to Swift CLI `--mode frontmost` which uses `NSWorkspace.shared.frontmostApplication` to detect the currently active application and captures its frontmost window.
* `"AppName"`: maps to Swift CLI `--app AppName --mode multi`.
* `"PID:ProcessID"`: maps to Swift CLI `--app PID:ProcessID --mode multi` (the Swift CLI's ApplicationFinder handles PID parsing).
* `"AppName:WINDOW_TITLE:Title"`: maps to Swift CLI `--app AppName --mode window --window-title Title`.
* `"AppName:WINDOW_INDEX:Index"`: maps to Swift CLI `--app AppName --mode window --window-index Index`.
* **Browser Helper Filtering:** The Swift CLI automatically filters out browser helper processes when searching for common browsers (chrome, safari, firefox, edge, brave, arc, opera). This prevents matching helper processes like "Google Chrome Helper (Renderer)" instead of the main browser application, which would result in confusing "no capturable windows" errors. The filtering:

View file

@ -18,6 +18,21 @@ final class ApplicationFinder: Sendable {
let runningApps = NSWorkspace.shared.runningApplications
// Check if identifier is a PID
if identifier.hasPrefix("PID:") {
let pidString = String(identifier.dropFirst(4))
guard let pid = Int32(pidString) else {
throw ApplicationError.notFound("Invalid PID format: \(identifier)")
}
if let app = runningApps.first(where: { $0.processIdentifier == pid }) {
// Logger.shared.debug("Found application by PID: \(app.localizedName ?? "Unknown") (PID: \(pid))")
return app
} else {
throw ApplicationError.notFound("No application found with PID: \(pid)")
}
}
// Check for exact bundle ID match first
if let exactMatch = runningApps.first(where: { $0.bundleIdentifier == identifier }) {
// Logger.shared.debug("Found exact bundle ID match: \(exactMatch.localizedName ?? "Unknown")")

View file

@ -50,6 +50,17 @@ struct ImageCommandTests {
#expect(command.app == "Finder")
}
@Test("Command with PID specifier", .tags(.fast))
func imageCommandWithPIDSpecifier() throws {
// Test PID-specific capture
let command = try ImageCommand.parse([
"--app", "PID:1234"
])
#expect(command.mode == nil) // mode is optional
#expect(command.app == "PID:1234")
}
@Test("Command with window title", .tags(.fast))
func imageCommandWithWindowTitle() throws {
// Test window title capture

View file

@ -0,0 +1,179 @@
import Foundation
import AppKit
import Testing
@testable import peekaboo
@Suite("PID Image Capture Tests")
struct PIDImageCaptureTests {
@Test("Capture windows by PID - valid PID")
func captureWindowsByValidPID() async throws {
// Skip in CI environment
guard ProcessInfo.processInfo.environment["CI"] == nil else {
return
}
// Get a running application with windows
let runningApps = NSWorkspace.shared.runningApplications
guard let appWithWindows = runningApps.first(where: { app in
app.localizedName != nil &&
app.isActive == false && // Don't capture active app to avoid test interference
app.bundleIdentifier != nil
}) else {
Issue.record("No suitable application found for PID capture testing")
return
}
let pid = appWithWindows.processIdentifier
// Create image command with PID
var command = ImageCommand()
command.app = "PID:\(pid)"
command.mode = .multi
command.format = .png
command.path = NSTemporaryDirectory()
command.jsonOutput = true
do {
// Mock the execution context
let result = try await captureWithPID(command: command, targetPID: pid)
#expect(result.success == true)
// Since we're mocking, we know data is ImageCaptureData
#expect(result.data != nil)
} catch {
Issue.record("Failed to capture windows by PID: \(error)")
}
}
@Test("Capture windows by PID - multiple app instances")
func captureWindowsByPIDMultipleInstances() async throws {
// Skip in CI environment
guard ProcessInfo.processInfo.environment["CI"] == nil else {
return
}
// Find apps that might have multiple instances (e.g., Terminal, Finder windows)
let runningApps = NSWorkspace.shared.runningApplications
let appGroups = Dictionary(grouping: runningApps) { $0.bundleIdentifier ?? "unknown" }
// Find an app with multiple instances
guard let (_, apps) = appGroups.first(where: { $0.value.count > 1 }) else {
// No multiple instances found, skip test
return
}
// Pick the first instance
let targetApp = apps[0]
let pid = targetApp.processIdentifier
// Create image command with specific PID
var command = ImageCommand()
command.app = "PID:\(pid)"
command.mode = .multi
command.format = .png
command.path = NSTemporaryDirectory()
command.jsonOutput = true
do {
let result = try await captureWithPID(command: command, targetPID: pid)
#expect(result.success == true)
// Since we're mocking, we know data contains windows from specific PID
#expect(result.data != nil)
} catch {
Issue.record("Failed to capture specific instance by PID: \(error)")
}
}
@Test("Invalid PID formats in image capture")
func invalidPIDFormatsInImageCapture() throws {
let invalidPIDs = [
"PID:", // Missing PID number
"PID:abc", // Non-numeric PID
"PID:-123", // Negative PID
"PID:12.34", // Decimal PID
"PID:0", // Zero PID
"PID:999999999" // Very large PID
]
for invalidPID in invalidPIDs {
var command = ImageCommand()
command.app = invalidPID
command.mode = .window
command.format = .png
command.jsonOutput = true
// The command should parse but fail during execution
#expect(command.app == invalidPID)
// In actual execution, this would fail with APP_NOT_FOUND error
// Here we just verify the command accepts the PID format
}
}
@Test("PID targeting with window specifiers")
func pidTargetingWithWindowSpecifiers() throws {
// Test that PID can be combined with window index
var command1 = ImageCommand()
command1.app = "PID:1234"
command1.windowIndex = 0
command1.mode = .window
#expect(command1.app == "PID:1234")
#expect(command1.windowIndex == 0)
// Test that PID can be combined with window title
var command2 = ImageCommand()
command2.app = "PID:5678"
command2.windowTitle = "Document"
command2.mode = .window
#expect(command2.app == "PID:5678")
#expect(command2.windowTitle == "Document")
}
@Test("PID targeting filename generation")
func pidTargetingFilenameGeneration() throws {
// Test that filenames include PID information
let pid: pid_t = 1234
let appName = "TestApp"
let timestamp = "20250608_120000"
// Expected filename format for PID capture
let expectedFilename = "\(appName)_PID_\(pid)_\(timestamp).png"
// Verify filename pattern
#expect(expectedFilename.contains("PID"))
#expect(expectedFilename.contains(String(pid)))
#expect(expectedFilename.contains(appName))
}
// Helper function to simulate capture with PID
private func captureWithPID(command: ImageCommand, targetPID: pid_t) async throws -> JSONResponse {
// In real execution, this would use WindowCapture.captureWindows
// For testing, we simulate the response
guard let app = NSRunningApplication(processIdentifier: targetPID) else {
throw ApplicationError.notFound("No application found with PID: \(targetPID)")
}
let savedFile = SavedFile(
path: "\(command.path ?? NSTemporaryDirectory())/\(app.localizedName ?? "Unknown")_PID_\(targetPID).png",
item_label: app.localizedName ?? "Unknown",
window_title: nil,
window_id: nil,
window_index: nil,
mime_type: "image/png"
)
let captureData = ImageCaptureData(saved_files: [savedFile])
return JSONResponse(
success: true,
data: captureData,
messages: ["Captured windows for PID: \(targetPID)"],
debugLogs: [],
error: nil
)
}
}

View file

@ -0,0 +1,64 @@
import Foundation
import AppKit
import Testing
@testable import peekaboo
@Suite("PID Targeting Tests")
struct PIDTargetingTests {
@Test("Find application by valid PID", .enabled(if: ProcessInfo.processInfo.environment["CI"] == nil))
func findByValidPID() throws {
// Get any running application
let runningApps = NSWorkspace.shared.runningApplications
guard let testApp = runningApps.first(where: { $0.localizedName != nil }) else {
Issue.record("No running applications found for testing")
return
}
let pid = testApp.processIdentifier
let identifier = "PID:\(pid)"
do {
let foundApp = try ApplicationFinder.findApplication(identifier: identifier)
#expect(foundApp.processIdentifier == pid)
#expect(foundApp.bundleIdentifier == testApp.bundleIdentifier)
} catch {
Issue.record("Failed to find application by PID: \(error)")
}
}
@Test("Invalid PID format throws error")
func invalidPIDFormat() throws {
// Test various invalid PID formats
let invalidPIDs = [
"PID:", // Missing PID number
"PID:abc", // Non-numeric PID
"PID:-123", // Negative PID
"PID:12.34", // Decimal PID
"PID:999999999" // Very large PID (likely non-existent)
]
for invalidPID in invalidPIDs {
#expect(throws: ApplicationError.self) {
_ = try ApplicationFinder.findApplication(identifier: invalidPID)
}
}
}
@Test("Non-existent PID throws notFound error")
func nonExistentPID() throws {
// Use a very high PID number that's unlikely to exist
let identifier = "PID:99999"
do {
_ = try ApplicationFinder.findApplication(identifier: identifier)
Issue.record("Expected error for non-existent PID")
} catch ApplicationError.notFound(let message) {
// The message should contain information about the PID
#expect(message.contains("99999") || message == identifier,
"Error message '\(message)' should mention PID 99999")
} catch {
Issue.record("Unexpected error: \(error)")
}
}
}

View file

@ -0,0 +1,117 @@
import Foundation
import AppKit
import Testing
import ArgumentParser
@testable import peekaboo
@Suite("PID Windows Subcommand Tests")
struct PIDWindowsSubcommandTests {
@Test("Parse windows subcommand with PID")
func parseWindowsSubcommandWithPID() throws {
// Test parsing windows subcommand with PID
let command = try WindowsSubcommand.parse([
"--app", "PID:1234",
"--json-output"
])
#expect(command.app == "PID:1234")
#expect(command.jsonOutput == true)
}
@Test("Parse windows subcommand with PID and details")
func parseWindowsSubcommandWithPIDAndDetails() throws {
// Test windows subcommand with PID and window details
let command = try WindowsSubcommand.parse([
"--app", "PID:5678",
"--include-details", "ids,bounds,off_screen",
"--json-output"
])
#expect(command.app == "PID:5678")
#expect(command.includeDetails == "ids,bounds,off_screen")
#expect(command.jsonOutput == true)
}
@Test("Various PID formats in windows subcommand")
func variousPIDFormatsInWindowsSubcommand() throws {
let pidFormats = [
"PID:1", // Single digit
"PID:123", // Three digits
"PID:99999", // Large PID
]
for pidFormat in pidFormats {
let command = try WindowsSubcommand.parse([
"--app", pidFormat
])
#expect(command.app == pidFormat)
}
}
@Test("ApplicationInfo includes PID")
func applicationInfoIncludesPID() throws {
// Verify that ApplicationInfo includes PID
let appInfo = ApplicationInfo(
app_name: "TestApp",
bundle_id: "com.test.app",
pid: 1234,
is_active: false,
window_count: 2
)
#expect(appInfo.pid == 1234)
#expect(appInfo.app_name == "TestApp")
// Test JSON encoding includes PID
let encoder = JSONEncoder()
let data = try encoder.encode(appInfo)
let json = String(data: data, encoding: .utf8) ?? ""
#expect(json.contains("\"pid\":1234"))
}
@Test("TargetApplicationInfo includes PID")
func targetApplicationInfoIncludesPID() throws {
// Test that window list response includes target app PID
let targetAppInfo = TargetApplicationInfo(
app_name: "Safari",
bundle_id: "com.apple.Safari",
pid: 5678
)
#expect(targetAppInfo.pid == 5678)
// Test JSON encoding
let encoder = JSONEncoder()
let data = try encoder.encode(targetAppInfo)
let json = String(data: data, encoding: .utf8) ?? ""
#expect(json.contains("\"pid\":5678"))
}
@Test("WindowListData structure with PID")
func windowListDataStructureWithPID() throws {
let targetAppInfo = TargetApplicationInfo(
app_name: "Terminal",
bundle_id: "com.apple.Terminal",
pid: 9999
)
let windowInfo = WindowInfo(
window_title: "~/Projects",
window_id: 456,
window_index: 0,
bounds: nil,
is_on_screen: true
)
let windowListData = WindowListData(
windows: [windowInfo],
target_application_info: targetAppInfo
)
#expect(windowListData.target_application_info.pid == 9999)
#expect(windowListData.windows.count == 1)
}
}

View file

@ -43,8 +43,8 @@ export const listToolSchema = z
.optional()
.describe(
"Required when `item_type` is `application_windows`. " +
"Specifies the target application by its name (e.g., \"Safari\", \"TextEdit\") or bundle ID. " +
"Fuzzy matching is used, so partial names may work.",
"Specifies the target application by its name (e.g., \"Safari\", \"TextEdit\"), bundle ID, or process ID (e.g., \"PID:663\"). " +
"Fuzzy matching is used for names, so partial names may work.",
),
include_window_details: z.preprocess(
(val) => {

View file

@ -121,6 +121,7 @@ export const imageToolSchema = z.object({
"Use `'screen:INDEX'` (e.g., `'screen:0'`) for a specific display.\n" +
"Use `'frontmost'` for all windows of the current foreground application.\n" +
"Use `'AppName'` (e.g., `'Safari'`) for all windows of that application.\n" +
"Use `'PID:PROCESS_ID'` (e.g., `'PID:663'`) to target a specific process by its PID.\n" +
"Use `'AppName:WINDOW_TITLE:Title'` (e.g., `'TextEdit:WINDOW_TITLE:My Notes'`) for a window of 'AppName' matching that title.\n" +
"Use `'AppName:WINDOW_INDEX:Index'` (e.g., `'Preview:WINDOW_INDEX:0'`) for a window of 'AppName' at that index.\n" +
"Ensure components are correctly colon-separated.",

View file

@ -0,0 +1,138 @@
import { describe, it, expect, beforeEach, vi } from "vitest";
import { imageToolHandler } from "../../../src/tools/image";
import * as peekabooCliModule from "../../../src/utils/peekaboo-cli";
import type { SwiftCliResponse } from "../../../src/types";
import type { ToolContext } from "@modelcontextprotocol/sdk/types";
import pino from "pino";
// Mock the peekaboo-cli module
vi.mock("../../../src/utils/peekaboo-cli");
// Create a mock context
const mockContext: ToolContext = {
logger: pino({ level: "silent" }),
};
describe("PID Targeting Tests", () => {
beforeEach(() => {
vi.clearAllMocks();
});
it("should handle PID targeting correctly", async () => {
const mockResponse: SwiftCliResponse = {
success: true,
data: {
saved_files: [
{
path: "/tmp/test_PID_663.png",
item_label: "Ghostty",
mime_type: "image/png",
},
],
},
};
vi.mocked(peekabooCliModule.executeSwiftCli).mockResolvedValue(mockResponse);
const result = await imageToolHandler(
{
app_target: "PID:663",
path: "/tmp/test.png",
},
mockContext,
);
expect(result.content).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: "text",
text: expect.stringContaining("Captured 1 image"),
}),
]),
);
expect(result.saved_files).toHaveLength(1);
expect(result.saved_files![0].path).toBe("/tmp/test_PID_663.png");
});
it("should handle invalid PID format", async () => {
const mockResponse: SwiftCliResponse = {
success: false,
error: {
code: "APP_NOT_FOUND",
message: "Invalid PID format: PID:abc",
},
};
vi.mocked(peekabooCliModule.executeSwiftCli).mockResolvedValue(mockResponse);
const result = await imageToolHandler(
{
app_target: "PID:abc",
},
mockContext,
);
expect(result.isError).toBe(true);
expect(result.content[0]).toMatchObject({
type: "text",
text: expect.stringContaining("Invalid PID format"),
});
});
it("should handle non-existent PID", async () => {
const mockResponse: SwiftCliResponse = {
success: false,
error: {
code: "APP_NOT_FOUND",
message: "No application found with PID: 99999",
},
};
vi.mocked(peekabooCliModule.executeSwiftCli).mockResolvedValue(mockResponse);
const result = await imageToolHandler(
{
app_target: "PID:99999",
},
mockContext,
);
expect(result.isError).toBe(true);
expect(result.content[0]).toMatchObject({
type: "text",
text: expect.stringContaining("No application found with PID"),
});
});
it("should pass PID targeting to Swift CLI correctly", async () => {
const mockResponse: SwiftCliResponse = {
success: true,
data: {
images: [
{
path: "/tmp/test.png",
item_label: "Some App",
mime_type: "image/png",
},
],
},
};
vi.mocked(peekabooCliModule.executeSwiftCli).mockResolvedValue(mockResponse);
await imageToolHandler(
{
app_target: "PID:1234",
path: "/tmp/test.png",
},
mockContext,
);
// Verify the Swift CLI was called with the PID target
expect(peekabooCliModule.executeSwiftCli).toHaveBeenCalledWith(
expect.arrayContaining(["image", "--app", "PID:1234"]),
expect.anything(),
expect.anything(),
);
});
});