diff --git a/CHANGELOG.md b/CHANGELOG.md index 03efa2d..827ef4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ Peekaboo MCP is now production-ready! This release marks the culmination of exte - Robust permission handling for Screen Recording and Accessibility ### Requirements -- macOS 15.0 or later (Sequoia) +- macOS 14.0 or later (Sonoma) - Node.js 18 or later - Screen Recording permission (for capture features) - Accessibility permission (optional, for foreground window detection) diff --git a/README.md b/README.md index 0fe7e47..0122bf4 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![npm version](https://badge.fury.io/js/%40steipete%2Fpeekaboo-mcp.svg)](https://www.npmjs.com/package/@steipete/peekaboo-mcp) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![macOS](https://img.shields.io/badge/macOS-15.0%2B-blue.svg)](https://www.apple.com/macos/) +[![macOS](https://img.shields.io/badge/macOS-14.0%2B-blue.svg)](https://www.apple.com/macos/) [![Node.js](https://img.shields.io/badge/node-%3E%3D20.0.0-brightgreen.svg)](https://nodejs.org/) Peekaboo is a macOS-only MCP server that enables AI agents to capture screenshots of applications, windows, or the entire system, with optional visual question answering through local or remote AI models. @@ -33,7 +33,7 @@ Read more about the design philosophy and implementation details in the [blog po ### Requirements -- **macOS 15.0+** (Sequoia or later) +- **macOS 14.0+** (Sonoma or later) - **Node.js 20.0+** - **Screen Recording Permission** (you'll be prompted on first use) diff --git a/peekaboo-cli/Package.swift b/peekaboo-cli/Package.swift index dc17558..9959bbc 100644 --- a/peekaboo-cli/Package.swift +++ b/peekaboo-cli/Package.swift @@ -4,7 +4,7 @@ import PackageDescription let package = Package( name: "peekaboo", platforms: [ - .macOS(.v15) + .macOS(.v14) ], products: [ .executable( diff --git a/peekaboo-cli/Sources/peekaboo/main.swift b/peekaboo-cli/Sources/peekaboo/main.swift index d0d60e5..88bb133 100644 --- a/peekaboo-cli/Sources/peekaboo/main.swift +++ b/peekaboo-cli/Sources/peekaboo/main.swift @@ -2,7 +2,7 @@ import ArgumentParser import Foundation @main -@available(macOS 10.15, *) +@available(macOS 14.0, *) struct PeekabooCommand: AsyncParsableCommand { static let configuration = CommandConfiguration( commandName: "peekaboo", diff --git a/src/utils/ai-providers.ts b/src/utils/ai-providers.ts index 4f12060..3afb1d6 100644 --- a/src/utils/ai-providers.ts +++ b/src/utils/ai-providers.ts @@ -106,6 +106,9 @@ async function analyzeWithOllama( logger.debug({ model, baseUrl }, "Analyzing image with Ollama"); + // Default to describing the image if no question is provided + const prompt = question.trim() || "Please describe what you see in this image."; + const response = await fetch(`${baseUrl}/api/generate`, { method: "POST", headers: { @@ -113,7 +116,7 @@ async function analyzeWithOllama( }, body: JSON.stringify({ model, - prompt: question, + prompt, images: [imageBase64], stream: false, }), @@ -147,13 +150,16 @@ async function analyzeWithOpenAI( const openai = new OpenAI({ apiKey }); + // Default to describing the image if no question is provided + const prompt = question.trim() || "Please describe what you see in this image."; + const response = await openai.chat.completions.create({ model: model || "gpt-4o", messages: [ { role: "user", content: [ - { type: "text", text: question }, + { type: "text", text: prompt }, { type: "image_url", image_url: { diff --git a/tests/unit/tools/analyze-edge-cases.test.ts b/tests/unit/tools/analyze-edge-cases.test.ts index 81db6c4..d7a267a 100644 --- a/tests/unit/tools/analyze-edge-cases.test.ts +++ b/tests/unit/tools/analyze-edge-cases.test.ts @@ -37,7 +37,7 @@ describe("Analyze Tool - Edge Cases", () => { }); describe("Empty question handling", () => { - it("should handle empty string question", async () => { + it("should handle empty string question with default prompt", async () => { mockParseAIProviders.mockReturnValue([ { provider: "ollama", model: "llava:latest" } ]); @@ -49,8 +49,8 @@ describe("Analyze Tool - Edge Cases", () => { mockReadImageAsBase64.mockResolvedValue("base64imagedata"); - // Mock Ollama returning "No response from Ollama" for empty question - mockAnalyzeImageWithProvider.mockResolvedValue("No response from Ollama"); + // Mock Ollama returning a description of the image + mockAnalyzeImageWithProvider.mockResolvedValue("This image shows a desktop window with various UI elements."); const result = await analyzeToolHandler( { @@ -69,11 +69,11 @@ describe("Analyze Tool - Edge Cases", () => { ); expect(result.content[0].type).toBe("text"); - expect(result.content[0].text).toBe("No response from Ollama"); + expect(result.content[0].text).toBe("This image shows a desktop window with various UI elements."); expect(result.model_used).toBe("ollama/llava:latest"); }); - it("should handle whitespace-only question", async () => { + it("should handle whitespace-only question with default prompt", async () => { mockParseAIProviders.mockReturnValue([ { provider: "ollama", model: "llava:latest" } ]); @@ -84,7 +84,7 @@ describe("Analyze Tool - Edge Cases", () => { }); mockReadImageAsBase64.mockResolvedValue("base64imagedata"); - mockAnalyzeImageWithProvider.mockResolvedValue("No response from Ollama"); + mockAnalyzeImageWithProvider.mockResolvedValue("This image shows a screenshot of an application."); const result = await analyzeToolHandler( { @@ -102,10 +102,10 @@ describe("Analyze Tool - Edge Cases", () => { mockLogger, ); - expect(result.content[0].text).toBe("No response from Ollama"); + expect(result.content[0].text).toBe("This image shows a screenshot of an application."); }); - it("should handle question with only newlines", async () => { + it("should handle question with only newlines with default prompt", async () => { mockParseAIProviders.mockReturnValue([ { provider: "ollama", model: "llava:latest" } ]); @@ -116,7 +116,7 @@ describe("Analyze Tool - Edge Cases", () => { }); mockReadImageAsBase64.mockResolvedValue("base64imagedata"); - mockAnalyzeImageWithProvider.mockResolvedValue("No response from Ollama"); + mockAnalyzeImageWithProvider.mockResolvedValue("This image displays a user interface with multiple sections."); const result = await analyzeToolHandler( { @@ -134,7 +134,7 @@ describe("Analyze Tool - Edge Cases", () => { mockLogger, ); - expect(result.content[0].text).toBe("No response from Ollama"); + expect(result.content[0].text).toBe("This image displays a user interface with multiple sections."); }); }); diff --git a/tests/unit/utils/ai-providers.test.ts b/tests/unit/utils/ai-providers.test.ts index 353b5c0..2bbeb39 100644 --- a/tests/unit/utils/ai-providers.test.ts +++ b/tests/unit/utils/ai-providers.test.ts @@ -335,6 +335,51 @@ describe("AI Providers Utility", () => { expect(result).toBe("No response from Ollama"); }); + it("should use default prompt for empty question with Ollama", async () => { + (global.fetch as vi.Mock).mockResolvedValueOnce({ + ok: true, + json: async () => ({ response: "This image shows a window with text content." }), + }); + const result = await analyzeImageWithProvider( + { provider: "ollama", model: "llava" }, + "path/img.png", + imageBase64, + "", // Empty question + mockLogger, + ); + expect(result).toBe("This image shows a window with text content."); + const fetchCall = (global.fetch as vi.Mock).mock.calls[0]; + const body = JSON.parse(fetchCall[1].body); + expect(body.prompt).toBe("Please describe what you see in this image."); + }); + + it("should use default prompt for whitespace-only question with OpenAI", async () => { + process.env.OPENAI_API_KEY = "test-key"; + mockChatCompletionsCreate.mockResolvedValueOnce({ + choices: [{ message: { content: "This image displays a user interface." } }], + }); + + const result = await analyzeImageWithProvider( + { provider: "openai", model: "gpt-4o" }, + "path/img.png", + imageBase64, + " ", // Whitespace-only question + mockLogger, + ); + expect(result).toBe("This image displays a user interface."); + expect(mockChatCompletionsCreate).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + content: expect.arrayContaining([ + { type: "text", text: "Please describe what you see in this image." }, + ]), + }), + ]), + }), + ); + }); + it("should throw error for anthropic provider (not implemented)", async () => { await expect( analyzeImageWithProvider(