diff --git a/CLAUDE.md b/CLAUDE.md index e49172d..5ecc9e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -72,6 +72,25 @@ The evaluation framework tests different combinations of: ## Current Focus Based on git history, the project has narrowed from broad testing to: -- **Models**: llava:7b (quality) and qwen2.5vl:3b (speed) -- **Sizes**: 768px and 1024px (balance of quality and performance) -- **Goal**: Optimal tag extraction for video diary search functionality \ No newline at end of file +- **Models**: llava:7b, qwen2.5vl:7b, and minicpm-v:8b +- **Sizes**: 768px (optimal balance of quality and performance) +- **Prompts**: Simplified to 01, 03, and 05 (complex prompts removed) +- **Goal**: Optimal tag extraction for video diary search functionality + +## Evaluation Priorities + +When evaluating model performance, our priorities are (in order): +1. **People detection** - Detecting human presence, emotions, expressions, moods, activities, and interactions +2. **Overall mood/atmosphere** - Capturing the feeling and emotional tone of scenes +3. **Objects** - Important items that provide context +4. **Scene details** - Colors, lighting, setting/location, time of day +2. **Camera perspective** - Identifying selfies and POV (first-person) shots + +### Key Insights from Testing +- **Emotion focus**: We prioritize understanding how people feel over precisely counting them +- **Background matters**: Details like "bicycles in distance" enable memory-based searches +- **Simple prompts win**: Complex prompts cause repetition without adding value +- **Model strengths vary**: + - Qwen2.5VL: Best for emotion keywords + - MiniCPM-V: Best for comprehensive scene understanding + - LLaVA:7b: Most reliable with minimal repetition diff --git a/extract_tags.rb b/extract_tags.rb index c559fe7..8c61514 100755 --- a/extract_tags.rb +++ b/extract_tags.rb @@ -11,7 +11,8 @@ require 'time' class TagExtractor OLLAMA_URL = 'http://localhost:11434/api/generate' - DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:7b', 'minicpm-v:8b'] + # DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:7b', 'minicpm-v:8b'] + DEFAULT_MODELS = ['gemma3:4b', 'gemma3:12b', 'gemma3:27b'] VALID_EXTENSIONS = %w[.jpg .jpeg .png .gif .bmp .tiff .tif].freeze def initialize(options = {}) @@ -61,15 +62,20 @@ class TagExtractor # Check if model exists and pull if needed unless model_exists?(model) - puts " 📦 Model not found locally. Pulling #{model}..." + puts " 📦 Model #{model} not found locally. Attempting to pull..." + puts " ⏳ This may take a while for large models..." + pull_success = system("ollama pull #{model}") unless pull_success - puts " ❌ Failed to pull #{model}. Skipping..." + puts " ❌ Failed to pull #{model}. Skipping this model." + puts " Try running manually: ollama pull #{model}" next end puts " ✓ Successfully pulled #{model}" + else + puts " ✓ Model #{model} already available" end # Ensure model is loaded @@ -205,8 +211,8 @@ class TagExtractor def model_exists?(model) list_output = `ollama list 2>&1` - model_name = model.split(':').first - list_output.include?(model_name) + # The model name appears at the start of each line in the output + list_output.lines.any? { |line| line.strip.start_with?("#{model} ") || line.strip.start_with?("#{model}\t") } end def unload_model(model)