From 296bf8752275a94bac6d39e834dc7cd7d136d7cc Mon Sep 17 00:00:00 2001
From: Sami Samhuri <sami@samhuri.net>
Date: Wed, 25 Jun 2025 09:24:18 -0400
Subject: [PATCH] Tweak prompts to put the right emphasis on people (it's not
 counting them)

---
 extract_tags.rb                         |  8 ++++----
 prompts/01-structured-comprehensive.txt |  2 +-
 prompts/03-single-list.txt              |  2 +-
 prompts/05-detailed-elements.txt        |  2 +-
 prompts/08-memory-search-optimizer.txt  | 13 -------------
 prompts/11-smart-scene-decoder.txt      | 13 -------------
 6 files changed, 7 insertions(+), 33 deletions(-)
 delete mode 100644 prompts/08-memory-search-optimizer.txt
 delete mode 100644 prompts/11-smart-scene-decoder.txt

diff --git a/extract_tags.rb b/extract_tags.rb
index 2fb1fd0..c559fe7 100755
--- a/extract_tags.rb
+++ b/extract_tags.rb
@@ -11,7 +11,7 @@ require 'time'
 
 class TagExtractor
   OLLAMA_URL = 'http://localhost:11434/api/generate'
-  DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:7b', 'bakllava:7b', 'minicpm-v:8b', 'llama3.2-vision:11b', 'llava:13b']
+  DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:7b', 'minicpm-v:8b']
   VALID_EXTENSIONS = %w[.jpg .jpeg .png .gif .bmp .tiff .tif].freeze
 
   def initialize(options = {})
@@ -21,7 +21,7 @@ class TagExtractor
     @max_images = options[:max_images] || nil
     @no_unload = options[:no_unload] || false
     @single_prompt = options[:single_prompt] || nil
-    @system_prompt = options[:system_prompt] || "You are an image-keyword assistant. After analyzing each picture, output one line containing concise, lowercase English keywords separated by commas. Include scene type, activities, emotions, dominant colours, time-of-day, objects in foreground, objects in background. For people: only include 'people' as a keyword if humans are actually visible in the image, followed by descriptive count like '3-people' or 'group'. If no people are present, do not include any people-related keywords. Do not repeat synonyms. Do not output anything except the comma-separated keyword list."
+    @system_prompt = options[:system_prompt] || "You are an image-keyword assistant. After analyzing each picture, output one line containing concise, lowercase English keywords separated by commas. Focus on people's emotions, expressions, moods, and activities if present. Include overall atmosphere, key objects, dominant colors, lighting quality, and setting. For people: include 'people' if humans are visible, with descriptors like 'couple', 'group', or 'crowd'. If the image appears to be a selfie or POV (point-of-view/first-person perspective), include 'selfie' or 'pov' as appropriate. Prioritize emotional and mood keywords. Do not repeat synonyms. Do not output anything except the comma-separated keyword list."
   end
 
   def run
@@ -122,8 +122,8 @@ class TagExtractor
   def collect_images
     images = []
 
-    # Only process 768 and 1024 sizes
-    allowed_sizes = [768, 1024]
+    # Only process 768 size
+    allowed_sizes = [768]
 
     Dir.glob('photo-*').select { |d| File.directory?(d) }.each do |dir|
       size_match = dir.match(/photo-(\d+)/)
diff --git a/prompts/01-structured-comprehensive.txt b/prompts/01-structured-comprehensive.txt
index 59e797c..4aff518 100644
--- a/prompts/01-structured-comprehensive.txt
+++ b/prompts/01-structured-comprehensive.txt
@@ -1 +1 @@
-List comma-separated keywords only. For this image include: dominant and accent colors; all visible objects; people presence (if any humans visible, include 'people' followed by count descriptor like '3-people' or 'couple'); their actions, interactions, and emotions; setting type (indoors/outdoors, location); weather; time of day; and overall mood or ambiance.
\ No newline at end of file
+List comma-separated keywords only. For this image include: people presence (if humans visible, include 'people' and describe as 'couple', 'group', or 'crowd'); their emotions, expressions, and mood (happy, relaxed, contemplative, excited, etc.); what they're doing and how they're interacting; camera perspective (include 'selfie' if self-portrait or 'pov' if first-person view); dominant colors; key objects; overall atmosphere and mood; lighting quality; and setting/location.
\ No newline at end of file
diff --git a/prompts/03-single-list.txt b/prompts/03-single-list.txt
index 69a4e93..5e96e18 100644
--- a/prompts/03-single-list.txt
+++ b/prompts/03-single-list.txt
@@ -1 +1 @@
-Provide a single comma-separated list of keywords. Capture colors, objects, count of people, activities and interactions, emotional tone, setting (e.g. beach/urban/indoor), weather, time of day, and overall scene mood—nothing else.
\ No newline at end of file
+Provide a single comma-separated list of keywords. If people are present, capture their emotions, expressions, moods, and what they're doing. Note if it's a selfie or POV shot. Include overall atmosphere, key objects, dominant colors, lighting, and setting/location—nothing else.
\ No newline at end of file
diff --git a/prompts/05-detailed-elements.txt b/prompts/05-detailed-elements.txt
index b15348d..677994b 100644
--- a/prompts/05-detailed-elements.txt
+++ b/prompts/05-detailed-elements.txt
@@ -1 +1 @@
-Create keywords as comma-separated values only. Keyword visible colors (dominant and secondary), every object, people with count, their activities and expressions, location details, environmental conditions, time indicators, and the emotional atmosphere of the scene.
\ No newline at end of file
+Create keywords as comma-separated values only. Focus on: people's emotions, expressions, moods, and activities if present; camera perspective (selfie or pov if applicable); the emotional atmosphere and overall mood; important objects; dominant colors; lighting quality; and location/setting details.
\ No newline at end of file
diff --git a/prompts/08-memory-search-optimizer.txt b/prompts/08-memory-search-optimizer.txt
deleted file mode 100644
index 74dca44..0000000
--- a/prompts/08-memory-search-optimizer.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Create detailed keywords for video diary search. Users might search for: "happy moments", "food experiences", "family time", "adventures", "quiet moments", "celebrations", "daily life", "travel memories". 
-
-Keyword everything visible including:
-- People: if present include 'people' with count descriptor (e.g. '3-people'), approximate ages, emotions on faces, what they're doing, how they're interacting
-- Scene type: where this is happening, indoor/outdoor, public/private space
-- Time: morning light, afternoon, golden hour, evening, night time
-- Mood: the feeling of the moment (joyful, peaceful, exciting, intimate, festive, contemplative)
-- Activities: eating, playing, working, relaxing, traveling, celebrating, exploring
-- Details: specific foods visible, drinks, decorations, clothing styles, weather, season
-- Colors: main colors that define the scene
-- Special moments: laughter, hugs, cheers, surprises, achievements
-
-Format: comma-separated keywords only, be specific rather than generic.
\ No newline at end of file
diff --git a/prompts/11-smart-scene-decoder.txt b/prompts/11-smart-scene-decoder.txt
deleted file mode 100644
index 17bd4a9..0000000
--- a/prompts/11-smart-scene-decoder.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-You're analyzing frames for an AI-powered video diary search. Users search with natural language like "dinner with friends", "kids playing", "sunset at the beach", "birthday celebrations", "quiet morning coffee".
-
-Extract and keyword:
-HUMANS: if humans visible include 'people' and descriptive count (e.g. '4-people', 'couple', 'crowd'), estimated ages in decades (20s/30s/etc), primary emotion per person, body language, attire style. Skip if no humans present
-ACTIONS: primary action, secondary actions, interactions, gestures
-LOCATION: venue type, indoor/outdoor, architectural style, geographic region if evident
-TEMPORAL: exact time if visible, otherwise: dawn/morning/noon/afternoon/dusk/night, season indicators
-AMBIANCE: energy level 1-10, mood descriptors, lighting quality, color temperature
-OBJECTS: enumerate all significant objects, food with cuisine type, beverages, decorations
-CONTEXT: occasion type, relationship dynamics, cultural indicators
-TECHNICAL: image quality descriptors, composition style
-
-Output as comma-separated keywords. Prioritize specific over generic (e.g., "pepperoni pizza" not just "food").
\ No newline at end of file