Simplify, focus on llava:7b and qwen2.5vl:3b and 768px and 1024px images

This commit is contained in:
Sami Samhuri 2025-06-24 23:05:19 -04:00
parent 9c32f2d04c
commit 437a4a3284
No known key found for this signature in database
15 changed files with 169 additions and 592 deletions

View file

@ -1,117 +0,0 @@
#!/usr/bin/env ruby
require 'fileutils'
require 'json'
# Configuration
test_model = ARGV[0] || 'moondream:1.8b'
test_parallel_sizes = [1, 2, 4, 8]
test_prompt = '01-structured-comprehensive'
num_images = 8
num_runs = 3 # Multiple runs for averaging
puts "=" * 60
puts "PARALLELISM BENCHMARK"
puts "=" * 60
puts "Model: #{test_model}"
puts "Images: #{num_images}"
puts "Prompt: #{test_prompt}"
puts "Runs per setting: #{num_runs}"
puts
# Ensure we have test images
unless Dir.exist?('photo-512')
puts "Creating test images (one-time setup)..."
system('./resize_images.rb')
end
# Pull model if needed
puts "Ensuring model is available..."
unless `ollama list`.include?(test_model)
system("ollama pull #{test_model}")
end
results = {}
test_parallel_sizes.each do |parallel|
puts "\n" + "-" * 40
puts "Testing #{parallel} parallel requests..."
times = []
num_runs.times do |run|
print " Run #{run + 1}/#{num_runs}... "
# Clean up previous results
FileUtils.rm_rf('results')
start_time = Time.now
# Run the extraction with specific parameters
cmd = [
"./extract_tags.rb",
"-p #{parallel}",
"-m #{test_model}",
"--max-images #{num_images}",
"--single-prompt #{test_prompt}",
"--no-unload" # Keep model loaded between runs
].join(" ")
success = system(cmd, out: File::NULL, err: File::NULL)
elapsed = Time.now - start_time
times << elapsed
puts "#{elapsed.round(2)}s"
end
avg_time = times.sum / times.length
std_dev = Math.sqrt(times.map { |t| (t - avg_time) ** 2 }.sum / times.length)
results[parallel] = {
times: times,
average: avg_time,
std_dev: std_dev,
per_image: avg_time / num_images
}
end
# Analysis
puts "\n" + "=" * 60
puts "RESULTS SUMMARY"
puts "=" * 60
baseline = results[1][:average]
puts "\n%-10s %-12s %-12s %-12s %-12s" % ["Parallel", "Avg Time", "Std Dev", "Speedup", "Per Image"]
puts "-" * 60
results.each do |parallel, data|
speedup = baseline / data[:average]
puts "%-10d %-12.2f %-12.2f %-12.2f %-12.3f" % [
parallel,
data[:average],
data[:std_dev],
speedup,
data[:per_image]
]
end
# Find optimal
optimal = results.min_by { |_, data| data[:average] }
puts "\n✅ Optimal parallelism: #{optimal[0]} (#{optimal[1][:average].round(2)}s average)"
# Save detailed results
File.write('benchmark_results.json', JSON.pretty_generate({
model: test_model,
num_images: num_images,
prompt: test_prompt,
timestamp: Time.now.iso8601,
results: results
}))
puts "\n📊 Detailed results saved to: benchmark_results.json"
# Cleanup
puts "\n🧹 Cleaning up..."
system("ollama stop #{test_model}", out: File::NULL, err: File::NULL)

View file

@ -8,28 +8,20 @@ require 'fileutils'
require 'csv'
require 'optparse'
require 'time'
require 'concurrent'
class TagExtractor
OLLAMA_URL = 'http://localhost:11434/api/generate'
DEFAULT_MODELS = {
'qwen2.5vl:3b' => 2,
'moondream:1.8b' => 8, # doesn't help a lot but doesn't hurt either
'llava:7b' => 2,
# 'llava:13b' => 2,
# 'llama3.2-vision:11b' => 1, # super slow, 3+ minutes for 8 photos
'llava-phi3:3.8b' => 4
}
DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:3b']
VALID_EXTENSIONS = %w[.jpg .jpeg .png .gif .bmp .tiff .tif].freeze
def initialize(options = {})
@global_parallel = options[:parallel] # Global override if specified
@models = options[:models] || DEFAULT_MODELS
@timeout = options[:timeout] || 120
@verbose = options[:verbose] || false
@max_images = options[:max_images] || nil
@no_unload = options[:no_unload] || false
@single_prompt = options[:single_prompt] || nil
@system_prompt = options[:system_prompt] || "You are an image-keyword assistant. After analyzing each picture, output one line containing concise, lowercase English keywords separated by commas. Include scene type, activities, number of people (e.g. '3-people'), emotions, dominant colours, time-of-day, objects in foreground, objects in background. Do not repeat synonyms. Do not output anything except the comma-separated keyword list."
end
def run
@ -54,29 +46,17 @@ class TagExtractor
puts
total_tasks = images.length * prompts.length * @models.length
completed = Concurrent::AtomicFixnum.new(0)
completed = 0
start_time = Time.now
# Create master CSV
master_csv = CSV.open('results/master.csv', 'w')
master_csv << %w[model image_size prompt_name image_filename tags raw_output timestamp success]
# Process in batches by model to allow proper cleanup
model_list = @models.is_a?(Hash) ? @models.keys : @models
model_list.each_with_index do |model, model_index|
# Determine parallelism for this model
parallel = if @global_parallel
@global_parallel # Use global override if specified
elsif @models.is_a?(Hash)
@models[model] || 2 # Use model-specific or default to 2
else
2 # Default parallelism
end
# Process each model sequentially
@models.each_with_index do |model, model_index|
puts "\n" + "=" * 60
puts "📊 Model #{model_index + 1}/#{model_list.length}: #{model}"
puts " Parallelism: #{parallel}"
puts "📊 Model #{model_index + 1}/#{@models.length}: #{model}"
puts "=" * 60
# Check if model exists and pull if needed
@ -95,34 +75,30 @@ class TagExtractor
# Ensure model is loaded
ensure_model_loaded(model)
# Create thread pool for this model
pool = Concurrent::FixedThreadPool.new(parallel)
# Process each prompt/image combination
prompts.each do |prompt_file, prompt_content|
prompt_name = File.basename(prompt_file, '.*')
images.each do |image_info|
pool.post do
process_single_image(
model: model,
image_info: image_info,
prompt_name: prompt_name,
prompt_content: prompt_content,
master_csv: master_csv,
completed: completed,
total: total_tasks
)
end
completed += 1
progress = (completed.to_f / total_tasks * 100).round(1)
# Clear the line with spaces to prevent leftover characters
print "\r%-80s" % " "
print "\r Progress: #{progress}% (#{completed}/#{total_tasks}) - Processing #{image_info[:filename]}"
process_single_image(
model: model,
image_info: image_info,
prompt_name: prompt_name,
prompt_content: prompt_content,
master_csv: master_csv
)
end
end
# Wait for all tasks for this model to complete
pool.shutdown
pool.wait_for_termination
# Unload model to free memory (unless disabled)
unless @no_unload
puts " 🧹 Unloading model #{model}..."
puts "\n 🧹 Unloading model #{model}..."
unload_model(model)
end
end
@ -145,9 +121,18 @@ class TagExtractor
def collect_images
images = []
# Only process 768 and 1024 sizes
allowed_sizes = [768, 1024]
Dir.glob('photo-*').select { |d| File.directory?(d) }.each do |dir|
size = dir.match(/photo-(\d+)/)[1]
size_match = dir.match(/photo-(\d+)/)
next unless size_match
size = size_match[1].to_i
# Skip sizes we don't want
next unless allowed_sizes.include?(size)
Dir.entries(dir).each do |file|
next unless valid_image?(file)
@ -155,7 +140,7 @@ class TagExtractor
images << {
path: File.join(dir, file),
filename: file,
size: size.to_i
size: size
}
end
end
@ -238,7 +223,7 @@ class TagExtractor
end
end
def process_single_image(model:, image_info:, prompt_name:, prompt_content:, master_csv:, completed:, total:)
def process_single_image(model:, image_info:, prompt_name:, prompt_content:, master_csv:)
start = Time.now
# Read and encode image
@ -276,46 +261,31 @@ class TagExtractor
success: success
)
# Save to master CSV (thread-safe)
@mutex ||= Mutex.new
@mutex.synchronize do
master_csv << [
model,
image_info[:size],
prompt_name,
image_info[:filename],
tags,
raw_output.gsub("\n", " "),
Time.now.iso8601,
success
]
master_csv.flush
end
# Update progress
count = completed.increment
progress = (count.to_f / total * 100).round(1)
elapsed = Time.now - start
if @verbose || count % 10 == 0
print "\r Overall progress: #{progress}% (#{count}/#{total})"
end
# Save to master CSV (no longer need thread safety)
master_csv << [
model,
image_info[:size],
prompt_name,
image_info[:filename],
tags,
raw_output.gsub("\n", " "),
Time.now.iso8601,
success
]
master_csv.flush
rescue => e
puts "\n ❌ Error processing #{image_info[:filename]}: #{e.message}"
@mutex ||= Mutex.new
@mutex.synchronize do
master_csv << [
model,
image_info[:size],
prompt_name,
image_info[:filename],
"",
"Error: #{e.message}",
Time.now.iso8601,
false
]
end
master_csv << [
model,
image_info[:size],
prompt_name,
image_info[:filename],
"",
"Error: #{e.message}",
Time.now.iso8601,
false
]
end
def query_ollama(model:, image_base64:, prompt:)
@ -327,12 +297,13 @@ class TagExtractor
request['Content-Type'] = 'application/json'
request.body = {
model: model,
system: @system_prompt,
prompt: prompt,
images: [image_base64],
stream: false,
options: {
temperature: 0.1,
num_predict: 500
temperature: 0.2,
num_predict: 300
}
}.to_json
@ -352,18 +323,29 @@ class TagExtractor
end
def extract_tags(raw_output)
# Clean up the output to extract just the tags
# Clean up the output to extract just the keywords
cleaned = raw_output.strip
# Remove any explanatory text before or after tags
# Remove any explanatory text before or after keywords
lines = cleaned.split("\n")
tag_line = lines.find { |line| line.include?(',') } || cleaned
# Clean up common patterns
tag_line
# Clean up common patterns and remove hashtags
cleaned_line = tag_line
.gsub(/^(tags:|keywords:|output:)/i, '')
.gsub(/["\[\]{}]/, '')
.gsub(/["\[\]{}#]/, '') # Added # to remove hashtags
.strip
# Split, clean, deduplicate, sort, and rejoin keywords
keywords = cleaned_line.split(',')
.map(&:strip)
.map(&:downcase)
.reject(&:empty?)
.uniq
.sort
.join(', ')
keywords
end
def save_individual_result(model:, size:, prompt_name:, filename:, tags:, raw_output:, success:)
@ -420,7 +402,6 @@ end
# CLI interface
if __FILE__ == $0
options = {
parallel: 8,
models: TagExtractor::DEFAULT_MODELS,
timeout: 120,
verbose: false,
@ -432,32 +413,8 @@ if __FILE__ == $0
OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options]"
opts.on("-p", "--parallel NUM", Integer, "Number of parallel requests (default: 8)") do |n|
options[:parallel] = n
end
opts.on("-m", "--models MODELS", "Comma-separated list of models or model:parallel pairs") do |models|
model_list = models.split(',').map(&:strip)
# Check if any model has parallelism specified
if model_list.any? { |m| m.include?(':') && m.split(':').length > 2 }
# Parse model:parallel format
model_hash = {}
model_list.each do |entry|
parts = entry.split(':')
if parts.length > 2 # Has parallelism (e.g., llava:7b:4)
model_name = parts[0..-2].join(':')
parallel = parts.last.to_i
model_hash[model_name] = parallel > 0 ? parallel : 2
else # Just model name
model_hash[entry] = 2
end
end
options[:models] = model_hash
else
# Simple list of models
options[:models] = model_list
end
opts.on("-m", "--models MODELS", "Comma-separated list of models") do |models|
options[:models] = models.split(',').map(&:strip)
end
opts.on("-t", "--timeout SECONDS", Integer, "Request timeout in seconds (default: 120)") do |t|

View file

@ -1,201 +0,0 @@
#!/usr/bin/env ruby
require 'json'
require 'base64'
require 'net/http'
require 'uri'
require 'fileutils'
require 'csv'
require 'optparse'
require 'time'
# Simplified worker that processes a specific model/size/prompt combination
class TagExtractorWorker
OLLAMA_URL = 'http://localhost:11434/api/generate'
def initialize(model:, size:, prompt:, timeout: 120)
@model = model
@size = size
@prompt_name = prompt
@prompt_file = "prompts/#{prompt}.txt"
@timeout = timeout
unless File.exist?(@prompt_file)
raise "Prompt file not found: #{@prompt_file}"
end
@prompt_content = File.read(@prompt_file).strip
end
def run
output_dir = "results/#{@model.gsub(':', '-')}/#{@size}"
FileUtils.mkdir_p(output_dir)
csv_path = File.join(output_dir, "#{@prompt_name}.csv")
# Check if already processed
if File.exist?(csv_path)
existing_count = CSV.read(csv_path).length - 1 # Minus header
total_images = Dir["photo-#{@size}/*.{jpg,jpeg,png}"].length
if existing_count >= total_images
puts "✓ Already complete: #{@model}/#{@size}/#{@prompt_name} (#{existing_count} images)"
return
else
puts "⚠️ Resuming: #{@model}/#{@size}/#{@prompt_name} (#{existing_count}/#{total_images} done)"
end
end
puts "🚀 Processing: #{@model} / size=#{@size} / prompt=#{@prompt_name}"
# Collect images
images = Dir["photo-#{@size}/*"].select { |f| f.match?(/\.(jpg|jpeg|png)$/i) }.sort
if images.empty?
puts "❌ No images found in photo-#{@size}/"
return
end
# Load existing results to avoid reprocessing
processed = Set.new
if File.exist?(csv_path)
CSV.foreach(csv_path, headers: true) do |row|
processed << row['image_filename']
end
end
# Open CSV for appending
is_new = !File.exist?(csv_path)
csv = CSV.open(csv_path, 'a')
csv << %w[image_filename tags raw_output timestamp success] if is_new
# Process images
images.each_with_index do |image_path, idx|
filename = File.basename(image_path)
if processed.include?(filename)
next
end
print "\r Progress: #{idx + 1}/#{images.length} - #{filename}"
# Process image
result = process_image(image_path)
# Save result
csv << [
filename,
result[:tags],
result[:raw_output].gsub("\n", " "),
Time.now.iso8601,
result[:success]
]
csv.flush
end
csv.close
puts "\n✅ Complete: #{images.length} images processed"
# Save metadata
metadata_path = File.join(output_dir, 'run.json')
File.write(metadata_path, JSON.pretty_generate({
model: @model,
image_size: @size,
prompt_name: @prompt_name,
timestamp: Time.now.iso8601,
images_processed: images.length
}))
end
private
def process_image(image_path)
# Read and encode image
image_data = File.read(image_path, mode: 'rb')
image_base64 = Base64.strict_encode64(image_data)
# Query Ollama
uri = URI.parse(OLLAMA_URL)
http = Net::HTTP.new(uri.host, uri.port)
http.read_timeout = @timeout
request = Net::HTTP::Post.new(uri.path)
request['Content-Type'] = 'application/json'
request.body = {
model: @model,
prompt: @prompt_content,
images: [image_base64],
stream: false,
options: {
temperature: 0.1,
num_predict: 500
}
}.to_json
response = http.request(request)
if response.code == '200'
data = JSON.parse(response.body)
raw_output = data['response']
tags = extract_tags(raw_output)
{ success: true, tags: tags, raw_output: raw_output }
else
{ success: false, tags: '', raw_output: "HTTP #{response.code}: #{response.message}" }
end
rescue Net::ReadTimeout
{ success: false, tags: '', raw_output: "Timeout after #{@timeout}s" }
rescue => e
{ success: false, tags: '', raw_output: "Error: #{e.message}" }
end
def extract_tags(raw_output)
cleaned = raw_output.strip
lines = cleaned.split("\n")
tag_line = lines.find { |line| line.include?(',') } || cleaned
tag_line
.gsub(/^(tags:|keywords:|output:)/i, '')
.gsub(/["\[\]{}]/, '')
.strip
end
end
# CLI
if __FILE__ == $0
options = {}
OptionParser.new do |opts|
opts.banner = "Usage: #{$0} -m MODEL -s SIZE -p PROMPT [options]"
opts.on("-m", "--model MODEL", "Model to use (required)") do |m|
options[:model] = m
end
opts.on("-s", "--size SIZE", Integer, "Image size (required)") do |s|
options[:size] = s
end
opts.on("-p", "--prompt PROMPT", "Prompt name without .txt (required)") do |p|
options[:prompt] = p
end
opts.on("-t", "--timeout SECONDS", Integer, "Request timeout (default: 120)") do |t|
options[:timeout] = t
end
opts.on("-h", "--help", "Show this help") do
puts opts
exit
end
end.parse!
if options[:model].nil? || options[:size].nil? || options[:prompt].nil?
puts "Error: Missing required arguments"
puts "Run with -h for help"
exit 1
end
worker = TagExtractorWorker.new(**options)
worker.run
end

View file

@ -1 +1 @@
List comma-separated tags only. For this image include: dominant and accent colors; all visible objects; number of people; their actions, interactions, and emotions; setting type (indoors/outdoors, location); weather; time of day; and overall mood or ambiance.
List comma-separated keywords only. For this image include: dominant and accent colors; all visible objects; number of people; their actions, interactions, and emotions; setting type (indoors/outdoors, location); weather; time of day; and overall mood or ambiance.

View file

@ -1 +1 @@
Output only comma-separated tags that cover everything in the scene: colors, objects, people count, what they're doing and feeling, the environment (location type, weather, lighting, time), and the general mood or vibe.
Output only comma-separated keywords that cover everything in the scene: colors, objects, people count, what they're doing and feeling, the environment (location type, weather, lighting, time), and the general mood or vibe.

View file

@ -1 +1 @@
Provide a single comma-separated list of tags. Capture colors, objects, count of people, activities and interactions, emotional tone, setting (e.g. beach/urban/indoor), weather, time of day, and overall scene mood—nothing else.
Provide a single comma-separated list of keywords. Capture colors, objects, count of people, activities and interactions, emotional tone, setting (e.g. beach/urban/indoor), weather, time of day, and overall scene mood—nothing else.

View file

@ -1 +1 @@
Generate comma-separated tags only: include all colors, objects, number of people, actions, facial expressions or emotions, the environment (location type, weather, lighting, time), and the scene's mood or atmosphere.
Generate comma-separated keywords only: include all colors, objects, number of people, actions, facial expressions or emotions, the environment (location type, weather, lighting, time), and the scene's mood or atmosphere.

View file

@ -1 +1 @@
Create tags as comma-separated values only. Tag visible colors (dominant and secondary), every object, people with count, their activities and expressions, location details, environmental conditions, time indicators, and the emotional atmosphere of the scene.
Create keywords as comma-separated values only. Keyword visible colors (dominant and secondary), every object, people with count, their activities and expressions, location details, environmental conditions, time indicators, and the emotional atmosphere of the scene.

View file

@ -1 +1 @@
Comma-separated tags covering: colors, all objects, people count, activities, emotions, location type, weather/lighting, time of day, mood. Tags only.
Comma-separated keywords covering: colors, all objects, people count, activities, emotions, location type, weather/lighting, time of day, mood. Keywords only.

View file

@ -0,0 +1,21 @@
Generate comprehensive search keywords for this image as comma-separated values. Analyze and keyword:
PEOPLE: exact count, apparent ages (baby/child/teen/adult/elderly), genders if clear, relationships (couple/family/friends/strangers), body language, facial expressions (smiling/laughing/crying/serious/surprised), activities (eating/walking/sitting/playing/working)
MOOD & ATMOSPHERE: overall emotional tone (joyful/peaceful/tense/romantic/nostalgic/energetic), energy level (calm/lively/chaotic), formality (casual/formal/ceremonial)
SETTING: indoor/outdoor, specific location type (beach/mountain/city/restaurant/home/office/park), country/region if identifiable, venue type (public/private)
TIME & LIGHTING: time of day (early morning/morning/midday/afternoon/golden hour/evening/night), lighting quality (bright/soft/harsh/dim/dramatic), light source (natural/artificial/mixed)
COLORS: dominant colors (top 3), accent colors, color mood (warm/cool/neutral/vibrant/muted)
OBJECTS: food types if visible (cuisine type, dishes, drinks), vehicles, furniture, technology, decorations, sports equipment, musical instruments, art
NATURE: weather conditions, season indicators, landscapes, water bodies, vegetation, animals
BACKGROUND: what's behind the main subjects, architectural elements, crowds, signage
EVENTS: if applicable (wedding/birthday/concert/sports/holiday/vacation)
Output only comma-separated keywords, no explanations.

View file

@ -0,0 +1,13 @@
Create detailed keywords for video diary search. Users might search for: "happy moments", "food experiences", "family time", "adventures", "quiet moments", "celebrations", "daily life", "travel memories".
Keyword everything visible including:
- People: count, approximate ages, emotions on faces, what they're doing, how they're interacting
- Scene type: where this is happening, indoor/outdoor, public/private space
- Time: morning light, afternoon, golden hour, evening, night time
- Mood: the feeling of the moment (joyful, peaceful, exciting, intimate, festive, contemplative)
- Activities: eating, playing, working, relaxing, traveling, celebrating, exploring
- Details: specific foods visible, drinks, decorations, clothing styles, weather, season
- Colors: main colors that define the scene
- Special moments: laughter, hugs, cheers, surprises, achievements
Format: comma-separated keywords only, be specific rather than generic.

View file

@ -0,0 +1,19 @@
Analyze this image as a moment in someone's life story. Generate specific, searchable keywords covering:
WHO: number of people, their likely relationships, emotions showing on faces, age groups
WHAT: main activity, secondary activities, interactions between people, gestures
WHERE: type of location, indoor/outdoor, specific venue (restaurant/home/beach/etc), geographic hints
WHEN: time of day, season clues, weather conditions, lighting quality
WHY: occasion if apparent (meal/celebration/vacation/work/leisure), mood of gathering
Include sensory details:
- Visual: dominant colors, lighting (harsh/soft/golden), contrast
- Implied: likely sounds (quiet/loud/music), temperature (hot/cold), atmosphere
Keyword specific items:
- Food: cuisine type, specific dishes if identifiable, drinks
- Objects: technology, vehicles, sports equipment, decorations
- Nature: landscapes, water, sky conditions, plants, animals
- Clothing: formal/casual, weather-appropriate
Output format: detailed comma-separated keywords only.

View file

@ -0,0 +1,21 @@
Generate exhaustive search keywords. Think like someone searching their memories:
EMOTION KEYWORDS: happy, laughing, smiling, serious, contemplative, excited, surprised, loving, playful, tired, focused, relaxed, celebratory
PEOPLE KEYWORDS: exact count, "group of friends", "couple", "family", "solo", ages if clear, interactions
ACTIVITY KEYWORDS: eating, drinking, cooking, playing, sports, reading, working, dancing, hugging, talking, walking, sitting, lying down
PLACE KEYWORDS: home, restaurant, cafe, beach, mountain, city, countryside, indoor, outdoor, kitchen, living room, bedroom, office, street, park, venue name if visible
TIME KEYWORDS: sunrise, morning, midday, afternoon, sunset, golden hour, blue hour, evening, night, specific time if visible
OBJECT KEYWORDS: list everything visible - food items, drinks, furniture, electronics, vehicles, decorations, plants, books, instruments
ATMOSPHERE KEYWORDS: cozy, energetic, romantic, professional, casual, festive, quiet, busy, crowded, intimate
COLOR KEYWORDS: list prominent colors
WEATHER/SEASON KEYWORDS: sunny, cloudy, rainy, snowy, fog, clear, spring, summer, fall, winter indicators
Comma-separated only, no duplicates.

View file

@ -0,0 +1,13 @@
You're analyzing frames for an AI-powered video diary search. Users search with natural language like "dinner with friends", "kids playing", "sunset at the beach", "birthday celebrations", "quiet morning coffee".
Extract and keyword:
HUMANS: precise count, estimated ages in decades (20s/30s/etc), primary emotion per person, body language, attire style
ACTIONS: primary action, secondary actions, interactions, gestures
LOCATION: venue type, indoor/outdoor, architectural style, geographic region if evident
TEMPORAL: exact time if visible, otherwise: dawn/morning/noon/afternoon/dusk/night, season indicators
AMBIANCE: energy level 1-10, mood descriptors, lighting quality, color temperature
OBJECTS: enumerate all significant objects, food with cuisine type, beverages, decorations
CONTEXT: occasion type, relationship dynamics, cultural indicators
TECHNICAL: image quality descriptors, composition style
Output as comma-separated keywords. Prioritize specific over generic (e.g., "pepperoni pizza" not just "food").

View file

@ -1,149 +0,0 @@
#!/usr/bin/env ruby
require 'csv'
require 'optparse'
require 'fileutils'
# Get all combinations of model, size, and prompt
def get_all_jobs
models = [
'qwen2.5vl:3b',
'moondream:1.8b',
'llava:7b',
'llava:13b',
# 'llama3.2-vision:11b',
'llava-phi3:3.8b'
]
sizes = Dir.glob('photo-*').select { |d| File.directory?(d) }
.map { |d| d.match(/photo-(\d+)/)[1].to_i }
.sort
prompts = Dir.glob('prompts/*.txt')
.map { |f| File.basename(f, '.txt') }
.sort
jobs = []
models.each do |model|
sizes.each do |size|
prompts.each do |prompt|
jobs << { model: model, size: size, prompt: prompt }
end
end
end
jobs
end
# Check if a job is already complete
def job_complete?(job)
csv_path = "results/#{job[:model].gsub(':', '-')}/#{job[:size]}/#{job[:prompt]}.csv"
return false unless File.exist?(csv_path)
# Check if all images were processed
csv_count = CSV.read(csv_path).length - 1 # Minus header
image_count = Dir["photo-#{job[:size]}/*.{jpg,jpeg,png}"].length
csv_count >= image_count
end
# Main execution
options = {
parallel: 2,
models: nil,
skip_complete: true
}
OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options]"
opts.on("-j", "--parallel NUM", Integer, "Number of parallel workers (default: 2)") do |n|
options[:parallel] = n
end
opts.on("-m", "--models MODELS", "Comma-separated list of models to process") do |m|
options[:models] = m.split(',').map(&:strip)
end
opts.on("--no-skip", "Don't skip completed jobs") do
options[:skip_complete] = false
end
opts.on("-h", "--help", "Show this help") do
puts opts
exit
end
end.parse!
# Get all jobs
all_jobs = get_all_jobs
# Filter by models if specified
if options[:models]
all_jobs.select! { |job| options[:models].include?(job[:model]) }
end
# Filter completed jobs
if options[:skip_complete]
remaining_jobs = all_jobs.reject { |job| job_complete?(job) }
completed = all_jobs.length - remaining_jobs.length
if completed > 0
puts "✓ Skipping #{completed} completed jobs"
end
all_jobs = remaining_jobs
end
if all_jobs.empty?
puts "✅ All jobs complete!"
exit
end
puts "📊 Jobs to process: #{all_jobs.length}"
puts "🚀 Running with #{options[:parallel]} parallel workers"
puts
# Group jobs by model to minimize model switching
jobs_by_model = all_jobs.group_by { |job| job[:model] }
# Process each model's jobs
jobs_by_model.each do |model, model_jobs|
puts "\n" + "=" * 60
puts "Processing #{model} (#{model_jobs.length} jobs)"
puts "=" * 60
# Ensure model is loaded
unless `ollama list`.include?(model.split(':').first)
puts "📦 Pulling #{model}..."
system("ollama pull #{model}")
end
# Process jobs in batches
model_jobs.each_slice(options[:parallel]) do |batch|
threads = batch.map do |job|
Thread.new do
cmd = [
"./extract_tags_worker.rb",
"-m '#{job[:model]}'",
"-s #{job[:size]}",
"-p '#{job[:prompt]}'"
].join(" ")
system(cmd)
end
end
# Wait for batch to complete
threads.each(&:join)
end
# Unload model to free memory
puts "🧹 Unloading #{model}..."
system("ollama stop #{model}", out: File::NULL, err: File::NULL)
end
puts "\n✅ All jobs complete!"
# Offer to aggregate results
puts "\nRun ./aggregate_results.rb to create the master CSV"