mirror of
https://github.com/1SecondEveryday/image-analysis-eval.git
synced 2026-03-25 09:05:49 +00:00
Simplify, focus on llava:7b and qwen2.5vl:3b and 768px and 1024px images
This commit is contained in:
parent
9c32f2d04c
commit
437a4a3284
15 changed files with 169 additions and 592 deletions
|
|
@ -1,117 +0,0 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require 'fileutils'
|
||||
require 'json'
|
||||
|
||||
# Configuration
|
||||
test_model = ARGV[0] || 'moondream:1.8b'
|
||||
test_parallel_sizes = [1, 2, 4, 8]
|
||||
test_prompt = '01-structured-comprehensive'
|
||||
num_images = 8
|
||||
num_runs = 3 # Multiple runs for averaging
|
||||
|
||||
puts "=" * 60
|
||||
puts "PARALLELISM BENCHMARK"
|
||||
puts "=" * 60
|
||||
puts "Model: #{test_model}"
|
||||
puts "Images: #{num_images}"
|
||||
puts "Prompt: #{test_prompt}"
|
||||
puts "Runs per setting: #{num_runs}"
|
||||
puts
|
||||
|
||||
# Ensure we have test images
|
||||
unless Dir.exist?('photo-512')
|
||||
puts "Creating test images (one-time setup)..."
|
||||
system('./resize_images.rb')
|
||||
end
|
||||
|
||||
# Pull model if needed
|
||||
puts "Ensuring model is available..."
|
||||
unless `ollama list`.include?(test_model)
|
||||
system("ollama pull #{test_model}")
|
||||
end
|
||||
|
||||
results = {}
|
||||
|
||||
test_parallel_sizes.each do |parallel|
|
||||
puts "\n" + "-" * 40
|
||||
puts "Testing #{parallel} parallel requests..."
|
||||
|
||||
times = []
|
||||
|
||||
num_runs.times do |run|
|
||||
print " Run #{run + 1}/#{num_runs}... "
|
||||
|
||||
# Clean up previous results
|
||||
FileUtils.rm_rf('results')
|
||||
|
||||
start_time = Time.now
|
||||
|
||||
# Run the extraction with specific parameters
|
||||
cmd = [
|
||||
"./extract_tags.rb",
|
||||
"-p #{parallel}",
|
||||
"-m #{test_model}",
|
||||
"--max-images #{num_images}",
|
||||
"--single-prompt #{test_prompt}",
|
||||
"--no-unload" # Keep model loaded between runs
|
||||
].join(" ")
|
||||
|
||||
success = system(cmd, out: File::NULL, err: File::NULL)
|
||||
|
||||
elapsed = Time.now - start_time
|
||||
times << elapsed
|
||||
|
||||
puts "#{elapsed.round(2)}s"
|
||||
end
|
||||
|
||||
avg_time = times.sum / times.length
|
||||
std_dev = Math.sqrt(times.map { |t| (t - avg_time) ** 2 }.sum / times.length)
|
||||
|
||||
results[parallel] = {
|
||||
times: times,
|
||||
average: avg_time,
|
||||
std_dev: std_dev,
|
||||
per_image: avg_time / num_images
|
||||
}
|
||||
end
|
||||
|
||||
# Analysis
|
||||
puts "\n" + "=" * 60
|
||||
puts "RESULTS SUMMARY"
|
||||
puts "=" * 60
|
||||
|
||||
baseline = results[1][:average]
|
||||
|
||||
puts "\n%-10s %-12s %-12s %-12s %-12s" % ["Parallel", "Avg Time", "Std Dev", "Speedup", "Per Image"]
|
||||
puts "-" * 60
|
||||
|
||||
results.each do |parallel, data|
|
||||
speedup = baseline / data[:average]
|
||||
puts "%-10d %-12.2f %-12.2f %-12.2f %-12.3f" % [
|
||||
parallel,
|
||||
data[:average],
|
||||
data[:std_dev],
|
||||
speedup,
|
||||
data[:per_image]
|
||||
]
|
||||
end
|
||||
|
||||
# Find optimal
|
||||
optimal = results.min_by { |_, data| data[:average] }
|
||||
puts "\n✅ Optimal parallelism: #{optimal[0]} (#{optimal[1][:average].round(2)}s average)"
|
||||
|
||||
# Save detailed results
|
||||
File.write('benchmark_results.json', JSON.pretty_generate({
|
||||
model: test_model,
|
||||
num_images: num_images,
|
||||
prompt: test_prompt,
|
||||
timestamp: Time.now.iso8601,
|
||||
results: results
|
||||
}))
|
||||
|
||||
puts "\n📊 Detailed results saved to: benchmark_results.json"
|
||||
|
||||
# Cleanup
|
||||
puts "\n🧹 Cleaning up..."
|
||||
system("ollama stop #{test_model}", out: File::NULL, err: File::NULL)
|
||||
195
extract_tags.rb
195
extract_tags.rb
|
|
@ -8,28 +8,20 @@ require 'fileutils'
|
|||
require 'csv'
|
||||
require 'optparse'
|
||||
require 'time'
|
||||
require 'concurrent'
|
||||
|
||||
class TagExtractor
|
||||
OLLAMA_URL = 'http://localhost:11434/api/generate'
|
||||
DEFAULT_MODELS = {
|
||||
'qwen2.5vl:3b' => 2,
|
||||
'moondream:1.8b' => 8, # doesn't help a lot but doesn't hurt either
|
||||
'llava:7b' => 2,
|
||||
# 'llava:13b' => 2,
|
||||
# 'llama3.2-vision:11b' => 1, # super slow, 3+ minutes for 8 photos
|
||||
'llava-phi3:3.8b' => 4
|
||||
}
|
||||
DEFAULT_MODELS = ['llava:7b', 'qwen2.5vl:3b']
|
||||
VALID_EXTENSIONS = %w[.jpg .jpeg .png .gif .bmp .tiff .tif].freeze
|
||||
|
||||
def initialize(options = {})
|
||||
@global_parallel = options[:parallel] # Global override if specified
|
||||
@models = options[:models] || DEFAULT_MODELS
|
||||
@timeout = options[:timeout] || 120
|
||||
@verbose = options[:verbose] || false
|
||||
@max_images = options[:max_images] || nil
|
||||
@no_unload = options[:no_unload] || false
|
||||
@single_prompt = options[:single_prompt] || nil
|
||||
@system_prompt = options[:system_prompt] || "You are an image-keyword assistant. After analyzing each picture, output one line containing concise, lowercase English keywords separated by commas. Include scene type, activities, number of people (e.g. '3-people'), emotions, dominant colours, time-of-day, objects in foreground, objects in background. Do not repeat synonyms. Do not output anything except the comma-separated keyword list."
|
||||
end
|
||||
|
||||
def run
|
||||
|
|
@ -54,29 +46,17 @@ class TagExtractor
|
|||
puts
|
||||
|
||||
total_tasks = images.length * prompts.length * @models.length
|
||||
completed = Concurrent::AtomicFixnum.new(0)
|
||||
completed = 0
|
||||
start_time = Time.now
|
||||
|
||||
# Create master CSV
|
||||
master_csv = CSV.open('results/master.csv', 'w')
|
||||
master_csv << %w[model image_size prompt_name image_filename tags raw_output timestamp success]
|
||||
|
||||
# Process in batches by model to allow proper cleanup
|
||||
model_list = @models.is_a?(Hash) ? @models.keys : @models
|
||||
|
||||
model_list.each_with_index do |model, model_index|
|
||||
# Determine parallelism for this model
|
||||
parallel = if @global_parallel
|
||||
@global_parallel # Use global override if specified
|
||||
elsif @models.is_a?(Hash)
|
||||
@models[model] || 2 # Use model-specific or default to 2
|
||||
else
|
||||
2 # Default parallelism
|
||||
end
|
||||
|
||||
# Process each model sequentially
|
||||
@models.each_with_index do |model, model_index|
|
||||
puts "\n" + "=" * 60
|
||||
puts "📊 Model #{model_index + 1}/#{model_list.length}: #{model}"
|
||||
puts " Parallelism: #{parallel}"
|
||||
puts "📊 Model #{model_index + 1}/#{@models.length}: #{model}"
|
||||
puts "=" * 60
|
||||
|
||||
# Check if model exists and pull if needed
|
||||
|
|
@ -95,34 +75,30 @@ class TagExtractor
|
|||
# Ensure model is loaded
|
||||
ensure_model_loaded(model)
|
||||
|
||||
# Create thread pool for this model
|
||||
pool = Concurrent::FixedThreadPool.new(parallel)
|
||||
|
||||
# Process each prompt/image combination
|
||||
prompts.each do |prompt_file, prompt_content|
|
||||
prompt_name = File.basename(prompt_file, '.*')
|
||||
|
||||
images.each do |image_info|
|
||||
pool.post do
|
||||
process_single_image(
|
||||
model: model,
|
||||
image_info: image_info,
|
||||
prompt_name: prompt_name,
|
||||
prompt_content: prompt_content,
|
||||
master_csv: master_csv,
|
||||
completed: completed,
|
||||
total: total_tasks
|
||||
)
|
||||
end
|
||||
completed += 1
|
||||
progress = (completed.to_f / total_tasks * 100).round(1)
|
||||
# Clear the line with spaces to prevent leftover characters
|
||||
print "\r%-80s" % " "
|
||||
print "\r Progress: #{progress}% (#{completed}/#{total_tasks}) - Processing #{image_info[:filename]}"
|
||||
|
||||
process_single_image(
|
||||
model: model,
|
||||
image_info: image_info,
|
||||
prompt_name: prompt_name,
|
||||
prompt_content: prompt_content,
|
||||
master_csv: master_csv
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
# Wait for all tasks for this model to complete
|
||||
pool.shutdown
|
||||
pool.wait_for_termination
|
||||
|
||||
# Unload model to free memory (unless disabled)
|
||||
unless @no_unload
|
||||
puts " 🧹 Unloading model #{model}..."
|
||||
puts "\n 🧹 Unloading model #{model}..."
|
||||
unload_model(model)
|
||||
end
|
||||
end
|
||||
|
|
@ -145,9 +121,18 @@ class TagExtractor
|
|||
|
||||
def collect_images
|
||||
images = []
|
||||
|
||||
# Only process 768 and 1024 sizes
|
||||
allowed_sizes = [768, 1024]
|
||||
|
||||
Dir.glob('photo-*').select { |d| File.directory?(d) }.each do |dir|
|
||||
size = dir.match(/photo-(\d+)/)[1]
|
||||
size_match = dir.match(/photo-(\d+)/)
|
||||
next unless size_match
|
||||
|
||||
size = size_match[1].to_i
|
||||
|
||||
# Skip sizes we don't want
|
||||
next unless allowed_sizes.include?(size)
|
||||
|
||||
Dir.entries(dir).each do |file|
|
||||
next unless valid_image?(file)
|
||||
|
|
@ -155,7 +140,7 @@ class TagExtractor
|
|||
images << {
|
||||
path: File.join(dir, file),
|
||||
filename: file,
|
||||
size: size.to_i
|
||||
size: size
|
||||
}
|
||||
end
|
||||
end
|
||||
|
|
@ -238,7 +223,7 @@ class TagExtractor
|
|||
end
|
||||
end
|
||||
|
||||
def process_single_image(model:, image_info:, prompt_name:, prompt_content:, master_csv:, completed:, total:)
|
||||
def process_single_image(model:, image_info:, prompt_name:, prompt_content:, master_csv:)
|
||||
start = Time.now
|
||||
|
||||
# Read and encode image
|
||||
|
|
@ -276,46 +261,31 @@ class TagExtractor
|
|||
success: success
|
||||
)
|
||||
|
||||
# Save to master CSV (thread-safe)
|
||||
@mutex ||= Mutex.new
|
||||
@mutex.synchronize do
|
||||
master_csv << [
|
||||
model,
|
||||
image_info[:size],
|
||||
prompt_name,
|
||||
image_info[:filename],
|
||||
tags,
|
||||
raw_output.gsub("\n", " "),
|
||||
Time.now.iso8601,
|
||||
success
|
||||
]
|
||||
master_csv.flush
|
||||
end
|
||||
|
||||
# Update progress
|
||||
count = completed.increment
|
||||
progress = (count.to_f / total * 100).round(1)
|
||||
elapsed = Time.now - start
|
||||
|
||||
if @verbose || count % 10 == 0
|
||||
print "\r Overall progress: #{progress}% (#{count}/#{total})"
|
||||
end
|
||||
# Save to master CSV (no longer need thread safety)
|
||||
master_csv << [
|
||||
model,
|
||||
image_info[:size],
|
||||
prompt_name,
|
||||
image_info[:filename],
|
||||
tags,
|
||||
raw_output.gsub("\n", " "),
|
||||
Time.now.iso8601,
|
||||
success
|
||||
]
|
||||
master_csv.flush
|
||||
|
||||
rescue => e
|
||||
puts "\n ❌ Error processing #{image_info[:filename]}: #{e.message}"
|
||||
@mutex ||= Mutex.new
|
||||
@mutex.synchronize do
|
||||
master_csv << [
|
||||
model,
|
||||
image_info[:size],
|
||||
prompt_name,
|
||||
image_info[:filename],
|
||||
"",
|
||||
"Error: #{e.message}",
|
||||
Time.now.iso8601,
|
||||
false
|
||||
]
|
||||
end
|
||||
master_csv << [
|
||||
model,
|
||||
image_info[:size],
|
||||
prompt_name,
|
||||
image_info[:filename],
|
||||
"",
|
||||
"Error: #{e.message}",
|
||||
Time.now.iso8601,
|
||||
false
|
||||
]
|
||||
end
|
||||
|
||||
def query_ollama(model:, image_base64:, prompt:)
|
||||
|
|
@ -327,12 +297,13 @@ class TagExtractor
|
|||
request['Content-Type'] = 'application/json'
|
||||
request.body = {
|
||||
model: model,
|
||||
system: @system_prompt,
|
||||
prompt: prompt,
|
||||
images: [image_base64],
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.1,
|
||||
num_predict: 500
|
||||
temperature: 0.2,
|
||||
num_predict: 300
|
||||
}
|
||||
}.to_json
|
||||
|
||||
|
|
@ -352,18 +323,29 @@ class TagExtractor
|
|||
end
|
||||
|
||||
def extract_tags(raw_output)
|
||||
# Clean up the output to extract just the tags
|
||||
# Clean up the output to extract just the keywords
|
||||
cleaned = raw_output.strip
|
||||
|
||||
# Remove any explanatory text before or after tags
|
||||
# Remove any explanatory text before or after keywords
|
||||
lines = cleaned.split("\n")
|
||||
tag_line = lines.find { |line| line.include?(',') } || cleaned
|
||||
|
||||
# Clean up common patterns
|
||||
tag_line
|
||||
# Clean up common patterns and remove hashtags
|
||||
cleaned_line = tag_line
|
||||
.gsub(/^(tags:|keywords:|output:)/i, '')
|
||||
.gsub(/["\[\]{}]/, '')
|
||||
.gsub(/["\[\]{}#]/, '') # Added # to remove hashtags
|
||||
.strip
|
||||
|
||||
# Split, clean, deduplicate, sort, and rejoin keywords
|
||||
keywords = cleaned_line.split(',')
|
||||
.map(&:strip)
|
||||
.map(&:downcase)
|
||||
.reject(&:empty?)
|
||||
.uniq
|
||||
.sort
|
||||
.join(', ')
|
||||
|
||||
keywords
|
||||
end
|
||||
|
||||
def save_individual_result(model:, size:, prompt_name:, filename:, tags:, raw_output:, success:)
|
||||
|
|
@ -420,7 +402,6 @@ end
|
|||
# CLI interface
|
||||
if __FILE__ == $0
|
||||
options = {
|
||||
parallel: 8,
|
||||
models: TagExtractor::DEFAULT_MODELS,
|
||||
timeout: 120,
|
||||
verbose: false,
|
||||
|
|
@ -432,32 +413,8 @@ if __FILE__ == $0
|
|||
OptionParser.new do |opts|
|
||||
opts.banner = "Usage: #{$0} [options]"
|
||||
|
||||
opts.on("-p", "--parallel NUM", Integer, "Number of parallel requests (default: 8)") do |n|
|
||||
options[:parallel] = n
|
||||
end
|
||||
|
||||
opts.on("-m", "--models MODELS", "Comma-separated list of models or model:parallel pairs") do |models|
|
||||
model_list = models.split(',').map(&:strip)
|
||||
|
||||
# Check if any model has parallelism specified
|
||||
if model_list.any? { |m| m.include?(':') && m.split(':').length > 2 }
|
||||
# Parse model:parallel format
|
||||
model_hash = {}
|
||||
model_list.each do |entry|
|
||||
parts = entry.split(':')
|
||||
if parts.length > 2 # Has parallelism (e.g., llava:7b:4)
|
||||
model_name = parts[0..-2].join(':')
|
||||
parallel = parts.last.to_i
|
||||
model_hash[model_name] = parallel > 0 ? parallel : 2
|
||||
else # Just model name
|
||||
model_hash[entry] = 2
|
||||
end
|
||||
end
|
||||
options[:models] = model_hash
|
||||
else
|
||||
# Simple list of models
|
||||
options[:models] = model_list
|
||||
end
|
||||
opts.on("-m", "--models MODELS", "Comma-separated list of models") do |models|
|
||||
options[:models] = models.split(',').map(&:strip)
|
||||
end
|
||||
|
||||
opts.on("-t", "--timeout SECONDS", Integer, "Request timeout in seconds (default: 120)") do |t|
|
||||
|
|
|
|||
|
|
@ -1,201 +0,0 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require 'json'
|
||||
require 'base64'
|
||||
require 'net/http'
|
||||
require 'uri'
|
||||
require 'fileutils'
|
||||
require 'csv'
|
||||
require 'optparse'
|
||||
require 'time'
|
||||
|
||||
# Simplified worker that processes a specific model/size/prompt combination
|
||||
class TagExtractorWorker
|
||||
OLLAMA_URL = 'http://localhost:11434/api/generate'
|
||||
|
||||
def initialize(model:, size:, prompt:, timeout: 120)
|
||||
@model = model
|
||||
@size = size
|
||||
@prompt_name = prompt
|
||||
@prompt_file = "prompts/#{prompt}.txt"
|
||||
@timeout = timeout
|
||||
|
||||
unless File.exist?(@prompt_file)
|
||||
raise "Prompt file not found: #{@prompt_file}"
|
||||
end
|
||||
|
||||
@prompt_content = File.read(@prompt_file).strip
|
||||
end
|
||||
|
||||
def run
|
||||
output_dir = "results/#{@model.gsub(':', '-')}/#{@size}"
|
||||
FileUtils.mkdir_p(output_dir)
|
||||
|
||||
csv_path = File.join(output_dir, "#{@prompt_name}.csv")
|
||||
|
||||
# Check if already processed
|
||||
if File.exist?(csv_path)
|
||||
existing_count = CSV.read(csv_path).length - 1 # Minus header
|
||||
total_images = Dir["photo-#{@size}/*.{jpg,jpeg,png}"].length
|
||||
|
||||
if existing_count >= total_images
|
||||
puts "✓ Already complete: #{@model}/#{@size}/#{@prompt_name} (#{existing_count} images)"
|
||||
return
|
||||
else
|
||||
puts "⚠️ Resuming: #{@model}/#{@size}/#{@prompt_name} (#{existing_count}/#{total_images} done)"
|
||||
end
|
||||
end
|
||||
|
||||
puts "🚀 Processing: #{@model} / size=#{@size} / prompt=#{@prompt_name}"
|
||||
|
||||
# Collect images
|
||||
images = Dir["photo-#{@size}/*"].select { |f| f.match?(/\.(jpg|jpeg|png)$/i) }.sort
|
||||
|
||||
if images.empty?
|
||||
puts "❌ No images found in photo-#{@size}/"
|
||||
return
|
||||
end
|
||||
|
||||
# Load existing results to avoid reprocessing
|
||||
processed = Set.new
|
||||
if File.exist?(csv_path)
|
||||
CSV.foreach(csv_path, headers: true) do |row|
|
||||
processed << row['image_filename']
|
||||
end
|
||||
end
|
||||
|
||||
# Open CSV for appending
|
||||
is_new = !File.exist?(csv_path)
|
||||
csv = CSV.open(csv_path, 'a')
|
||||
csv << %w[image_filename tags raw_output timestamp success] if is_new
|
||||
|
||||
# Process images
|
||||
images.each_with_index do |image_path, idx|
|
||||
filename = File.basename(image_path)
|
||||
|
||||
if processed.include?(filename)
|
||||
next
|
||||
end
|
||||
|
||||
print "\r Progress: #{idx + 1}/#{images.length} - #{filename}"
|
||||
|
||||
# Process image
|
||||
result = process_image(image_path)
|
||||
|
||||
# Save result
|
||||
csv << [
|
||||
filename,
|
||||
result[:tags],
|
||||
result[:raw_output].gsub("\n", " "),
|
||||
Time.now.iso8601,
|
||||
result[:success]
|
||||
]
|
||||
csv.flush
|
||||
end
|
||||
|
||||
csv.close
|
||||
puts "\n✅ Complete: #{images.length} images processed"
|
||||
|
||||
# Save metadata
|
||||
metadata_path = File.join(output_dir, 'run.json')
|
||||
File.write(metadata_path, JSON.pretty_generate({
|
||||
model: @model,
|
||||
image_size: @size,
|
||||
prompt_name: @prompt_name,
|
||||
timestamp: Time.now.iso8601,
|
||||
images_processed: images.length
|
||||
}))
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_image(image_path)
|
||||
# Read and encode image
|
||||
image_data = File.read(image_path, mode: 'rb')
|
||||
image_base64 = Base64.strict_encode64(image_data)
|
||||
|
||||
# Query Ollama
|
||||
uri = URI.parse(OLLAMA_URL)
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
http.read_timeout = @timeout
|
||||
|
||||
request = Net::HTTP::Post.new(uri.path)
|
||||
request['Content-Type'] = 'application/json'
|
||||
request.body = {
|
||||
model: @model,
|
||||
prompt: @prompt_content,
|
||||
images: [image_base64],
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.1,
|
||||
num_predict: 500
|
||||
}
|
||||
}.to_json
|
||||
|
||||
response = http.request(request)
|
||||
|
||||
if response.code == '200'
|
||||
data = JSON.parse(response.body)
|
||||
raw_output = data['response']
|
||||
tags = extract_tags(raw_output)
|
||||
{ success: true, tags: tags, raw_output: raw_output }
|
||||
else
|
||||
{ success: false, tags: '', raw_output: "HTTP #{response.code}: #{response.message}" }
|
||||
end
|
||||
|
||||
rescue Net::ReadTimeout
|
||||
{ success: false, tags: '', raw_output: "Timeout after #{@timeout}s" }
|
||||
rescue => e
|
||||
{ success: false, tags: '', raw_output: "Error: #{e.message}" }
|
||||
end
|
||||
|
||||
def extract_tags(raw_output)
|
||||
cleaned = raw_output.strip
|
||||
lines = cleaned.split("\n")
|
||||
tag_line = lines.find { |line| line.include?(',') } || cleaned
|
||||
|
||||
tag_line
|
||||
.gsub(/^(tags:|keywords:|output:)/i, '')
|
||||
.gsub(/["\[\]{}]/, '')
|
||||
.strip
|
||||
end
|
||||
end
|
||||
|
||||
# CLI
|
||||
if __FILE__ == $0
|
||||
options = {}
|
||||
|
||||
OptionParser.new do |opts|
|
||||
opts.banner = "Usage: #{$0} -m MODEL -s SIZE -p PROMPT [options]"
|
||||
|
||||
opts.on("-m", "--model MODEL", "Model to use (required)") do |m|
|
||||
options[:model] = m
|
||||
end
|
||||
|
||||
opts.on("-s", "--size SIZE", Integer, "Image size (required)") do |s|
|
||||
options[:size] = s
|
||||
end
|
||||
|
||||
opts.on("-p", "--prompt PROMPT", "Prompt name without .txt (required)") do |p|
|
||||
options[:prompt] = p
|
||||
end
|
||||
|
||||
opts.on("-t", "--timeout SECONDS", Integer, "Request timeout (default: 120)") do |t|
|
||||
options[:timeout] = t
|
||||
end
|
||||
|
||||
opts.on("-h", "--help", "Show this help") do
|
||||
puts opts
|
||||
exit
|
||||
end
|
||||
end.parse!
|
||||
|
||||
if options[:model].nil? || options[:size].nil? || options[:prompt].nil?
|
||||
puts "Error: Missing required arguments"
|
||||
puts "Run with -h for help"
|
||||
exit 1
|
||||
end
|
||||
|
||||
worker = TagExtractorWorker.new(**options)
|
||||
worker.run
|
||||
end
|
||||
|
|
@ -1 +1 @@
|
|||
List comma-separated tags only. For this image include: dominant and accent colors; all visible objects; number of people; their actions, interactions, and emotions; setting type (indoors/outdoors, location); weather; time of day; and overall mood or ambiance.
|
||||
List comma-separated keywords only. For this image include: dominant and accent colors; all visible objects; number of people; their actions, interactions, and emotions; setting type (indoors/outdoors, location); weather; time of day; and overall mood or ambiance.
|
||||
|
|
@ -1 +1 @@
|
|||
Output only comma-separated tags that cover everything in the scene: colors, objects, people count, what they're doing and feeling, the environment (location type, weather, lighting, time), and the general mood or vibe.
|
||||
Output only comma-separated keywords that cover everything in the scene: colors, objects, people count, what they're doing and feeling, the environment (location type, weather, lighting, time), and the general mood or vibe.
|
||||
|
|
@ -1 +1 @@
|
|||
Provide a single comma-separated list of tags. Capture colors, objects, count of people, activities and interactions, emotional tone, setting (e.g. beach/urban/indoor), weather, time of day, and overall scene mood—nothing else.
|
||||
Provide a single comma-separated list of keywords. Capture colors, objects, count of people, activities and interactions, emotional tone, setting (e.g. beach/urban/indoor), weather, time of day, and overall scene mood—nothing else.
|
||||
|
|
@ -1 +1 @@
|
|||
Generate comma-separated tags only: include all colors, objects, number of people, actions, facial expressions or emotions, the environment (location type, weather, lighting, time), and the scene's mood or atmosphere.
|
||||
Generate comma-separated keywords only: include all colors, objects, number of people, actions, facial expressions or emotions, the environment (location type, weather, lighting, time), and the scene's mood or atmosphere.
|
||||
|
|
@ -1 +1 @@
|
|||
Create tags as comma-separated values only. Tag visible colors (dominant and secondary), every object, people with count, their activities and expressions, location details, environmental conditions, time indicators, and the emotional atmosphere of the scene.
|
||||
Create keywords as comma-separated values only. Keyword visible colors (dominant and secondary), every object, people with count, their activities and expressions, location details, environmental conditions, time indicators, and the emotional atmosphere of the scene.
|
||||
|
|
@ -1 +1 @@
|
|||
Comma-separated tags covering: colors, all objects, people count, activities, emotions, location type, weather/lighting, time of day, mood. Tags only.
|
||||
Comma-separated keywords covering: colors, all objects, people count, activities, emotions, location type, weather/lighting, time of day, mood. Keywords only.
|
||||
21
prompts/07-ultra-detailed-scene.txt
Normal file
21
prompts/07-ultra-detailed-scene.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
Generate comprehensive search keywords for this image as comma-separated values. Analyze and keyword:
|
||||
|
||||
PEOPLE: exact count, apparent ages (baby/child/teen/adult/elderly), genders if clear, relationships (couple/family/friends/strangers), body language, facial expressions (smiling/laughing/crying/serious/surprised), activities (eating/walking/sitting/playing/working)
|
||||
|
||||
MOOD & ATMOSPHERE: overall emotional tone (joyful/peaceful/tense/romantic/nostalgic/energetic), energy level (calm/lively/chaotic), formality (casual/formal/ceremonial)
|
||||
|
||||
SETTING: indoor/outdoor, specific location type (beach/mountain/city/restaurant/home/office/park), country/region if identifiable, venue type (public/private)
|
||||
|
||||
TIME & LIGHTING: time of day (early morning/morning/midday/afternoon/golden hour/evening/night), lighting quality (bright/soft/harsh/dim/dramatic), light source (natural/artificial/mixed)
|
||||
|
||||
COLORS: dominant colors (top 3), accent colors, color mood (warm/cool/neutral/vibrant/muted)
|
||||
|
||||
OBJECTS: food types if visible (cuisine type, dishes, drinks), vehicles, furniture, technology, decorations, sports equipment, musical instruments, art
|
||||
|
||||
NATURE: weather conditions, season indicators, landscapes, water bodies, vegetation, animals
|
||||
|
||||
BACKGROUND: what's behind the main subjects, architectural elements, crowds, signage
|
||||
|
||||
EVENTS: if applicable (wedding/birthday/concert/sports/holiday/vacation)
|
||||
|
||||
Output only comma-separated keywords, no explanations.
|
||||
13
prompts/08-memory-search-optimizer.txt
Normal file
13
prompts/08-memory-search-optimizer.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
Create detailed keywords for video diary search. Users might search for: "happy moments", "food experiences", "family time", "adventures", "quiet moments", "celebrations", "daily life", "travel memories".
|
||||
|
||||
Keyword everything visible including:
|
||||
- People: count, approximate ages, emotions on faces, what they're doing, how they're interacting
|
||||
- Scene type: where this is happening, indoor/outdoor, public/private space
|
||||
- Time: morning light, afternoon, golden hour, evening, night time
|
||||
- Mood: the feeling of the moment (joyful, peaceful, exciting, intimate, festive, contemplative)
|
||||
- Activities: eating, playing, working, relaxing, traveling, celebrating, exploring
|
||||
- Details: specific foods visible, drinks, decorations, clothing styles, weather, season
|
||||
- Colors: main colors that define the scene
|
||||
- Special moments: laughter, hugs, cheers, surprises, achievements
|
||||
|
||||
Format: comma-separated keywords only, be specific rather than generic.
|
||||
19
prompts/09-contextual-story-tagger.txt
Normal file
19
prompts/09-contextual-story-tagger.txt
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
Analyze this image as a moment in someone's life story. Generate specific, searchable keywords covering:
|
||||
|
||||
WHO: number of people, their likely relationships, emotions showing on faces, age groups
|
||||
WHAT: main activity, secondary activities, interactions between people, gestures
|
||||
WHERE: type of location, indoor/outdoor, specific venue (restaurant/home/beach/etc), geographic hints
|
||||
WHEN: time of day, season clues, weather conditions, lighting quality
|
||||
WHY: occasion if apparent (meal/celebration/vacation/work/leisure), mood of gathering
|
||||
|
||||
Include sensory details:
|
||||
- Visual: dominant colors, lighting (harsh/soft/golden), contrast
|
||||
- Implied: likely sounds (quiet/loud/music), temperature (hot/cold), atmosphere
|
||||
|
||||
Keyword specific items:
|
||||
- Food: cuisine type, specific dishes if identifiable, drinks
|
||||
- Objects: technology, vehicles, sports equipment, decorations
|
||||
- Nature: landscapes, water, sky conditions, plants, animals
|
||||
- Clothing: formal/casual, weather-appropriate
|
||||
|
||||
Output format: detailed comma-separated keywords only.
|
||||
21
prompts/10-moment-finder-pro.txt
Normal file
21
prompts/10-moment-finder-pro.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
Generate exhaustive search keywords. Think like someone searching their memories:
|
||||
|
||||
EMOTION KEYWORDS: happy, laughing, smiling, serious, contemplative, excited, surprised, loving, playful, tired, focused, relaxed, celebratory
|
||||
|
||||
PEOPLE KEYWORDS: exact count, "group of friends", "couple", "family", "solo", ages if clear, interactions
|
||||
|
||||
ACTIVITY KEYWORDS: eating, drinking, cooking, playing, sports, reading, working, dancing, hugging, talking, walking, sitting, lying down
|
||||
|
||||
PLACE KEYWORDS: home, restaurant, cafe, beach, mountain, city, countryside, indoor, outdoor, kitchen, living room, bedroom, office, street, park, venue name if visible
|
||||
|
||||
TIME KEYWORDS: sunrise, morning, midday, afternoon, sunset, golden hour, blue hour, evening, night, specific time if visible
|
||||
|
||||
OBJECT KEYWORDS: list everything visible - food items, drinks, furniture, electronics, vehicles, decorations, plants, books, instruments
|
||||
|
||||
ATMOSPHERE KEYWORDS: cozy, energetic, romantic, professional, casual, festive, quiet, busy, crowded, intimate
|
||||
|
||||
COLOR KEYWORDS: list prominent colors
|
||||
|
||||
WEATHER/SEASON KEYWORDS: sunny, cloudy, rainy, snowy, fog, clear, spring, summer, fall, winter indicators
|
||||
|
||||
Comma-separated only, no duplicates.
|
||||
13
prompts/11-smart-scene-decoder.txt
Normal file
13
prompts/11-smart-scene-decoder.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
You're analyzing frames for an AI-powered video diary search. Users search with natural language like "dinner with friends", "kids playing", "sunset at the beach", "birthday celebrations", "quiet morning coffee".
|
||||
|
||||
Extract and keyword:
|
||||
HUMANS: precise count, estimated ages in decades (20s/30s/etc), primary emotion per person, body language, attire style
|
||||
ACTIONS: primary action, secondary actions, interactions, gestures
|
||||
LOCATION: venue type, indoor/outdoor, architectural style, geographic region if evident
|
||||
TEMPORAL: exact time if visible, otherwise: dawn/morning/noon/afternoon/dusk/night, season indicators
|
||||
AMBIANCE: energy level 1-10, mood descriptors, lighting quality, color temperature
|
||||
OBJECTS: enumerate all significant objects, food with cuisine type, beverages, decorations
|
||||
CONTEXT: occasion type, relationship dynamics, cultural indicators
|
||||
TECHNICAL: image quality descriptors, composition style
|
||||
|
||||
Output as comma-separated keywords. Prioritize specific over generic (e.g., "pepperoni pizza" not just "food").
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require 'csv'
|
||||
require 'optparse'
|
||||
require 'fileutils'
|
||||
|
||||
# Get all combinations of model, size, and prompt
|
||||
def get_all_jobs
|
||||
models = [
|
||||
'qwen2.5vl:3b',
|
||||
'moondream:1.8b',
|
||||
'llava:7b',
|
||||
'llava:13b',
|
||||
# 'llama3.2-vision:11b',
|
||||
'llava-phi3:3.8b'
|
||||
]
|
||||
|
||||
sizes = Dir.glob('photo-*').select { |d| File.directory?(d) }
|
||||
.map { |d| d.match(/photo-(\d+)/)[1].to_i }
|
||||
.sort
|
||||
|
||||
prompts = Dir.glob('prompts/*.txt')
|
||||
.map { |f| File.basename(f, '.txt') }
|
||||
.sort
|
||||
|
||||
jobs = []
|
||||
models.each do |model|
|
||||
sizes.each do |size|
|
||||
prompts.each do |prompt|
|
||||
jobs << { model: model, size: size, prompt: prompt }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
jobs
|
||||
end
|
||||
|
||||
# Check if a job is already complete
|
||||
def job_complete?(job)
|
||||
csv_path = "results/#{job[:model].gsub(':', '-')}/#{job[:size]}/#{job[:prompt]}.csv"
|
||||
return false unless File.exist?(csv_path)
|
||||
|
||||
# Check if all images were processed
|
||||
csv_count = CSV.read(csv_path).length - 1 # Minus header
|
||||
image_count = Dir["photo-#{job[:size]}/*.{jpg,jpeg,png}"].length
|
||||
|
||||
csv_count >= image_count
|
||||
end
|
||||
|
||||
# Main execution
|
||||
options = {
|
||||
parallel: 2,
|
||||
models: nil,
|
||||
skip_complete: true
|
||||
}
|
||||
|
||||
OptionParser.new do |opts|
|
||||
opts.banner = "Usage: #{$0} [options]"
|
||||
|
||||
opts.on("-j", "--parallel NUM", Integer, "Number of parallel workers (default: 2)") do |n|
|
||||
options[:parallel] = n
|
||||
end
|
||||
|
||||
opts.on("-m", "--models MODELS", "Comma-separated list of models to process") do |m|
|
||||
options[:models] = m.split(',').map(&:strip)
|
||||
end
|
||||
|
||||
opts.on("--no-skip", "Don't skip completed jobs") do
|
||||
options[:skip_complete] = false
|
||||
end
|
||||
|
||||
opts.on("-h", "--help", "Show this help") do
|
||||
puts opts
|
||||
exit
|
||||
end
|
||||
end.parse!
|
||||
|
||||
# Get all jobs
|
||||
all_jobs = get_all_jobs
|
||||
|
||||
# Filter by models if specified
|
||||
if options[:models]
|
||||
all_jobs.select! { |job| options[:models].include?(job[:model]) }
|
||||
end
|
||||
|
||||
# Filter completed jobs
|
||||
if options[:skip_complete]
|
||||
remaining_jobs = all_jobs.reject { |job| job_complete?(job) }
|
||||
completed = all_jobs.length - remaining_jobs.length
|
||||
|
||||
if completed > 0
|
||||
puts "✓ Skipping #{completed} completed jobs"
|
||||
end
|
||||
|
||||
all_jobs = remaining_jobs
|
||||
end
|
||||
|
||||
if all_jobs.empty?
|
||||
puts "✅ All jobs complete!"
|
||||
exit
|
||||
end
|
||||
|
||||
puts "📊 Jobs to process: #{all_jobs.length}"
|
||||
puts "🚀 Running with #{options[:parallel]} parallel workers"
|
||||
puts
|
||||
|
||||
# Group jobs by model to minimize model switching
|
||||
jobs_by_model = all_jobs.group_by { |job| job[:model] }
|
||||
|
||||
# Process each model's jobs
|
||||
jobs_by_model.each do |model, model_jobs|
|
||||
puts "\n" + "=" * 60
|
||||
puts "Processing #{model} (#{model_jobs.length} jobs)"
|
||||
puts "=" * 60
|
||||
|
||||
# Ensure model is loaded
|
||||
unless `ollama list`.include?(model.split(':').first)
|
||||
puts "📦 Pulling #{model}..."
|
||||
system("ollama pull #{model}")
|
||||
end
|
||||
|
||||
# Process jobs in batches
|
||||
model_jobs.each_slice(options[:parallel]) do |batch|
|
||||
threads = batch.map do |job|
|
||||
Thread.new do
|
||||
cmd = [
|
||||
"./extract_tags_worker.rb",
|
||||
"-m '#{job[:model]}'",
|
||||
"-s #{job[:size]}",
|
||||
"-p '#{job[:prompt]}'"
|
||||
].join(" ")
|
||||
|
||||
system(cmd)
|
||||
end
|
||||
end
|
||||
|
||||
# Wait for batch to complete
|
||||
threads.each(&:join)
|
||||
end
|
||||
|
||||
# Unload model to free memory
|
||||
puts "🧹 Unloading #{model}..."
|
||||
system("ollama stop #{model}", out: File::NULL, err: File::NULL)
|
||||
end
|
||||
|
||||
puts "\n✅ All jobs complete!"
|
||||
|
||||
# Offer to aggregate results
|
||||
puts "\nRun ./aggregate_results.rb to create the master CSV"
|
||||
Loading…
Reference in a new issue