[scrub] scrub files daily, but only files not scrubbed in 30 days

This commit is contained in:
Sami Samhuri 2012-10-05 23:20:21 -07:00
parent f317bccdd6
commit 97e62c56fb

63
scrub
View file

@ -23,17 +23,17 @@ class Scrubber
end end
def scrub(dir = @root_dir) def scrub(dir = @root_dir)
hash_file = hashes_filename(dir) scrub_file = scrub_filename(dir)
if File.exist?(File.join(dir, 'noscrub')) if File.exist?(File.join(dir, 'noscrub'))
if File.exists?(hash_file) if File.exists?(scrub_file)
File.unlink(hash_file) File.unlink(scrub_file)
end end
return self return self
end end
# restore hashes if already scrubbed # restore file records if already scrubbed
expected_hashes = hashes(dir) existing_file_records = file_records(dir)
new_hashes = {} new_file_records = {}
# walk the directory # walk the directory
Dir[File.join(dir, '*')].each do |file| Dir[File.join(dir, '*')].each do |file|
@ -48,36 +48,47 @@ class Scrubber
# scrub this file # scrub this file
else else
basename = File.basename(file) basename = File.basename(file)
expected_hash = expected_hashes[basename]
next if basename == 'scrub.json' next if basename == 'scrub.json'
if options.skip_existing && expected_hash file_record = existing_file_records[basename] || {}
new_hashes[basename] = expected_hash # convert old scrub.json records
if file_record.is_a?(String)
puts ">>> converting #{file} in #{scrub_filename(dir)} to new disk format" if options.verbose
# even out the scrubbing load throughout the month
random_timestamp = Time.now.to_i - (rand(30) * 86400)
file_record = { 'hash' => file_record, 'timestamp' => random_timestamp }
end
last_scrubbed = file_record['timestamp'] || 0
# skip files scrubbed in the last 30 days, unless --all was given
if !options.all && last_scrubbed >= Time.now.to_i - (30 * 86400)
puts ">>> skipping #{file} as it has been scrubbed recently (#{Time.at(last_scrubbed)})" if options.verbose
new_file_records[basename] = file_record
next next
end end
relative_filename = file.sub(@root_dir + '/', '') relative_filename = file.sub(@root_dir + '/', '')
result, hash = scrub_file(file, expected_hash) result, hash = scrub_file(file, file_record['hash'])
case result case result
when :ok when :ok
new_hashes[basename] = hash file_record['hash'] = hash
puts "[ok] #{hash} - #{relative_filename}" if options.verbose puts "[ok] #{hash} - #{relative_filename}" if options.verbose
when :new when :new
new_hashes[basename] = hash file_record['hash'] = hash
puts "[new] #{hash} - #{relative_filename}" if options.verbose puts "[new] #{hash} - #{relative_filename}" if options.verbose
when :fail when :fail
# no change in scrub.json, just report the new sha # no change in scrub.json, just report the new sha
new_hashes[basename] = expected_hash
@failures << { @failures << {
:filename => relative_filename, :filename => relative_filename,
:hash => hash, :hash => hash,
:expected_hash => expected_hash :expected_hash => file_record['hash']
} }
@status = :fail @status = :fail
puts "[FAIL] #{hash} - #{relative_filename} (previously had sha #{expected_hash})" puts "[FAIL] #{hash} - #{relative_filename} (previously had sha #{expected_hash})"
end end
file_record['timestamp'] = Time.now.to_i
new_file_records[basename] = file_record
end end
end end
write_hashes(dir, new_hashes) write_file_records(dir, new_file_records)
self self
end end
@ -109,12 +120,12 @@ class Scrubber
`shasum "#{filename.gsub(/(\$)/, '\\\\\\1')}"`.split.first `shasum "#{filename.gsub(/(\$)/, '\\\\\\1')}"`.split.first
end end
def hashes_filename(dir) def scrub_filename(dir)
File.join(dir, 'scrub.json') File.join(dir, 'scrub.json')
end end
def hashes(dir) def file_records(dir)
f = hashes_filename(dir) f = scrub_filename(dir)
if File.exist?(f) if File.exist?(f)
JSON.parse(File.read(f)) JSON.parse(File.read(f))
else else
@ -122,11 +133,11 @@ class Scrubber
end end
end end
def write_hashes(dir, hashes) def write_file_records(dir, records)
return if options.phantom return if options.phantom
f = hashes_filename(dir) f = scrub_filename(dir)
if hashes.size > 0 if records.size > 0
File.open(f, 'w') { |f| f.puts(JSON.fast_generate(hashes)) } File.open(f, 'w') { |f| f.puts(JSON.fast_generate(records)) }
elsif File.exists?(f) elsif File.exists?(f)
File.unlink(f) File.unlink(f)
end end
@ -136,12 +147,15 @@ end
def main def main
options = OpenStruct.new options = OpenStruct.new
options.all = false
options.phantom = false options.phantom = false
options.skip_existing = false
options.verbose = false options.verbose = false
OptionParser.new do |opts| OptionParser.new do |opts|
opts.banner = 'Usage: scrub [options] <root-directory>' opts.banner = 'Usage: scrub [options] <root-directory>'
opts.on('-a', '--all', 'Scrub all files no matter when they were last scrubbed.') do
options.all = true
end
opts.on('-h', '--help', 'Show this help') do opts.on('-h', '--help', 'Show this help') do
puts opts puts opts
exit exit
@ -149,9 +163,6 @@ def main
opts.on('-p', '--phantom', 'Do everything except write scrub.json files. Useful for testing.') do opts.on('-p', '--phantom', 'Do everything except write scrub.json files. Useful for testing.') do
options.phantom = true options.phantom = true
end end
opts.on('-s', '--skip-existing', 'Only calculate new checksums, skipping files with existing hashes') do
options.skip_existing = true
end
opts.on('-v', '--verbose', 'Log every file that is checked') do opts.on('-v', '--verbose', 'Log every file that is checked') do
options.verbose = true options.verbose = true
end end