samhuri.net/wayback/scrape-mephisto-page.js

57 lines
2.1 KiB
JavaScript
Executable file

#!/usr/bin/env node
var fs = require('fs')
, jsdom = require('jsdom')
, strftime = require('strftime').strftime
fs.readFile(process.argv[2] || 'sjs 301 moved permanently.html', 'utf8', function(err, html) {
jsdom.env({ html: html
, scripts: [ 'http://code.jquery.com/jquery-1.6.min.js' ]
}, onLoad)
})
function onLoad(err, window) {
var $ = window.jQuery
$('div.hentry').each(function() {
console.log('title: ' + $('.entry-title a', this).text())
console.log('url: ' + $('.entry-title a', this).attr('href').replace(/^http:\/\/web.archive.org\/web\/\d+\//, ''))
console.log('iso date: ' + $('abbr.published', this).attr('title'))
var tags = $('ul.meta li:first-child a', this).map(function(){ return $(this).text() }).get()
console.log('tags: ' + tags)
// console.log('body: ' + $('.entry-content', this).html().trim())
var post = {
title: $('.entry-title a', this).text()
, url: $('.entry-title a', this).attr('href').replace(/^http:\/\/web.archive.org\/web\/\d+\//, '')
, ISODate: $('abbr.published', this).attr('title')
, body: $('.entry-content', this).html().trim()
, tags: tags
}
, s = [ 'Title: ' + post.title
, 'Date: ' + strftime('%B %e, %Y', new Date(post.ISODate))
, 'Timestamp: ' + strftime('%s', new Date(post.ISODate))
, 'Author: sjs'
, 'Tags: ' + post.tags.join(', ')
, '----'
, ''
, post.body
, ''
].join('\n')
, slug = strftime('%Y-%m-%d_' + post.title
.toLowerCase()
.replace(/[^\sa-z0-9_-]/g, '')
.replace(/\s+/g, '-'), new Date(post.ISODate))
console.log('slug: ' + slug)
, filename = '../recovered/' + slug + '.html'
try {
fs.statSync(filename)
console.log('skipped, exists -> ' + post.title + ' (' + slug + '.html)')
console.log()
}
catch (e) {
// fs.writeFileSync(filename, s, 'utf8')
console.log(post.title + ' (' + slug + '.html)')
console.log()
}
// console.log(s)
})
}