blog/wayback/scrape-mephisto.js
2011-12-11 02:23:55 -08:00

60 lines
2.2 KiB
JavaScript
Executable file

#!/usr/bin/env node
var fs = require('fs')
, jsdom = require('jsdom')
, strftime = require('strftime').strftime
fs.readFile(process.argv[2] || 'sjs 301 moved permanently.html', 'utf8', function(err, html) {
jsdom.env({ html: html
, scripts: [ 'http://code.jquery.com/jquery-1.6.min.js' ]
}, onLoad)
})
function onLoad(err, window) {
var $ = window.jQuery
console.log('title: ' + $('.entry-title a').text())
console.log('url: ' + $('.entry-title a').attr('href').replace(/^http:\/\/web.archive.org\/web\/\d+\//, ''))
console.log('iso date: ' + $('abbr.published').attr('title'))
var tags = $('ul.meta li:first-child a').map(function(){ return $(this).text() }).get()
console.log('tags: ' + tags)
// console.log('body: ' + $('.entry-content').html().trim())
var comments = []
, $comments = $('li.comment')
$.each($comments, function(i, x) {
var author = $('div.author > cite > span.author > *', x)
comments.push({
author: author.text()
, url: author.attr('href').replace(/^http:\/\/web.archive.org\/web\/\d+\//, '')
, date: $('div.author > abbr', x).attr('title')
, body: $('div.content', x).text().trim()
})
})
// console.log('comments: ' + comments.length)
var post = {
title: $('.entry-title a').text()
, url: $('.entry-title a').attr('href').replace(/^http:\/\/web.archive.org\/web\/\d+\//, '')
, ISODate: $('abbr.published').attr('title')
, body: $('.entry-content').html().trim()
, tags: tags
, comments: comments
}
, s = [ 'Title: ' + post.title
, 'Date: ' + strftime('%B %e, %Y', new Date(post.ISODate))
, 'Timestamp: ' + strftime('%s', new Date(post.ISODate))
, 'Author: sjs'
, 'Tags: ' + post.tags.join(', ')
, '----'
, ''
, post.body
, ''
].join('\n')
, slug = strftime('%Y.%m.%d-' + post.title
.toLowerCase()
.replace(/[^\sa-z0-9._-]/g, '')
.replace(/\s+/g, '-'), new Date(post.ISODate))
console.log('slug: ' + slug)
fs.writeFileSync('../recovered/' + slug + '.html', s, 'utf8')
console.log(post.title + ' (' + slug + '.html)')
console.log()
// console.log(s)
}