diff --git a/bake.rb b/bake.rb index 28eb6c3..a931811 100644 --- a/bake.rb +++ b/bake.rb @@ -11,6 +11,7 @@ $LOAD_PATH.unshift(LIB_PATH) unless $LOAD_PATH.include?(LIB_PATH) require "pressa/drafts" require "pressa/link_post" +require "pressa/open_graph" require "pressa/config/simple_toml" require "pressa/coverage" require "pressa/publish" @@ -81,6 +82,7 @@ def new_link end author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"] + image = payload["image"] || fetch_link_image(payload["link"]) post = begin Pressa::LinkPost.build( @@ -88,6 +90,7 @@ def new_link link: payload["link"], body: payload["body"], tags: payload["tags"], + image:, author: ) rescue Pressa::LinkPost::Error => e @@ -114,6 +117,7 @@ def preview_link end author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"] + image = payload["image"] || fetch_link_image(payload["link"]) post = begin Pressa::LinkPost.build( @@ -121,6 +125,7 @@ def preview_link link: payload["link"], body: payload["body"], tags: payload["tags"], + image:, author: ) rescue Pressa::LinkPost::Error => e @@ -339,6 +344,14 @@ end private +# Best-effort: a slow or broken link shouldn't block creating the post, it +# just means the Image front-matter field is left for the author to fill in. +def fetch_link_image(link) + return nil if link.to_s.strip.empty? + + Pressa::OpenGraph.fetch(link)&.image +end + def run_test_suite(test_files) run_command("ruby", "-Ilib", "-Itest", "-e", "ARGV.each { |file| require File.expand_path(file) }", *test_files) end diff --git a/lib/pressa/link_post.rb b/lib/pressa/link_post.rb index 6ea76d2..5c2d34a 100644 --- a/lib/pressa/link_post.rb +++ b/lib/pressa/link_post.rb @@ -9,7 +9,7 @@ module Pressa Result = Data.define(:filename, :target_path, :content) - def self.build(title:, link:, body: nil, tags: nil, author: Drafts.current_author, now: Time.now) + def self.build(title:, link:, body: nil, tags: nil, image: nil, author: Drafts.current_author, now: Time.now) title = title.to_s.strip raise Error, "title cannot be empty" if title.empty? @@ -21,12 +21,12 @@ module Pressa filename = "#{slug}.md" target_path = "posts/#{now.strftime("%Y/%m")}/#{filename}" - content = render(title:, link:, body:, tags:, author:, now:) + content = render(title:, link:, body:, tags:, image:, author:, now:) Result.new(filename:, target_path:, content:) end - def self.render(title:, link:, body:, tags:, author:, now:) + def self.render(title:, link:, body:, tags:, image:, author:, now:) lines = [ "---", "Title: #{yaml_quote(title)}", @@ -37,6 +37,8 @@ module Pressa tag_list = normalize_tags(tags) lines << "Tags: #{tag_list.join(", ")}" unless tag_list.empty? lines << "Link: #{link}" + image = image.to_s.strip + lines << "Image: #{image}" unless image.empty? lines << "---" front_matter = lines.join("\n") diff --git a/lib/pressa/open_graph.rb b/lib/pressa/open_graph.rb new file mode 100644 index 0000000..64133ee --- /dev/null +++ b/lib/pressa/open_graph.rb @@ -0,0 +1,67 @@ +require "net/http" +require "uri" + +module Pressa + # Best-effort scraper for OpenGraph metadata on a linked page, used to fill + # in an Image for link posts. Never raises: network failures, timeouts, and + # missing tags all just resolve to a nil image so post creation never blocks + # on a flaky or slow third-party site. + class OpenGraph + Result = Data.define(:image) + + USER_AGENT = "samhuri.net-link-preview/1.0".freeze + MAX_REDIRECTS = 5 + + def self.fetch(url, http_get: method(:http_get)) + html = http_get.call(url) + return nil if html.nil? + + extract(html, base_url: url) + rescue + nil + end + + def self.extract(html, base_url:) + image = meta_content(html, "og:image") || meta_content(html, "twitter:image") + return nil if image.nil? + + Result.new(image: resolve(image, base_url:)) + end + + def self.meta_content(html, property) + pattern = /]*(?:property|name)\s*=\s*["']#{Regexp.escape(property)}["'][^>]*>/i + tag = html[pattern] + return nil unless tag + + content = tag[/content\s*=\s*["']([^"']*)["']/i, 1] + content&.strip&.then { |value| value.empty? ? nil : value } + end + + def self.resolve(image, base_url:) + URI.join(base_url, image).to_s + rescue URI::InvalidURIError, URI::InvalidComponentError + image + end + + def self.http_get(url, redirects_left: MAX_REDIRECTS) + return nil if redirects_left < 0 + + uri = URI.parse(url) + return nil unless uri.is_a?(URI::HTTP) + + Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https", open_timeout: 5, read_timeout: 5) do |http| + response = http.get(uri.request_uri, "User-Agent" => USER_AGENT) + + case response + when Net::HTTPSuccess + response.body + when Net::HTTPRedirection + location = response["location"] + return nil unless location + + http_get(URI.join(url, location).to_s, redirects_left: redirects_left - 1) + end + end + end + end +end diff --git a/test/link_post_test.rb b/test/link_post_test.rb index 814cdbe..a0dac4a 100644 --- a/test/link_post_test.rb +++ b/test/link_post_test.rb @@ -71,4 +71,20 @@ class Pressa::LinkPostTest < Minitest::Test error = assert_raises(Pressa::LinkPost::Error) { build(link: " ") } assert_match(/link/i, error.message) end + + def test_image_is_included_in_front_matter_when_given + post = build(image: "https://example.net/preview.png") + meta = Pressa::Posts::PostMetadata.parse(post.content) + assert_equal("https://example.net/preview.png", meta.image) + end + + def test_image_is_omitted_when_blank + post = build(image: " ") + refute_includes(post.content, "Image:") + end + + def test_image_is_omitted_when_not_given + post = build + refute_includes(post.content, "Image:") + end end diff --git a/test/open_graph_test.rb b/test/open_graph_test.rb new file mode 100644 index 0000000..b15656b --- /dev/null +++ b/test/open_graph_test.rb @@ -0,0 +1,61 @@ +require "test_helper" +require "pressa/open_graph" + +class Pressa::OpenGraphTest < Minitest::Test + def test_extract_returns_og_image_resolved_against_base_url + html = <<~HTML + + + + HTML + + result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing") + assert_equal("https://example.net/images/cover.png", result.image) + end + + def test_extract_preserves_absolute_image_urls + html = %() + + result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing") + assert_equal("https://cdn.example.net/cover.png", result.image) + end + + def test_extract_falls_back_to_twitter_image + html = %() + + result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing") + assert_equal("https://cdn.example.net/tw.png", result.image) + end + + def test_extract_returns_nil_when_no_image_meta_present + html = "No image here" + + refute(Pressa::OpenGraph.extract(html, base_url: "https://example.net")) + end + + def test_extract_handles_single_quoted_attributes + html = %() + + result = Pressa::OpenGraph.extract(html, base_url: "https://example.net") + assert_equal("https://cdn.example.net/single.png", result.image) + end + + def test_fetch_uses_injected_http_get_and_extracts_image + html = %() + result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) { html }) + + assert_equal("https://cdn.example.net/x.png", result.image) + end + + def test_fetch_returns_nil_when_http_get_returns_nil + result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) {}) + assert_nil(result) + end + + def test_fetch_returns_nil_instead_of_raising_on_network_errors + failing_get = ->(_url) { raise Net::OpenTimeout, "timed out" } + result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: failing_get) + + assert_nil(result) + end +end