Fetch og:image from linked URLs when creating link posts

Add Pressa::OpenGraph, a best-effort scraper that pulls og:image (falling
back to twitter:image) from a linked page's HTML. bake new_link and bake
preview_link now use it to prefill the Image front-matter field for link
posts, unless the payload already supplies one. Network failures and
missing tags just resolve to nil so creating a post never blocks on a
slow or broken link; the fetch happens once at draft-creation time, not
on every build.
This commit is contained in:
Sami Samhuri 2026-06-21 23:33:22 -07:00
parent 760c13b0b6
commit 97105c1501
5 changed files with 162 additions and 3 deletions

13
bake.rb
View file

@ -11,6 +11,7 @@ $LOAD_PATH.unshift(LIB_PATH) unless $LOAD_PATH.include?(LIB_PATH)
require "pressa/drafts"
require "pressa/link_post"
require "pressa/open_graph"
require "pressa/config/simple_toml"
require "pressa/coverage"
require "pressa/publish"
@ -81,6 +82,7 @@ def new_link
end
author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"]
image = payload["image"] || fetch_link_image(payload["link"])
post =
begin
Pressa::LinkPost.build(
@ -88,6 +90,7 @@ def new_link
link: payload["link"],
body: payload["body"],
tags: payload["tags"],
image:,
author:
)
rescue Pressa::LinkPost::Error => e
@ -114,6 +117,7 @@ def preview_link
end
author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"]
image = payload["image"] || fetch_link_image(payload["link"])
post =
begin
Pressa::LinkPost.build(
@ -121,6 +125,7 @@ def preview_link
link: payload["link"],
body: payload["body"],
tags: payload["tags"],
image:,
author:
)
rescue Pressa::LinkPost::Error => e
@ -339,6 +344,14 @@ end
private
# Best-effort: a slow or broken link shouldn't block creating the post, it
# just means the Image front-matter field is left for the author to fill in.
def fetch_link_image(link)
return nil if link.to_s.strip.empty?
Pressa::OpenGraph.fetch(link)&.image
end
def run_test_suite(test_files)
run_command("ruby", "-Ilib", "-Itest", "-e", "ARGV.each { |file| require File.expand_path(file) }", *test_files)
end

View file

@ -9,7 +9,7 @@ module Pressa
Result = Data.define(:filename, :target_path, :content)
def self.build(title:, link:, body: nil, tags: nil, author: Drafts.current_author, now: Time.now)
def self.build(title:, link:, body: nil, tags: nil, image: nil, author: Drafts.current_author, now: Time.now)
title = title.to_s.strip
raise Error, "title cannot be empty" if title.empty?
@ -21,12 +21,12 @@ module Pressa
filename = "#{slug}.md"
target_path = "posts/#{now.strftime("%Y/%m")}/#{filename}"
content = render(title:, link:, body:, tags:, author:, now:)
content = render(title:, link:, body:, tags:, image:, author:, now:)
Result.new(filename:, target_path:, content:)
end
def self.render(title:, link:, body:, tags:, author:, now:)
def self.render(title:, link:, body:, tags:, image:, author:, now:)
lines = [
"---",
"Title: #{yaml_quote(title)}",
@ -37,6 +37,8 @@ module Pressa
tag_list = normalize_tags(tags)
lines << "Tags: #{tag_list.join(", ")}" unless tag_list.empty?
lines << "Link: #{link}"
image = image.to_s.strip
lines << "Image: #{image}" unless image.empty?
lines << "---"
front_matter = lines.join("\n")

67
lib/pressa/open_graph.rb Normal file
View file

@ -0,0 +1,67 @@
require "net/http"
require "uri"
module Pressa
# Best-effort scraper for OpenGraph metadata on a linked page, used to fill
# in an Image for link posts. Never raises: network failures, timeouts, and
# missing tags all just resolve to a nil image so post creation never blocks
# on a flaky or slow third-party site.
class OpenGraph
Result = Data.define(:image)
USER_AGENT = "samhuri.net-link-preview/1.0".freeze
MAX_REDIRECTS = 5
def self.fetch(url, http_get: method(:http_get))
html = http_get.call(url)
return nil if html.nil?
extract(html, base_url: url)
rescue
nil
end
def self.extract(html, base_url:)
image = meta_content(html, "og:image") || meta_content(html, "twitter:image")
return nil if image.nil?
Result.new(image: resolve(image, base_url:))
end
def self.meta_content(html, property)
pattern = /<meta\s+[^>]*(?:property|name)\s*=\s*["']#{Regexp.escape(property)}["'][^>]*>/i
tag = html[pattern]
return nil unless tag
content = tag[/content\s*=\s*["']([^"']*)["']/i, 1]
content&.strip&.then { |value| value.empty? ? nil : value }
end
def self.resolve(image, base_url:)
URI.join(base_url, image).to_s
rescue URI::InvalidURIError, URI::InvalidComponentError
image
end
def self.http_get(url, redirects_left: MAX_REDIRECTS)
return nil if redirects_left < 0
uri = URI.parse(url)
return nil unless uri.is_a?(URI::HTTP)
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https", open_timeout: 5, read_timeout: 5) do |http|
response = http.get(uri.request_uri, "User-Agent" => USER_AGENT)
case response
when Net::HTTPSuccess
response.body
when Net::HTTPRedirection
location = response["location"]
return nil unless location
http_get(URI.join(url, location).to_s, redirects_left: redirects_left - 1)
end
end
end
end
end

View file

@ -71,4 +71,20 @@ class Pressa::LinkPostTest < Minitest::Test
error = assert_raises(Pressa::LinkPost::Error) { build(link: " ") }
assert_match(/link/i, error.message)
end
def test_image_is_included_in_front_matter_when_given
post = build(image: "https://example.net/preview.png")
meta = Pressa::Posts::PostMetadata.parse(post.content)
assert_equal("https://example.net/preview.png", meta.image)
end
def test_image_is_omitted_when_blank
post = build(image: " ")
refute_includes(post.content, "Image:")
end
def test_image_is_omitted_when_not_given
post = build
refute_includes(post.content, "Image:")
end
end

61
test/open_graph_test.rb Normal file
View file

@ -0,0 +1,61 @@
require "test_helper"
require "pressa/open_graph"
class Pressa::OpenGraphTest < Minitest::Test
def test_extract_returns_og_image_resolved_against_base_url
html = <<~HTML
<html><head>
<meta property="og:image" content="/images/cover.png">
</head></html>
HTML
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
assert_equal("https://example.net/images/cover.png", result.image)
end
def test_extract_preserves_absolute_image_urls
html = %(<meta property="og:image" content="https://cdn.example.net/cover.png">)
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
assert_equal("https://cdn.example.net/cover.png", result.image)
end
def test_extract_falls_back_to_twitter_image
html = %(<meta name="twitter:image" content="https://cdn.example.net/tw.png">)
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
assert_equal("https://cdn.example.net/tw.png", result.image)
end
def test_extract_returns_nil_when_no_image_meta_present
html = "<html><head><title>No image here</title></head></html>"
refute(Pressa::OpenGraph.extract(html, base_url: "https://example.net"))
end
def test_extract_handles_single_quoted_attributes
html = %(<meta property='og:image' content='https://cdn.example.net/single.png'>)
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net")
assert_equal("https://cdn.example.net/single.png", result.image)
end
def test_fetch_uses_injected_http_get_and_extracts_image
html = %(<meta property="og:image" content="https://cdn.example.net/x.png">)
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) { html })
assert_equal("https://cdn.example.net/x.png", result.image)
end
def test_fetch_returns_nil_when_http_get_returns_nil
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) {})
assert_nil(result)
end
def test_fetch_returns_nil_instead_of_raising_on_network_errors
failing_get = ->(_url) { raise Net::OpenTimeout, "timed out" }
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: failing_get)
assert_nil(result)
end
end