mirror of
https://github.com/samsonjs/samhuri.net.git
synced 2026-06-26 04:59:35 +00:00
Fetch og:image from linked URLs when creating link posts
Add Pressa::OpenGraph, a best-effort scraper that pulls og:image (falling back to twitter:image) from a linked page's HTML. bake new_link and bake preview_link now use it to prefill the Image front-matter field for link posts, unless the payload already supplies one. Network failures and missing tags just resolve to nil so creating a post never blocks on a slow or broken link; the fetch happens once at draft-creation time, not on every build.
This commit is contained in:
parent
760c13b0b6
commit
97105c1501
5 changed files with 162 additions and 3 deletions
13
bake.rb
13
bake.rb
|
|
@ -11,6 +11,7 @@ $LOAD_PATH.unshift(LIB_PATH) unless $LOAD_PATH.include?(LIB_PATH)
|
|||
|
||||
require "pressa/drafts"
|
||||
require "pressa/link_post"
|
||||
require "pressa/open_graph"
|
||||
require "pressa/config/simple_toml"
|
||||
require "pressa/coverage"
|
||||
require "pressa/publish"
|
||||
|
|
@ -81,6 +82,7 @@ def new_link
|
|||
end
|
||||
|
||||
author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"]
|
||||
image = payload["image"] || fetch_link_image(payload["link"])
|
||||
post =
|
||||
begin
|
||||
Pressa::LinkPost.build(
|
||||
|
|
@ -88,6 +90,7 @@ def new_link
|
|||
link: payload["link"],
|
||||
body: payload["body"],
|
||||
tags: payload["tags"],
|
||||
image:,
|
||||
author:
|
||||
)
|
||||
rescue Pressa::LinkPost::Error => e
|
||||
|
|
@ -114,6 +117,7 @@ def preview_link
|
|||
end
|
||||
|
||||
author = payload["author"] || Pressa::Config::SimpleToml.load_file("site.toml")["author"]
|
||||
image = payload["image"] || fetch_link_image(payload["link"])
|
||||
post =
|
||||
begin
|
||||
Pressa::LinkPost.build(
|
||||
|
|
@ -121,6 +125,7 @@ def preview_link
|
|||
link: payload["link"],
|
||||
body: payload["body"],
|
||||
tags: payload["tags"],
|
||||
image:,
|
||||
author:
|
||||
)
|
||||
rescue Pressa::LinkPost::Error => e
|
||||
|
|
@ -339,6 +344,14 @@ end
|
|||
|
||||
private
|
||||
|
||||
# Best-effort: a slow or broken link shouldn't block creating the post, it
|
||||
# just means the Image front-matter field is left for the author to fill in.
|
||||
def fetch_link_image(link)
|
||||
return nil if link.to_s.strip.empty?
|
||||
|
||||
Pressa::OpenGraph.fetch(link)&.image
|
||||
end
|
||||
|
||||
def run_test_suite(test_files)
|
||||
run_command("ruby", "-Ilib", "-Itest", "-e", "ARGV.each { |file| require File.expand_path(file) }", *test_files)
|
||||
end
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ module Pressa
|
|||
|
||||
Result = Data.define(:filename, :target_path, :content)
|
||||
|
||||
def self.build(title:, link:, body: nil, tags: nil, author: Drafts.current_author, now: Time.now)
|
||||
def self.build(title:, link:, body: nil, tags: nil, image: nil, author: Drafts.current_author, now: Time.now)
|
||||
title = title.to_s.strip
|
||||
raise Error, "title cannot be empty" if title.empty?
|
||||
|
||||
|
|
@ -21,12 +21,12 @@ module Pressa
|
|||
|
||||
filename = "#{slug}.md"
|
||||
target_path = "posts/#{now.strftime("%Y/%m")}/#{filename}"
|
||||
content = render(title:, link:, body:, tags:, author:, now:)
|
||||
content = render(title:, link:, body:, tags:, image:, author:, now:)
|
||||
|
||||
Result.new(filename:, target_path:, content:)
|
||||
end
|
||||
|
||||
def self.render(title:, link:, body:, tags:, author:, now:)
|
||||
def self.render(title:, link:, body:, tags:, image:, author:, now:)
|
||||
lines = [
|
||||
"---",
|
||||
"Title: #{yaml_quote(title)}",
|
||||
|
|
@ -37,6 +37,8 @@ module Pressa
|
|||
tag_list = normalize_tags(tags)
|
||||
lines << "Tags: #{tag_list.join(", ")}" unless tag_list.empty?
|
||||
lines << "Link: #{link}"
|
||||
image = image.to_s.strip
|
||||
lines << "Image: #{image}" unless image.empty?
|
||||
lines << "---"
|
||||
|
||||
front_matter = lines.join("\n")
|
||||
|
|
|
|||
67
lib/pressa/open_graph.rb
Normal file
67
lib/pressa/open_graph.rb
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
require "net/http"
|
||||
require "uri"
|
||||
|
||||
module Pressa
|
||||
# Best-effort scraper for OpenGraph metadata on a linked page, used to fill
|
||||
# in an Image for link posts. Never raises: network failures, timeouts, and
|
||||
# missing tags all just resolve to a nil image so post creation never blocks
|
||||
# on a flaky or slow third-party site.
|
||||
class OpenGraph
|
||||
Result = Data.define(:image)
|
||||
|
||||
USER_AGENT = "samhuri.net-link-preview/1.0".freeze
|
||||
MAX_REDIRECTS = 5
|
||||
|
||||
def self.fetch(url, http_get: method(:http_get))
|
||||
html = http_get.call(url)
|
||||
return nil if html.nil?
|
||||
|
||||
extract(html, base_url: url)
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
|
||||
def self.extract(html, base_url:)
|
||||
image = meta_content(html, "og:image") || meta_content(html, "twitter:image")
|
||||
return nil if image.nil?
|
||||
|
||||
Result.new(image: resolve(image, base_url:))
|
||||
end
|
||||
|
||||
def self.meta_content(html, property)
|
||||
pattern = /<meta\s+[^>]*(?:property|name)\s*=\s*["']#{Regexp.escape(property)}["'][^>]*>/i
|
||||
tag = html[pattern]
|
||||
return nil unless tag
|
||||
|
||||
content = tag[/content\s*=\s*["']([^"']*)["']/i, 1]
|
||||
content&.strip&.then { |value| value.empty? ? nil : value }
|
||||
end
|
||||
|
||||
def self.resolve(image, base_url:)
|
||||
URI.join(base_url, image).to_s
|
||||
rescue URI::InvalidURIError, URI::InvalidComponentError
|
||||
image
|
||||
end
|
||||
|
||||
def self.http_get(url, redirects_left: MAX_REDIRECTS)
|
||||
return nil if redirects_left < 0
|
||||
|
||||
uri = URI.parse(url)
|
||||
return nil unless uri.is_a?(URI::HTTP)
|
||||
|
||||
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https", open_timeout: 5, read_timeout: 5) do |http|
|
||||
response = http.get(uri.request_uri, "User-Agent" => USER_AGENT)
|
||||
|
||||
case response
|
||||
when Net::HTTPSuccess
|
||||
response.body
|
||||
when Net::HTTPRedirection
|
||||
location = response["location"]
|
||||
return nil unless location
|
||||
|
||||
http_get(URI.join(url, location).to_s, redirects_left: redirects_left - 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -71,4 +71,20 @@ class Pressa::LinkPostTest < Minitest::Test
|
|||
error = assert_raises(Pressa::LinkPost::Error) { build(link: " ") }
|
||||
assert_match(/link/i, error.message)
|
||||
end
|
||||
|
||||
def test_image_is_included_in_front_matter_when_given
|
||||
post = build(image: "https://example.net/preview.png")
|
||||
meta = Pressa::Posts::PostMetadata.parse(post.content)
|
||||
assert_equal("https://example.net/preview.png", meta.image)
|
||||
end
|
||||
|
||||
def test_image_is_omitted_when_blank
|
||||
post = build(image: " ")
|
||||
refute_includes(post.content, "Image:")
|
||||
end
|
||||
|
||||
def test_image_is_omitted_when_not_given
|
||||
post = build
|
||||
refute_includes(post.content, "Image:")
|
||||
end
|
||||
end
|
||||
|
|
|
|||
61
test/open_graph_test.rb
Normal file
61
test/open_graph_test.rb
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
require "test_helper"
|
||||
require "pressa/open_graph"
|
||||
|
||||
class Pressa::OpenGraphTest < Minitest::Test
|
||||
def test_extract_returns_og_image_resolved_against_base_url
|
||||
html = <<~HTML
|
||||
<html><head>
|
||||
<meta property="og:image" content="/images/cover.png">
|
||||
</head></html>
|
||||
HTML
|
||||
|
||||
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
|
||||
assert_equal("https://example.net/images/cover.png", result.image)
|
||||
end
|
||||
|
||||
def test_extract_preserves_absolute_image_urls
|
||||
html = %(<meta property="og:image" content="https://cdn.example.net/cover.png">)
|
||||
|
||||
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
|
||||
assert_equal("https://cdn.example.net/cover.png", result.image)
|
||||
end
|
||||
|
||||
def test_extract_falls_back_to_twitter_image
|
||||
html = %(<meta name="twitter:image" content="https://cdn.example.net/tw.png">)
|
||||
|
||||
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net/posts/cool-thing")
|
||||
assert_equal("https://cdn.example.net/tw.png", result.image)
|
||||
end
|
||||
|
||||
def test_extract_returns_nil_when_no_image_meta_present
|
||||
html = "<html><head><title>No image here</title></head></html>"
|
||||
|
||||
refute(Pressa::OpenGraph.extract(html, base_url: "https://example.net"))
|
||||
end
|
||||
|
||||
def test_extract_handles_single_quoted_attributes
|
||||
html = %(<meta property='og:image' content='https://cdn.example.net/single.png'>)
|
||||
|
||||
result = Pressa::OpenGraph.extract(html, base_url: "https://example.net")
|
||||
assert_equal("https://cdn.example.net/single.png", result.image)
|
||||
end
|
||||
|
||||
def test_fetch_uses_injected_http_get_and_extracts_image
|
||||
html = %(<meta property="og:image" content="https://cdn.example.net/x.png">)
|
||||
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) { html })
|
||||
|
||||
assert_equal("https://cdn.example.net/x.png", result.image)
|
||||
end
|
||||
|
||||
def test_fetch_returns_nil_when_http_get_returns_nil
|
||||
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: ->(_url) {})
|
||||
assert_nil(result)
|
||||
end
|
||||
|
||||
def test_fetch_returns_nil_instead_of_raising_on_network_errors
|
||||
failing_get = ->(_url) { raise Net::OpenTimeout, "timed out" }
|
||||
result = Pressa::OpenGraph.fetch("https://example.net/post", http_get: failing_get)
|
||||
|
||||
assert_nil(result)
|
||||
end
|
||||
end
|
||||
Loading…
Reference in a new issue