From fcd3aedd310aedbd61e6e1f1afa1b37d1ffebc2d Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Sat, 3 Feb 2024 14:30:01 -0800 Subject: [PATCH] Add SCRIPT_LOADERS and API.add_script_loader to allow alternate formats (e.g., YAML-LD) to define loaders for extracting script content. --- lib/json/ld/api.rb | 86 ++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/lib/json/ld/api.rb b/lib/json/ld/api.rb index 965d3dc..410207c 100644 --- a/lib/json/ld/api.rb +++ b/lib/json/ld/api.rb @@ -758,6 +758,28 @@ class << self alias fromRDF fromRdf end + ## + # Hash of recognized script types and the loaders that decode them + # into a hash or array of hashes. + # + # @return Hash{type, Proc} + SCRIPT_LOADERS = { + 'application/ld+json' => ->(content, url:, **options) do + validate_input(content, url: url) if options[:validate] + mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } + MultiJson.load(content, **mj_opts) + end + } + + ## + # Adds a loader for some specific content type + # + # @param [String] type + # @param [Proc] loader + def self.add_script_loader(type, loader) + SCRIPT_LOADERS[type] = loader + end + ## # Load one or more script tags from an HTML source. # Unescapes and uncomments input, returns the internal representation @@ -812,47 +834,53 @@ def self.load_html(input, url:, element = input.at_xpath("//script[@id='#{id}']") raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found with id=#{id}" unless element - unless element.attributes['type'].to_s.start_with?('application/ld+json') + script_type = SCRIPT_LOADERS.keys.detect {|type| element.attributes['type'].to_s.start_with?(type)} + unless script_type raise JSON::LD::JsonLdError::LoadingDocumentFailed, "Script tag has type=#{element.attributes['type']}" end - content = element.inner_html - validate_input(content, url: url) if options[:validate] - mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } - MultiJson.load(content, **mj_opts) + loader = SCRIPT_LOADERS[script_type] + loader.call(element.inner_html, url: url, **options) elsif extractAllScripts res = [] - elements = if profile - es = input.xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]") - # If no profile script, just take a single script without profile - es = [input.at_xpath("//script[starts-with(@type, 'application/ld+json')]")].compact if es.empty? - es - else - input.xpath("//script[starts-with(@type, 'application/ld+json')]") - end - elements.each do |element| - content = element.inner_html - validate_input(content, url: url) if options[:validate] - mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } - r = MultiJson.load(content, **mj_opts) - if r.is_a?(Hash) - res << r - elsif r.is_a?(Array) - res.concat(r) + + SCRIPT_LOADERS.each do |type, loader| + next unless res.empty? # Only load a single type + elements = if profile + es = input.xpath("//script[starts-with(@type, '#{type};profile=#{profile}')]") + # If no profile script, just take a single script without profile + es = [input.at_xpath("//script[starts-with(@type, '#{type}')]")].compact if es.empty? + es + else + input.xpath("//script[starts-with(@type, '#{type}')]") + end + elements.each do |element| + content = element.inner_html + r = loader.call(content, url: url, extractAllScripts: true, **options) + if r.is_a?(Hash) + res << r + elsif r.is_a?(Array) + res.concat(r) + end end end res else - # Find the first script with type application/ld+json. - element = input.at_xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]") if profile - element ||= input.at_xpath("//script[starts-with(@type, 'application/ld+json')]") - raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found" unless element + # Find the first script with a known type + script_type, element = nil, nil + SCRIPT_LOADERS.keys.each do |type| + next if script_type # already found the type + element = input.at_xpath("//script[starts-with(@type, '#{type};profile=#{profile}')]") if profile + element ||= input.at_xpath("//script[starts-with(@type, '#{type}')]") + script_type = type if element + end + unless script_type + raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found" unless element + end content = element.inner_html - validate_input(content, url: url) if options[:validate] - mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) } - MultiJson.load(content, **mj_opts) + SCRIPT_LOADERS[script_type].call(content, url: url, **options) end rescue MultiJson::ParseError => e raise JSON::LD::JsonLdError::InvalidScriptElement, e.message