From 0e157d92fda0b2a84e3786c2561317f331a6bf58 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 30 Jun 2017 16:44:05 -0700 Subject: [PATCH 01/12] Update based on simplified spec proposed for Third Edition. This version continues to create document-relative property URIs. --- lib/rdf/microdata/reader.rb | 99 ++++++++++++---------------- lib/rdf/microdata/reader/nokogiri.rb | 6 ++ spec/suite_helper.rb | 4 ++ 3 files changed, 52 insertions(+), 57 deletions(-) diff --git a/lib/rdf/microdata/reader.rb b/lib/rdf/microdata/reader.rb index d33b253..e820387 100644 --- a/lib/rdf/microdata/reader.rb +++ b/lib/rdf/microdata/reader.rb @@ -20,10 +20,12 @@ class Reader < RDF::Reader # @private class CrawlFailure < StandardError; end - # @!attribute [r] implementation # @return [Module] Returns the HTML implementation module for this reader instance. attr_reader :implementation + # @return [Hash{Object => RDF::Resource}] maps RDF elements (items) to resources + attr_reader :memory + ## # Returns the base URI determined by this reader. # @@ -92,16 +94,16 @@ def self.find(type) # @param [#to_s] name # @param [Hash{}] ec Evaluation Context # @return [RDF::URI] - def predicateURI(name, ec) + def predicateURI(name, base_uri) u = RDF::URI(name) # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ return u if u.absolute? n = frag_escape(name) - if ec[:current_type].nil? - # 2) If current type from context is null, there can be no current vocabulary. + if uri.nil? + # 2) If current vocabulary from context is null, there can be no current vocabulary. # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name - u = RDF::URI(ec[:document_base].to_s) + u = RDF::URI(base_uri.to_s) u.fragment = frag_escape(name) u else @@ -178,12 +180,12 @@ def initialize(input = $stdin, options = {}, &block) log_error("Empty document") if root.nil? log_error(doc_errors.map(&:message).uniq.join("\n")) if !doc_errors.empty? - log_debug(@doc, "library = #{@library}") + log_debug('', "library = #{@library}") # Load registry begin registry_uri = options[:registry] || DEFAULT_REGISTRY - log_debug(@doc, "registry = #{registry_uri.inspect}") + log_debug('', "registry = #{registry_uri.inspect}") Registry.load_registry(registry_uri) rescue JSON::ParserError => e log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) @@ -270,6 +272,7 @@ def add_triple(node, subject, predicate, object) # Parsing a Microdata document (this is *not* the recursive method) def parse_whole_document(doc, base) base = doc_base(base) + @memory = {} options[:base_uri] = if (base) # Strip any fragment from base base = base.to_s.split('#').first @@ -280,15 +283,9 @@ def parse_whole_document(doc, base) log_info(nil) {"parse_whole_doc: base='#{base}'"} - ec = { - memory: {}, - current_type: nil, - current_vocabulary: nil, - document_base: base, - } # 1) For each element that is also a top-level item, Generate the triples for that item using the evaluation context. getItems.each do |el| - log_depth {generate_triples(el, ec)} + log_depth {generate_triples(el, Registry.new(nil))} end log_info(doc, "parse_whole_doc: traversal complete") @@ -298,12 +295,11 @@ def parse_whole_document(doc, base) # Generate triples for an item # # @param [RDF::Resource] item - # @param [Hash{Symbol => Object}] ec + # @param [Registry] vocab # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory - # @option ec [RDF::Resource] :current_type + # @option ec [RDF::Resource] :current_vocabulary # @return [RDF::Resource] - def generate_triples(item, ec = {}) - memory = ec[:memory] + def generate_triples(item, vocab) # 1) If there is an entry for item in memory, then let subject be the subject of that entry. Otherwise, if item has a global identifier and that global identifier is an absolute URL, let subject be that global identifier. Otherwise, let subject be a new blank node. subject = if memory.include?(item.node) memory[item.node][:subject] @@ -312,12 +308,13 @@ def generate_triples(item, ec = {}) end || RDF::Node.new memory[item.node] ||= {} - log_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"} + log_debug(item) {"gentrips(2): subject=#{subject.inspect}, vocab: #{vocab.inspect}"} # 2) Add a mapping from item to subject in memory, if there isn't one already. memory[item.node][:subject] ||= subject # 3) For each type returned from element.itemType of the element defining the item. + # 4) Set vocab to the first value returned from element.itemType of the element defining the item. type = nil item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t| # 3.1. If type is an absolute URL, generate the following triple: @@ -325,36 +322,26 @@ def generate_triples(item, ec = {}) add_triple(item, subject, RDF.type, t) end - # 4) Set type to the first value returned from element.itemType of the element defining the item. - - # 5) Otherwise, set type to current type from the Evaluation Context if not empty. - type ||= ec[:current_type] - log_debug(item) {"gentrips(5): type=#{type.inspect}"} - - # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix. - vocab = Registry.find(type) - - # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type. - vocab ||= begin - type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') - log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} - Registry.new(type_vocab) + # 6) If the registry contains a URI prefix that is a character for character match of vocab up to the length of the URI prefix, set vocab as that URI prefix. + if type || vocab.nil? + vocab = Registry.find(type) || begin + type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless type.nil? + log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} + Registry.new(type_vocab) + end end - # 8) Update evaluation context setting current vocabulary to vocab. - ec[:current_vocabulary] = vocab + # Otherwise, use vocab from evaluation context + log_debug(item) {"gentrips(8): vocab: #{vocab.inspect}"} # 9. For each element _element_ that has one or more property names and is one of the properties of the item _item_, run the following substep: props = item_properties(item) # 9.1. For each name name in element's property names, run the following substeps: props.each do |element| element.attribute('itemprop').to_s.split(' ').compact.each do |name| - log_debug(item) {"gentrips(9.1): name=#{name.inspect}, type=#{type}"} - # 9.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) - + log_debug(item) {"gentrips(9.1): name=#{name.inspect}, vocab=#{vocab.inspect}"} # 9.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 9.1.3) Let value be the property value of element. value = property_value(element) @@ -362,7 +349,7 @@ def generate_triples(item, ec = {}) # 9.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(9.1.4): value=#{value.inspect}"} end @@ -384,11 +371,9 @@ def generate_triples(item, ec = {}) props.each do |element| element.attribute('itemprop-reverse').to_s.split(' ').compact.each do |name| log_debug(item) {"gentrips(10.1): name=#{name.inspect}"} - # 10.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) # 10.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 10.1.3) Let value be the property value of element. value = property_value(element) @@ -396,7 +381,7 @@ def generate_triples(item, ec = {}) # 10.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(10.1.4): value=#{value.inspect}"} elsif value.is_a?(RDF::Literal) # 10.1.5) Otherwise, if value is a literal, ignore the value and continue to the next name; it is an error for the value of @itemprop-reverse to be a literal @@ -432,13 +417,13 @@ def item_properties(item, reverse = false) # To crawl the properties of an element root with a list memory, the user agent must run the following steps. These steps either fail or return a list with a count of errors. The count of errors is used as part of the authoring conformance criteria below. # # @param [Nokogiri::XML::Element] root - # @param [Array] memory + # @param [Array] memo # @param [Boolean] reverse crawl reverse properties # @return [Array] # Resultant elements - def crawl_properties(root, memory, reverse) - # 1. If root is in memory, then the algorithm fails; abort these steps. - raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root) + def crawl_properties(root, memo, reverse) + # 1. If root is in memo, then the algorithm fails; abort these steps. + raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memo.include?(root) # 2. Collect all the elements in the item root; let results be the resulting list of elements, and errors be the resulting count of errors. results = elements_in_item(root) @@ -447,13 +432,13 @@ def crawl_properties(root, memory, reverse) # 3. Remove any elements from results that do not have an @itemprop (@itemprop-reverse) attribute specified. results = results.select {|e| e.has_attribute?(reverse ? 'itemprop-reverse' : 'itemprop')} - # 4. Let new memory be a new list consisting of the old list memory with the addition of root. - raise CrawlFailure, "itemref recursion" if memory.detect {|n| root.node.object_id == n.node.object_id} - new_memory = memory + [root] + # 4. Let new memo be a new list consisting of the old list memo with the addition of root. + raise CrawlFailure, "itemref recursion" if memo.detect {|n| root.node.object_id == n.node.object_id} + new_memo = memo + [root] - # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memory as the memory. + # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memo as the memo. results.select {|e| e.has_attribute?('itemscope')}.each do |element| - log_depth {crawl_properties(element, new_memory, reverse)} + log_depth {crawl_properties(element, new_memo, reverse)} end results @@ -469,7 +454,7 @@ def crawl_properties(root, memory, reverse) def elements_in_item(root) # Let results and pending be empty lists of elements. # Let errors be zero. - results, memory, errors = [], [], 0 + results, memo, errors = [], [], 0 # Add all the children elements of root to pending. pending = root.elements @@ -487,13 +472,13 @@ def elements_in_item(root) # Loop: Remove an element from pending and let current be that element. while current = pending.shift - if memory.include?(current) + if memo.include?(current) raise CrawlFailure, "elements_in_item: results already includes #{current.inspect}" elsif !current.has_attribute?('itemscope') # If current is not already in results and current does not have an itemscope attribute, then: add all the child elements of current to pending. pending += current.elements end - memory << current + memo << current # If current is not already in results, then: add current to results. results << current unless results.include?(current) diff --git a/lib/rdf/microdata/reader/nokogiri.rb b/lib/rdf/microdata/reader/nokogiri.rb index a77bb30..f148516 100644 --- a/lib/rdf/microdata/reader/nokogiri.rb +++ b/lib/rdf/microdata/reader/nokogiri.rb @@ -103,6 +103,12 @@ def elements NodeSetProxy.new(@node.elements, self) end + ## + # Rational debug output + def to_str + @node.path + end + ## # Proxy for everything else to @node def method_missing(method, *args) diff --git a/spec/suite_helper.rb b/spec/suite_helper.rb index 7ac224a..5d14456 100644 --- a/spec/suite_helper.rb +++ b/spec/suite_helper.rb @@ -145,6 +145,10 @@ def action BASE.join(property('action')) end + def input + RDF::Util::File.open_file(action).read + end + def registry reg = property('registry') || BASE + "test-registry.json" From a44ea71dfd53f6f6db9ea09efbf9829da7363155 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 19 Jul 2017 16:24:21 -0700 Subject: [PATCH 02/12] Extract the Registry class from the reader. --- lib/rdf/microdata.rb | 2 + lib/rdf/microdata/reader.rb | 108 +-------------------------------- lib/rdf/microdata/registry.rb | 109 ++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 107 deletions(-) create mode 100644 lib/rdf/microdata/registry.rb diff --git a/lib/rdf/microdata.rb b/lib/rdf/microdata.rb index e86650a..387dc2f 100644 --- a/lib/rdf/microdata.rb +++ b/lib/rdf/microdata.rb @@ -21,12 +21,14 @@ module RDF # @author [Gregg Kellogg](http://greggkellogg.net/) module Microdata USES_VOCAB = RDF::URI("http://www.w3.org/ns/rdfa#usesVocabulary") + DEFAULT_REGISTRY = File.expand_path("../../../etc/registry.json", __FILE__) require 'rdf/microdata/format' require 'rdf/microdata/vocab' autoload :Expansion, 'rdf/microdata/expansion' autoload :Profile, 'rdf/microdata/profile' autoload :Reader, 'rdf/microdata/reader' + autoload :Registry, 'rdf/microdata/registry' autoload :VERSION, 'rdf/microdata/version' end end diff --git a/lib/rdf/microdata/reader.rb b/lib/rdf/microdata/reader.rb index e820387..c78a0c1 100644 --- a/lib/rdf/microdata/reader.rb +++ b/lib/rdf/microdata/reader.rb @@ -15,7 +15,6 @@ class Reader < RDF::Reader include Expansion include RDF::Util::Logger URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video) - DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json")) # @private class CrawlFailure < StandardError; end @@ -38,111 +37,6 @@ def base_uri @options[:base_uri] end - # Interface to registry - class Registry - # @return [RDF::URI] Prefix of vocabulary - attr_reader :uri - - # @return [Hash] properties - attr_reader :properties - - ## - # Initialize the registry from a URI or file path - # - # @param [String] registry_uri - def self.load_registry(registry_uri) - return if @registry_uri == registry_uri - - json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) } - - @prefixes = {} - json.each do |prefix, elements| - next unless elements.is_a?(Hash) - properties = elements.fetch("properties", {}) - @prefixes[prefix] = Registry.new(prefix, properties) - end - @registry_uri = registry_uri - end - - ## - # Initialize registry for a particular prefix URI - # - # @param [RDF::URI] prefixURI - # @param [Hash] properties ({}) - def initialize(prefixURI, properties = {}) - @uri = prefixURI - @properties = properties - @property_base = prefixURI.to_s - # Append a '#' for fragment if necessary - @property_base += '#' unless %w(/ #).include?(@property_base[-1,1]) - end - - ## - # Find a registry entry given a type URI - # - # @param [RDF::URI] type - # @return [Registry] - def self.find(type) - @prefixes ||= {} - k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 } - @prefixes[k] if k - end - - ## - # Generate a predicateURI given a `name` - # - # @param [#to_s] name - # @param [Hash{}] ec Evaluation Context - # @return [RDF::URI] - def predicateURI(name, base_uri) - u = RDF::URI(name) - # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ - return u if u.absolute? - - n = frag_escape(name) - if uri.nil? - # 2) If current vocabulary from context is null, there can be no current vocabulary. - # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name - u = RDF::URI(base_uri.to_s) - u.fragment = frag_escape(name) - u - else - # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). - RDF::URI(@property_base + n) - end - end - - ## - # Yield a equivalentProperty or subPropertyOf if appropriate - # - # @param [RDF::URI] predicateURI - # @yield equiv - # @yieldparam [RDF::URI] equiv - def expand(predicateURI) - tok = tokenize(predicateURI) - if @properties[tok].is_a?(Hash) - value = @properties[tok].fetch("subPropertyOf", nil) - value ||= @properties[tok].fetch("equivalentProperty", nil) - - Array(value).each {|equiv| yield RDF::URI(equiv)} - end - end - - ## - # Turn a predicateURI into a simple token - # @param [RDF::URI] predicateURI - # @return [String] - def tokenize(predicateURI) - predicateURI.to_s.sub(@property_base, '') - end - - ## - # Fragment escape a name - def frag_escape(name) - name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} - end - end - ## # Initializes the Microdata reader instance. # @@ -184,7 +78,7 @@ def initialize(input = $stdin, options = {}, &block) # Load registry begin - registry_uri = options[:registry] || DEFAULT_REGISTRY + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY log_debug('', "registry = #{registry_uri.inspect}") Registry.load_registry(registry_uri) rescue JSON::ParserError => e diff --git a/lib/rdf/microdata/registry.rb b/lib/rdf/microdata/registry.rb new file mode 100644 index 0000000..f7940f8 --- /dev/null +++ b/lib/rdf/microdata/registry.rb @@ -0,0 +1,109 @@ +require 'json' +module RDF::Microdata + + # Interface to registry + class Registry + # @return [RDF::URI] Prefix of vocabulary + attr_reader :uri + + # @return [Hash] properties + attr_reader :properties + + ## + # Initialize the registry from a URI or file path + # + # @param [String] registry_uri + def self.load_registry(registry_uri) + return if @registry_uri == registry_uri + + json = RDF::Util::File.open_file(registry_uri) { |f| ::JSON.load(f) } + + @prefixes = {} + json.each do |prefix, elements| + next unless elements.is_a?(Hash) + properties = elements.fetch("properties", {}) + @prefixes[prefix] = Registry.new(prefix, properties) + end + @registry_uri = registry_uri + end + + ## + # Initialize registry for a particular prefix URI + # + # @param [RDF::URI] prefixURI + # @param [Hash] properties ({}) + def initialize(prefixURI, properties = {}) + @uri = prefixURI + @properties = properties + @property_base = prefixURI.to_s + # Append a '#' for fragment if necessary + @property_base += '#' unless %w(/ #).include?(@property_base[-1,1]) + end + + ## + # Find a registry entry given a type URI + # + # @param [RDF::URI] type + # @return [Registry] + def self.find(type) + @prefixes ||= {} + k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 } + @prefixes[k] if k + end + + ## + # Generate a predicateURI given a `name` + # + # @param [#to_s] name + # @param [Hash{}] ec Evaluation Context + # @return [RDF::URI] + def predicateURI(name, base_uri) + u = RDF::URI(name) + # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ + return u if u.absolute? + + n = frag_escape(name) + if uri.nil? + # 2) If current vocabulary from context is null, there can be no current vocabulary. + # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name + u = RDF::URI(base_uri.to_s) + u.fragment = frag_escape(name) + u + else + # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). + RDF::URI(@property_base + n) + end + end + + ## + # Yield a equivalentProperty or subPropertyOf if appropriate + # + # @param [RDF::URI] predicateURI + # @yield equiv + # @yieldparam [RDF::URI] equiv + def expand(predicateURI) + tok = tokenize(predicateURI) + if @properties[tok].is_a?(Hash) + value = @properties[tok].fetch("subPropertyOf", nil) + value ||= @properties[tok].fetch("equivalentProperty", nil) + + Array(value).each {|equiv| yield RDF::URI(equiv)} + end + end + + ## + # Turn a predicateURI into a simple token + # @param [RDF::URI] predicateURI + # @return [String] + def tokenize(predicateURI) + predicateURI.to_s.sub(@property_base, '') + end + + ## + # Fragment escape a name + def frag_escape(name) + name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} + end + end + +end \ No newline at end of file From e791cd1e6be37d8d28c7a2e6905bc7bab376d00d Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 19 Jul 2017 16:24:44 -0700 Subject: [PATCH 03/12] Add experimental RdfaReader based on DOM transformation from Microdata to RDFa. --- README.md | 6 +- lib/rdf/microdata.rb | 1 + lib/rdf/microdata/rdfa_reader.rb | 125 +++++ lib/rdf/microdata/reader.rb | 15 + spec/rdfa_reader_spec.rb | 888 +++++++++++++++++++++++++++++++ spec/suite_helper.rb | 2 +- 6 files changed, 1035 insertions(+), 2 deletions(-) create mode 100644 lib/rdf/microdata/rdfa_reader.rb create mode 100644 spec/rdfa_reader_spec.rb diff --git a/README.md b/README.md index 2cec89a..d00e533 100755 --- a/README.md +++ b/README.md @@ -60,7 +60,11 @@ Full documentation available on [Rubydoc.info][Microdata doc] * {RDF::Microdata::Reader} * {RDF::Microdata::Reader::Nokogiri} -### Additional vocabularies + +### RDFa-based Reader +There is an experimental reader based on transforming Microdata to RDFa within the DOM. To invoke +this, add the `rdfa: true` option to the {RDF::Microdata::Reader.new}, or +use {RDF::Microdata::RdfaReader} directly. ## Resources * [RDF.rb][RDF.rb] diff --git a/lib/rdf/microdata.rb b/lib/rdf/microdata.rb index 387dc2f..34e77fb 100644 --- a/lib/rdf/microdata.rb +++ b/lib/rdf/microdata.rb @@ -28,6 +28,7 @@ module Microdata autoload :Expansion, 'rdf/microdata/expansion' autoload :Profile, 'rdf/microdata/profile' autoload :Reader, 'rdf/microdata/reader' + autoload :RdfaReader, 'rdf/microdata/rdfa_reader' autoload :Registry, 'rdf/microdata/registry' autoload :VERSION, 'rdf/microdata/version' end diff --git a/lib/rdf/microdata/rdfa_reader.rb b/lib/rdf/microdata/rdfa_reader.rb new file mode 100644 index 0000000..14e1681 --- /dev/null +++ b/lib/rdf/microdata/rdfa_reader.rb @@ -0,0 +1,125 @@ +require 'rdf/rdfa' +require 'nokogumbo' + +module RDF::Microdata + ## + # Update DOM to turn Microdata into RDFa and parse using the RDFa Reader + class RdfaReader < RDF::RDFa::Reader + + def self.format(klass = nil) + if klass.nil? + RDF::Microdata::Format + else + super + end + end + + ## + # Initializes the RdfaReader instance. + # + # @param [IO, File, String] input + # the input stream to read + # @param [Hash{Symbol => Object}] options + # any additional options (see `RDF::Reader#initialize`) + # @return [reader] + # @yield [reader] `self` + # @yieldparam [RDF::Reader] reader + # @yieldreturn [void] ignored + # @raise [RDF::ReaderError] if _validate_ + def initialize(input = $stdin, options = {}, &block) + + input = case input + when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document then input + else + # Try to detect charset from input + options[:encoding] ||= input.charset if input.respond_to?(:charset) + + # Otherwise, default is utf-8 + options[:encoding] ||= 'utf-8' + options[:encoding] = options[:encoding].to_s if options[:encoding] + input = input.read if input.respond_to?(:read) + ::Nokogiri::HTML5(input.force_encoding(options[:encoding])) + end + + + # Load registry + begin + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") + Registry.load_registry(registry_uri) + rescue JSON::ParserError => e + log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) + end + + # For all members having @itemscope + input.css("[itemscope]").each do |item| + # Get @itemtypes to create @type and @vocab + item.attribute('itemscope').remove + if item['itemtype'] + # Only absolute URLs + types = item.attribute('itemtype'). + remove. + to_s. + split(/\s+/). + select {|t| RDF::URI(t).absolute?} + + item['typeof'] = types.join(' ') unless types.empty? + if vocab = types.first + vocab = Registry.find(vocab) || begin + type_vocab = vocab.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless vocab.nil? + Registry.new(type_vocab) if type_vocab + end + item['vocab'] = vocab.uri.to_s if vocab + end + end + + # Change each itemid attribute to an resource attribute with the same value + if item['itemid'] + id = item.attribute('itemid').remove + item[item['itemprop'] ? 'resource' : 'about'] = id + else + # Otherwise, ensure that @typeof has at least an empty value + item['typeof'] ||= '' + end + end + + # Add @resource for all itemprop values of object based on a @data value + input.css("object[itemprop][data]").each do |item| + item['resource'] ||= item['data'] + end + + # Replace all @itemprop values with @property + input.css("[itemprop]").each {|item| item['property'] = item.attribute('itemprop').remove} + + # Wrap all @itemref properties + input.css("[itemref]").each do |item| + item_vocab = item['vocab'] || item.ancestors.detect {|a| a.attribute('vocab')} + item_vocab = item_vocab.to_s if item_vocab + + item.attribute('itemref').remove.to_s.split(/\s+/).each do |ref| + if referenced = input.css("##{ref}") + # Add @vocab to referenced using the closest ansestor having @vocab of item. + # If the element with id reference has no resource attribute, add a resource attribute whose value is a NUMBER SIGN U+0023 followed by reference to the element. + # If the element with id reference has no typeof attribute, add a typeof="rdfa:Pattern" attribute to the element. + referenced.wrap(%(
")) + end + + context :interface do + subject {%( +
+

My name is Elizabeth.

+
+ )} + + it "should yield reader" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Microdata::RdfaReader) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/') do |reader| + inner.called(reader.class) + end + end + + it "should return reader" do + expect(RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/')).to be_a(RDF::Microdata::RdfaReader) + end + + it "should not raise errors" do + expect { + RDF::Microdata::RdfaReader.new(subject, validate: true, base_uri: 'http://example/') + }.not_to raise_error + end + + it "should yield statements" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Statement).at_least(2) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/').each_statement do |statement| + inner.called(statement.class) + end + end + + it "should yield triples" do + inner = double("inner") + expect(inner).to receive(:called).at_least(2) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/').each_triple do |subject, predicate, object| + inner.called(subject.class, predicate.class, object.class) + end + end + + context "Microdata Reader with :rdfa option" do + it "returns a RdfaReader instance" do + r = RDF::Microdata::Reader.new(StringIO.new(""), rdfa: true) + expect(r).to be_a(RDF::Microdata::RdfaReader) + end + end + end + + context :parsing do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + _:a . + %s + ) + end + + it "parses a simple graph" do + md = %q(

My name is Gregg Kellogg.

) + nt = %q(_:a "Gregg Kellogg" .) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + + context "values" do + [ + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q( +

My name is Gregg

+

My name is Kellogg

+ ), + %q(_:a "Gregg", "Kellogg" .) + ], + [ + %q(

My name is Gregg Kellogg

), + %q( + _:a "Gregg Kellogg" . + _:a "Gregg Kellogg" . + ) + ], + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(Bar), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a "2011-06-28Z"^^ .) + ], + [ + %q(), + %q(_:a "00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "P2011Y06M28DT00H00M00S"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a _:b .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + pending if [ + '', + '', + ].include?(md) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "base_uri" do + before :each do + @nt_ctx = %q( + _:a . + %s + ) + end + + [ + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(Stéphane Corlosquet), + %q(_:a .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md, base_uri: 'http://example.com/')).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemid" do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + . + %s + ) + end + + [ + [ + %q(

My name is Gregg Kellogg

), + %q( "Gregg Kellogg" .) + ], + [ + %q(), + %q( "foo" .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q( .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemtype" do + { + "with no type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with empty type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with relative type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with single type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + #"with no type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with empty type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with relative type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + "with single type and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + "with inherited type and token property" => [ + %q( +
+

Name: Gregg

+
+

Name: Jeni

+
+
+ ), + %q( + @prefix md: . + @prefix schema: . + [ a schema:Person ; + schema:name "Gregg" ; + schema:knows [ schema:name "Jeni" ] + ] . + ) + ] + }.each do |name, (md, nt)| + it "#{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "itemref" do + { + "to single id" => + [ + %q( +
+
+

Name: Amanda

+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "to generate listed property values" => + [ + %q( +
+
+

My name is Gregg

+
+

My name is Kellogg

+
+ ), + %q( + [ a ; + "Gregg", "Kellogg" ; + ] . + ) + ], + #"to single id with different types" => + #[ + # %q( + #
+ #
+ #
+ #

Name: Amanda

+ #
+ # ), + # %q( + # [ a ; + # "Amanda" ; + # ] . + # [ a ; + # "Amanda" ; + # ] . + # ) + #], + "to multiple ids" => + [ + %q( +
+
+

Name: Amanda

+

Jazz Band

+
+ ), + %q( + [ a ; + "Amanda" ; + "Jazz Band" ; + ] . + ) + ], + "with chaining" => + [ + %q( +
+
+

Name: Amanda

+
+
+

Band: Jazz Band

+

Size: 12 players

+
+
+ ), + %q( + [ a ; + "Amanda" ; + [ + a ; + "Jazz Band"; + "12" + ] + ] . + ) + ], + "shared" => + [ + %q( +
+
+
+
+ Amanda +
+
+ ), + %q( + [ a ; _:a ] . + [ a ; _:a ] . + _:a "Amanda" . + ) + + ], + }.each do |name, (md, nt)| + it "parses #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + + it "catches infinite recursion", pending: true do + md = %( + + +
+
+
friend1
+
+
friend2
+
+
+
+
+ + ) + expect {parse(md, validate: true)}.to raise_error(RDF::ReaderError) + expect(@logger.to_s).to include("itemref recursion") + end + end + + context "propertyURI" do + context "no expansion" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "default propertyURI generation" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + end + + context "itemprop-reverse", skip: true do + { + "link" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + [ + a ; + "William Shakespeare" + ] . + ) + ], + "itemscope" => [ + %q( +
+ The ACME Shopping Mall on Structured Data Avenue + The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data +

Here is a list of shops inside:

+
+ Dan Brickley's Data Restaurant +
+
+ Ramanathan Guha's Meta Content Framework Bakery +
+
+ ), + %q( + _:a a ; + "The ACME Shopping Mall on Structured Data Avenue"; + "The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data" . + _:b a ; + "Dan Brickley's Data Restaurant"; + _:a . + _:c a ; + "Ramanathan Guha's Meta Content Framework Bakery"; + _:a . + ) + ], + "literal" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + _:a a ; + "William Shakespeare" . + ) + ], + "itemprop and itemprop-reverse" => [ + %q( +
+ Cryptography Users +
+
+ Alice +
+ 1977 +
+
+ ), + %q( + @prefix schema: . + @prefix md: . + + _:a a schema:Organization; + schema:name "Cryptography Users"; + schema:member _:b . + _:b a schema:OrganizationRole; + schema:startDate "1977"; + schema:member _:c; + schema:memberOf _:a . + _:c a schema:Person; + schema:name "Alice"; + schema:memberOf _:b . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "vocabulary expansion", pending: true do + it "always expands" do + md = %q( +
+ +
+ ) + ttl = %q( + [ a , ; + + ] . + ) + + expect(parse(md, vocab_expansion: true)).to be_equivalent_graph(ttl, logger: @logger) + end + end + + context "test-files", skip: true do + Dir.glob(File.join(File.expand_path(File.dirname(__FILE__)), "test-files", "*.html")).each do |md| + it "parses #{md}" do + test_file(md) + end + end + end + end + + def parse(input, options = {}) + @logger = RDF::Spec.logger + graph = options[:graph] || RDF::Graph.new + RDF::Microdata::Reader.new(input, { + logger: @logger, + rdfa: true, + validate: false, + base_uri: "http://example/", + registry: registry_path, + canonicalize: false}.merge(options)).each do |statement| + graph << statement + end + + # Remove any rdfa:usesVocabulary statements + graph.query(predicate: RDF::RDFA.usesVocabulary).each do |stmt| + graph.delete(stmt) + end + graph + end + + def test_file(filepath, options = {}) + graph = parse(File.open(filepath), options) + + ttl_string = File.read(filepath.sub('.html', '.ttl')) + expect(graph).to be_equivalent_graph(ttl_string, logger: @logger) + end +end diff --git a/spec/suite_helper.rb b/spec/suite_helper.rb index 5d14456..80850f7 100644 --- a/spec/suite_helper.rb +++ b/spec/suite_helper.rb @@ -25,7 +25,7 @@ def self.open_file(filename_or_url, options = {}, &block) path = filename_or_url[5..-1] Kernel.open(path.to_s, &block) when 'http://www.w3.org/ns/md' - Kernel.open(RDF::Microdata::Reader::DEFAULT_REGISTRY, &block) + Kernel.open(RDF::Microdata::DEFAULT_REGISTRY, &block) when /^#{REMOTE_PATH}/ begin #puts "attempt to open #{filename_or_url} locally" From f0e408d1311394eb611cb27bdfa0fdf6e021f6fc Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Thu, 20 Jul 2017 14:23:28 -0700 Subject: [PATCH 04/12] Add `RdfaReader#rdfa` to retrieve converted RDFa document after reader initialization. --- Gemfile | 1 + examples/itemref.html | 7 +++ examples/locomotive.html | 11 +++++ lib/rdf/microdata/format.rb | 82 ++++++++++++++++++++++++++++++++ lib/rdf/microdata/rdfa_reader.rb | 9 +++- lib/rdf/microdata/reader.rb | 13 +++++ script/parse | 61 +++++++++++++++++++++++- 7 files changed, 181 insertions(+), 3 deletions(-) create mode 100644 examples/itemref.html create mode 100644 examples/locomotive.html diff --git a/Gemfile b/Gemfile index 48f46d8..e63746f 100644 --- a/Gemfile +++ b/Gemfile @@ -8,6 +8,7 @@ gem "rdf-xsd", github: "ruby-rdf/rdf-xsd", branch: "develop" gem "nokogumbo", '~> 1.4' group :development do + gem 'linkeddata' gem 'ebnf', github: "gkellogg/ebnf", branch: "develop" gem 'rdf-aggregate-repo', github: "ruby-rdf/rdf-aggregate-repo", branch: "develop" gem 'rdf-isomorphic', github: "ruby-rdf/rdf-isomorphic", branch: "develop" diff --git a/examples/itemref.html b/examples/itemref.html new file mode 100644 index 0000000..b961767 --- /dev/null +++ b/examples/itemref.html @@ -0,0 +1,7 @@ +
+

1

+
+
+

test

+

2

+
diff --git a/examples/locomotive.html b/examples/locomotive.html new file mode 100644 index 0000000..e976ef9 --- /dev/null +++ b/examples/locomotive.html @@ -0,0 +1,11 @@ +
+
Name: +
Tank Locomotive (DB 80) +
Product code: +
33041 +
Scale: +
HO +
Digital: +
Delta +
diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index a11dca1..2377030 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -41,5 +41,87 @@ class Format < RDF::Format def self.detect(sample) !!sample.match(/<[^>]*(itemprop|itemtype|itemref|itemscope|itemid)[^>]*>/m) end + + ## + # Hash of CLI commands appropriate for this format + # @return [Hash{Symbol => Hash}] + def self.cli_commands + { + "to-rdfa": { + description: "Transform HTML+Microdata into HTML+RDFa", + parse: false, + help: "to-rdfa files ...", + lambda: ->(files, options) do + out = options[:output] || $stdout + xsl = Nokogiri::XSLT(%( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ).gsub(/^ /, '')) + if files.empty? + # If files are empty, either use options[::evaluate] + input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN + input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) + RDF::Microdata::Reader.new(input, options(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + out.puts xsl.apply_to(reader.rdfa).to_s + end + else + files.each do |file| + RDF::Microdata::Reader.open(file, options.merge(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + out.puts xsl.apply_to(reader.rdfa).to_s + end + end + end + end + }, + } + end end end diff --git a/lib/rdf/microdata/rdfa_reader.rb b/lib/rdf/microdata/rdfa_reader.rb index 14e1681..9ca15d4 100644 --- a/lib/rdf/microdata/rdfa_reader.rb +++ b/lib/rdf/microdata/rdfa_reader.rb @@ -5,6 +5,9 @@ module RDF::Microdata ## # Update DOM to turn Microdata into RDFa and parse using the RDFa Reader class RdfaReader < RDF::RDFa::Reader + # The transformed DOM using RDFa + # @return [RDF::HTML::Document] + attr_reader :rdfa def self.format(klass = nil) if klass.nil? @@ -27,6 +30,8 @@ def self.format(klass = nil) # @yieldreturn [void] ignored # @raise [RDF::ReaderError] if _validate_ def initialize(input = $stdin, options = {}, &block) + @options = options + log_debug('', "using RDFa transformation reader") input = case input when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document then input @@ -41,7 +46,6 @@ def initialize(input = $stdin, options = {}, &block) ::Nokogiri::HTML5(input.force_encoding(options[:encoding])) end - # Load registry begin registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY @@ -112,6 +116,9 @@ def initialize(input = $stdin, options = {}, &block) end end + @rdfa = input + log_debug('', "Transformed document: #{input.to_html}") + options = options.merge( library: :nokogiri, reference_folding: true, diff --git a/lib/rdf/microdata/reader.rb b/lib/rdf/microdata/reader.rb index 49bef8c..2889dfb 100644 --- a/lib/rdf/microdata/reader.rb +++ b/lib/rdf/microdata/reader.rb @@ -37,6 +37,19 @@ def base_uri @options[:base_uri] end + ## + # Reader options + # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Reader#options-class_method + def self.options + super + [ + RDF::CLI::Option.new( + symbol: :rdfa, + datatype: TrueClass, + on: ["--rdfa"], + description: "Transform and parse as RDFa.") {true}, + ] + end + ## # Redirect for RDFa Reader given `:rdfa` option # diff --git a/script/parse b/script/parse index 5eccebc..a4f4349 100755 --- a/script/parse +++ b/script/parse @@ -19,7 +19,8 @@ def run(input, options) start = Time.new num = 0 - if options[:output_format] == :ntriples || options[:quiet] + case options[:output_format] + when :ntriples, :quiet reader_class.new(input, options).each do |statement| num += 1 if options[:quiet] @@ -28,7 +29,60 @@ def run(input, options) options[:output].puts statement.to_ntriples end end - elsif options[:output_format] == :inspect + when :rdfa + xsl = Nokogiri::XSLT(%( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ).gsub(/^ /, '')) + reader_class.new(input, options.merge(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + options[:output].puts xsl.apply_to(reader.rdfa).to_s + end + when :inspect reader_class.new(input, options).each do |statement| num += 1 options[:output].puts statement.inspect @@ -55,6 +109,7 @@ logger.formatter = lambda {|severity, datetime, progname, msg| "#{severity}: #{m options = { verbose: false, validate: false, + rdfa: false, logger: logger, output: STDOUT, output_format: :turtle, @@ -72,6 +127,7 @@ opts = GetoptLong.new( ["--output", "-o", GetoptLong::REQUIRED_ARGUMENT], ["--quiet", GetoptLong::NO_ARGUMENT], ["--registry", GetoptLong::REQUIRED_ARGUMENT], + ["--rdfa", GetoptLong::NO_ARGUMENT], ["--template", GetoptLong::REQUIRED_ARGUMENT], ["--uri", GetoptLong::REQUIRED_ARGUMENT], ["--validate", GetoptLong::NO_ARGUMENT], @@ -82,6 +138,7 @@ opts.each do |opt, arg| when '--debug' then logger.level = Logger::DEBUG when '--execute' then input = arg when '--format' then options[:output_format] = arg.to_sym + when '--rdfa' then options[:rdfa] = true when '--input-format' then options[:input_format] = arg.to_sym when '--quiet' options[:quiet] = options[:quiet].to_i + 1 From ce70b8a21a16f594edf1112e3cc0922e62f47ada Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Thu, 20 Jul 2017 16:41:14 -0700 Subject: [PATCH 05/12] Add JSON-LD-based reader, invoked by passing `jsonld: true` when instantiating the reader. --- README.md | 9 + lib/rdf/microdata.rb | 13 +- lib/rdf/microdata/format.rb | 24 +- lib/rdf/microdata/jsonld_reader.rb | 251 ++++++++ lib/rdf/microdata/reader.rb | 14 + rdf-microdata.gemspec | 1 + script/parse | 6 +- spec/jsonld_reader_spec.rb | 888 +++++++++++++++++++++++++++++ 8 files changed, 1197 insertions(+), 9 deletions(-) create mode 100644 lib/rdf/microdata/jsonld_reader.rb create mode 100644 spec/jsonld_reader_spec.rb diff --git a/README.md b/README.md index d00e533..7a696c2 100755 --- a/README.md +++ b/README.md @@ -66,6 +66,15 @@ There is an experimental reader based on transforming Microdata to RDFa within t this, add the `rdfa: true` option to the {RDF::Microdata::Reader.new}, or use {RDF::Microdata::RdfaReader} directly. +The reader exposes a `#rdfa` method, which can be used to retrieve the transformed HTML+RDFa + +### JSON-lD-based Reader +There is an experimental reader based on transforming Microdata to JSON-LD. To invoke +this, add the `jsonld: true` option to the {RDF::Microdata::Reader.new}, or +use {RDF::Microdata::JsonLdReader} directly. + +The reader exposes a `#json` method, which can be used to retrieve the generated JSON-LD + ## Resources * [RDF.rb][RDF.rb] * [Documentation](http://rdf.rubyforge.org/microdata) diff --git a/lib/rdf/microdata.rb b/lib/rdf/microdata.rb index 34e77fb..101f09a 100644 --- a/lib/rdf/microdata.rb +++ b/lib/rdf/microdata.rb @@ -25,11 +25,12 @@ module Microdata require 'rdf/microdata/format' require 'rdf/microdata/vocab' - autoload :Expansion, 'rdf/microdata/expansion' - autoload :Profile, 'rdf/microdata/profile' - autoload :Reader, 'rdf/microdata/reader' - autoload :RdfaReader, 'rdf/microdata/rdfa_reader' - autoload :Registry, 'rdf/microdata/registry' - autoload :VERSION, 'rdf/microdata/version' + autoload :Expansion, 'rdf/microdata/expansion' + autoload :JsonLdReader, 'rdf/microdata/jsonld_reader' + autoload :Profile, 'rdf/microdata/profile' + autoload :RdfaReader, 'rdf/microdata/rdfa_reader' + autoload :Reader, 'rdf/microdata/reader' + autoload :Registry, 'rdf/microdata/registry' + autoload :VERSION, 'rdf/microdata/version' end end diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index 2377030..193bb52 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -50,7 +50,7 @@ def self.cli_commands "to-rdfa": { description: "Transform HTML+Microdata into HTML+RDFa", parse: false, - help: "to-rdfa files ...", + help: "to-rdfa files ...\nTransform HTML+Microdata into HTML+RDFa", lambda: ->(files, options) do out = options[:output] || $stdout xsl = Nokogiri::XSLT(%( @@ -121,6 +121,28 @@ def self.cli_commands end end }, + "to-jsonld": { + description: "Transform HTML+Microdata into JSON-LD", + parse: false, + help: "to-jsonld files ...\nTransform HTML+Microdata into JSON-LD", + lambda: ->(files, options) do + out = options[:output] || $stdout + if files.empty? + # If files are empty, either use options[::evaluate] + input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN + input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) + RDF::Microdata::Reader.new(input, options(jsonld: true)) do |reader| + out.puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end + else + files.each do |file| + RDF::Microdata::Reader.open(file, options.merge(jsonld: true)) do |reader| + out.puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end + end + end + end + }, } end end diff --git a/lib/rdf/microdata/jsonld_reader.rb b/lib/rdf/microdata/jsonld_reader.rb new file mode 100644 index 0000000..5bf6d3d --- /dev/null +++ b/lib/rdf/microdata/jsonld_reader.rb @@ -0,0 +1,251 @@ +require 'json/ld' +require 'nokogumbo' + +module RDF::Microdata + ## + # Update DOM to turn Microdata into JSON-LD and parse using the JSON-LD Reader + class JsonLdReader < JSON::LD::Reader + # The resulting JSON-LD + # @return [Hash] + attr_reader :jsonld + + def self.format(klass = nil) + if klass.nil? + RDF::Microdata::Format + else + super + end + end + + ## + # Initializes the JsonLdReader instance. + # + # @param [IO, File, String] input + # the input stream to read + # @param [Hash{Symbol => Object}] options + # any additional options (see `RDF::Reader#initialize`) + # @return [reader] + # @yield [reader] `self` + # @yieldparam [RDF::Reader] reader + # @yieldreturn [void] ignored + # @raise [RDF::ReaderError] if _validate_ + def initialize(input = $stdin, options = {}, &block) + @options = options + log_debug('', "using JSON-LD transformation reader") + + input = case input + when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document then input + else + # Try to detect charset from input + options[:encoding] ||= input.charset if input.respond_to?(:charset) + + # Otherwise, default is utf-8 + options[:encoding] ||= 'utf-8' + options[:encoding] = options[:encoding].to_s if options[:encoding] + input = input.read if input.respond_to?(:read) + ::Nokogiri::HTML5(input.force_encoding(options[:encoding])) + end + + # Load registry + begin + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") + Registry.load_registry(registry_uri) + rescue JSON::ParserError => e + log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) + end + + @jsonld = {'@graph' => []} + + # Start with all top-level items + input.css("[itemscope]").each do |item| + next if item['itemprop'] # Only top-level items + jsonld['@graph'] << get_object(item) + end + + log_debug('', "Transformed document: #{jsonld.to_json(JSON::LD::JSON_STATE)}") + + # Rely on RDFa reader + super(jsonld.to_json, options, &block) + end + + private + # Return JSON-LD representation of an item + # @param [Nokogiri::XML::Element] item + # @param [Hash{Nokogiri::XML::Node => Hash}] + # @return [Hash] + def get_object(item, memory = {}) + if result = memory[item] + # Result is a reference to that item; assign a blank-node identifier if necessary + result['@id'] ||= alloc_bnode + return result + end + + result = {} + memory[item] = result + + # If the item has a global identifier, add an entry to result called "@id" whose value is the global identifier of item. + result['@id'] = item['itemid'].to_s if item['itemid'] + + # If the item has any item types, add an entry to result called "@type" whose value is an array listing the item types of item, in the order they were specified on the itemtype attribute. + if item['itemtype'] + # Only absolute URLs + types = item.attribute('itemtype'). + remove. + to_s. + split(/\s+/). + select {|t| RDF::URI(t).absolute?} + if vocab = types.first + vocab = Registry.find(vocab) || begin + type_vocab = vocab.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless vocab.nil? + Registry.new(type_vocab) if type_vocab + end + (result['@context'] = {})['@vocab'] = vocab.uri.to_s if vocab + result['@type'] = types unless types.empty? + end + end + + # For each element element that has one or more property names and is one of the properties of the item item, in the order those elements are given by the algorithm that returns the properties of an item, run the following substeps + item_properties(item).each do |element| + value = if element['itemscope'] + get_object(element, memory) + else + property_value(element) + end + element['itemprop'].to_s.split(/\s+/).each do |prop| + result[prop] ||= [] << value + end + end + + result + end + + ## + # + # @param [Nokogiri::XML::Element] item + # @return [Array] + # List of property elements for an item + def item_properties(item) + results, memory, pending = [], [item], item.children.select(&:element?) + log_debug(item, "item_properties") + + # If root has an itemref attribute, split the value of that itemref attribute on spaces. For each resulting token ID, if there is an element in the document whose ID is ID, then add the first such element to pending. + item['itemref'].to_s.split(/\s+/).each do |ref| + if referenced = referenced = item.at_css("##{ref}") + pending << referenced + end + end + + while !pending.empty? + current = pending.shift + # Error + break if memory.include?(current) + memory << current + + # If current does not have an itemscope attribute, then: add all the child elements of current to pending. + pending += current.children.select(&:element?) unless current['itemscope'] + + # If current has an itemprop attribute specified and has one or more property names, then add current to results. + results << current unless current['itemprop'].to_s.split(/\s+/).empty? + end + + results + end + + ## + # + def property_value(element) + base = element.base || base_uri + log_debug(element) {"property_value(#{element.name}): base #{base.inspect}"} + value = case + when element.has_attribute?('itemscope') + {} + when element.has_attribute?('content') + if element.language + {"@value" => element['content'].to_s.strip, language: element.language} + else + element['content'].to_s.strip + end + when %w(data meter).include?(element.name) && element.attribute('value') + # XXX parse as number? + {"@value" => element['value'].to_s.strip} + when %w(audio embed iframe img source track video).include?(element.name) + {"@id" => uri(element.attribute('src'), base).to_s} + when %w(a area link).include?(element.name) + {"@id" => uri(element.attribute('href'), base).to_s} + when %w(object).include?(element.name) + {"@id" => uri(element.attribute('data'), base).to_s} + when %w(time).include?(element.name) + # use datatype? + (element.attribute('datetime') || element.text).to_s.strip + else + if element.language + {"@value" => element.inner_text.to_s.strip, language: element.language} + else + element.inner_text.to_s.strip + end + end + log_debug(element) {" #{value.inspect}"} + value + end + + # Allocate a new blank node identifier + # @return [String] + def alloc_bnode + @bnode_base ||= "_:a" + res = @bnode_base + @bnode_base = res.succ + res + end + + # Fixme, what about xml:base relative to element? + def uri(value, base = nil) + value = if base + base = uri(base) unless base.is_a?(RDF::URI) + base.join(value.to_s) + else + RDF::URI(value.to_s) + end + value.validate! if validate? + value.canonicalize! if canonicalize? + value = RDF::URI.intern(value) if intern? + value + end + end +end + +# Monkey Patch Nokogiri +module Nokogiri::XML + class Element + + ## + # Get any xml:base in effect for this element + def base + if @base.nil? + @base = attributes['xml:base'] || + (parent && parent.element? && parent.base) || + false + end + + @base == false ? nil : @base + end + + + ## + # Get any xml:lang or lang in effect for this element + def language + if @language.nil? + language = case + when self["xml:lang"] + self["xml:lang"].to_s + when self["lang"] + self["lang"].to_s + else + parent && parent.element? && parent.language + end + end + @language == false ? nil : @language + end + + end +end diff --git a/lib/rdf/microdata/reader.rb b/lib/rdf/microdata/reader.rb index 2889dfb..4c1ee6a 100644 --- a/lib/rdf/microdata/reader.rb +++ b/lib/rdf/microdata/reader.rb @@ -56,7 +56,21 @@ def self.options # @private def self.new(input = nil, options = {}, &block) klass = if options[:rdfa] + # Requires rdf-rdfa gem to be loaded + begin + require 'rdf/rdfa' + rescue LoadError + raise ReaderError, "Use of RDFa-based reader requires rdf-rdfa gem" + end RdfaReader + elsif options[:jsonld] + # Requires rdf-rdfa gem to be loaded + begin + require 'json/ld' + rescue LoadError + raise ReaderError, "Use of JSON-LD-based reader requires json-ld gem" + end + JsonLdReader else self end diff --git a/rdf-microdata.gemspec b/rdf-microdata.gemspec index 459fc28..623d957 100755 --- a/rdf-microdata.gemspec +++ b/rdf-microdata.gemspec @@ -34,6 +34,7 @@ Gem::Specification.new do |gem| gem.add_development_dependency 'rspec', '~> 3.5' gem.add_development_dependency 'rspec-its', '~> 1.2' + gem.add_development_dependency 'json-ld', '~> 2.1' gem.add_development_dependency 'rdf-spec', '~> 2.2' gem.add_development_dependency 'rdf-rdfa', '~> 2.2' gem.add_development_dependency 'rdf-turtle', '~> 2.2' diff --git a/script/parse b/script/parse index a4f4349..c7bc0de 100755 --- a/script/parse +++ b/script/parse @@ -82,6 +82,10 @@ def run(input, options) end options[:output].puts xsl.apply_to(reader.rdfa).to_s end + when :jsonld + reader_class.new(input, options.merge(jsonld: true)) do |reader| + options[:output].puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end when :inspect reader_class.new(input, options).each do |statement| num += 1 @@ -127,7 +131,6 @@ opts = GetoptLong.new( ["--output", "-o", GetoptLong::REQUIRED_ARGUMENT], ["--quiet", GetoptLong::NO_ARGUMENT], ["--registry", GetoptLong::REQUIRED_ARGUMENT], - ["--rdfa", GetoptLong::NO_ARGUMENT], ["--template", GetoptLong::REQUIRED_ARGUMENT], ["--uri", GetoptLong::REQUIRED_ARGUMENT], ["--validate", GetoptLong::NO_ARGUMENT], @@ -138,7 +141,6 @@ opts.each do |opt, arg| when '--debug' then logger.level = Logger::DEBUG when '--execute' then input = arg when '--format' then options[:output_format] = arg.to_sym - when '--rdfa' then options[:rdfa] = true when '--input-format' then options[:input_format] = arg.to_sym when '--quiet' options[:quiet] = options[:quiet].to_i + 1 diff --git a/spec/jsonld_reader_spec.rb b/spec/jsonld_reader_spec.rb new file mode 100644 index 0000000..d341b71 --- /dev/null +++ b/spec/jsonld_reader_spec.rb @@ -0,0 +1,888 @@ +# coding: utf-8 +$:.unshift "." +require 'spec_helper' +require 'rdf/spec/reader' + +describe RDF::Microdata::JsonLdReader do + let!(:doap) {File.expand_path("../../etc/doap.html", __FILE__)} + let!(:doap_nt) {File.expand_path("../../etc/doap.nt", __FILE__)} + let!(:registry_path) {File.expand_path("../test-files/test-registry.json", __FILE__)} + before :each do + @reader = RDF::Microdata::JsonLdReader.new(StringIO.new("")) + end + + context :interface do + subject {%( +
+

My name is Elizabeth.

+
+ )} + + it "should yield reader" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Microdata::JsonLdReader) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/') do |reader| + inner.called(reader.class) + end + end + + it "should return reader" do + expect(RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/')).to be_a(RDF::Microdata::JsonLdReader) + end + + it "should not raise errors" do + expect { + RDF::Microdata::JsonLdReader.new(subject, validate: true, base_uri: 'http://example/') + }.not_to raise_error + end + + it "should yield statements" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Statement).at_least(2) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/').each_statement do |statement| + inner.called(statement.class) + end + end + + it "should yield triples" do + inner = double("inner") + expect(inner).to receive(:called).at_least(2) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/').each_triple do |subject, predicate, object| + inner.called(subject.class, predicate.class, object.class) + end + end + + context "Microdata Reader with :jsonld option" do + it "returns a JsonLdReader instance" do + r = RDF::Microdata::Reader.new(StringIO.new(""), jsonld: true) + expect(r).to be_a(RDF::Microdata::JsonLdReader) + end + end + end + + context :parsing do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + _:a . + %s + ) + end + + it "parses a simple graph" do + md = %q(

My name is Gregg Kellogg.

) + nt = %q(_:a "Gregg Kellogg" .) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + + context "values" do + [ + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q( +

My name is Gregg

+

My name is Kellogg

+ ), + %q(_:a "Gregg", "Kellogg" .) + ], + [ + %q(

My name is Gregg Kellogg

), + %q( + _:a "Gregg Kellogg" . + _:a "Gregg Kellogg" . + ) + ], + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(Bar), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a "2011-06-28Z"^^ .) + ], + [ + %q(), + %q(_:a "00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "P2011Y06M28DT00H00M00S"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a _:b .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + pending if [ + '', + '', + ].include?(md) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "base_uri" do + before :each do + @nt_ctx = %q( + _:a . + %s + ) + end + + [ + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(Stéphane Corlosquet), + %q(_:a .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md, base_uri: 'http://example.com/')).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemid" do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + . + %s + ) + end + + [ + [ + %q(

My name is Gregg Kellogg

), + %q( "Gregg Kellogg" .) + ], + [ + %q(), + %q( "foo" .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q( .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemtype" do + { + "with no type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with empty type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with relative type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with single type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + #"with no type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with empty type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with relative type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + "with single type and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + "with inherited type and token property" => [ + %q( +
+

Name: Gregg

+
+

Name: Jeni

+
+
+ ), + %q( + @prefix md: . + @prefix schema: . + [ a schema:Person ; + schema:name "Gregg" ; + schema:knows [ schema:name "Jeni" ] + ] . + ) + ] + }.each do |name, (md, nt)| + it "#{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "itemref" do + { + "to single id" => + [ + %q( +
+
+

Name: Amanda

+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "to generate listed property values" => + [ + %q( +
+
+

My name is Gregg

+
+

My name is Kellogg

+
+ ), + %q( + [ a ; + "Gregg", "Kellogg" ; + ] . + ) + ], + #"to single id with different types" => + #[ + # %q( + #
+ #
+ #
+ #

Name: Amanda

+ #
+ # ), + # %q( + # [ a ; + # "Amanda" ; + # ] . + # [ a ; + # "Amanda" ; + # ] . + # ) + #], + "to multiple ids" => + [ + %q( +
+
+

Name: Amanda

+

Jazz Band

+
+ ), + %q( + [ a ; + "Amanda" ; + "Jazz Band" ; + ] . + ) + ], + "with chaining" => + [ + %q( +
+
+

Name: Amanda

+
+
+

Band: Jazz Band

+

Size: 12 players

+
+
+ ), + %q( + [ a ; + "Amanda" ; + [ + a ; + "Jazz Band"; + "12" + ] + ] . + ) + ], + "shared" => + [ + %q( +
+
+
+
+ Amanda +
+
+ ), + %q( + [ a ; _:a ] . + [ a ; _:a ] . + _:a "Amanda" . + ) + + ], + }.each do |name, (md, nt)| + it "parses #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + + it "catches infinite recursion", pending: true do + md = %( + + +
+
+
friend1
+
+
friend2
+
+
+
+
+ + ) + expect {parse(md, validate: true)}.to raise_error(RDF::ReaderError) + expect(@logger.to_s).to include("itemref recursion") + end + end + + context "propertyURI" do + context "no expansion" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "default propertyURI generation" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + end + + context "itemprop-reverse", skip: true do + { + "link" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + [ + a ; + "William Shakespeare" + ] . + ) + ], + "itemscope" => [ + %q( +
+ The ACME Shopping Mall on Structured Data Avenue + The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data +

Here is a list of shops inside:

+
+ Dan Brickley's Data Restaurant +
+
+ Ramanathan Guha's Meta Content Framework Bakery +
+
+ ), + %q( + _:a a ; + "The ACME Shopping Mall on Structured Data Avenue"; + "The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data" . + _:b a ; + "Dan Brickley's Data Restaurant"; + _:a . + _:c a ; + "Ramanathan Guha's Meta Content Framework Bakery"; + _:a . + ) + ], + "literal" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + _:a a ; + "William Shakespeare" . + ) + ], + "itemprop and itemprop-reverse" => [ + %q( +
+ Cryptography Users +
+
+ Alice +
+ 1977 +
+
+ ), + %q( + @prefix schema: . + @prefix md: . + + _:a a schema:Organization; + schema:name "Cryptography Users"; + schema:member _:b . + _:b a schema:OrganizationRole; + schema:startDate "1977"; + schema:member _:c; + schema:memberOf _:a . + _:c a schema:Person; + schema:name "Alice"; + schema:memberOf _:b . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "vocabulary expansion", pending: true do + it "always expands" do + md = %q( +
+ +
+ ) + ttl = %q( + [ a , ; + + ] . + ) + + expect(parse(md, vocab_expansion: true)).to be_equivalent_graph(ttl, logger: @logger) + end + end + + context "test-files", skip: true do + Dir.glob(File.join(File.expand_path(File.dirname(__FILE__)), "test-files", "*.html")).each do |md| + it "parses #{md}" do + test_file(md) + end + end + end + end + + def parse(input, options = {}) + @logger = RDF::Spec.logger + graph = options[:graph] || RDF::Graph.new + RDF::Microdata::Reader.new(input, { + logger: @logger, + rdfa: true, + validate: false, + base_uri: "http://example/", + registry: registry_path, + canonicalize: false}.merge(options)).each do |statement| + graph << statement + end + + # Remove any rdfa:usesVocabulary statements + graph.query(predicate: RDF::RDFA.usesVocabulary).each do |stmt| + graph.delete(stmt) + end + graph + end + + def test_file(filepath, options = {}) + graph = parse(File.open(filepath), options) + + ttl_string = File.read(filepath.sub('.html', '.ttl')) + expect(graph).to be_equivalent_graph(ttl_string, logger: @logger) + end +end From e6265437316b081c7978bf1751c41b324653d205 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 16 Aug 2017 15:10:48 -0700 Subject: [PATCH 06/12] Update CLI commands. --- examples/blog_posting.html | 34 ++++++++++++++++++++++++++++++++++ lib/rdf/microdata/format.rb | 10 ++++++++++ 2 files changed, 44 insertions(+) create mode 100644 examples/blog_posting.html diff --git a/examples/blog_posting.html b/examples/blog_posting.html new file mode 100644 index 0000000..9676054 --- /dev/null +++ b/examples/blog_posting.html @@ -0,0 +1,34 @@ + +My Blog +
+
+

Progress report

+

+ +
+

All in all, he's doing well with his swim lessons. The biggest thing was he had trouble + putting his head in, but we got it down.

+
+

Comments

+
+ +
+

Posted by: + Greg +

+

+
+

Ha!

+
+
+ +
+

Posted by: + Charlotte +

+

+
+

When you say "we got it down"...

+
+
+
diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index 193bb52..7384868 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -51,6 +51,11 @@ def self.cli_commands description: "Transform HTML+Microdata into HTML+RDFa", parse: false, help: "to-rdfa files ...\nTransform HTML+Microdata into HTML+RDFa", + filter: { + format: :microdata, + output_format: :jsonld + }, + option_use: {output_format: :disabled}, lambda: ->(files, options) do out = options[:output] || $stdout xsl = Nokogiri::XSLT(%( @@ -125,6 +130,11 @@ def self.cli_commands description: "Transform HTML+Microdata into JSON-LD", parse: false, help: "to-jsonld files ...\nTransform HTML+Microdata into JSON-LD", + filter: { + format: :microdata, + output_format: :rdfa + }, + option_use: {output_format: :disabled}, lambda: ->(files, options) do out = options[:output] || $stdout if files.empty? From 4930bf181421f4513b7d423d5e4b27b101ee3de9 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Thu, 17 Aug 2017 09:05:20 -0700 Subject: [PATCH 07/12] Update rdf dependency. --- rdf-microdata.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdf-microdata.gemspec b/rdf-microdata.gemspec index 623d957..a192559 100755 --- a/rdf-microdata.gemspec +++ b/rdf-microdata.gemspec @@ -24,7 +24,7 @@ Gem::Specification.new do |gem| gem.required_ruby_version = '>= 2.2.2' gem.requirements = [] - gem.add_runtime_dependency 'rdf', '~> 2.2' + gem.add_runtime_dependency 'rdf', '~> 2.2', '>= 2.2.8' gem.add_runtime_dependency 'rdf-xsd', '~> 2.1' gem.add_runtime_dependency 'htmlentities', '~> 4.3' gem.add_runtime_dependency 'nokogiri' , '~> 1.7' From 9128422ada958b4593901eb827a47ad543db0aba Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 21 Aug 2017 10:03:50 -0600 Subject: [PATCH 08/12] Update rubyforge references. --- README.md | 2 +- lib/rdf/microdata.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7a696c2..c8d5da1 100755 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ The reader exposes a `#json` method, which can be used to retrieve the generated ## Resources * [RDF.rb][RDF.rb] -* [Documentation](http://rdf.rubyforge.org/microdata) +* [Documentation](http://www.rubydoc.info/github/ruby-rdf/rdf-microdata/) * [History](file:History.md) * [Microdata][] * [Microdata RDF][] diff --git a/lib/rdf/microdata.rb b/lib/rdf/microdata.rb index 101f09a..98b386b 100644 --- a/lib/rdf/microdata.rb +++ b/lib/rdf/microdata.rb @@ -15,7 +15,7 @@ module RDF # end # end # - # @see http://rdf.rubyforge.org/ + # @see http://www.rubydoc.info/github/ruby-rdf/rdf/ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/ # # @author [Gregg Kellogg](http://greggkellogg.net/) From 735ed0245b16d4625172ea9470903f0c3009c4fd Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Mon, 21 Aug 2017 16:14:41 -0600 Subject: [PATCH 09/12] Remove rubyforge reference. --- rdf-microdata.gemspec | 1 - 1 file changed, 1 deletion(-) diff --git a/rdf-microdata.gemspec b/rdf-microdata.gemspec index a192559..283f2c4 100755 --- a/rdf-microdata.gemspec +++ b/rdf-microdata.gemspec @@ -10,7 +10,6 @@ Gem::Specification.new do |gem| gem.license = 'Unlicense' gem.summary = "Microdata reader for Ruby." gem.description = 'Reads HTML Microdata as RDF.' - gem.rubyforge_project = 'rdf-microdata' gem.authors = %w(Gregg Kellogg) gem.email = 'public-rdf-ruby@w3.org' From cdcea807f01d066dff2f2b46eb1a072411f80964 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 10 Oct 2017 16:14:20 -0700 Subject: [PATCH 10/12] Don't restrict rdfa and jsonld reader output on output format. --- Gemfile | 18 +++++++++--------- examples/to_jsonld.html | 34 ++++++++++++++++++++++++++++++++++ lib/rdf/microdata/format.rb | 6 ++---- rdf-microdata.gemspec | 6 +++--- 4 files changed, 48 insertions(+), 16 deletions(-) create mode 100644 examples/to_jsonld.html diff --git a/Gemfile b/Gemfile index e63746f..10db0a6 100644 --- a/Gemfile +++ b/Gemfile @@ -2,19 +2,19 @@ source "http://rubygems.org" gemspec -gem "rdf", github: "ruby-rdf/rdf", branch: "develop" -gem "rdf-rdfa", github: "ruby-rdf/rdf-rdfa", branch: "develop" -gem "rdf-xsd", github: "ruby-rdf/rdf-xsd", branch: "develop" +gem "rdf", git: "https://github.com/ruby-rdf/rdf", branch: "develop" +gem "rdf-rdfa", git: "https://github.com/ruby-rdf/rdf-rdfa", branch: "develop" +gem "rdf-xsd", git: "https://github.com/ruby-rdf/rdf-xsd", branch: "develop" gem "nokogumbo", '~> 1.4' group :development do gem 'linkeddata' - gem 'ebnf', github: "gkellogg/ebnf", branch: "develop" - gem 'rdf-aggregate-repo', github: "ruby-rdf/rdf-aggregate-repo", branch: "develop" - gem 'rdf-isomorphic', github: "ruby-rdf/rdf-isomorphic", branch: "develop" - gem "rdf-spec", github: "ruby-rdf/rdf-spec", branch: "develop" - gem 'rdf-turtle', github: "ruby-rdf/rdf-turtle", branch: "develop" - gem 'sxp', github: "dryruby/sxp.rb", branch: "develop" + gem 'ebnf', git: "https://github.com/gkellogg/ebnf", branch: "develop" + gem 'rdf-aggregate-repo', git: "https://github.com/ruby-rdf/rdf-aggregate-repo", branch: "develop" + gem 'rdf-isomorphic', git: "https://github.com/ruby-rdf/rdf-isomorphic", branch: "develop" + gem "rdf-spec", git: "https://github.com/ruby-rdf/rdf-spec", branch: "develop" + gem 'rdf-turtle', git: "https://github.com/ruby-rdf/rdf-turtle", branch: "develop" + gem 'sxp', git: "https://github.com/dryruby/sxp.rb", branch: "develop" end group :debug do diff --git a/examples/to_jsonld.html b/examples/to_jsonld.html new file mode 100644 index 0000000..9676054 --- /dev/null +++ b/examples/to_jsonld.html @@ -0,0 +1,34 @@ + +My Blog +
+
+

Progress report

+

+ +
+

All in all, he's doing well with his swim lessons. The biggest thing was he had trouble + putting his head in, but we got it down.

+
+

Comments

+
+ +
+

Posted by: + Greg +

+

+
+

Ha!

+
+
+ +
+

Posted by: + Charlotte +

+

+
+

When you say "we got it down"...

+
+
+
diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index 7384868..9eef5aa 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -52,8 +52,7 @@ def self.cli_commands parse: false, help: "to-rdfa files ...\nTransform HTML+Microdata into HTML+RDFa", filter: { - format: :microdata, - output_format: :jsonld + format: :microdata }, option_use: {output_format: :disabled}, lambda: ->(files, options) do @@ -131,8 +130,7 @@ def self.cli_commands parse: false, help: "to-jsonld files ...\nTransform HTML+Microdata into JSON-LD", filter: { - format: :microdata, - output_format: :rdfa + format: :microdata }, option_use: {output_format: :disabled}, lambda: ->(files, options) do diff --git a/rdf-microdata.gemspec b/rdf-microdata.gemspec index 283f2c4..a9cf6f9 100755 --- a/rdf-microdata.gemspec +++ b/rdf-microdata.gemspec @@ -24,13 +24,13 @@ Gem::Specification.new do |gem| gem.required_ruby_version = '>= 2.2.2' gem.requirements = [] gem.add_runtime_dependency 'rdf', '~> 2.2', '>= 2.2.8' - gem.add_runtime_dependency 'rdf-xsd', '~> 2.1' + gem.add_runtime_dependency 'rdf-xsd', '~> 2.2' gem.add_runtime_dependency 'htmlentities', '~> 4.3' - gem.add_runtime_dependency 'nokogiri' , '~> 1.7' + gem.add_runtime_dependency 'nokogiri' , '~> 1.8' gem.add_development_dependency 'equivalent-xml' , '~> 0.6' gem.add_development_dependency 'yard' , '~> 0.9' - gem.add_development_dependency 'rspec', '~> 3.5' + gem.add_development_dependency 'rspec', '~> 3.6' gem.add_development_dependency 'rspec-its', '~> 1.2' gem.add_development_dependency 'json-ld', '~> 2.1' From 570547cdc1a6145d1a04a7517e94f88a845418e0 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 10 Oct 2017 16:25:48 -0700 Subject: [PATCH 11/12] Fix to-rdfa and to-jsonld access of `options`. --- lib/rdf/microdata/format.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index 9eef5aa..722da35 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -107,7 +107,7 @@ def self.cli_commands # If files are empty, either use options[::evaluate] input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) - RDF::Microdata::Reader.new(input, options(rdfa: true)) do |reader| + RDF::Microdata::Reader.new(input, options.merge(rdfa: true)) do |reader| reader.rdfa.xpath("//text()").each do |txt| txt.content = txt.content.to_s.strip end @@ -139,7 +139,7 @@ def self.cli_commands # If files are empty, either use options[::evaluate] input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) - RDF::Microdata::Reader.new(input, options(jsonld: true)) do |reader| + RDF::Microdata::Reader.new(input, options.merge(jsonld: true)) do |reader| out.puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) end else From 5181927a6486aa5f6f10c2d4126b7d3f3868a2c1 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 10 Oct 2017 16:28:12 -0700 Subject: [PATCH 12/12] Version 2.2.2. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index c043eea..b1b25a5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.1 +2.2.2