diff --git a/Gemfile b/Gemfile index 48f46d8..10db0a6 100644 --- a/Gemfile +++ b/Gemfile @@ -2,18 +2,19 @@ source "http://rubygems.org" gemspec -gem "rdf", github: "ruby-rdf/rdf", branch: "develop" -gem "rdf-rdfa", github: "ruby-rdf/rdf-rdfa", branch: "develop" -gem "rdf-xsd", github: "ruby-rdf/rdf-xsd", branch: "develop" +gem "rdf", git: "https://github.com/ruby-rdf/rdf", branch: "develop" +gem "rdf-rdfa", git: "https://github.com/ruby-rdf/rdf-rdfa", branch: "develop" +gem "rdf-xsd", git: "https://github.com/ruby-rdf/rdf-xsd", branch: "develop" gem "nokogumbo", '~> 1.4' group :development do - gem 'ebnf', github: "gkellogg/ebnf", branch: "develop" - gem 'rdf-aggregate-repo', github: "ruby-rdf/rdf-aggregate-repo", branch: "develop" - gem 'rdf-isomorphic', github: "ruby-rdf/rdf-isomorphic", branch: "develop" - gem "rdf-spec", github: "ruby-rdf/rdf-spec", branch: "develop" - gem 'rdf-turtle', github: "ruby-rdf/rdf-turtle", branch: "develop" - gem 'sxp', github: "dryruby/sxp.rb", branch: "develop" + gem 'linkeddata' + gem 'ebnf', git: "https://github.com/gkellogg/ebnf", branch: "develop" + gem 'rdf-aggregate-repo', git: "https://github.com/ruby-rdf/rdf-aggregate-repo", branch: "develop" + gem 'rdf-isomorphic', git: "https://github.com/ruby-rdf/rdf-isomorphic", branch: "develop" + gem "rdf-spec", git: "https://github.com/ruby-rdf/rdf-spec", branch: "develop" + gem 'rdf-turtle', git: "https://github.com/ruby-rdf/rdf-turtle", branch: "develop" + gem 'sxp', git: "https://github.com/dryruby/sxp.rb", branch: "develop" end group :debug do diff --git a/README.md b/README.md index 2cec89a..c8d5da1 100755 --- a/README.md +++ b/README.md @@ -60,11 +60,24 @@ Full documentation available on [Rubydoc.info][Microdata doc] * {RDF::Microdata::Reader} * {RDF::Microdata::Reader::Nokogiri} -### Additional vocabularies + +### RDFa-based Reader +There is an experimental reader based on transforming Microdata to RDFa within the DOM. To invoke +this, add the `rdfa: true` option to the {RDF::Microdata::Reader.new}, or +use {RDF::Microdata::RdfaReader} directly. + +The reader exposes a `#rdfa` method, which can be used to retrieve the transformed HTML+RDFa + +### JSON-lD-based Reader +There is an experimental reader based on transforming Microdata to JSON-LD. To invoke +this, add the `jsonld: true` option to the {RDF::Microdata::Reader.new}, or +use {RDF::Microdata::JsonLdReader} directly. + +The reader exposes a `#json` method, which can be used to retrieve the generated JSON-LD ## Resources * [RDF.rb][RDF.rb] -* [Documentation](http://rdf.rubyforge.org/microdata) +* [Documentation](http://www.rubydoc.info/github/ruby-rdf/rdf-microdata/) * [History](file:History.md) * [Microdata][] * [Microdata RDF][] diff --git a/VERSION b/VERSION index c043eea..b1b25a5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.1 +2.2.2 diff --git a/examples/blog_posting.html b/examples/blog_posting.html new file mode 100644 index 0000000..9676054 --- /dev/null +++ b/examples/blog_posting.html @@ -0,0 +1,34 @@ + +My Blog +
+
+

Progress report

+

+ +
+

All in all, he's doing well with his swim lessons. The biggest thing was he had trouble + putting his head in, but we got it down.

+
+

Comments

+
+ +
+

Posted by: + Greg +

+

+
+

Ha!

+
+
+ +
+

Posted by: + Charlotte +

+

+
+

When you say "we got it down"...

+
+
+
diff --git a/examples/itemref.html b/examples/itemref.html new file mode 100644 index 0000000..b961767 --- /dev/null +++ b/examples/itemref.html @@ -0,0 +1,7 @@ +
+

1

+
+
+

test

+

2

+
diff --git a/examples/locomotive.html b/examples/locomotive.html new file mode 100644 index 0000000..e976ef9 --- /dev/null +++ b/examples/locomotive.html @@ -0,0 +1,11 @@ +
+
Name: +
Tank Locomotive (DB 80) +
Product code: +
33041 +
Scale: +
HO +
Digital: +
Delta +
diff --git a/examples/to_jsonld.html b/examples/to_jsonld.html new file mode 100644 index 0000000..9676054 --- /dev/null +++ b/examples/to_jsonld.html @@ -0,0 +1,34 @@ + +My Blog +
+
+

Progress report

+

+ +
+

All in all, he's doing well with his swim lessons. The biggest thing was he had trouble + putting his head in, but we got it down.

+
+

Comments

+
+ +
+

Posted by: + Greg +

+

+
+

Ha!

+
+
+ +
+

Posted by: + Charlotte +

+

+
+

When you say "we got it down"...

+
+
+
diff --git a/lib/rdf/microdata.rb b/lib/rdf/microdata.rb index e86650a..98b386b 100644 --- a/lib/rdf/microdata.rb +++ b/lib/rdf/microdata.rb @@ -15,18 +15,22 @@ module RDF # end # end # - # @see http://rdf.rubyforge.org/ + # @see http://www.rubydoc.info/github/ruby-rdf/rdf/ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/ # # @author [Gregg Kellogg](http://greggkellogg.net/) module Microdata USES_VOCAB = RDF::URI("http://www.w3.org/ns/rdfa#usesVocabulary") + DEFAULT_REGISTRY = File.expand_path("../../../etc/registry.json", __FILE__) require 'rdf/microdata/format' require 'rdf/microdata/vocab' - autoload :Expansion, 'rdf/microdata/expansion' - autoload :Profile, 'rdf/microdata/profile' - autoload :Reader, 'rdf/microdata/reader' - autoload :VERSION, 'rdf/microdata/version' + autoload :Expansion, 'rdf/microdata/expansion' + autoload :JsonLdReader, 'rdf/microdata/jsonld_reader' + autoload :Profile, 'rdf/microdata/profile' + autoload :RdfaReader, 'rdf/microdata/rdfa_reader' + autoload :Reader, 'rdf/microdata/reader' + autoload :Registry, 'rdf/microdata/registry' + autoload :VERSION, 'rdf/microdata/version' end end diff --git a/lib/rdf/microdata/format.rb b/lib/rdf/microdata/format.rb index a11dca1..722da35 100644 --- a/lib/rdf/microdata/format.rb +++ b/lib/rdf/microdata/format.rb @@ -41,5 +41,117 @@ class Format < RDF::Format def self.detect(sample) !!sample.match(/<[^>]*(itemprop|itemtype|itemref|itemscope|itemid)[^>]*>/m) end + + ## + # Hash of CLI commands appropriate for this format + # @return [Hash{Symbol => Hash}] + def self.cli_commands + { + "to-rdfa": { + description: "Transform HTML+Microdata into HTML+RDFa", + parse: false, + help: "to-rdfa files ...\nTransform HTML+Microdata into HTML+RDFa", + filter: { + format: :microdata + }, + option_use: {output_format: :disabled}, + lambda: ->(files, options) do + out = options[:output] || $stdout + xsl = Nokogiri::XSLT(%( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ).gsub(/^ /, '')) + if files.empty? + # If files are empty, either use options[::evaluate] + input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN + input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) + RDF::Microdata::Reader.new(input, options.merge(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + out.puts xsl.apply_to(reader.rdfa).to_s + end + else + files.each do |file| + RDF::Microdata::Reader.open(file, options.merge(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + out.puts xsl.apply_to(reader.rdfa).to_s + end + end + end + end + }, + "to-jsonld": { + description: "Transform HTML+Microdata into JSON-LD", + parse: false, + help: "to-jsonld files ...\nTransform HTML+Microdata into JSON-LD", + filter: { + format: :microdata + }, + option_use: {output_format: :disabled}, + lambda: ->(files, options) do + out = options[:output] || $stdout + if files.empty? + # If files are empty, either use options[::evaluate] + input = options[:evaluate] ? StringIO.new(options[:evaluate]) : STDIN + input.set_encoding(options.fetch(:encoding, Encoding::UTF_8)) + RDF::Microdata::Reader.new(input, options.merge(jsonld: true)) do |reader| + out.puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end + else + files.each do |file| + RDF::Microdata::Reader.open(file, options.merge(jsonld: true)) do |reader| + out.puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end + end + end + end + }, + } + end end end diff --git a/lib/rdf/microdata/jsonld_reader.rb b/lib/rdf/microdata/jsonld_reader.rb new file mode 100644 index 0000000..5bf6d3d --- /dev/null +++ b/lib/rdf/microdata/jsonld_reader.rb @@ -0,0 +1,251 @@ +require 'json/ld' +require 'nokogumbo' + +module RDF::Microdata + ## + # Update DOM to turn Microdata into JSON-LD and parse using the JSON-LD Reader + class JsonLdReader < JSON::LD::Reader + # The resulting JSON-LD + # @return [Hash] + attr_reader :jsonld + + def self.format(klass = nil) + if klass.nil? + RDF::Microdata::Format + else + super + end + end + + ## + # Initializes the JsonLdReader instance. + # + # @param [IO, File, String] input + # the input stream to read + # @param [Hash{Symbol => Object}] options + # any additional options (see `RDF::Reader#initialize`) + # @return [reader] + # @yield [reader] `self` + # @yieldparam [RDF::Reader] reader + # @yieldreturn [void] ignored + # @raise [RDF::ReaderError] if _validate_ + def initialize(input = $stdin, options = {}, &block) + @options = options + log_debug('', "using JSON-LD transformation reader") + + input = case input + when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document then input + else + # Try to detect charset from input + options[:encoding] ||= input.charset if input.respond_to?(:charset) + + # Otherwise, default is utf-8 + options[:encoding] ||= 'utf-8' + options[:encoding] = options[:encoding].to_s if options[:encoding] + input = input.read if input.respond_to?(:read) + ::Nokogiri::HTML5(input.force_encoding(options[:encoding])) + end + + # Load registry + begin + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") + Registry.load_registry(registry_uri) + rescue JSON::ParserError => e + log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) + end + + @jsonld = {'@graph' => []} + + # Start with all top-level items + input.css("[itemscope]").each do |item| + next if item['itemprop'] # Only top-level items + jsonld['@graph'] << get_object(item) + end + + log_debug('', "Transformed document: #{jsonld.to_json(JSON::LD::JSON_STATE)}") + + # Rely on RDFa reader + super(jsonld.to_json, options, &block) + end + + private + # Return JSON-LD representation of an item + # @param [Nokogiri::XML::Element] item + # @param [Hash{Nokogiri::XML::Node => Hash}] + # @return [Hash] + def get_object(item, memory = {}) + if result = memory[item] + # Result is a reference to that item; assign a blank-node identifier if necessary + result['@id'] ||= alloc_bnode + return result + end + + result = {} + memory[item] = result + + # If the item has a global identifier, add an entry to result called "@id" whose value is the global identifier of item. + result['@id'] = item['itemid'].to_s if item['itemid'] + + # If the item has any item types, add an entry to result called "@type" whose value is an array listing the item types of item, in the order they were specified on the itemtype attribute. + if item['itemtype'] + # Only absolute URLs + types = item.attribute('itemtype'). + remove. + to_s. + split(/\s+/). + select {|t| RDF::URI(t).absolute?} + if vocab = types.first + vocab = Registry.find(vocab) || begin + type_vocab = vocab.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless vocab.nil? + Registry.new(type_vocab) if type_vocab + end + (result['@context'] = {})['@vocab'] = vocab.uri.to_s if vocab + result['@type'] = types unless types.empty? + end + end + + # For each element element that has one or more property names and is one of the properties of the item item, in the order those elements are given by the algorithm that returns the properties of an item, run the following substeps + item_properties(item).each do |element| + value = if element['itemscope'] + get_object(element, memory) + else + property_value(element) + end + element['itemprop'].to_s.split(/\s+/).each do |prop| + result[prop] ||= [] << value + end + end + + result + end + + ## + # + # @param [Nokogiri::XML::Element] item + # @return [Array] + # List of property elements for an item + def item_properties(item) + results, memory, pending = [], [item], item.children.select(&:element?) + log_debug(item, "item_properties") + + # If root has an itemref attribute, split the value of that itemref attribute on spaces. For each resulting token ID, if there is an element in the document whose ID is ID, then add the first such element to pending. + item['itemref'].to_s.split(/\s+/).each do |ref| + if referenced = referenced = item.at_css("##{ref}") + pending << referenced + end + end + + while !pending.empty? + current = pending.shift + # Error + break if memory.include?(current) + memory << current + + # If current does not have an itemscope attribute, then: add all the child elements of current to pending. + pending += current.children.select(&:element?) unless current['itemscope'] + + # If current has an itemprop attribute specified and has one or more property names, then add current to results. + results << current unless current['itemprop'].to_s.split(/\s+/).empty? + end + + results + end + + ## + # + def property_value(element) + base = element.base || base_uri + log_debug(element) {"property_value(#{element.name}): base #{base.inspect}"} + value = case + when element.has_attribute?('itemscope') + {} + when element.has_attribute?('content') + if element.language + {"@value" => element['content'].to_s.strip, language: element.language} + else + element['content'].to_s.strip + end + when %w(data meter).include?(element.name) && element.attribute('value') + # XXX parse as number? + {"@value" => element['value'].to_s.strip} + when %w(audio embed iframe img source track video).include?(element.name) + {"@id" => uri(element.attribute('src'), base).to_s} + when %w(a area link).include?(element.name) + {"@id" => uri(element.attribute('href'), base).to_s} + when %w(object).include?(element.name) + {"@id" => uri(element.attribute('data'), base).to_s} + when %w(time).include?(element.name) + # use datatype? + (element.attribute('datetime') || element.text).to_s.strip + else + if element.language + {"@value" => element.inner_text.to_s.strip, language: element.language} + else + element.inner_text.to_s.strip + end + end + log_debug(element) {" #{value.inspect}"} + value + end + + # Allocate a new blank node identifier + # @return [String] + def alloc_bnode + @bnode_base ||= "_:a" + res = @bnode_base + @bnode_base = res.succ + res + end + + # Fixme, what about xml:base relative to element? + def uri(value, base = nil) + value = if base + base = uri(base) unless base.is_a?(RDF::URI) + base.join(value.to_s) + else + RDF::URI(value.to_s) + end + value.validate! if validate? + value.canonicalize! if canonicalize? + value = RDF::URI.intern(value) if intern? + value + end + end +end + +# Monkey Patch Nokogiri +module Nokogiri::XML + class Element + + ## + # Get any xml:base in effect for this element + def base + if @base.nil? + @base = attributes['xml:base'] || + (parent && parent.element? && parent.base) || + false + end + + @base == false ? nil : @base + end + + + ## + # Get any xml:lang or lang in effect for this element + def language + if @language.nil? + language = case + when self["xml:lang"] + self["xml:lang"].to_s + when self["lang"] + self["lang"].to_s + else + parent && parent.element? && parent.language + end + end + @language == false ? nil : @language + end + + end +end diff --git a/lib/rdf/microdata/rdfa_reader.rb b/lib/rdf/microdata/rdfa_reader.rb new file mode 100644 index 0000000..9ca15d4 --- /dev/null +++ b/lib/rdf/microdata/rdfa_reader.rb @@ -0,0 +1,132 @@ +require 'rdf/rdfa' +require 'nokogumbo' + +module RDF::Microdata + ## + # Update DOM to turn Microdata into RDFa and parse using the RDFa Reader + class RdfaReader < RDF::RDFa::Reader + # The transformed DOM using RDFa + # @return [RDF::HTML::Document] + attr_reader :rdfa + + def self.format(klass = nil) + if klass.nil? + RDF::Microdata::Format + else + super + end + end + + ## + # Initializes the RdfaReader instance. + # + # @param [IO, File, String] input + # the input stream to read + # @param [Hash{Symbol => Object}] options + # any additional options (see `RDF::Reader#initialize`) + # @return [reader] + # @yield [reader] `self` + # @yieldparam [RDF::Reader] reader + # @yieldreturn [void] ignored + # @raise [RDF::ReaderError] if _validate_ + def initialize(input = $stdin, options = {}, &block) + @options = options + log_debug('', "using RDFa transformation reader") + + input = case input + when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document then input + else + # Try to detect charset from input + options[:encoding] ||= input.charset if input.respond_to?(:charset) + + # Otherwise, default is utf-8 + options[:encoding] ||= 'utf-8' + options[:encoding] = options[:encoding].to_s if options[:encoding] + input = input.read if input.respond_to?(:read) + ::Nokogiri::HTML5(input.force_encoding(options[:encoding])) + end + + # Load registry + begin + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") + Registry.load_registry(registry_uri) + rescue JSON::ParserError => e + log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) + end + + # For all members having @itemscope + input.css("[itemscope]").each do |item| + # Get @itemtypes to create @type and @vocab + item.attribute('itemscope').remove + if item['itemtype'] + # Only absolute URLs + types = item.attribute('itemtype'). + remove. + to_s. + split(/\s+/). + select {|t| RDF::URI(t).absolute?} + + item['typeof'] = types.join(' ') unless types.empty? + if vocab = types.first + vocab = Registry.find(vocab) || begin + type_vocab = vocab.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless vocab.nil? + Registry.new(type_vocab) if type_vocab + end + item['vocab'] = vocab.uri.to_s if vocab + end + end + + # Change each itemid attribute to an resource attribute with the same value + if item['itemid'] + id = item.attribute('itemid').remove + item[item['itemprop'] ? 'resource' : 'about'] = id + else + # Otherwise, ensure that @typeof has at least an empty value + item['typeof'] ||= '' + end + end + + # Add @resource for all itemprop values of object based on a @data value + input.css("object[itemprop][data]").each do |item| + item['resource'] ||= item['data'] + end + + # Replace all @itemprop values with @property + input.css("[itemprop]").each {|item| item['property'] = item.attribute('itemprop').remove} + + # Wrap all @itemref properties + input.css("[itemref]").each do |item| + item_vocab = item['vocab'] || item.ancestors.detect {|a| a.attribute('vocab')} + item_vocab = item_vocab.to_s if item_vocab + + item.attribute('itemref').remove.to_s.split(/\s+/).each do |ref| + if referenced = input.css("##{ref}") + # Add @vocab to referenced using the closest ansestor having @vocab of item. + # If the element with id reference has no resource attribute, add a resource attribute whose value is a NUMBER SIGN U+0023 followed by reference to the element. + # If the element with id reference has no typeof attribute, add a typeof="rdfa:Pattern" attribute to the element. + referenced.wrap(%(
RDF::Resource}] maps RDF elements (items) to resources + attr_reader :memory + ## # Returns the base URI determined by this reader. # @@ -36,109 +37,46 @@ def base_uri @options[:base_uri] end - # Interface to registry - class Registry - # @return [RDF::URI] Prefix of vocabulary - attr_reader :uri - - # @return [Hash] properties - attr_reader :properties - - ## - # Initialize the registry from a URI or file path - # - # @param [String] registry_uri - def self.load_registry(registry_uri) - return if @registry_uri == registry_uri - - json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) } - - @prefixes = {} - json.each do |prefix, elements| - next unless elements.is_a?(Hash) - properties = elements.fetch("properties", {}) - @prefixes[prefix] = Registry.new(prefix, properties) - end - @registry_uri = registry_uri - end - - ## - # Initialize registry for a particular prefix URI - # - # @param [RDF::URI] prefixURI - # @param [Hash] properties ({}) - def initialize(prefixURI, properties = {}) - @uri = prefixURI - @properties = properties - @property_base = prefixURI.to_s - # Append a '#' for fragment if necessary - @property_base += '#' unless %w(/ #).include?(@property_base[-1,1]) - end + ## + # Reader options + # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Reader#options-class_method + def self.options + super + [ + RDF::CLI::Option.new( + symbol: :rdfa, + datatype: TrueClass, + on: ["--rdfa"], + description: "Transform and parse as RDFa.") {true}, + ] + end - ## - # Find a registry entry given a type URI - # - # @param [RDF::URI] type - # @return [Registry] - def self.find(type) - @prefixes ||= {} - k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 } - @prefixes[k] if k - end - - ## - # Generate a predicateURI given a `name` - # - # @param [#to_s] name - # @param [Hash{}] ec Evaluation Context - # @return [RDF::URI] - def predicateURI(name, ec) - u = RDF::URI(name) - # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ - return u if u.absolute? - - n = frag_escape(name) - if ec[:current_type].nil? - # 2) If current type from context is null, there can be no current vocabulary. - # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name - u = RDF::URI(ec[:document_base].to_s) - u.fragment = frag_escape(name) - u - else - # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). - RDF::URI(@property_base + n) + ## + # Redirect for RDFa Reader given `:rdfa` option + # + # @private + def self.new(input = nil, options = {}, &block) + klass = if options[:rdfa] + # Requires rdf-rdfa gem to be loaded + begin + require 'rdf/rdfa' + rescue LoadError + raise ReaderError, "Use of RDFa-based reader requires rdf-rdfa gem" end - end - - ## - # Yield a equivalentProperty or subPropertyOf if appropriate - # - # @param [RDF::URI] predicateURI - # @yield equiv - # @yieldparam [RDF::URI] equiv - def expand(predicateURI) - tok = tokenize(predicateURI) - if @properties[tok].is_a?(Hash) - value = @properties[tok].fetch("subPropertyOf", nil) - value ||= @properties[tok].fetch("equivalentProperty", nil) - - Array(value).each {|equiv| yield RDF::URI(equiv)} + RdfaReader + elsif options[:jsonld] + # Requires rdf-rdfa gem to be loaded + begin + require 'json/ld' + rescue LoadError + raise ReaderError, "Use of JSON-LD-based reader requires json-ld gem" end + JsonLdReader + else + self end - - ## - # Turn a predicateURI into a simple token - # @param [RDF::URI] predicateURI - # @return [String] - def tokenize(predicateURI) - predicateURI.to_s.sub(@property_base, '') - end - - ## - # Fragment escape a name - def frag_escape(name) - name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} - end + reader = klass.allocate + reader.send(:initialize, input, options, &block) + reader end ## @@ -178,12 +116,12 @@ def initialize(input = $stdin, options = {}, &block) log_error("Empty document") if root.nil? log_error(doc_errors.map(&:message).uniq.join("\n")) if !doc_errors.empty? - log_debug(@doc, "library = #{@library}") + log_debug('', "library = #{@library}") # Load registry begin - registry_uri = options[:registry] || DEFAULT_REGISTRY - log_debug(@doc, "registry = #{registry_uri.inspect}") + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") Registry.load_registry(registry_uri) rescue JSON::ParserError => e log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) @@ -270,6 +208,7 @@ def add_triple(node, subject, predicate, object) # Parsing a Microdata document (this is *not* the recursive method) def parse_whole_document(doc, base) base = doc_base(base) + @memory = {} options[:base_uri] = if (base) # Strip any fragment from base base = base.to_s.split('#').first @@ -280,15 +219,9 @@ def parse_whole_document(doc, base) log_info(nil) {"parse_whole_doc: base='#{base}'"} - ec = { - memory: {}, - current_type: nil, - current_vocabulary: nil, - document_base: base, - } # 1) For each element that is also a top-level item, Generate the triples for that item using the evaluation context. getItems.each do |el| - log_depth {generate_triples(el, ec)} + log_depth {generate_triples(el, Registry.new(nil))} end log_info(doc, "parse_whole_doc: traversal complete") @@ -298,12 +231,11 @@ def parse_whole_document(doc, base) # Generate triples for an item # # @param [RDF::Resource] item - # @param [Hash{Symbol => Object}] ec + # @param [Registry] vocab # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory - # @option ec [RDF::Resource] :current_type + # @option ec [RDF::Resource] :current_vocabulary # @return [RDF::Resource] - def generate_triples(item, ec = {}) - memory = ec[:memory] + def generate_triples(item, vocab) # 1) If there is an entry for item in memory, then let subject be the subject of that entry. Otherwise, if item has a global identifier and that global identifier is an absolute URL, let subject be that global identifier. Otherwise, let subject be a new blank node. subject = if memory.include?(item.node) memory[item.node][:subject] @@ -312,12 +244,13 @@ def generate_triples(item, ec = {}) end || RDF::Node.new memory[item.node] ||= {} - log_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"} + log_debug(item) {"gentrips(2): subject=#{subject.inspect}, vocab: #{vocab.inspect}"} # 2) Add a mapping from item to subject in memory, if there isn't one already. memory[item.node][:subject] ||= subject # 3) For each type returned from element.itemType of the element defining the item. + # 4) Set vocab to the first value returned from element.itemType of the element defining the item. type = nil item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t| # 3.1. If type is an absolute URL, generate the following triple: @@ -325,36 +258,26 @@ def generate_triples(item, ec = {}) add_triple(item, subject, RDF.type, t) end - # 4) Set type to the first value returned from element.itemType of the element defining the item. - - # 5) Otherwise, set type to current type from the Evaluation Context if not empty. - type ||= ec[:current_type] - log_debug(item) {"gentrips(5): type=#{type.inspect}"} - - # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix. - vocab = Registry.find(type) - - # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type. - vocab ||= begin - type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') - log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} - Registry.new(type_vocab) + # 6) If the registry contains a URI prefix that is a character for character match of vocab up to the length of the URI prefix, set vocab as that URI prefix. + if type || vocab.nil? + vocab = Registry.find(type) || begin + type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless type.nil? + log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} + Registry.new(type_vocab) + end end - # 8) Update evaluation context setting current vocabulary to vocab. - ec[:current_vocabulary] = vocab + # Otherwise, use vocab from evaluation context + log_debug(item) {"gentrips(8): vocab: #{vocab.inspect}"} # 9. For each element _element_ that has one or more property names and is one of the properties of the item _item_, run the following substep: props = item_properties(item) # 9.1. For each name name in element's property names, run the following substeps: props.each do |element| element.attribute('itemprop').to_s.split(' ').compact.each do |name| - log_debug(item) {"gentrips(9.1): name=#{name.inspect}, type=#{type}"} - # 9.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) - + log_debug(item) {"gentrips(9.1): name=#{name.inspect}, vocab=#{vocab.inspect}"} # 9.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 9.1.3) Let value be the property value of element. value = property_value(element) @@ -362,7 +285,7 @@ def generate_triples(item, ec = {}) # 9.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(9.1.4): value=#{value.inspect}"} end @@ -384,11 +307,9 @@ def generate_triples(item, ec = {}) props.each do |element| element.attribute('itemprop-reverse').to_s.split(' ').compact.each do |name| log_debug(item) {"gentrips(10.1): name=#{name.inspect}"} - # 10.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) # 10.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 10.1.3) Let value be the property value of element. value = property_value(element) @@ -396,7 +317,7 @@ def generate_triples(item, ec = {}) # 10.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(10.1.4): value=#{value.inspect}"} elsif value.is_a?(RDF::Literal) # 10.1.5) Otherwise, if value is a literal, ignore the value and continue to the next name; it is an error for the value of @itemprop-reverse to be a literal @@ -432,13 +353,13 @@ def item_properties(item, reverse = false) # To crawl the properties of an element root with a list memory, the user agent must run the following steps. These steps either fail or return a list with a count of errors. The count of errors is used as part of the authoring conformance criteria below. # # @param [Nokogiri::XML::Element] root - # @param [Array] memory + # @param [Array] memo # @param [Boolean] reverse crawl reverse properties # @return [Array] # Resultant elements - def crawl_properties(root, memory, reverse) - # 1. If root is in memory, then the algorithm fails; abort these steps. - raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root) + def crawl_properties(root, memo, reverse) + # 1. If root is in memo, then the algorithm fails; abort these steps. + raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memo.include?(root) # 2. Collect all the elements in the item root; let results be the resulting list of elements, and errors be the resulting count of errors. results = elements_in_item(root) @@ -447,13 +368,13 @@ def crawl_properties(root, memory, reverse) # 3. Remove any elements from results that do not have an @itemprop (@itemprop-reverse) attribute specified. results = results.select {|e| e.has_attribute?(reverse ? 'itemprop-reverse' : 'itemprop')} - # 4. Let new memory be a new list consisting of the old list memory with the addition of root. - raise CrawlFailure, "itemref recursion" if memory.detect {|n| root.node.object_id == n.node.object_id} - new_memory = memory + [root] + # 4. Let new memo be a new list consisting of the old list memo with the addition of root. + raise CrawlFailure, "itemref recursion" if memo.detect {|n| root.node.object_id == n.node.object_id} + new_memo = memo + [root] - # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memory as the memory. + # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memo as the memo. results.select {|e| e.has_attribute?('itemscope')}.each do |element| - log_depth {crawl_properties(element, new_memory, reverse)} + log_depth {crawl_properties(element, new_memo, reverse)} end results @@ -469,7 +390,7 @@ def crawl_properties(root, memory, reverse) def elements_in_item(root) # Let results and pending be empty lists of elements. # Let errors be zero. - results, memory, errors = [], [], 0 + results, memo, errors = [], [], 0 # Add all the children elements of root to pending. pending = root.elements @@ -487,13 +408,13 @@ def elements_in_item(root) # Loop: Remove an element from pending and let current be that element. while current = pending.shift - if memory.include?(current) + if memo.include?(current) raise CrawlFailure, "elements_in_item: results already includes #{current.inspect}" elsif !current.has_attribute?('itemscope') # If current is not already in results and current does not have an itemscope attribute, then: add all the child elements of current to pending. pending += current.elements end - memory << current + memo << current # If current is not already in results, then: add current to results. results << current unless results.include?(current) diff --git a/lib/rdf/microdata/reader/nokogiri.rb b/lib/rdf/microdata/reader/nokogiri.rb index a77bb30..f148516 100644 --- a/lib/rdf/microdata/reader/nokogiri.rb +++ b/lib/rdf/microdata/reader/nokogiri.rb @@ -103,6 +103,12 @@ def elements NodeSetProxy.new(@node.elements, self) end + ## + # Rational debug output + def to_str + @node.path + end + ## # Proxy for everything else to @node def method_missing(method, *args) diff --git a/lib/rdf/microdata/registry.rb b/lib/rdf/microdata/registry.rb new file mode 100644 index 0000000..f7940f8 --- /dev/null +++ b/lib/rdf/microdata/registry.rb @@ -0,0 +1,109 @@ +require 'json' +module RDF::Microdata + + # Interface to registry + class Registry + # @return [RDF::URI] Prefix of vocabulary + attr_reader :uri + + # @return [Hash] properties + attr_reader :properties + + ## + # Initialize the registry from a URI or file path + # + # @param [String] registry_uri + def self.load_registry(registry_uri) + return if @registry_uri == registry_uri + + json = RDF::Util::File.open_file(registry_uri) { |f| ::JSON.load(f) } + + @prefixes = {} + json.each do |prefix, elements| + next unless elements.is_a?(Hash) + properties = elements.fetch("properties", {}) + @prefixes[prefix] = Registry.new(prefix, properties) + end + @registry_uri = registry_uri + end + + ## + # Initialize registry for a particular prefix URI + # + # @param [RDF::URI] prefixURI + # @param [Hash] properties ({}) + def initialize(prefixURI, properties = {}) + @uri = prefixURI + @properties = properties + @property_base = prefixURI.to_s + # Append a '#' for fragment if necessary + @property_base += '#' unless %w(/ #).include?(@property_base[-1,1]) + end + + ## + # Find a registry entry given a type URI + # + # @param [RDF::URI] type + # @return [Registry] + def self.find(type) + @prefixes ||= {} + k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 } + @prefixes[k] if k + end + + ## + # Generate a predicateURI given a `name` + # + # @param [#to_s] name + # @param [Hash{}] ec Evaluation Context + # @return [RDF::URI] + def predicateURI(name, base_uri) + u = RDF::URI(name) + # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ + return u if u.absolute? + + n = frag_escape(name) + if uri.nil? + # 2) If current vocabulary from context is null, there can be no current vocabulary. + # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name + u = RDF::URI(base_uri.to_s) + u.fragment = frag_escape(name) + u + else + # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). + RDF::URI(@property_base + n) + end + end + + ## + # Yield a equivalentProperty or subPropertyOf if appropriate + # + # @param [RDF::URI] predicateURI + # @yield equiv + # @yieldparam [RDF::URI] equiv + def expand(predicateURI) + tok = tokenize(predicateURI) + if @properties[tok].is_a?(Hash) + value = @properties[tok].fetch("subPropertyOf", nil) + value ||= @properties[tok].fetch("equivalentProperty", nil) + + Array(value).each {|equiv| yield RDF::URI(equiv)} + end + end + + ## + # Turn a predicateURI into a simple token + # @param [RDF::URI] predicateURI + # @return [String] + def tokenize(predicateURI) + predicateURI.to_s.sub(@property_base, '') + end + + ## + # Fragment escape a name + def frag_escape(name) + name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} + end + end + +end \ No newline at end of file diff --git a/rdf-microdata.gemspec b/rdf-microdata.gemspec index 459fc28..a9cf6f9 100755 --- a/rdf-microdata.gemspec +++ b/rdf-microdata.gemspec @@ -10,7 +10,6 @@ Gem::Specification.new do |gem| gem.license = 'Unlicense' gem.summary = "Microdata reader for Ruby." gem.description = 'Reads HTML Microdata as RDF.' - gem.rubyforge_project = 'rdf-microdata' gem.authors = %w(Gregg Kellogg) gem.email = 'public-rdf-ruby@w3.org' @@ -24,16 +23,17 @@ Gem::Specification.new do |gem| gem.required_ruby_version = '>= 2.2.2' gem.requirements = [] - gem.add_runtime_dependency 'rdf', '~> 2.2' - gem.add_runtime_dependency 'rdf-xsd', '~> 2.1' + gem.add_runtime_dependency 'rdf', '~> 2.2', '>= 2.2.8' + gem.add_runtime_dependency 'rdf-xsd', '~> 2.2' gem.add_runtime_dependency 'htmlentities', '~> 4.3' - gem.add_runtime_dependency 'nokogiri' , '~> 1.7' + gem.add_runtime_dependency 'nokogiri' , '~> 1.8' gem.add_development_dependency 'equivalent-xml' , '~> 0.6' gem.add_development_dependency 'yard' , '~> 0.9' - gem.add_development_dependency 'rspec', '~> 3.5' + gem.add_development_dependency 'rspec', '~> 3.6' gem.add_development_dependency 'rspec-its', '~> 1.2' + gem.add_development_dependency 'json-ld', '~> 2.1' gem.add_development_dependency 'rdf-spec', '~> 2.2' gem.add_development_dependency 'rdf-rdfa', '~> 2.2' gem.add_development_dependency 'rdf-turtle', '~> 2.2' diff --git a/script/parse b/script/parse index 5eccebc..c7bc0de 100755 --- a/script/parse +++ b/script/parse @@ -19,7 +19,8 @@ def run(input, options) start = Time.new num = 0 - if options[:output_format] == :ntriples || options[:quiet] + case options[:output_format] + when :ntriples, :quiet reader_class.new(input, options).each do |statement| num += 1 if options[:quiet] @@ -28,7 +29,64 @@ def run(input, options) options[:output].puts statement.to_ntriples end end - elsif options[:output_format] == :inspect + when :rdfa + xsl = Nokogiri::XSLT(%( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ).gsub(/^ /, '')) + reader_class.new(input, options.merge(rdfa: true)) do |reader| + reader.rdfa.xpath("//text()").each do |txt| + txt.content = txt.content.to_s.strip + end + options[:output].puts xsl.apply_to(reader.rdfa).to_s + end + when :jsonld + reader_class.new(input, options.merge(jsonld: true)) do |reader| + options[:output].puts reader.jsonld.to_json(::JSON::LD::JSON_STATE) + end + when :inspect reader_class.new(input, options).each do |statement| num += 1 options[:output].puts statement.inspect @@ -55,6 +113,7 @@ logger.formatter = lambda {|severity, datetime, progname, msg| "#{severity}: #{m options = { verbose: false, validate: false, + rdfa: false, logger: logger, output: STDOUT, output_format: :turtle, diff --git a/spec/jsonld_reader_spec.rb b/spec/jsonld_reader_spec.rb new file mode 100644 index 0000000..d341b71 --- /dev/null +++ b/spec/jsonld_reader_spec.rb @@ -0,0 +1,888 @@ +# coding: utf-8 +$:.unshift "." +require 'spec_helper' +require 'rdf/spec/reader' + +describe RDF::Microdata::JsonLdReader do + let!(:doap) {File.expand_path("../../etc/doap.html", __FILE__)} + let!(:doap_nt) {File.expand_path("../../etc/doap.nt", __FILE__)} + let!(:registry_path) {File.expand_path("../test-files/test-registry.json", __FILE__)} + before :each do + @reader = RDF::Microdata::JsonLdReader.new(StringIO.new("")) + end + + context :interface do + subject {%( +
+

My name is Elizabeth.

+
+ )} + + it "should yield reader" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Microdata::JsonLdReader) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/') do |reader| + inner.called(reader.class) + end + end + + it "should return reader" do + expect(RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/')).to be_a(RDF::Microdata::JsonLdReader) + end + + it "should not raise errors" do + expect { + RDF::Microdata::JsonLdReader.new(subject, validate: true, base_uri: 'http://example/') + }.not_to raise_error + end + + it "should yield statements" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Statement).at_least(2) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/').each_statement do |statement| + inner.called(statement.class) + end + end + + it "should yield triples" do + inner = double("inner") + expect(inner).to receive(:called).at_least(2) + RDF::Microdata::JsonLdReader.new(subject, base_uri: 'http://example/').each_triple do |subject, predicate, object| + inner.called(subject.class, predicate.class, object.class) + end + end + + context "Microdata Reader with :jsonld option" do + it "returns a JsonLdReader instance" do + r = RDF::Microdata::Reader.new(StringIO.new(""), jsonld: true) + expect(r).to be_a(RDF::Microdata::JsonLdReader) + end + end + end + + context :parsing do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + _:a . + %s + ) + end + + it "parses a simple graph" do + md = %q(

My name is Gregg Kellogg.

) + nt = %q(_:a "Gregg Kellogg" .) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + + context "values" do + [ + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q( +

My name is Gregg

+

My name is Kellogg

+ ), + %q(_:a "Gregg", "Kellogg" .) + ], + [ + %q(

My name is Gregg Kellogg

), + %q( + _:a "Gregg Kellogg" . + _:a "Gregg Kellogg" . + ) + ], + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(Bar), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a "2011-06-28Z"^^ .) + ], + [ + %q(), + %q(_:a "00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "P2011Y06M28DT00H00M00S"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a _:b .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + pending if [ + '', + '', + ].include?(md) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "base_uri" do + before :each do + @nt_ctx = %q( + _:a . + %s + ) + end + + [ + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(Stéphane Corlosquet), + %q(_:a .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md, base_uri: 'http://example.com/')).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemid" do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + . + %s + ) + end + + [ + [ + %q(

My name is Gregg Kellogg

), + %q( "Gregg Kellogg" .) + ], + [ + %q(), + %q( "foo" .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q( .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemtype" do + { + "with no type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with empty type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with relative type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with single type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + #"with no type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with empty type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with relative type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + "with single type and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + "with inherited type and token property" => [ + %q( +
+

Name: Gregg

+
+

Name: Jeni

+
+
+ ), + %q( + @prefix md: . + @prefix schema: . + [ a schema:Person ; + schema:name "Gregg" ; + schema:knows [ schema:name "Jeni" ] + ] . + ) + ] + }.each do |name, (md, nt)| + it "#{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "itemref" do + { + "to single id" => + [ + %q( +
+
+

Name: Amanda

+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "to generate listed property values" => + [ + %q( +
+
+

My name is Gregg

+
+

My name is Kellogg

+
+ ), + %q( + [ a ; + "Gregg", "Kellogg" ; + ] . + ) + ], + #"to single id with different types" => + #[ + # %q( + #
+ #
+ #
+ #

Name: Amanda

+ #
+ # ), + # %q( + # [ a ; + # "Amanda" ; + # ] . + # [ a ; + # "Amanda" ; + # ] . + # ) + #], + "to multiple ids" => + [ + %q( +
+
+

Name: Amanda

+

Jazz Band

+
+ ), + %q( + [ a ; + "Amanda" ; + "Jazz Band" ; + ] . + ) + ], + "with chaining" => + [ + %q( +
+
+

Name: Amanda

+
+
+

Band: Jazz Band

+

Size: 12 players

+
+
+ ), + %q( + [ a ; + "Amanda" ; + [ + a ; + "Jazz Band"; + "12" + ] + ] . + ) + ], + "shared" => + [ + %q( +
+
+
+
+ Amanda +
+
+ ), + %q( + [ a ; _:a ] . + [ a ; _:a ] . + _:a "Amanda" . + ) + + ], + }.each do |name, (md, nt)| + it "parses #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + + it "catches infinite recursion", pending: true do + md = %( + + +
+
+
friend1
+
+
friend2
+
+
+
+
+ + ) + expect {parse(md, validate: true)}.to raise_error(RDF::ReaderError) + expect(@logger.to_s).to include("itemref recursion") + end + end + + context "propertyURI" do + context "no expansion" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "default propertyURI generation" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + end + + context "itemprop-reverse", skip: true do + { + "link" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + [ + a ; + "William Shakespeare" + ] . + ) + ], + "itemscope" => [ + %q( +
+ The ACME Shopping Mall on Structured Data Avenue + The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data +

Here is a list of shops inside:

+
+ Dan Brickley's Data Restaurant +
+
+ Ramanathan Guha's Meta Content Framework Bakery +
+
+ ), + %q( + _:a a ; + "The ACME Shopping Mall on Structured Data Avenue"; + "The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data" . + _:b a ; + "Dan Brickley's Data Restaurant"; + _:a . + _:c a ; + "Ramanathan Guha's Meta Content Framework Bakery"; + _:a . + ) + ], + "literal" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + _:a a ; + "William Shakespeare" . + ) + ], + "itemprop and itemprop-reverse" => [ + %q( +
+ Cryptography Users +
+
+ Alice +
+ 1977 +
+
+ ), + %q( + @prefix schema: . + @prefix md: . + + _:a a schema:Organization; + schema:name "Cryptography Users"; + schema:member _:b . + _:b a schema:OrganizationRole; + schema:startDate "1977"; + schema:member _:c; + schema:memberOf _:a . + _:c a schema:Person; + schema:name "Alice"; + schema:memberOf _:b . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "vocabulary expansion", pending: true do + it "always expands" do + md = %q( +
+ +
+ ) + ttl = %q( + [ a , ; + + ] . + ) + + expect(parse(md, vocab_expansion: true)).to be_equivalent_graph(ttl, logger: @logger) + end + end + + context "test-files", skip: true do + Dir.glob(File.join(File.expand_path(File.dirname(__FILE__)), "test-files", "*.html")).each do |md| + it "parses #{md}" do + test_file(md) + end + end + end + end + + def parse(input, options = {}) + @logger = RDF::Spec.logger + graph = options[:graph] || RDF::Graph.new + RDF::Microdata::Reader.new(input, { + logger: @logger, + rdfa: true, + validate: false, + base_uri: "http://example/", + registry: registry_path, + canonicalize: false}.merge(options)).each do |statement| + graph << statement + end + + # Remove any rdfa:usesVocabulary statements + graph.query(predicate: RDF::RDFA.usesVocabulary).each do |stmt| + graph.delete(stmt) + end + graph + end + + def test_file(filepath, options = {}) + graph = parse(File.open(filepath), options) + + ttl_string = File.read(filepath.sub('.html', '.ttl')) + expect(graph).to be_equivalent_graph(ttl_string, logger: @logger) + end +end diff --git a/spec/rdfa_reader_spec.rb b/spec/rdfa_reader_spec.rb new file mode 100644 index 0000000..18a7139 --- /dev/null +++ b/spec/rdfa_reader_spec.rb @@ -0,0 +1,888 @@ +# coding: utf-8 +$:.unshift "." +require 'spec_helper' +require 'rdf/spec/reader' + +describe RDF::Microdata::RdfaReader do + let!(:doap) {File.expand_path("../../etc/doap.html", __FILE__)} + let!(:doap_nt) {File.expand_path("../../etc/doap.nt", __FILE__)} + let!(:registry_path) {File.expand_path("../test-files/test-registry.json", __FILE__)} + before :each do + @reader = RDF::Microdata::RdfaReader.new(StringIO.new("")) + end + + context :interface do + subject {%( +
+

My name is Elizabeth.

+
+ )} + + it "should yield reader" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Microdata::RdfaReader) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/') do |reader| + inner.called(reader.class) + end + end + + it "should return reader" do + expect(RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/')).to be_a(RDF::Microdata::RdfaReader) + end + + it "should not raise errors" do + expect { + RDF::Microdata::RdfaReader.new(subject, validate: true, base_uri: 'http://example/') + }.not_to raise_error + end + + it "should yield statements" do + inner = double("inner") + expect(inner).to receive(:called).with(RDF::Statement).at_least(2) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/').each_statement do |statement| + inner.called(statement.class) + end + end + + it "should yield triples" do + inner = double("inner") + expect(inner).to receive(:called).at_least(2) + RDF::Microdata::RdfaReader.new(subject, base_uri: 'http://example/').each_triple do |subject, predicate, object| + inner.called(subject.class, predicate.class, object.class) + end + end + + context "Microdata Reader with :rdfa option" do + it "returns a RdfaReader instance" do + r = RDF::Microdata::Reader.new(StringIO.new(""), rdfa: true) + expect(r).to be_a(RDF::Microdata::RdfaReader) + end + end + end + + context :parsing do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + _:a . + %s + ) + end + + it "parses a simple graph" do + md = %q(

My name is Gregg Kellogg.

) + nt = %q(_:a "Gregg Kellogg" .) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + + context "values" do + [ + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q( +

My name is Gregg

+

My name is Kellogg

+ ), + %q(_:a "Gregg", "Kellogg" .) + ], + [ + %q(

My name is Gregg Kellogg

), + %q( + _:a "Gregg Kellogg" . + _:a "Gregg Kellogg" . + ) + ], + [ + %q(

My name is Gregg Kellogg

), + %q(_:a "Gregg Kellogg" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(Bar), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a "2011-06-28Z"^^ .) + ], + [ + %q(), + %q(_:a "00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q(_:a "P2011Y06M28DT00H00M00S"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a _:b .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "1"^^ .) + ], + [ + %q(), + %q(_:a "1.1"^^ .) + ], + [ + %q(), + %q(_:a "1.1e1"^^ .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + [ + %q(), + %q(_:a "foo" .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + pending if [ + '', + '', + ].include?(md) + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "base_uri" do + before :each do + @nt_ctx = %q( + _:a . + %s + ) + end + + [ + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(), + %q(_:a .) + ], + [ + %q(Stéphane Corlosquet), + %q(_:a .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md, base_uri: 'http://example.com/')).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemid" do + before :each do + @md_ctx = %q( +
+ %s +
+ ) + @nt_ctx = %q( + . + %s + ) + end + + [ + [ + %q(

My name is Gregg Kellogg

), + %q( "Gregg Kellogg" .) + ], + [ + %q(), + %q( "foo" .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( .) + ], + [ + %q(), + %q( "2011-06-28T00:00:00Z"^^ .) + ], + [ + %q(), + %q( .) + ], + ].each do |(md, nt)| + it "parses #{md}" do + expect(parse(@md_ctx % md)).to be_equivalent_graph(@nt_ctx % nt, logger: @logger) + end + end + end + + context "itemtype" do + { + "with no type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with empty type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with relative type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q() + ], + "with single type and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and token property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + #"with no type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with empty type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + #"with relative type and URI property" => [ + # %q( + #
+ #
+ #

Name: Amanda

+ #
+ #
+ # ), + # %q( + # [ "Amanda" ] . + # ) + #], + "with single type and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "with multipe types and URI property" => [ + %q( +
+
+

Name: Amanda

+
+
+ ), + %q( + [ a , ; + "Amanda" ; + ] . + ) + ], + "with inherited type and token property" => [ + %q( +
+

Name: Gregg

+
+

Name: Jeni

+
+
+ ), + %q( + @prefix md: . + @prefix schema: . + [ a schema:Person ; + schema:name "Gregg" ; + schema:knows [ schema:name "Jeni" ] + ] . + ) + ] + }.each do |name, (md, nt)| + it "#{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "itemref" do + { + "to single id" => + [ + %q( +
+
+

Name: Amanda

+
+ ), + %q( + [ a ; + "Amanda" ; + ] . + ) + ], + "to generate listed property values" => + [ + %q( +
+
+

My name is Gregg

+
+

My name is Kellogg

+
+ ), + %q( + [ a ; + "Gregg", "Kellogg" ; + ] . + ) + ], + #"to single id with different types" => + #[ + # %q( + #
+ #
+ #
+ #

Name: Amanda

+ #
+ # ), + # %q( + # [ a ; + # "Amanda" ; + # ] . + # [ a ; + # "Amanda" ; + # ] . + # ) + #], + "to multiple ids" => + [ + %q( +
+
+

Name: Amanda

+

Jazz Band

+
+ ), + %q( + [ a ; + "Amanda" ; + "Jazz Band" ; + ] . + ) + ], + "with chaining" => + [ + %q( +
+
+

Name: Amanda

+
+
+

Band: Jazz Band

+

Size: 12 players

+
+
+ ), + %q( + [ a ; + "Amanda" ; + [ + a ; + "Jazz Band"; + "12" + ] + ] . + ) + ], + "shared" => + [ + %q( +
+
+
+
+ Amanda +
+
+ ), + %q( + [ a ; _:a ] . + [ a ; _:a ] . + _:a "Amanda" . + ) + + ], + }.each do |name, (md, nt)| + it "parses #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + + it "catches infinite recursion", pending: true do + md = %( + + +
+
+
friend1
+
+
friend2
+
+
+
+
+ + ) + expect {parse(md, validate: true)}.to raise_error(RDF::ReaderError) + expect(@logger.to_s).to include("itemref recursion") + end + end + + context "propertyURI" do + context "no expansion" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "default propertyURI generation" do + { + "http://foo/bar + baz => http://foo/baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#bar + baz => http://foo#baz" => + [ + %q( +
+

FooBar

+
+ ), + %q( + [ a ; "FooBar" ] . + ) + ], + "http://foo#Type + bar + baz => http://foo#baz" => + [ + %q( +
+

Baz

+
+ ), + %q( + [ a ; + [ "Baz"]] . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + end + + context "itemprop-reverse", skip: true do + { + "link" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + [ + a ; + "William Shakespeare" + ] . + ) + ], + "itemscope" => [ + %q( +
+ The ACME Shopping Mall on Structured Data Avenue + The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data +

Here is a list of shops inside:

+
+ Dan Brickley's Data Restaurant +
+
+ Ramanathan Guha's Meta Content Framework Bakery +
+
+ ), + %q( + _:a a ; + "The ACME Shopping Mall on Structured Data Avenue"; + "The ACME Shopping Mall is your one-stop paradise for all data-related shopping needs, from schemas to instance data" . + _:b a ; + "Dan Brickley's Data Restaurant"; + _:a . + _:c a ; + "Ramanathan Guha's Meta Content Framework Bakery"; + _:a . + ) + ], + "literal" => [ + %q( +
+ William Shakespeare + +
+ ), + %q( + _:a a ; + "William Shakespeare" . + ) + ], + "itemprop and itemprop-reverse" => [ + %q( +
+ Cryptography Users +
+
+ Alice +
+ 1977 +
+
+ ), + %q( + @prefix schema: . + @prefix md: . + + _:a a schema:Organization; + schema:name "Cryptography Users"; + schema:member _:b . + _:b a schema:OrganizationRole; + schema:startDate "1977"; + schema:member _:c; + schema:memberOf _:a . + _:c a schema:Person; + schema:name "Alice"; + schema:memberOf _:b . + ) + ], + }.each do |name, (md, nt)| + it "expands #{name}" do + expect(parse(md)).to be_equivalent_graph(nt, logger: @logger) + end + end + end + + context "vocabulary expansion", pending: true do + it "always expands" do + md = %q( +
+ +
+ ) + ttl = %q( + [ a , ; + + ] . + ) + + expect(parse(md, vocab_expansion: true)).to be_equivalent_graph(ttl, logger: @logger) + end + end + + context "test-files", skip: true do + Dir.glob(File.join(File.expand_path(File.dirname(__FILE__)), "test-files", "*.html")).each do |md| + it "parses #{md}" do + test_file(md) + end + end + end + end + + def parse(input, options = {}) + @logger = RDF::Spec.logger + graph = options[:graph] || RDF::Graph.new + RDF::Microdata::Reader.new(input, { + logger: @logger, + rdfa: true, + validate: false, + base_uri: "http://example/", + registry: registry_path, + canonicalize: false}.merge(options)).each do |statement| + graph << statement + end + + # Remove any rdfa:usesVocabulary statements + graph.query(predicate: RDF::RDFA.usesVocabulary).each do |stmt| + graph.delete(stmt) + end + graph + end + + def test_file(filepath, options = {}) + graph = parse(File.open(filepath), options) + + ttl_string = File.read(filepath.sub('.html', '.ttl')) + expect(graph).to be_equivalent_graph(ttl_string, logger: @logger) + end +end diff --git a/spec/suite_helper.rb b/spec/suite_helper.rb index 7ac224a..80850f7 100644 --- a/spec/suite_helper.rb +++ b/spec/suite_helper.rb @@ -25,7 +25,7 @@ def self.open_file(filename_or_url, options = {}, &block) path = filename_or_url[5..-1] Kernel.open(path.to_s, &block) when 'http://www.w3.org/ns/md' - Kernel.open(RDF::Microdata::Reader::DEFAULT_REGISTRY, &block) + Kernel.open(RDF::Microdata::DEFAULT_REGISTRY, &block) when /^#{REMOTE_PATH}/ begin #puts "attempt to open #{filename_or_url} locally" @@ -145,6 +145,10 @@ def action BASE.join(property('action')) end + def input + RDF::Util::File.open_file(action).read + end + def registry reg = property('registry') || BASE + "test-registry.json"