Finish 2.2.0

dryruby · Aug 25, 2021 · 4d5372b · 4d5372b
2 parents 2c3938c + f1c0819
commit 4d5372b
Show file tree

Hide file tree

Showing 17 changed files with 249 additions and 180 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,11 +15,11 @@ jobs:
     runs-on: ubuntu-latest
     env:
       CI: true
+      ALLOW_FAILURES: ${{ endsWith(matrix.ruby, 'head') }}
     strategy:
       fail-fast: false
       matrix:
         ruby:
-          - 2.4
           - 2.5
           - 2.6
           - 2.7
@@ -36,5 +36,9 @@ jobs:
       - name: Install dependencies
         run: bundle install --jobs 4 --retry 3
       - name: Run tests
-        run: bundle exec rspec spec
-
+        run: bundle exec rspec spec || $ALLOW_FAILURES
+      - name: Coveralls GitHub Action
+        uses: coverallsapp/github-action@v1.1.2
+        if: "matrix.ruby == '3.0'"
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.travis.yml b/.travis.yml
diff --git a/Gemfile b/Gemfile
@@ -6,7 +6,6 @@ gem 'rdf',            github: "ruby-rdf/rdf",       branch: "develop"
 
 group :development do
   gem 'rdf-spec',   github: "ruby-rdf/rdf-spec",    branch: "develop"
-  gem "nokogumbo",  platforms: :mri
   gem "byebug",     platforms: :mri
   gem 'psych',      platforms: [:mri, :rbx]
   gem "redcarpet",  platforms: :mri
@@ -15,6 +14,6 @@ group :development do
 end
 
 group :development, :test do
-  gem 'simplecov',  platforms: :mri
-  gem 'coveralls',  '~> 0.8', platforms: :mri
+  gem 'simplecov', '~> 0.21',  platforms: :mri
+  gem 'simplecov-lcov', '~> 0.8',  platforms: :mri
 end
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi
 
 The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching).
 
-The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated.
+The formatted HTML results are designed to be appropriate for including in specifications.
 
 ### Parser Errors
 On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.1.3
+2.2.0
diff --git a/lib/ebnf.rb b/lib/ebnf.rb
@@ -9,6 +9,7 @@ module EBNF
   autoload :PEG,      "ebnf/peg"
   autoload :Rule,     "ebnf/rule"
   autoload :Terminals,"ebnf/terminals"
+  autoload :Unescape, "ebnf/unescape"
   autoload :Writer,   "ebnf/writer"
   autoload :VERSION,  "ebnf/version"
 

diff --git a/lib/ebnf/ll1/lexer.rb b/lib/ebnf/ll1/lexer.rb
@@ -32,60 +32,12 @@ module EBNF::LL1
   # @see https://en.wikipedia.org/wiki/Lexical_analysis
   class Lexer
     include Enumerable
-
-    ESCAPE_CHARS         = {
-      '\\t'   => "\t",  # \u0009 (tab)
-      '\\n'   => "\n",  # \u000A (line feed)
-      '\\r'   => "\r",  # \u000D (carriage return)
-      '\\b'   => "\b",  # \u0008 (backspace)
-      '\\f'   => "\f",  # \u000C (form feed)
-      '\\"'  => '"',    # \u0022 (quotation mark, double quote mark)
-      "\\'"  => '\'',   # \u0027 (apostrophe-quote, single quote mark)
-      '\\\\' => '\\'    # \u005C (backslash)
-    }.freeze
-    ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze    # \uXXXX
-    ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze    # \UXXXXXXXX
-    ECHAR               = /\\./u.freeze                        # More liberal unescaping
-    UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
+    include ::EBNF::Unescape
 
     ##
     # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
     attr_reader :whitespace
 
-    ##
-    # Returns a copy of the given `input` string with all `\uXXXX` and
-    # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
-    # unescaped UTF-8 character counterparts.
-    #
-    # @param  [String] string
-    # @return [String]
-    # @see    https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
-    def self.unescape_codepoints(string)
-      string = string.dup
-      string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
-
-      # Decode \uXXXX and \UXXXXXXXX code points:
-      string = string.gsub(UCHAR) do |c|
-        s = [(c[2..-1]).hex].pack('U*')
-        s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
-      end
-
-      string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) 
-      string
-    end
-
-    ##
-    # Returns a copy of the given `input` string with all string escape
-    # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
-    # character counterparts.
-    #
-    # @param  [String] input
-    # @return [String]
-    # @see    https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
-    def self.unescape_string(input)
-      input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
-    end
-
     ##
     # Tokenizes the given `input` string or stream.
     #
@@ -338,7 +290,7 @@ def ==(other)
       # @return [String]
       def unescape(string)
         if @options[:unescape]
-          Lexer.unescape_string(Lexer.unescape_codepoints(string))
+          EBNF::Unescape.unescape(string)
         else
           string
         end

diff --git a/lib/ebnf/native.rb b/lib/ebnf/native.rb
@@ -287,10 +287,10 @@ def terminal(s)
       case m = s[0,1]
       when '"', "'" # STRING1 or STRING2
         l, s = s[1..-1].split(m.rstrip, 2)
-        [LL1::Lexer.unescape_string(l), s]
+        [Unescape.unescape_string(l), s]
       when '[' # RANGE, O_RANGE
         l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
-        [[:range, LL1::Lexer.unescape_string(l)], s]
+        [[:range, Unescape.unescape_string(l)], s]
       when '#' # HEX
         s.match(/(#x\h+)(.*)$/)
         l, s = $1, $2

diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb
@@ -55,6 +55,7 @@ def start_options; (@start_hoptions ||= {}); end
       def production_handlers; (@production_handlers ||= {}); end
       def terminal_handlers; (@terminal_handlers ||= {}); end
       def terminal_regexps; (@terminal_regexps ||= {}); end
+      def terminal_options; (@terminal_options ||= {}); end
 
       ##
       # Defines the pattern for a terminal node and a block to be invoked
@@ -72,9 +73,8 @@ def terminal_regexps; (@terminal_regexps ||= {}); end
       #   defaults to the expression defined in the associated rule.
       #   If unset, the terminal rule is used for matching.
       # @param [Hash] options
-      # @option options [Hash{String => String}] :map ({})
-      #   A mapping from terminals, in lower-case form, to
-      #   their canonical value
+      # @option options [Boolean] :unescape
+      #   Cause strings and codepoints to be unescaped.
       # @yield [value, prod]
       # @yieldparam [String] value
       #   The scanned terminal value.
@@ -86,6 +86,7 @@ def terminal_regexps; (@terminal_regexps ||= {}); end
       def terminal(term, regexp = nil, **options, &block)
         terminal_regexps[term] = regexp if regexp
         terminal_handlers[term] = block if block_given?
+        terminal_options[term] = options.freeze
       end
 
       ##
@@ -100,6 +101,8 @@ def terminal(term, regexp = nil, **options, &block)
       #   Options which are returned from {Parser#onStart}.
       # @option options [Boolean] :as_hash (false)
       #   If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
+      # @option options[:upper, :lower] :insensitive_strings
+      #   Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case.
       # @yield [data, block]
       # @yieldparam [Hash] data
       #   A Hash defined for the current production, during :start
@@ -182,6 +185,8 @@ def method_missing(method, *args, &block)
     # @option options[Integer] :high_water passed to lexer
     # @option options [Logger] :logger for errors/progress/debug.
     # @option options[Integer] :low_water passed to lexer
+    # @option options[Boolean] :seq_hash (false)
+    #   If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided.
     # @option options [Symbol, Regexp] :whitespace 
     #   Symbol of whitespace rule (defaults to `@pass`), or a regular expression
     #   for eating whitespace between non-terminal rules (strongly encouraged).
@@ -195,6 +200,7 @@ def method_missing(method, *args, &block)
     # @raise [Exception] Raises exceptions for parsing errors
     #   or errors raised during processing callbacks. Internal
     #   errors are raised using {Error}.
+    # @todo FIXME implement seq_hash
     def parse(input = nil, start = nil, rules = nil, **options, &block)
       start ||= options[:start]
       rules ||= options[:rules] || []
@@ -467,10 +473,19 @@ def find_rule(sym)
     #
     # @param [Symbol] sym
     # @return [Regexp]
-    def find_terminal_regexp(sym)
+    def terminal_regexp(sym)
       self.class.terminal_regexps[sym]
     end
 
+    ##
+    # Find a regular expression defined for a terminal
+    #
+    # @param [Symbol] sym
+    # @return [Regexp]
+    def terminal_options(sym)
+      self.class.terminal_options[sym]
+    end
+
     ##
     # Record furthest failure.
     #

diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb
@@ -1,6 +1,8 @@
 module EBNF::PEG
   # Behaviior for parsing a PEG rule
   module Rule
+    include ::EBNF::Unescape
+
     ##
     # Initialized by parser when loading rules.
     # Used for finding rules and invoking elements of the parse process.
@@ -45,9 +47,18 @@ def parse(input)
         # If the terminal is defined with a regular expression,
         # use that to match the input,
         # otherwise,
-        if regexp = parser.find_terminal_regexp(sym)
-          matched = input.scan(regexp)
+        if regexp = parser.terminal_regexp(sym)
+          term_opts = parser.terminal_options(sym)
+          if matched = input.scan(regexp)
+            # Optionally map matched
+            matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched)
+
+            # Optionally unescape matched
+            matched = unescape(matched) if term_opts[:unescape]
+          end
+
           result = parser.onTerminal(sym, (matched ? matched : :unmatched))
+
           # Update furthest failure for strings and terminals
           parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
           parser.packrat[sym][pos] = {
@@ -61,6 +72,7 @@ def parse(input)
         eat_whitespace(input)
       end
       start_options = parser.onStart(sym)
+      string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0
 
       result = case expr.first
       when :alt
@@ -74,7 +86,12 @@ def parse(input)
             raise "No rule found for #{prod}" unless rule
             rule.parse(input)
           when String
-            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+            s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
+            case start_options[:insensitive_strings]
+            when :lower then s && s.downcase
+            when :upper then s && s.upcase
+            else s
+            end || :unmatched
           end
           if alt == :unmatched
             # Update furthest failure for strings and terminals
@@ -112,7 +129,7 @@ def parse(input)
           raise "No rule found for #{prod}" unless rule
           rule.parse(input)
         when String
-          input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+          input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched
         end
         if res != :unmatched
           # Update furthest failure for terminals
@@ -123,15 +140,15 @@ def parse(input)
         end
       when :opt
         # Result is the matched value or nil
-        opt = rept(input, 0, 1, expr[1])
+        opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options)
 
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
         opt.first
       when :plus
         # Result is an array of all expressions while they match,
         # at least one must match
-        plus = rept(input, 1, '*', expr[1])
+        plus = rept(input, 1, '*', expr[1], string_regexp_opts)
 
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -146,7 +163,7 @@ def parse(input)
       when :rept
         # Result is an array of all expressions while they match,
         # an empty array of none match
-        rept = rept(input, expr[1], expr[2], expr[3])
+        rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts)
 
         # # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
@@ -161,7 +178,12 @@ def parse(input)
             raise "No rule found for #{prod}" unless rule
             rule.parse(input)
           when String
-            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+            s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
+            case start_options[:insensitive_strings]
+            when :lower then s && s.downcase
+            when :upper then s && s.upcase
+            else s
+            end || :unmatched
           end
           if res == :unmatched
             # Update furthest failure for strings and terminals
@@ -182,7 +204,7 @@ def parse(input)
       when :star
         # Result is an array of all expressions while they match,
         # an empty array of none match
-        star = rept(input, 0, '*', expr[1])
+        star = rept(input, 0, '*', expr[1], string_regexp_opts)
 
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -214,8 +236,9 @@ def parse(input)
     # @param [Integer] max
     #   If it is an integer, it stops matching after max entries.
     # @param [Symbol, String] prod
+    # @param [Integer] string_regexp_opts
     # @return [:unmatched, Array]
-    def rept(input, min, max, prod)
+    def rept(input, min, max, prod, string_regexp_opts, **options)
       result = []
 
       case prod
@@ -227,9 +250,13 @@ def rept(input, min, max, prod)
           result << res
         end
       when String
-        while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
+        while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max)
           eat_whitespace(input) unless terminal?
-          result << res
+          result << case options[:insensitive_strings]
+          when :lower then res.downcase
+          when :upper then res.upcase
+          else res
+          end
         end
       end