From b72416e98d1baa213c7143b862c35f3a1fa07f86 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 6 Aug 2021 12:46:38 -0700 Subject: [PATCH 1/8] Remove unused travis config. --- .travis.yml | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 35096bf..0000000 --- a/.travis.yml +++ /dev/null @@ -1,20 +0,0 @@ -language: ruby -script: "bundle exec rspec spec" -env: - - CI=true - global: - - NOKOGIRI_USE_SYSTEM_LIBRARIES=true -rvm: - - 2.4 - - 2.5 - - 2.6 - - 2.7 - - ruby-head - - jruby -cache: bundler -sudo: false -matrix: - allow_failures: - - rvm: jruby - - rvm: ruby-head -dist: trusty From 7178b6516ddbd26c74f0ececc585f4fd6425fba3 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Fri, 6 Aug 2021 12:46:55 -0700 Subject: [PATCH 2/8] Don't use Nokogumbo directly, as it's included in Nokogiri >= 1.12. --- Gemfile | 1 - README.md | 2 +- lib/ebnf/writer.rb | 3 +-- spec/spec_helper.rb | 8 +------- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/Gemfile b/Gemfile index 38930f0..54adbe4 100644 --- a/Gemfile +++ b/Gemfile @@ -6,7 +6,6 @@ gem 'rdf', github: "ruby-rdf/rdf", branch: "develop" group :development do gem 'rdf-spec', github: "ruby-rdf/rdf-spec", branch: "develop" - gem "nokogumbo", platforms: :mri gem "byebug", platforms: :mri gem 'psych', platforms: [:mri, :rbx] gem "redcarpet", platforms: :mri diff --git a/README.md b/README.md index b64e254..8c79112 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching). -The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated. +The formatted HTML results are designed to be appropriate for including in specifications. ### Parser Errors On a parsing failure, and exception is raised with information that may be useful in determining the source of the error. diff --git a/lib/ebnf/writer.rb b/lib/ebnf/writer.rb index 5cb7e97..df83056 100644 --- a/lib/ebnf/writer.rb +++ b/lib/ebnf/writer.rb @@ -181,12 +181,11 @@ def initialize(rules, out: $stdout, html: false, format: :ebnf, validate: false, if validate begin - require 'nokogumbo' # Validate the output HTML doc = Nokogiri::HTML5("" + html_result, max_errors: 10) raise EncodingError, "Errors found in generated HTML:\n " + doc.errors.map(&:to_s).join("\n ") unless doc.errors.empty? - rescue LoadError + rescue LoadError, NoMethodError # Skip end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index a609097..1ff2ac4 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -9,12 +9,6 @@ require 'rspec/matchers' require 'rspec/its' require 'matchers' -begin - have_nokogumbo = true - require 'nokogumbo' -rescue LoadError - have_nokogumbo = false -end begin require 'simplecov' @@ -47,7 +41,7 @@ RSpec::Matchers.define :be_valid_html do match do |actual| - return true unless have_nokogumbo + return true unless Nokogiri.const_defined?(:HTML5) root = Nokogiri::HTML5("" + actual, max_parse_errors: 1000) @errors = Array(root && root.errors.map(&:to_s)) @errors.empty? From a09ca6dcb2108b8cb19b38624ba620f06aae9565 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 11 Aug 2021 12:28:54 -0700 Subject: [PATCH 3/8] Update CI for coveralls. --- .github/workflows/ci.yml | 10 +++++++--- Gemfile | 4 ++-- spec/spec_helper.rb | 15 +++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e1a059..3475da9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,11 +15,11 @@ jobs: runs-on: ubuntu-latest env: CI: true + ALLOW_FAILURES: ${{ endsWith(matrix.ruby, 'head') }} strategy: fail-fast: false matrix: ruby: - - 2.4 - 2.5 - 2.6 - 2.7 @@ -36,5 +36,9 @@ jobs: - name: Install dependencies run: bundle install --jobs 4 --retry 3 - name: Run tests - run: bundle exec rspec spec - + run: bundle exec rspec spec || $ALLOW_FAILURES + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v1.1.2 + if: "matrix.ruby == '3.0'" + with: + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/Gemfile b/Gemfile index 54adbe4..97c28cb 100644 --- a/Gemfile +++ b/Gemfile @@ -14,6 +14,6 @@ group :development do end group :development, :test do - gem 'simplecov', platforms: :mri - gem 'coveralls', '~> 0.8', platforms: :mri + gem 'simplecov', '~> 0.21', platforms: :mri + gem 'simplecov-lcov', '~> 0.8', platforms: :mri end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 1ff2ac4..c0d22a4 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -12,10 +12,17 @@ begin require 'simplecov' - require 'coveralls' + require 'simplecov-lcov' + + SimpleCov::Formatter::LcovFormatter.config do |config| + #Coveralls is coverage by default/lcov. Send info results + config.report_with_single_file = true + config.single_report_path = 'coverage/lcov.info' + end + SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([ SimpleCov::Formatter::HTMLFormatter, - Coveralls::SimpleCov::Formatter + SimpleCov::Formatter::LcovFormatter ]) SimpleCov.start do add_filter "/spec/" @@ -24,6 +31,8 @@ STDERR.puts "Coverage Skipped: #{e.message}" end +require 'ebnf' + ::RSpec.configure do |c| c.filter_run focus: true c.run_all_when_everything_filtered = true @@ -52,6 +61,4 @@ end end -require 'ebnf' - PARSED_EBNF_GRAMMAR = EBNF.parse(File.open(File.expand_path("../../etc/ebnf.ebnf", __FILE__)), format: :native).freeze \ No newline at end of file From 6d2e4bbbe58de63579b89d899786e661019ecf81 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 24 Aug 2021 11:25:58 -0700 Subject: [PATCH 4/8] Move string unescaping from LL1::Lexter to new Unescape module. --- lib/ebnf.rb | 1 + lib/ebnf/ll1/lexer.rb | 52 ++--------------------------- lib/ebnf/native.rb | 4 +-- lib/ebnf/unescape.rb | 62 +++++++++++++++++++++++++++++++++++ spec/ll1/lexer_spec.rb | 69 --------------------------------------- spec/unescape_spec.rb | 74 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 141 insertions(+), 121 deletions(-) create mode 100644 lib/ebnf/unescape.rb create mode 100644 spec/unescape_spec.rb diff --git a/lib/ebnf.rb b/lib/ebnf.rb index dce92bd..eb00a0c 100755 --- a/lib/ebnf.rb +++ b/lib/ebnf.rb @@ -9,6 +9,7 @@ module EBNF autoload :PEG, "ebnf/peg" autoload :Rule, "ebnf/rule" autoload :Terminals,"ebnf/terminals" + autoload :Unescape, "ebnf/unescape" autoload :Writer, "ebnf/writer" autoload :VERSION, "ebnf/version" diff --git a/lib/ebnf/ll1/lexer.rb b/lib/ebnf/ll1/lexer.rb index 19be66c..65520d4 100644 --- a/lib/ebnf/ll1/lexer.rb +++ b/lib/ebnf/ll1/lexer.rb @@ -32,60 +32,12 @@ module EBNF::LL1 # @see https://en.wikipedia.org/wiki/Lexical_analysis class Lexer include Enumerable - - ESCAPE_CHARS = { - '\\t' => "\t", # \u0009 (tab) - '\\n' => "\n", # \u000A (line feed) - '\\r' => "\r", # \u000D (carriage return) - '\\b' => "\b", # \u0008 (backspace) - '\\f' => "\f", # \u000C (form feed) - '\\"' => '"', # \u0022 (quotation mark, double quote mark) - "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark) - '\\\\' => '\\' # \u005C (backslash) - }.freeze - ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX - ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX - ECHAR = /\\./u.freeze # More liberal unescaping - UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze + include ::EBNF::Unescape ## # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals attr_reader :whitespace - ## - # Returns a copy of the given `input` string with all `\uXXXX` and - # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their - # unescaped UTF-8 character counterparts. - # - # @param [String] string - # @return [String] - # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape - def self.unescape_codepoints(string) - string = string.dup - string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding) - - # Decode \uXXXX and \UXXXXXXXX code points: - string = string.gsub(UCHAR) do |c| - s = [(c[2..-1]).hex].pack('U*') - s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s - end - - string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) - string - end - - ## - # Returns a copy of the given `input` string with all string escape - # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8 - # character counterparts. - # - # @param [String] input - # @return [String] - # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes - def self.unescape_string(input) - input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]} - end - ## # Tokenizes the given `input` string or stream. # @@ -338,7 +290,7 @@ def ==(other) # @return [String] def unescape(string) if @options[:unescape] - Lexer.unescape_string(Lexer.unescape_codepoints(string)) + EBNF::Unescape.unescape(string) else string end diff --git a/lib/ebnf/native.rb b/lib/ebnf/native.rb index 664d01c..c3fbb43 100644 --- a/lib/ebnf/native.rb +++ b/lib/ebnf/native.rb @@ -287,10 +287,10 @@ def terminal(s) case m = s[0,1] when '"', "'" # STRING1 or STRING2 l, s = s[1..-1].split(m.rstrip, 2) - [LL1::Lexer.unescape_string(l), s] + [Unescape.unescape_string(l), s] when '[' # RANGE, O_RANGE l, s = s[1..-1].split(/(?<=[^\\])\]/, 2) - [[:range, LL1::Lexer.unescape_string(l)], s] + [[:range, Unescape.unescape_string(l)], s] when '#' # HEX s.match(/(#x\h+)(.*)$/) l, s = $1, $2 diff --git a/lib/ebnf/unescape.rb b/lib/ebnf/unescape.rb new file mode 100644 index 0000000..e501608 --- /dev/null +++ b/lib/ebnf/unescape.rb @@ -0,0 +1,62 @@ +# encoding: utf-8 +# Unsecape strings +module EBNF::Unescape + ESCAPE_CHARS = { + '\\t' => "\t", # \u0009 (tab) + '\\n' => "\n", # \u000A (line feed) + '\\r' => "\r", # \u000D (carriage return) + '\\b' => "\b", # \u0008 (backspace) + '\\f' => "\f", # \u000C (form feed) + '\\"' => '"', # \u0022 (quotation mark, double quote mark) + "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark) + '\\\\' => '\\' # \u005C (backslash) + }.freeze + ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX + ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX + ECHAR = /\\./u.freeze # More liberal unescaping + UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze + + ## + # Returns a copy of the given `input` string with all `\uXXXX` and + # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their + # unescaped UTF-8 character counterparts. + # + # @param [String] string + # @return [String] + # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape + def unescape_codepoints(string) + string = string.dup + string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding) + + # Decode \uXXXX and \UXXXXXXXX code points: + string = string.gsub(UCHAR) do |c| + s = [(c[2..-1]).hex].pack('U*') + s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s + end + + string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding) + string + end + module_function :unescape_codepoints + + ## + # Returns a copy of the given `input` string with all string escape + # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8 + # character counterparts. + # + # @param [String] input + # @return [String] + # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes + def unescape_string(input) + input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]} + end + module_function :unescape_string + + # Perform string and codepoint unescaping if defined for this terminal + # @param [String] string + # @return [String] + def unescape(string) + unescape_string(unescape_codepoints(string)) + end + module_function :unescape +end \ No newline at end of file diff --git a/spec/ll1/lexer_spec.rb b/spec/ll1/lexer_spec.rb index 76354f9..845feb0 100644 --- a/spec/ll1/lexer_spec.rb +++ b/spec/ll1/lexer_spec.rb @@ -25,75 +25,6 @@ :STRING_LITERAL_QUOTE, :STRING_LITERAL_SINGLE_QUOTE, :STRING_LITERAL_LONG_SINGLE_QUOTE, :STRING_LITERAL_LONG_QUOTE ]} - - describe ".unescape_codepoints" do - # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape - - it "unescapes \\uXXXX codepoint escape sequences" do - inputs = { - %q(\\u0020) => %q( ), - %q() => %Q(), - %q(\\u03B1:a) => %Q(\xCE\xB1:a), - %q(a\\u003Ab) => %Q(a\x3Ab), - } - inputs.each do |input, output| - output.force_encoding(Encoding::UTF_8) - expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output - end - end - - it "unescapes \\UXXXXXXXX codepoint escape sequences" do - inputs = { - %q(\\U00000020) => %q( ), - %q(\\U00010000) => %Q(\xF0\x90\x80\x80), - %q(\\U000EFFFF) => %Q(\xF3\xAF\xBF\xBF), - } - inputs.each do |input, output| - output.force_encoding(Encoding::UTF_8) - expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output - end - end - - context "escaped strings" do - { - 'Dürst' => 'D\\u00FCrst', - "é" => '\\u00E9', - "€" => '\\u20AC', - "resumé" => 'resum\\u00E9', - }.each_pair do |unescaped, escaped| - it "unescapes #{unescaped.inspect}" do - expect(EBNF::LL1::Lexer.unescape_codepoints(escaped)).to eq unescaped - end - end - end - end - - describe ".unescape_string" do - # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes - - context "escape sequences" do - EBNF::LL1::Lexer::ESCAPE_CHARS.each do |escaped, unescaped| - it "unescapes #{unescaped.inspect}" do - expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped - end - end - end - - context "escaped strings" do - { - 'simple literal' => 'simple literal', - 'backslash:\\' => 'backslash:\\\\', - 'dquote:"' => 'dquote:\\"', - "newline:\n" => 'newline:\\n', - "return\r" => 'return\\r', - "tab:\t" => 'tab:\\t', - }.each_pair do |unescaped, escaped| - it "unescapes #{unescaped.inspect}" do - expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped - end - end - end - end describe ".tokenize" do context "numeric literals" do diff --git a/spec/unescape_spec.rb b/spec/unescape_spec.rb new file mode 100644 index 0000000..11416bb --- /dev/null +++ b/spec/unescape_spec.rb @@ -0,0 +1,74 @@ +# coding: utf-8 +$:.unshift ".." +require 'spec_helper' +require 'ebnf' + +describe EBNF::Unescape do + + describe ".unescape_codepoints" do + # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape + + it "unescapes \\uXXXX codepoint escape sequences" do + inputs = { + %q(\\u0020) => %q( ), + %q() => %Q(), + %q(\\u03B1:a) => %Q(\xCE\xB1:a), + %q(a\\u003Ab) => %Q(a\x3Ab), + } + inputs.each do |input, output| + expect(EBNF::Unescape.unescape_codepoints(input)).to eq output + end + end + + it "unescapes \\UXXXXXXXX codepoint escape sequences" do + inputs = { + %q(\\U00000020) => %q( ), + %q(\\U00010000) => %Q(\xF0\x90\x80\x80), + %q(\\U000EFFFF) => %Q(\xF3\xAF\xBF\xBF), + } + inputs.each do |input, output| + expect(EBNF::Unescape.unescape_codepoints(input)).to eq output + end + end + + context "escaped strings" do + { + 'Dürst' => 'D\\u00FCrst', + "é" => '\\u00E9', + "€" => '\\u20AC', + "resumé" => 'resum\\u00E9', + }.each_pair do |unescaped, escaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::Unescape.unescape_codepoints(escaped)).to eq unescaped + end + end + end + end + + describe ".unescape_string" do + # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes + + context "escape sequences" do + EBNF::Unescape::ESCAPE_CHARS.each do |escaped, unescaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::Unescape.unescape_string(escaped)).to eq unescaped + end + end + end + + context "escaped strings" do + { + 'simple literal' => 'simple literal', + 'backslash:\\' => 'backslash:\\\\', + 'dquote:"' => 'dquote:\\"', + "newline:\n" => 'newline:\\n', + "return\r" => 'return\\r', + "tab:\t" => 'tab:\\t', + }.each_pair do |unescaped, escaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::Unescape.unescape_string(escaped)).to eq unescaped + end + end + end + end +end \ No newline at end of file From 26e69d8dd7e0c048bdcf79fbd0bf360ad83469d9 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 24 Aug 2021 12:59:31 -0700 Subject: [PATCH 5/8] Implement :map and :unescape options to PEG terminals. --- lib/ebnf/peg/parser.rb | 21 ++++++++++++++++++++- lib/ebnf/peg/rule.rb | 15 +++++++++++++-- spec/peg/rule_spec.rb | 2 +- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb index 95f635f..39bc140 100644 --- a/lib/ebnf/peg/parser.rb +++ b/lib/ebnf/peg/parser.rb @@ -55,6 +55,7 @@ def start_options; (@start_hoptions ||= {}); end def production_handlers; (@production_handlers ||= {}); end def terminal_handlers; (@terminal_handlers ||= {}); end def terminal_regexps; (@terminal_regexps ||= {}); end + def terminal_options; (@terminal_options ||= {}); end ## # Defines the pattern for a terminal node and a block to be invoked @@ -75,6 +76,8 @@ def terminal_regexps; (@terminal_regexps ||= {}); end # @option options [Hash{String => String}] :map ({}) # A mapping from terminals, in lower-case form, to # their canonical value + # @option options [Boolean] :unescape + # Cause strings and codepoints to be unescaped. # @yield [value, prod] # @yieldparam [String] value # The scanned terminal value. @@ -83,9 +86,11 @@ def terminal_regexps; (@terminal_regexps ||= {}); end # @yieldparam [Proc] block # Block passed to initialization for yielding to calling parser. # Should conform to the yield specs for #initialize + # @todo FIXME implement map and unescape def terminal(term, regexp = nil, **options, &block) terminal_regexps[term] = regexp if regexp terminal_handlers[term] = block if block_given? + terminal_options[term] = options end ## @@ -180,8 +185,12 @@ def method_missing(method, *args, &block) # Identify the symbol of the starting rule with `start`. # @param [Hash{Symbol => Object}] options # @option options[Integer] :high_water passed to lexer + # @option options[:upper, :lower] :insensitive_strings + # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case. # @option options [Logger] :logger for errors/progress/debug. # @option options[Integer] :low_water passed to lexer + # @option options[Boolean] :seq_hash (false) + # If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided. # @option options [Symbol, Regexp] :whitespace # Symbol of whitespace rule (defaults to `@pass`), or a regular expression # for eating whitespace between non-terminal rules (strongly encouraged). @@ -195,6 +204,7 @@ def method_missing(method, *args, &block) # @raise [Exception] Raises exceptions for parsing errors # or errors raised during processing callbacks. Internal # errors are raised using {Error}. + # @todo FIXME implement insensitive_strings and seq_hash def parse(input = nil, start = nil, rules = nil, **options, &block) start ||= options[:start] rules ||= options[:rules] || [] @@ -467,10 +477,19 @@ def find_rule(sym) # # @param [Symbol] sym # @return [Regexp] - def find_terminal_regexp(sym) + def terminal_regexp(sym) self.class.terminal_regexps[sym] end + ## + # Find a regular expression defined for a terminal + # + # @param [Symbol] sym + # @return [Regexp] + def terminal_options(sym) + self.class.terminal_options[sym] + end + ## # Record furthest failure. # diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb index 115ba51..3626464 100644 --- a/lib/ebnf/peg/rule.rb +++ b/lib/ebnf/peg/rule.rb @@ -1,6 +1,8 @@ module EBNF::PEG # Behaviior for parsing a PEG rule module Rule + include ::EBNF::Unescape + ## # Initialized by parser when loading rules. # Used for finding rules and invoking elements of the parse process. @@ -45,9 +47,18 @@ def parse(input) # If the terminal is defined with a regular expression, # use that to match the input, # otherwise, - if regexp = parser.find_terminal_regexp(sym) - matched = input.scan(regexp) + if regexp = parser.terminal_regexp(sym) + term_opts = parser.terminal_options(sym) + if matched = input.scan(regexp) + # Optionally map matched + matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched) + + # Optionally unescape matched + matched = unescape(matched) if term_opts[:unescape] + end + result = parser.onTerminal(sym, (matched ? matched : :unmatched)) + # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched parser.packrat[sym][pos] = { diff --git a/spec/peg/rule_spec.rb b/spec/peg/rule_spec.rb index 783f887..edd86d7 100644 --- a/spec/peg/rule_spec.rb +++ b/spec/peg/rule_spec.rb @@ -449,7 +449,7 @@ expect(parser).to receive(:onStart).with(Symbol).and_return({}) expect(parser).to receive(:onFinish).with(params[:expect]).and_return(params[:expect]) expect(parser).not_to receive(:onTerminal) - expect(parser).to receive(:find_terminal_regexp).with(:rule) + expect(parser).to receive(:terminal_regexp).with(:rule) expect(rule.parse(EBNF::LL1::Scanner.new(params[:input]))).to eql(params[:expect]) end From 343951cccc20ed6cf82df3a164359315c751078a Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Tue, 24 Aug 2021 13:49:16 -0700 Subject: [PATCH 6/8] Allow start_production for PEG to specify case insensitive_strings, mapping to either :upper or :lower. --- lib/ebnf/peg/parser.rb | 10 +++------- lib/ebnf/peg/rule.rb | 20 +++++++++++--------- spec/peg/parser_spec.rb | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb index 39bc140..65307a7 100644 --- a/lib/ebnf/peg/parser.rb +++ b/lib/ebnf/peg/parser.rb @@ -73,9 +73,6 @@ def terminal_options; (@terminal_options ||= {}); end # defaults to the expression defined in the associated rule. # If unset, the terminal rule is used for matching. # @param [Hash] options - # @option options [Hash{String => String}] :map ({}) - # A mapping from terminals, in lower-case form, to - # their canonical value # @option options [Boolean] :unescape # Cause strings and codepoints to be unescaped. # @yield [value, prod] @@ -86,7 +83,6 @@ def terminal_options; (@terminal_options ||= {}); end # @yieldparam [Proc] block # Block passed to initialization for yielding to calling parser. # Should conform to the yield specs for #initialize - # @todo FIXME implement map and unescape def terminal(term, regexp = nil, **options, &block) terminal_regexps[term] = regexp if regexp terminal_handlers[term] = block if block_given? @@ -105,6 +101,8 @@ def terminal(term, regexp = nil, **options, &block) # Options which are returned from {Parser#onStart}. # @option options [Boolean] :as_hash (false) # If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence. + # @option options[:upper, :lower] :insensitive_strings + # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case. # @yield [data, block] # @yieldparam [Hash] data # A Hash defined for the current production, during :start @@ -185,8 +183,6 @@ def method_missing(method, *args, &block) # Identify the symbol of the starting rule with `start`. # @param [Hash{Symbol => Object}] options # @option options[Integer] :high_water passed to lexer - # @option options[:upper, :lower] :insensitive_strings - # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case. # @option options [Logger] :logger for errors/progress/debug. # @option options[Integer] :low_water passed to lexer # @option options[Boolean] :seq_hash (false) @@ -204,7 +200,7 @@ def method_missing(method, *args, &block) # @raise [Exception] Raises exceptions for parsing errors # or errors raised during processing callbacks. Internal # errors are raised using {Error}. - # @todo FIXME implement insensitive_strings and seq_hash + # @todo FIXME implement seq_hash def parse(input = nil, start = nil, rules = nil, **options, &block) start ||= options[:start] rules ||= options[:rules] || [] diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb index 3626464..dec0921 100644 --- a/lib/ebnf/peg/rule.rb +++ b/lib/ebnf/peg/rule.rb @@ -72,6 +72,7 @@ def parse(input) eat_whitespace(input) end start_options = parser.onStart(sym) + string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0 result = case expr.first when :alt @@ -85,7 +86,7 @@ def parse(input) raise "No rule found for #{prod}" unless rule rule.parse(input) when String - input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched + input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched end if alt == :unmatched # Update furthest failure for strings and terminals @@ -123,7 +124,7 @@ def parse(input) raise "No rule found for #{prod}" unless rule rule.parse(input) when String - input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched + input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched end if res != :unmatched # Update furthest failure for terminals @@ -134,7 +135,7 @@ def parse(input) end when :opt # Result is the matched value or nil - opt = rept(input, 0, 1, expr[1]) + opt = rept(input, 0, 1, expr[1], string_regexp_opts) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -142,7 +143,7 @@ def parse(input) when :plus # Result is an array of all expressions while they match, # at least one must match - plus = rept(input, 1, '*', expr[1]) + plus = rept(input, 1, '*', expr[1], string_regexp_opts) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -157,7 +158,7 @@ def parse(input) when :rept # Result is an array of all expressions while they match, # an empty array of none match - rept = rept(input, expr[1], expr[2], expr[3]) + rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts) # # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal? @@ -172,7 +173,7 @@ def parse(input) raise "No rule found for #{prod}" unless rule rule.parse(input) when String - input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched + input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched end if res == :unmatched # Update furthest failure for strings and terminals @@ -193,7 +194,7 @@ def parse(input) when :star # Result is an array of all expressions while they match, # an empty array of none match - star = rept(input, 0, '*', expr[1]) + star = rept(input, 0, '*', expr[1], string_regexp_opts) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -225,8 +226,9 @@ def parse(input) # @param [Integer] max # If it is an integer, it stops matching after max entries. # @param [Symbol, String] prod + # @param [Integer] string_regexp_opts # @return [:unmatched, Array] - def rept(input, min, max, prod) + def rept(input, min, max, prod, string_regexp_opts) result = [] case prod @@ -238,7 +240,7 @@ def rept(input, min, max, prod) result << res end when String - while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max) + while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max) eat_whitespace(input) unless terminal? result << res end diff --git a/spec/peg/parser_spec.rb b/spec/peg/parser_spec.rb index 5fffbc5..74507fd 100644 --- a/spec/peg/parser_spec.rb +++ b/spec/peg/parser_spec.rb @@ -11,6 +11,8 @@ class PegParserTest before(:all) { PegParserTest.start_production(:term) {"foo"} PegParserTest.production(:term) {"foo"} + PegParserTest.start_production(:toLower) {|value| value} + PegParserTest.start_production(:toUpper) {|value| value} PegParserTest.terminal(:escape, /escape/) {"foo"} PegParserTest.terminal(:unescape, /unescape/, unescape: true) {"foo"} } @@ -94,6 +96,28 @@ class PegParserTest end end + context "case insensitive string matching" do + let(:start) {:expression} + let(:grammar) {%{( + (rule expression "1" (alt upper lower)) + (rule upper "2" (seq "uPpEr")) + (rule lower "3" (seq "LoWeR")) + )}} + let(:rules) {EBNF.parse(grammar, format: :sxp).make_peg.ast} + + { + "UPPER" => "UPPER", + "upper" => "UPPER", + "LOWER" => "lower", + "lower" => "lower", + }.each do |input, expected| + it "parses #{input.inspect} to #{expected.inspect}" do + output = PegParserTest.new.parse(input, start, rules, debug: 3, logger: logger) + expect(output).to produce(expected, logger) + end + end + end + context "with backtracking" do let(:start) {:expression} let(:grammar) {%{( From cf89c77f444ef0599ffb58539d02e6204375178b Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 25 Aug 2021 14:33:58 -0700 Subject: [PATCH 7/8] Acdtually do transformation of matched string when insensitive_strings is set. --- lib/ebnf/peg/parser.rb | 2 +- lib/ebnf/peg/rule.rb | 24 +++++++++++++++++++----- spec/peg/parser_spec.rb | 20 ++++++++++---------- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb index 65307a7..79a7270 100644 --- a/lib/ebnf/peg/parser.rb +++ b/lib/ebnf/peg/parser.rb @@ -86,7 +86,7 @@ def terminal_options; (@terminal_options ||= {}); end def terminal(term, regexp = nil, **options, &block) terminal_regexps[term] = regexp if regexp terminal_handlers[term] = block if block_given? - terminal_options[term] = options + terminal_options[term] = options.freeze end ## diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb index dec0921..305543a 100644 --- a/lib/ebnf/peg/rule.rb +++ b/lib/ebnf/peg/rule.rb @@ -86,7 +86,12 @@ def parse(input) raise "No rule found for #{prod}" unless rule rule.parse(input) when String - input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched + s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) + case start_options[:insensitive_strings] + when :lower then s && s.downcase + when :upper then s && s.upcase + else s + end || :unmatched end if alt == :unmatched # Update furthest failure for strings and terminals @@ -135,7 +140,7 @@ def parse(input) end when :opt # Result is the matched value or nil - opt = rept(input, 0, 1, expr[1], string_regexp_opts) + opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -173,7 +178,12 @@ def parse(input) raise "No rule found for #{prod}" unless rule rule.parse(input) when String - input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched + s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) + case start_options[:insensitive_strings] + when :lower then s && s.downcase + when :upper then s && s.upcase + else s + end || :unmatched end if res == :unmatched # Update furthest failure for strings and terminals @@ -228,7 +238,7 @@ def parse(input) # @param [Symbol, String] prod # @param [Integer] string_regexp_opts # @return [:unmatched, Array] - def rept(input, min, max, prod, string_regexp_opts) + def rept(input, min, max, prod, string_regexp_opts, **options) result = [] case prod @@ -242,7 +252,11 @@ def rept(input, min, max, prod, string_regexp_opts) when String while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max) eat_whitespace(input) unless terminal? - result << res + result << case options[:insensitive_strings] + when :lower then res.downcase + when :upper then res.upcase + else res + end end end diff --git a/spec/peg/parser_spec.rb b/spec/peg/parser_spec.rb index 74507fd..9d31d3d 100644 --- a/spec/peg/parser_spec.rb +++ b/spec/peg/parser_spec.rb @@ -11,8 +11,8 @@ class PegParserTest before(:all) { PegParserTest.start_production(:term) {"foo"} PegParserTest.production(:term) {"foo"} - PegParserTest.start_production(:toLower) {|value| value} - PegParserTest.start_production(:toUpper) {|value| value} + PegParserTest.start_production(:toLower, insensitive_strings: :lower) {|value| value} + PegParserTest.start_production(:toUpper, insensitive_strings: :upper) {|value| value} PegParserTest.terminal(:escape, /escape/) {"foo"} PegParserTest.terminal(:unescape, /unescape/, unescape: true) {"foo"} } @@ -24,7 +24,7 @@ class PegParserTest describe "ClassMethods" do describe "production" do it "adds as a start_handler" do - expect(PegParserTest.start_handlers.keys).to eq [:term] + expect(PegParserTest.start_handlers.keys).to eq [:term, :toLower, :toUpper] expect(PegParserTest.start_handlers[:term]).to be_a(Proc) end it "adds as a production_handler" do @@ -99,17 +99,17 @@ class PegParserTest context "case insensitive string matching" do let(:start) {:expression} let(:grammar) {%{( - (rule expression "1" (alt upper lower)) - (rule upper "2" (seq "uPpEr")) - (rule lower "3" (seq "LoWeR")) + (rule expression "1" (alt toUpper toLower)) + (rule toUpper "2" (seq "uPpEr")) + (rule toLower "3" (seq "LoWeR")) )}} let(:rules) {EBNF.parse(grammar, format: :sxp).make_peg.ast} { - "UPPER" => "UPPER", - "upper" => "UPPER", - "LOWER" => "lower", - "lower" => "lower", + "UPPER" => [{uPpEr: "UPPER"}], + "upper" => [{uPpEr: "UPPER"}], + "LOWER" => [{LoWeR: "lower"}], + "lower" => [{LoWeR: "lower"}], }.each do |input, expected| it "parses #{input.inspect} to #{expected.inspect}" do output = PegParserTest.new.parse(input, start, rules, debug: 3, logger: logger) From f1c081921786b2b2ef0255ee145b6361bd49c848 Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 25 Aug 2021 14:38:28 -0700 Subject: [PATCH 8/8] Version 2,2,0. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ac2cdeb..ccbccc3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.3 +2.2.0