Skip to content

Commit

Permalink
Finish 2.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
gkellogg committed Aug 25, 2021
2 parents 2c3938c + f1c0819 commit 4d5372b
Show file tree
Hide file tree
Showing 17 changed files with 249 additions and 180 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/ci.yml
Expand Up @@ -15,11 +15,11 @@ jobs:
runs-on: ubuntu-latest
env:
CI: true
ALLOW_FAILURES: ${{ endsWith(matrix.ruby, 'head') }}
strategy:
fail-fast: false
matrix:
ruby:
- 2.4
- 2.5
- 2.6
- 2.7
Expand All @@ -36,5 +36,9 @@ jobs:
- name: Install dependencies
run: bundle install --jobs 4 --retry 3
- name: Run tests
run: bundle exec rspec spec

run: bundle exec rspec spec || $ALLOW_FAILURES
- name: Coveralls GitHub Action
uses: coverallsapp/github-action@v1.1.2
if: "matrix.ruby == '3.0'"
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
20 changes: 0 additions & 20 deletions .travis.yml

This file was deleted.

5 changes: 2 additions & 3 deletions Gemfile
Expand Up @@ -6,7 +6,6 @@ gem 'rdf', github: "ruby-rdf/rdf", branch: "develop"

group :development do
gem 'rdf-spec', github: "ruby-rdf/rdf-spec", branch: "develop"
gem "nokogumbo", platforms: :mri
gem "byebug", platforms: :mri
gem 'psych', platforms: [:mri, :rbx]
gem "redcarpet", platforms: :mri
Expand All @@ -15,6 +14,6 @@ group :development do
end

group :development, :test do
gem 'simplecov', platforms: :mri
gem 'coveralls', '~> 0.8', platforms: :mri
gem 'simplecov', '~> 0.21', platforms: :mri
gem 'simplecov-lcov', '~> 0.8', platforms: :mri
end
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi

The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching).

The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated.
The formatted HTML results are designed to be appropriate for including in specifications.

### Parser Errors
On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
@@ -1 +1 @@
2.1.3
2.2.0
1 change: 1 addition & 0 deletions lib/ebnf.rb
Expand Up @@ -9,6 +9,7 @@ module EBNF
autoload :PEG, "ebnf/peg"
autoload :Rule, "ebnf/rule"
autoload :Terminals,"ebnf/terminals"
autoload :Unescape, "ebnf/unescape"
autoload :Writer, "ebnf/writer"
autoload :VERSION, "ebnf/version"

Expand Down
52 changes: 2 additions & 50 deletions lib/ebnf/ll1/lexer.rb
Expand Up @@ -32,60 +32,12 @@ module EBNF::LL1
# @see https://en.wikipedia.org/wiki/Lexical_analysis
class Lexer
include Enumerable

ESCAPE_CHARS = {
'\\t' => "\t", # \u0009 (tab)
'\\n' => "\n", # \u000A (line feed)
'\\r' => "\r", # \u000D (carriage return)
'\\b' => "\b", # \u0008 (backspace)
'\\f' => "\f", # \u000C (form feed)
'\\"' => '"', # \u0022 (quotation mark, double quote mark)
"\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
'\\\\' => '\\' # \u005C (backslash)
}.freeze
ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
ECHAR = /\\./u.freeze # More liberal unescaping
UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
include ::EBNF::Unescape

##
# @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
attr_reader :whitespace

##
# Returns a copy of the given `input` string with all `\uXXXX` and
# `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
# unescaped UTF-8 character counterparts.
#
# @param [String] string
# @return [String]
# @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
def self.unescape_codepoints(string)
string = string.dup
string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)

# Decode \uXXXX and \UXXXXXXXX code points:
string = string.gsub(UCHAR) do |c|
s = [(c[2..-1]).hex].pack('U*')
s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
end

string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
string
end

##
# Returns a copy of the given `input` string with all string escape
# sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
# character counterparts.
#
# @param [String] input
# @return [String]
# @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
def self.unescape_string(input)
input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
end

##
# Tokenizes the given `input` string or stream.
#
Expand Down Expand Up @@ -338,7 +290,7 @@ def ==(other)
# @return [String]
def unescape(string)
if @options[:unescape]
Lexer.unescape_string(Lexer.unescape_codepoints(string))
EBNF::Unescape.unescape(string)
else
string
end
Expand Down
4 changes: 2 additions & 2 deletions lib/ebnf/native.rb
Expand Up @@ -287,10 +287,10 @@ def terminal(s)
case m = s[0,1]
when '"', "'" # STRING1 or STRING2
l, s = s[1..-1].split(m.rstrip, 2)
[LL1::Lexer.unescape_string(l), s]
[Unescape.unescape_string(l), s]
when '[' # RANGE, O_RANGE
l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
[[:range, LL1::Lexer.unescape_string(l)], s]
[[:range, Unescape.unescape_string(l)], s]
when '#' # HEX
s.match(/(#x\h+)(.*)$/)
l, s = $1, $2
Expand Down
23 changes: 19 additions & 4 deletions lib/ebnf/peg/parser.rb
Expand Up @@ -55,6 +55,7 @@ def start_options; (@start_hoptions ||= {}); end
def production_handlers; (@production_handlers ||= {}); end
def terminal_handlers; (@terminal_handlers ||= {}); end
def terminal_regexps; (@terminal_regexps ||= {}); end
def terminal_options; (@terminal_options ||= {}); end

##
# Defines the pattern for a terminal node and a block to be invoked
Expand All @@ -72,9 +73,8 @@ def terminal_regexps; (@terminal_regexps ||= {}); end
# defaults to the expression defined in the associated rule.
# If unset, the terminal rule is used for matching.
# @param [Hash] options
# @option options [Hash{String => String}] :map ({})
# A mapping from terminals, in lower-case form, to
# their canonical value
# @option options [Boolean] :unescape
# Cause strings and codepoints to be unescaped.
# @yield [value, prod]
# @yieldparam [String] value
# The scanned terminal value.
Expand All @@ -86,6 +86,7 @@ def terminal_regexps; (@terminal_regexps ||= {}); end
def terminal(term, regexp = nil, **options, &block)
terminal_regexps[term] = regexp if regexp
terminal_handlers[term] = block if block_given?
terminal_options[term] = options.freeze
end

##
Expand All @@ -100,6 +101,8 @@ def terminal(term, regexp = nil, **options, &block)
# Options which are returned from {Parser#onStart}.
# @option options [Boolean] :as_hash (false)
# If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
# @option options[:upper, :lower] :insensitive_strings
# Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case.
# @yield [data, block]
# @yieldparam [Hash] data
# A Hash defined for the current production, during :start
Expand Down Expand Up @@ -182,6 +185,8 @@ def method_missing(method, *args, &block)
# @option options[Integer] :high_water passed to lexer
# @option options [Logger] :logger for errors/progress/debug.
# @option options[Integer] :low_water passed to lexer
# @option options[Boolean] :seq_hash (false)
# If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided.
# @option options [Symbol, Regexp] :whitespace
# Symbol of whitespace rule (defaults to `@pass`), or a regular expression
# for eating whitespace between non-terminal rules (strongly encouraged).
Expand All @@ -195,6 +200,7 @@ def method_missing(method, *args, &block)
# @raise [Exception] Raises exceptions for parsing errors
# or errors raised during processing callbacks. Internal
# errors are raised using {Error}.
# @todo FIXME implement seq_hash
def parse(input = nil, start = nil, rules = nil, **options, &block)
start ||= options[:start]
rules ||= options[:rules] || []
Expand Down Expand Up @@ -467,10 +473,19 @@ def find_rule(sym)
#
# @param [Symbol] sym
# @return [Regexp]
def find_terminal_regexp(sym)
def terminal_regexp(sym)
self.class.terminal_regexps[sym]
end

##
# Find a regular expression defined for a terminal
#
# @param [Symbol] sym
# @return [Regexp]
def terminal_options(sym)
self.class.terminal_options[sym]
end

##
# Record furthest failure.
#
Expand Down
51 changes: 39 additions & 12 deletions lib/ebnf/peg/rule.rb
@@ -1,6 +1,8 @@
module EBNF::PEG
# Behaviior for parsing a PEG rule
module Rule
include ::EBNF::Unescape

##
# Initialized by parser when loading rules.
# Used for finding rules and invoking elements of the parse process.
Expand Down Expand Up @@ -45,9 +47,18 @@ def parse(input)
# If the terminal is defined with a regular expression,
# use that to match the input,
# otherwise,
if regexp = parser.find_terminal_regexp(sym)
matched = input.scan(regexp)
if regexp = parser.terminal_regexp(sym)
term_opts = parser.terminal_options(sym)
if matched = input.scan(regexp)
# Optionally map matched
matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched)

# Optionally unescape matched
matched = unescape(matched) if term_opts[:unescape]
end

result = parser.onTerminal(sym, (matched ? matched : :unmatched))

# Update furthest failure for strings and terminals
parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
parser.packrat[sym][pos] = {
Expand All @@ -61,6 +72,7 @@ def parse(input)
eat_whitespace(input)
end
start_options = parser.onStart(sym)
string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0

result = case expr.first
when :alt
Expand All @@ -74,7 +86,12 @@ def parse(input)
raise "No rule found for #{prod}" unless rule
rule.parse(input)
when String
input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
case start_options[:insensitive_strings]
when :lower then s && s.downcase
when :upper then s && s.upcase
else s
end || :unmatched
end
if alt == :unmatched
# Update furthest failure for strings and terminals
Expand Down Expand Up @@ -112,7 +129,7 @@ def parse(input)
raise "No rule found for #{prod}" unless rule
rule.parse(input)
when String
input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched
end
if res != :unmatched
# Update furthest failure for terminals
Expand All @@ -123,15 +140,15 @@ def parse(input)
end
when :opt
# Result is the matched value or nil
opt = rept(input, 0, 1, expr[1])
opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options)

# Update furthest failure for strings and terminals
parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
opt.first
when :plus
# Result is an array of all expressions while they match,
# at least one must match
plus = rept(input, 1, '*', expr[1])
plus = rept(input, 1, '*', expr[1], string_regexp_opts)

# Update furthest failure for strings and terminals
parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
Expand All @@ -146,7 +163,7 @@ def parse(input)
when :rept
# Result is an array of all expressions while they match,
# an empty array of none match
rept = rept(input, expr[1], expr[2], expr[3])
rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts)

# # Update furthest failure for strings and terminals
parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
Expand All @@ -161,7 +178,12 @@ def parse(input)
raise "No rule found for #{prod}" unless rule
rule.parse(input)
when String
input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
case start_options[:insensitive_strings]
when :lower then s && s.downcase
when :upper then s && s.upcase
else s
end || :unmatched
end
if res == :unmatched
# Update furthest failure for strings and terminals
Expand All @@ -182,7 +204,7 @@ def parse(input)
when :star
# Result is an array of all expressions while they match,
# an empty array of none match
star = rept(input, 0, '*', expr[1])
star = rept(input, 0, '*', expr[1], string_regexp_opts)

# Update furthest failure for strings and terminals
parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
Expand Down Expand Up @@ -214,8 +236,9 @@ def parse(input)
# @param [Integer] max
# If it is an integer, it stops matching after max entries.
# @param [Symbol, String] prod
# @param [Integer] string_regexp_opts
# @return [:unmatched, Array]
def rept(input, min, max, prod)
def rept(input, min, max, prod, string_regexp_opts, **options)
result = []

case prod
Expand All @@ -227,9 +250,13 @@ def rept(input, min, max, prod)
result << res
end
when String
while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max)
eat_whitespace(input) unless terminal?
result << res
result << case options[:insensitive_strings]
when :lower then res.downcase
when :upper then res.upcase
else res
end
end
end

Expand Down

0 comments on commit 4d5372b

Please sign in to comment.