Skip to content

Commit

Permalink
Finish 2.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
gkellogg committed Aug 25, 2021
2 parents 4d5372b + a98eb19 commit cb90783
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 1 deletion.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.0
2.2.1
24 changes: 24 additions & 0 deletions lib/ebnf/ll1/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,30 @@ class Lexer
# @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
attr_reader :whitespace

##
# Returns a copy of the given `input` string with all `\uXXXX` and
# `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
# unescaped UTF-8 character counterparts.
#
# @param [String] string
# @return [String]
# @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
def self.unescape_codepoints(string)
::EBNF::Unescape.unescape_codepoints(string)
end

##
# Returns a copy of the given `input` string with all string escape
# sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
# character counterparts.
#
# @param [String] input
# @return [String]
# @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
def self.unescape_string(input)
::EBNF::Unescape.unescape_string(input)
end

##
# Tokenizes the given `input` string or stream.
#
Expand Down
69 changes: 69 additions & 0 deletions spec/ll1/lexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,75 @@
:STRING_LITERAL_QUOTE, :STRING_LITERAL_SINGLE_QUOTE,
:STRING_LITERAL_LONG_SINGLE_QUOTE, :STRING_LITERAL_LONG_QUOTE
]}

describe ".unescape_codepoints" do
# @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape

it "unescapes \\uXXXX codepoint escape sequences" do
inputs = {
%q(\\u0020) => %q( ),
%q(<ab\\u00E9xy>) => %Q(<ab\xC3\xA9xy>),
%q(\\u03B1:a) => %Q(\xCE\xB1:a),
%q(a\\u003Ab) => %Q(a\x3Ab),
}
inputs.each do |input, output|
output.force_encoding(Encoding::UTF_8)
expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output
end
end

it "unescapes \\UXXXXXXXX codepoint escape sequences" do
inputs = {
%q(\\U00000020) => %q( ),
%q(\\U00010000) => %Q(\xF0\x90\x80\x80),
%q(\\U000EFFFF) => %Q(\xF3\xAF\xBF\xBF),
}
inputs.each do |input, output|
output.force_encoding(Encoding::UTF_8)
expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output
end
end

context "escaped strings" do
{
'Dürst' => 'D\\u00FCrst',
"é" => '\\u00E9',
"€" => '\\u20AC',
"resumé" => 'resum\\u00E9',
}.each_pair do |unescaped, escaped|
it "unescapes #{unescaped.inspect}" do
expect(EBNF::LL1::Lexer.unescape_codepoints(escaped)).to eq unescaped
end
end
end
end

describe ".unescape_string" do
# @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes

context "escape sequences" do
EBNF::LL1::Lexer::ESCAPE_CHARS.each do |escaped, unescaped|
it "unescapes #{unescaped.inspect}" do
expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped
end
end
end

context "escaped strings" do
{
'simple literal' => 'simple literal',
'backslash:\\' => 'backslash:\\\\',
'dquote:"' => 'dquote:\\"',
"newline:\n" => 'newline:\\n',
"return\r" => 'return\\r',
"tab:\t" => 'tab:\\t',
}.each_pair do |unescaped, escaped|
it "unescapes #{unescaped.inspect}" do
expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped
end
end
end
end

describe ".tokenize" do
context "numeric literals" do
Expand Down

0 comments on commit cb90783

Please sign in to comment.