Finish 2.2.1

dryruby · Aug 25, 2021 · cb90783 · cb90783
2 parents 4d5372b + a98eb19
commit cb90783
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 1 deletion.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.0
+2.2.1
diff --git a/lib/ebnf/ll1/lexer.rb b/lib/ebnf/ll1/lexer.rb
@@ -38,6 +38,30 @@ class Lexer
     # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
     attr_reader :whitespace
 
+    ##
+    # Returns a copy of the given `input` string with all `\uXXXX` and
+    # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
+    # unescaped UTF-8 character counterparts.
+    #
+    # @param  [String] string
+    # @return [String]
+    # @see    https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
+    def self.unescape_codepoints(string)
+      ::EBNF::Unescape.unescape_codepoints(string)
+    end
+
+    ##
+    # Returns a copy of the given `input` string with all string escape
+    # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
+    # character counterparts.
+    #
+    # @param  [String] input
+    # @return [String]
+    # @see    https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
+    def self.unescape_string(input)
+      ::EBNF::Unescape.unescape_string(input)
+    end
+
     ##
     # Tokenizes the given `input` string or stream.
     #

diff --git a/spec/ll1/lexer_spec.rb b/spec/ll1/lexer_spec.rb
@@ -25,6 +25,75 @@
     :STRING_LITERAL_QUOTE, :STRING_LITERAL_SINGLE_QUOTE,
     :STRING_LITERAL_LONG_SINGLE_QUOTE, :STRING_LITERAL_LONG_QUOTE
   ]}
+
+  describe ".unescape_codepoints" do
+    # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
+
+    it "unescapes \\uXXXX codepoint escape sequences" do
+      inputs = {
+        %q(\\u0020)       => %q( ),
+        %q(<ab\\u00E9xy>) => %Q(<ab\xC3\xA9xy>),
+        %q(\\u03B1:a)     => %Q(\xCE\xB1:a),
+        %q(a\\u003Ab)     => %Q(a\x3Ab),
+      }
+      inputs.each do |input, output|
+        output.force_encoding(Encoding::UTF_8)
+        expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output
+      end
+    end
+
+    it "unescapes \\UXXXXXXXX codepoint escape sequences" do
+      inputs = {
+        %q(\\U00000020)   => %q( ),
+        %q(\\U00010000)   => %Q(\xF0\x90\x80\x80),
+        %q(\\U000EFFFF)   => %Q(\xF3\xAF\xBF\xBF),
+      }
+      inputs.each do |input, output|
+        output.force_encoding(Encoding::UTF_8)
+        expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output
+      end
+    end
+
+    context "escaped strings" do
+      {
+        'Dürst' => 'D\\u00FCrst',
+        "é" => '\\u00E9',
+        "€" => '\\u20AC',
+        "resumé" => 'resum\\u00E9',
+      }.each_pair do |unescaped, escaped|
+        it "unescapes #{unescaped.inspect}" do
+          expect(EBNF::LL1::Lexer.unescape_codepoints(escaped)).to eq unescaped
+        end
+      end
+    end
+  end
+
+  describe ".unescape_string" do
+    # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
+
+    context "escape sequences" do
+      EBNF::LL1::Lexer::ESCAPE_CHARS.each do |escaped, unescaped|
+        it "unescapes #{unescaped.inspect}" do
+          expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped
+        end
+      end
+    end
+
+    context "escaped strings" do
+      {
+        'simple literal' => 'simple literal',
+        'backslash:\\' => 'backslash:\\\\',
+        'dquote:"' => 'dquote:\\"',
+        "newline:\n" => 'newline:\\n',
+        "return\r" => 'return\\r',
+        "tab:\t" => 'tab:\\t',
+      }.each_pair do |unescaped, escaped|
+        it "unescapes #{unescaped.inspect}" do
+          expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped
+        end
+      end
+    end
+  end
 
   describe ".tokenize" do
     context "numeric literals" do