Caret not being treated as literal in rust-onig #188

sargas · 2024-04-02T03:32:10Z

I believe there is a discrepancy between oniguruma in C and this rust wrapper.

In rust, the following code:

        let re = Regex::with_options(
            "a^b",
            RegexOptions::REGEX_OPTION_NONE,
            Syntax::grep(),
        ).unwrap();
        dbg!(re.find("a^b"));

gives the following output (with print-debug):

PATTERN: /a^b/
OPTIONS:
MAX PARSE DEPTH: 2
TREE (after tune)
<list:0x7b9794001990>
   <string:0x7b9794001940>a
   <anchor:0x7b97940019e0> begin line
   <string:0x7b9794001a80>b

....
[....:788:9] re.find("a^b") = None

This is not how grep treats ^ in the middle of a regex; I expected the parsed tree to look something like:

TREE (after tune)
<string:0x5a86ca39ea40>a^b

I'm reporting this in rust-onig, because I can't reproduce this in the C library. Based on oniguruma's simple example, I put together this code in C:

    int r;
    unsigned char *start, *range, *end;
    regex_t* reg;
    OnigErrorInfo einfo;
    OnigRegion *region;
    OnigEncoding use_encs[1];

    static UChar* pattern = (UChar* )"a^b";
    static UChar* str     = (UChar* )"a^b";

    use_encs[0] = ONIG_ENCODING_UTF8;
    onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0]));

    r = onig_new(&reg, pattern, pattern + strlen((char* )pattern),
                 ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, ONIG_SYNTAX_GREP, &einfo);
    if (r != ONIG_NORMAL) {
        char s[ONIG_MAX_ERROR_MESSAGE_LEN];
        onig_error_code_to_str((UChar* )s, r, &einfo);
        fprintf(stderr, "ERROR: %s\n", s);
        return -1;
    }

    region = onig_region_new();

    end   = str + strlen((char* )str);
    start = str;
    range = end;
    r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
    if (r >= 0) {
        int i;

        fprintf(stderr, "match at %d\n", r);
        for (i = 0; i < region->num_regs; i++) {
            fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
        }
    }
    else if (r == ONIG_MISMATCH) {
        fprintf(stderr, "search fail\n");
    }
    else { /* error */
        char s[ONIG_MAX_ERROR_MESSAGE_LEN];
        onig_error_code_to_str((UChar* )s, r);
        fprintf(stderr, "ERROR: %s\n", s);
        onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
        onig_free(reg);
        onig_end();
        return -1;
    }

    onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
    onig_free(reg);
    onig_end();
    return 0;

(note the "a^b" in the regex). This correctly outputs:

match at 0
0: (0-3)

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Caret not being treated as literal in rust-onig #188

Caret not being treated as literal in rust-onig #188

sargas commented Apr 2, 2024

Caret not being treated as literal in rust-onig #188

Caret not being treated as literal in rust-onig #188

Comments

sargas commented Apr 2, 2024