Skip to content

Commit

Permalink
Merge pull request #2240 from quanteda/dev-pattern2fixed
Browse files Browse the repository at this point in the history
Dev pattern2fixed
  • Loading branch information
kbenoit committed Apr 7, 2023
2 parents bd9ed93 + 8d0a097 commit 9a221ea
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 99 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ importFrom(stringi,stri_detect_charclass)
importFrom(stringi,stri_detect_fixed)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_enc_toutf8)
importFrom(stringi,stri_endswith_fixed)
importFrom(stringi,stri_extract_all_regex)
importFrom(stringi,stri_extract_first_regex)
importFrom(stringi,stri_extract_last_regex)
Expand Down
3 changes: 2 additions & 1 deletion R/object2fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ object2id <- function(x, types, valuetype = c("glob", "fixed", "regex"),
temp <- stri_split_charclass(x$collocation, "\\p{Z}")
names(temp) <- x$collocation
if (case_insensitive) {
result <- pattern2id(temp, types, valuetype = "fixed", TRUE)
result <- pattern2id(temp, types, valuetype = "fixed",
case_insensitive = TRUE)
} else {
temp <- lapply(temp, function(x) fastmatch::fmatch(x, types))
result <- temp[unlist(lapply(temp, function(x) all(!is.na(x))), use.names = FALSE)]
Expand Down
155 changes: 79 additions & 76 deletions R/pattern2fixed.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @inheritParams pattern
#' @param types token types against which patterns are matched
#' @param keep_nomatch keep patterns that did not match
#' @param use_index construct index of types for quick search
#' @inheritParams valuetype
#' @return a list of integer vectors containing indices of matched types
#' @keywords development
Expand All @@ -21,9 +22,10 @@
#' pats_glob <- list(c("a*", "b*"), c("c"), c("d"))
#' pattern2id(pats_glob, types, "glob", case_insensitive = TRUE)
#'
#' @export
#' @importFrom stringi stri_endswith_fixed
pattern2id <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
case_insensitive = TRUE, keep_nomatch = FALSE) {
case_insensitive = TRUE, keep_nomatch = FALSE,
use_index = TRUE) {

types <- check_character(types, min_len = 0, max_len = Inf, strict = TRUE)
valuetype <- match.arg(valuetype)
Expand All @@ -36,20 +38,18 @@ pattern2id <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
pattern <- lapply(pattern, stri_trans_nfc)

# glob is treated as fixed if neither * or ? is found
if (valuetype == "glob" && !is_glob(pattern))
if (valuetype == "glob" && !any(is_glob(pattern)))
valuetype <- "fixed"

len <- stri_length(unlist(pattern, use.names = FALSE))
if (length(len)) {
max_len <- max(len)
# construct glob or fixed index for quick search
if (use_index) {
index <- index_types(pattern, types, valuetype, case_insensitive)
types_search <- attr(index, "types_search")
} else {
max_len <- 0
index <- NULL
types_search <- stri_trans_nfc(types)
}

# construct glob or fixed index for quick search
index <- index_types(types, valuetype, case_insensitive, max_len)
types_search <- attr(index, "types_search")


# lowercases for case-insensitive search
if (valuetype != "regex" && case_insensitive)
pattern <- lapply(pattern, stri_trans_tolower)
Expand All @@ -60,15 +60,15 @@ pattern2id <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
if (valuetype == "regex") {
temp[[i]] <- search_regex_multi(pattern[[i]], types_search, case_insensitive)
} else if (valuetype == "glob") {
temp[[i]] <- search_glob_multi(pattern[[i]], types_search, index)
temp[[i]] <- search_glob_multi(pattern[[i]], types_search, case_insensitive, index)
} else {
temp[[i]] <- search_fixed_multi(pattern[[i]], types_search, index)
}
} else {
if (valuetype == "regex") {
temp[[i]] <- as.list(search_regex(pattern[[i]], types_search, case_insensitive))
} else if (valuetype == "glob") {
temp[[i]] <- as.list(search_glob(pattern[[i]], types_search, index))
temp[[i]] <- as.list(search_glob(pattern[[i]], types_search, case_insensitive, index))
} else {
temp[[i]] <- as.list(search_fixed(pattern[[i]], types_search, index))
}
Expand All @@ -93,9 +93,11 @@ pattern2id <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
#' types <- c("A", "AA", "B", "BB", "BBB", "C", "CC")
#' pattern2fixed(pattern, types, "regex", case_insensitive = TRUE)
pattern2fixed <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
case_insensitive = TRUE, keep_nomatch = FALSE) {
case_insensitive = TRUE, keep_nomatch = FALSE,
use_index = TRUE) {

temp <- pattern2id(pattern, types, valuetype, case_insensitive, keep_nomatch)
temp <- pattern2id(pattern, types, valuetype, case_insensitive, keep_nomatch,
use_index = use_index)
result <- lapply(temp, function(x) types[x])
return(result)
}
Expand All @@ -111,22 +113,25 @@ pattern2fixed <- function(pattern, types, valuetype = c("glob", "fixed", "regex"
#' @inheritParams valuetype
#' @param index index object created by `index_types`
#' @keywords internal
search_glob <- function(pattern, types_search, index) {
search_glob <- function(pattern, types_search, case_insensitive, index = NULL) {
if (length(pattern) == 0) {
return(integer())
} else if (pattern == "") {
return(0L)
} else if (pattern == "*") {
return(seq_along(types_search)) # return all types when glob is *
} else if (is.null(index)) {
return(which(stri_detect_regex(types_search, utils::glob2rx(pattern),
case_insensitive = case_insensitive)))
} else {
pos <- search_index(pattern, index)
if (length((pos))) {
if (length(pos)) {
#cat("Index search", pattern, "\n")
return(pos)
} else if (!is_indexed(pattern)) {
#cat("Regex search", pattern, "\n")
return(which(stri_detect_regex(types_search, utils::glob2rx(pattern),
case_insensitive = FALSE)))
case_insensitive = case_insensitive)))
} else {
#cat("Not found\n")
return(integer())
Expand All @@ -137,8 +142,8 @@ search_glob <- function(pattern, types_search, index) {
#' @rdname search_glob
#' @param patterns a list of "glob", "fixed" or "regex" patterns
#' @keywords internal
search_glob_multi <- function(patterns, types_search, index) {
expand(lapply(patterns, search_glob, types_search, index))
search_glob_multi <- function(patterns, types_search, case_insensitive, index) {
expand(lapply(patterns, search_glob, types_search, case_insensitive, index))
}

#' @rdname search_glob
Expand All @@ -162,11 +167,13 @@ search_regex_multi <- function(patterns, types_search, case_insensitive) {

#' @rdname search_glob
#' @keywords internal
search_fixed <- function(pattern, types_search, index) {
search_fixed <- function(pattern, types_search, index = NULL) {
if (length(pattern) == 0) {
return(integer())
} else if (pattern == "") {
return(0L)
} else if (is.null(index)) {
return(which(types_search %in% pattern))
} else {
return(search_index(pattern, index))
}
Expand All @@ -186,91 +193,91 @@ search_fixed_multi <- function(patterns, types_search, index) {
#' "car?", "c*", "ca*", "car*" and "cars*" when `valuetype="glob"`.
#' @rdname pattern2id
#' @inheritParams valuetype
#' @param max_len maximum length of types to be indexed
#' @return `index_types` returns a list of integer vectors containing type
#' IDs with index keys as an attribute
#' @keywords internal
#' @export
#' @examples
#' index <- index_types(c("xxx", "yyyy", "ZZZ"), "glob", FALSE, 3)
#' quanteda:::search_glob("yy*", attr(index, "type_search"), index)
index_types <- function(types, valuetype, case_insensitive, max_len = NULL) {

if (is.null(types)) stop("types cannot be NULL")
if (is.null(valuetype)) stop("valuetype cannot be NULL")
if (is.null(case_insensitive)) stop("case_insensitive cannot be NULL")
#' index <- index_types("yy*", c("xxx", "yyyy", "ZZZ"), "glob", FALSE)
#' quanteda:::search_glob("yy*", attr(index, "types_search"), index)
index_types <- function(pattern, types, valuetype = c("glob", "fixed", "regex"),
case_insensitive = TRUE) {

pattern <- unlist_character(pattern, use.names = FALSE)
types <- check_character(types, min_len = 0, max_len = Inf, strict = TRUE)
valuetype <- match.arg(valuetype)

# normalize unicode
types <- stri_trans_nfc(types)
types <- search <- stri_trans_nfc(types)

if (!valuetype %in% c("glob", "fixed", "regex"))
stop('valuetype should be "glob", "fixed" or "regex"')
if (valuetype == "regex" || length(types) == 0) {
index <- list()
attr(index, "types_search") <- types
attr(index, "types_search") <- search
attr(index, "types") <- types
attr(index, "valuetype") <- valuetype
attr(index, "case_insensitive") <- case_insensitive
attr(index, "key") <- character()
return(index)
}

# lowercases for case-insensitive search
if (case_insensitive) {
types_search <- stri_trans_tolower(types)
} else {
types_search <- types
search <- stri_trans_tolower(search)
pattern <- stri_trans_tolower(pattern)
}

val_index <- seq_along(search) # index values
key_index <- search # index keys

# index for fixed patterns
pos_tmp <- seq_along(types_search)
key_tmp <- list(types_search)

pos <- val_index
key <- key_index
l <- key %in% pattern
lis_pos <- list(pos[l])
lis_key <- list(key[l])

# index for glob patterns
if (valuetype == "glob") {
len <- stri_length(types_search)
id <- seq_along(types_search)
# index all the types if max_len is unknown
if (is.null(max_len)) max_len <- max(len)
for (i in seq(1, max_len)) {
k <- id[len >= i]
# index for patterns with * at the end
pos_tmp <- c(pos_tmp, list(k))
key_tmp <- c(key_tmp,
list(stri_c(stri_sub(types_search[k], 1, i), "*")))
# # index for patterns with * at the top or end
#pos_tmp <- c(pos_tmp, list(rep(k, 2)))
#key_tmp <- c(key_tmp, list(stri_c(stri_sub(types_search[k], 1, i), "*")))
#key_tmp <- c(key_tmp, list(stri_c("*", stri_sub(types_search[k], i * -1, -1))))

len_type <- stri_length(key_index)
len_pat <- stri_length(stri_trim_right(pattern, "[^*?]"))
for (n in sort(unique(len_pat))) {
pos <- val_index[len_type >= n]
key <- stri_c(stri_sub(key_index[pos], 1, n), "*")
l <- key %in% pattern
lis_pos <- c(lis_pos, list(pos[l]))
lis_key <- c(lis_key, list(key[l]))
}

l <- id[len >= 2]

# index for patterns with ? at the end
pos_tmp <- c(pos_tmp, list(l))
key_tmp <- c(key_tmp, list(stri_c(stri_sub(types_search[l], 1, -2), "?")))
# # index for patterns with ? at the top or end
# pos_tmp <- c(pos_tmp, list(rep(l, 2)))
# key_tmp <- c(key_tmp, list(stri_c(stri_sub(types_search[l], 1, -2), "?")))
# key_tmp <- c(key_tmp, list(stri_c("?", stri_sub(types_search[l], 2, -1))))
pos <- val_index[len_type >= 2]
key <- stri_c(stri_sub(key_index[pos], 1, -2), "?")
l <- key %in% pattern
lis_pos <- c(lis_pos, list(pos[l]))
lis_key <- c(lis_key, list(key[l]))

}

# faster to join vectors in the end
key <- unlist(key_tmp, use.names = FALSE)
pos <- unlist(pos_tmp, use.names = FALSE)
pos <- unlist(lis_pos, use.names = FALSE)
key <- unlist(lis_key, use.names = FALSE)
# set factor for quick split
index <- split(pos, factor(key, ordered = FALSE, levels = unique(key)))
key <- names(index)

attr(index, "names") <- NULL # names attribute slows down
attr(index, "types_search") <- types_search
attr(index, "types_search") <- search
attr(index, "types") <- types
attr(index, "valuetype") <- valuetype
attr(index, "case_insensitive") <- case_insensitive
attr(index, "key") <- key

return(index)
}


#' Internal function for `select_types` to search the index using
#' fastmatch.
#' @param regex a glob expression to search
Expand Down Expand Up @@ -316,25 +323,21 @@ expand <- function(elem){
#' @param pattern a glob pattern to be tested
#' @keywords internal
is_indexed <- function(pattern) {
pattern <- stri_sub(pattern, 1, -2)
pat <- stri_sub(pattern, 1, -2)
if (pattern == "") {
return(FALSE)
} else {
# check index for patterns with ? or * at the end
return(!any(stri_detect_fixed(pattern, c("*", "?"))))

# # check index for patterns with ? or * at the top or end
#return(!any(stri_detect_fixed(stri_sub(pattern, 1, -2), c("*", "?"))) ||
# !any(stri_detect_fixed(stri_sub(pattern, 2, -1), c("*", "?"))))
# check if patterns have ? or * other than the end
return(!any(stri_detect_fixed(pat, c("*", "?"))))
}
}

#' Check if patterns contains glob wildcard
#' @param pattern a glob pattern to be tested
#' @keywords internal
is_glob <- function(pattern) {
pattern <- unlist(pattern, use.names = FALSE)
return(any(stri_detect_fixed(pattern, "*")) || any(stri_detect_fixed(pattern, "?")))
pat <- unlist_character(pattern, use.names = FALSE)
return(any(stri_detect_fixed(pat, "*")) || any(stri_detect_fixed(pat, "?")))
}

#' Unlist a list of integer vectors safely
Expand Down
19 changes: 13 additions & 6 deletions man/pattern2id.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9a221ea

Please sign in to comment.