Merge branch 'master' of https://github.com/quanteda/readtext

quanteda · May 7, 2019 · 5fd8050 · 5fd8050
2 parents 8fc5929 + 5abeab0
commit 5fd8050
Show file tree

Hide file tree

Showing 11 changed files with 505 additions and 12 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,3 @@
-r_packages:
-    - covr
-    - pdftools
 language: r
 warnings_are_errors: true
 sudo: true

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: readtext
-Version: 0.73
+Version: 0.74
 Type: Package
 Title: Import and Handling for Plain and Formatted Text Files
 Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
@@ -11,7 +11,7 @@ Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
     person("Stefan", "Müller", email = "mullers@tcd.ie", role = "ctb"))
 Description: Functions for importing and handling text files and formatted text
     files with additional meta-data, such including '.csv', '.tab', '.json', '.xml',
-    '.html', '.pdf', '.doc', '.docx', '.xls', '.xlsx', and others.
+    '.html', '.pdf', '.doc', '.docx', '.rtf', '.xls', '.xlsx', and others.
 License: GPL-3
 Depends:
     R (>= 3.1)
@@ -26,6 +26,7 @@ Imports:
     readxl,
     streamR,
     stringi,
+    striprtf,
     tibble,
     xml2,
     utils

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+readtext v0.74
+==============
+
+* Added support for RTF format (.rtf).
+
+
 readtext v0.7.2
 ==============
 

diff --git a/R/get-functions.R b/R/get-functions.R
@@ -21,7 +21,7 @@ get_csv <- function(path, text_field, encoding, source, ...) {
         close(con)
         result <- data.table::fread(input = txt, data.table = FALSE, stringsAsFactors = FALSE, ...)
     } else {
-        result <- data.table::fread(input = path, data.table = FALSE, stringsAsFactors = FALSE, ...)
+        result <- data.table::fread(input = path, data.table = FALSE, stringsAsFactors = FALSE, encoding = encoding, ...)
     }
     sort_fields(result, path, text_field)
 }
@@ -220,6 +220,13 @@ get_doc <- function(path, source, ...) {
     data.frame(text = txt, stringsAsFactors = FALSE)
 }
 
+get_rtf <- function(path, source, ...) {
+    path <- normalizePath(path)
+    txt <- striprtf::read_rtf(as.character(path))
+    txt <- paste0(txt, collapse = "\n")
+    txt <- trimws(txt)
+    data.frame(text = txt, stringsAsFactors = FALSE)
+}
 
 get_excel <- function(path, text_field, source, ...) {
 

diff --git a/R/readtext.R b/R/readtext.R
@@ -37,7 +37,8 @@
 #'   \item{\code{pdf}}{pdf formatted files, converted through \pkg{pdftools}.}  
 #'   \item{\code{odt}}{Open Document Text formatted files.}
 #'   \item{\code{doc, docx}}{Microsoft Word formatted files.}
-#'   
+#'   \item{\code{rtf}}{Rich Text Files.}
+#'      
 #'   \strong{Reading multiple files and file types:} 
 #'   
 #'   In addition, \code{file} can also not be a path 
@@ -269,6 +270,7 @@ get_source <- function(path, text_field, replace_specialchar = FALSE, verbosity
                odt = get_odt(path, ...),
                docx = get_docx(path, ...),
                doc = get_doc(path, ...),
+               rtf = get_rtf(path, ...),
                xls = get_excel(path, text_field, ...),
                xlsx = get_excel(path, text_field, ...),
                ods = get_ods(path, text_field, ...)

diff --git a/R/utils.R b/R/utils.R
@@ -1,6 +1,6 @@
 # Returns supported file extensions
 extensions <- function() {
-    c("csv", "txt", "json", "zip", "gz", "tar", "xml", "tab",
+    c("csv", "txt", "json", "zip", "gz", "tar", "xml", "tab", "rtf",
       "tsv", "html", "pdf", "odt", "docx", "doc", "xls", "xlsx", "ods")
 }
 

diff --git a/README.Rmd b/README.Rmd
@@ -102,6 +102,3 @@ summary(corpus_csv, 5)
 **readtext** returns a data.frame that is formatted as per the corpus structure of the [Text Interchange Format](https://github.com/ropensci/tif), it can easily be used by other packages that can accept a corpus in data.frame format.  
 
 If you only want a named `character` object, **readtext** also defines an `as.character()` method that inputs its data.frame and returns just the named character vector of texts, conforming to the TIF definition of the character version of a corpus.
-
-
-
diff --git a/man/readtext.Rd b/man/readtext.Rd
diff --git a/tests/data/rtf/test1.rtf b/tests/data/rtf/test1.rtf
diff --git a/tests/data/rtf/test2.rtf b/tests/data/rtf/test2.rtf
diff --git a/tests/testthat/test-readtext.R b/tests/testthat/test-readtext.R
@@ -897,3 +897,11 @@ test_that("readtext works with one-column csv files (#138)", {
                    stringsAsFactors = FALSE)
     )
 })
+
+test_that("tests for ODS files", {
+    expect_identical(
+        unname(texts(readtext("../data/rtf/*.rtf"))),
+        c("The quick brown fox jumps over the lazy dog",
+          "This is an example of “rich text” format.")
+    )
+})