Merge pull request #22 from kcroker/feature/invalid-utf8

Feature/invalid utf8
kcroker · Nov 29, 2023 · 5ee6667 · 5ee6667
2 parents c05661f + 9e2cb93
commit 5ee6667
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 6 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,13 +1,13 @@
 name: Run tests
 
-on: [push, pull_request]
+on: [push]
 
 jobs:
   test:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8']
+        python-version: ['3.9']
         os: [ubuntu-22.04, macos-12]
 
     runs-on: ${{ matrix.os }}
@@ -17,7 +17,9 @@ jobs:
 
     - name: Install prerequisites on Ubuntu
       if: matrix.os == 'ubuntu-22.04'
-      run: sudo apt install --yes djvulibre-bin libtiff5
+      run: |
+        sudo apt update
+        sudo apt install --yes djvulibre-bin
 
     - name: Install prerequisites on macOS
       if: matrix.os == 'macos-12'

diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ If you are packaging this for some other package manager, consider using PEP-517
 
 A convenience script that can be copied or linked to any directory in `$PATH` can be found at [`./bin/dpsprep`](./bin/dpsprep).
 
-Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/makefile`](./fixtures/makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `pdftotext` (Poppler) and `djvudigital` (GSDjVU). Similarly, the man file is checked in, but building it from markdown depends on `ronn`.
+Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/makefile`](./fixtures/makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `pdftotext` (Poppler), `djvudigital` (GSDjVU) and `djvused` (DjVuLibre). Similarly, the man file is checked in, but building it from markdown depends on `ronn`.
 
 ## Note regarding compression
 

diff --git a/dpsprep/test_text.py b/dpsprep/test_text.py
@@ -1,6 +1,7 @@
 import string
 
 import djvu.decode
+import pytest
 
 from .text import TextExtractVisitor
 
@@ -39,3 +40,21 @@ def test_extract_djvu_page_text_lines():
         source_pdf_text = file.read()
 
     assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text)
+
+
+def test_invalid_utf8():
+    document = djvu.decode.Context().new_document(
+        djvu.decode.FileURI('fixtures/lipsum_words_invalid.djvu')
+    )
+    document.decoding_job.wait()
+
+    djvu_page = document.pages[0]
+    djvu_page.get_info()
+    first_word_sexpr = djvu_page.text.sexpr[5][5]
+
+    # djvulibre cannot decode the first word
+    with pytest.raises(UnicodeDecodeError):
+        first_word_sexpr.value
+
+    first_word = TextExtractVisitor().visit(first_word_sexpr)
+    assert first_word == ''
diff --git a/dpsprep/text.py b/dpsprep/text.py
@@ -26,7 +26,13 @@
 
 class TextExtractVisitor(SExpressionVisitor):
     def visit_string(self, node: djvu.sexpr.StringExpression):
-        return ''.join(c for c in node.value if unicodedata.category(c) not in UNDRAWABLE_UNICODE_CATEGORIES)
+        try:
+            string = node.value  # This getter is not static - it does UTF-8 conversion and fails for some DjVu files
+        except ValueError as err:
+            logger.warning(f'Could not decode {repr(node)}: {err}')
+            return ''
+        else:
+            return ''.join(c for c in string if unicodedata.category(c) not in UNDRAWABLE_UNICODE_CATEGORIES)
 
     def visit_plain_list(self, node: djvu.sexpr.ListExpression):
         return ''

diff --git a/fixtures/lipsum_words_invalid.djvu b/fixtures/lipsum_words_invalid.djvu
diff --git a/fixtures/makefile b/fixtures/makefile
@@ -19,3 +19,9 @@ all: lipsum_01.txt lipsum_01.png lipsum_lines.djvu lipsum_words.djvu
 %_lines.djvu: %.pdf
 	djvudigital --lines $*.pdf
 	mv $*.djvu $*_lines.djvu
+
+%_invalid.djvu: %.djvu
+	cp $*.djvu $*_invalid.djvu
+	djvused $*_invalid.djvu -e 'output-all' | \
+		sed "s/Lorem/\\270/g" | \
+		djvused $*_invalid.djvu -f /dev/stdin -s
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dpsprep"
-version = "2.2.1"
+version = "2.2.2"
 description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers"
 authors = ["Kevin Arthur Schiff Croker", "Ianis Vasilev"]
 license = "GPL-3.0-or-later"