Skip to content

Commit

Permalink
Ignore invalid utf-8 sequences instead of crashing
Browse files Browse the repository at this point in the history
  • Loading branch information
v-- committed Nov 29, 2023
1 parent c05661f commit a4fa54e
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ If you are packaging this for some other package manager, consider using PEP-517

A convenience script that can be copied or linked to any directory in `$PATH` can be found at [`./bin/dpsprep`](./bin/dpsprep).

Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/makefile`](./fixtures/makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `pdftotext` (Poppler) and `djvudigital` (GSDjVU). Similarly, the man file is checked in, but building it from markdown depends on `ronn`.
Previous versions of the tool itself used to depend on third-party binaries, but this is no longer the case. The test fixtures are checked in, however regenerating them (see [`./fixtures/makefile`](./fixtures/makefile)) requires `pdflatex` (texlive, among others), `gs` (Ghostscript), `pdftotext` (Poppler), `djvudigital` (GSDjVU) and `djvused` (DjVuLibre). Similarly, the man file is checked in, but building it from markdown depends on `ronn`.

## Note regarding compression

Expand Down
19 changes: 19 additions & 0 deletions dpsprep/test_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import string

import djvu.decode
import pytest

from .text import TextExtractVisitor

Expand Down Expand Up @@ -39,3 +40,21 @@ def test_extract_djvu_page_text_lines():
source_pdf_text = file.read()

assert remove_whitespace(djvu_text) == remove_whitespace(source_pdf_text)


def test_invalid_utf8():
document = djvu.decode.Context().new_document(
djvu.decode.FileURI('fixtures/lipsum_words_invalid.djvu')
)
document.decoding_job.wait()

djvu_page = document.pages[0]
djvu_page.get_info()
first_word_sexpr = djvu_page.text.sexpr[5][5]

# djvulibre cannot decode the first word
with pytest.raises(UnicodeDecodeError):
first_word_sexpr.value

first_word = TextExtractVisitor().visit(first_word_sexpr)
assert first_word == ''
8 changes: 7 additions & 1 deletion dpsprep/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@

class TextExtractVisitor(SExpressionVisitor):
def visit_string(self, node: djvu.sexpr.StringExpression):
return ''.join(c for c in node.value if unicodedata.category(c) not in UNDRAWABLE_UNICODE_CATEGORIES)
try:
string = node.value # This getter is not static - it does UTF-8 conversion and fails for some DjVu files
except ValueError as err:
logger.warning(f'Could not decode {repr(node)}: {err}')
return ''
else:
return ''.join(c for c in node.value if unicodedata.category(c) not in UNDRAWABLE_UNICODE_CATEGORIES)

def visit_plain_list(self, node: djvu.sexpr.ListExpression):
return ''
Expand Down
Binary file added fixtures/lipsum_words_invalid.djvu
Binary file not shown.
6 changes: 6 additions & 0 deletions fixtures/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@ all: lipsum_01.txt lipsum_01.png lipsum_lines.djvu lipsum_words.djvu
%_lines.djvu: %.pdf
djvudigital --lines $*.pdf
mv $*.djvu $*_lines.djvu

%_invalid.djvu: %.djvu
cp $*.djvu $*_invalid.djvu
djvused $*_invalid.djvu -e 'output-all' | \
sed "s/Lorem/\\270/g" | \
djvused $*_invalid.djvu -f /dev/stdin -s
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dpsprep"
version = "2.2.1"
version = "2.2.2"
description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers"
authors = ["Kevin Arthur Schiff Croker", "Ianis Vasilev"]
license = "GPL-3.0-or-later"
Expand Down

0 comments on commit a4fa54e

Please sign in to comment.