Merge pull request #24 from kcroker/bugfix/outline

Ignore unrecognized page titles in the outline
kcroker · Dec 9, 2023 · 3d4c8c3 · 3d4c8c3
2 parents 5ee6667 + ab6665c
commit 3d4c8c3
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 2 deletions.
diff --git a/dpsprep/outline.py b/dpsprep/outline.py
@@ -1,3 +1,4 @@
+from loguru import logger
 from pdfrw import PdfName, PdfDict, IndirectPdfDict
 import djvu.sexpr
 
@@ -10,7 +11,12 @@ class OutlineTransformVisitor(SExpressionVisitor):
     def visit_plain_list(self, node: djvu.sexpr.StringExpression, parent: IndirectPdfDict):
         title, page, *rest = node
         # I have experimentally determined that we need to translate page indices. -- Ianis, 2023-05-03
-        page_number = int(page.value[1:]) - 1
+        try:
+            page_number = int(page.value[1:]) - 1
+        except ValueError:
+            # As far as I understand, python-djvulibre doesn't support Djvu's page titles. -- Ianis, 2023-12-09
+            logger.warning(f'Could not determine page number from the page title {page.value}.')
+            return
 
         bookmark = IndirectPdfDict(
             Parent = parent,

diff --git a/dpsprep/test_outline.py b/dpsprep/test_outline.py
@@ -0,0 +1,67 @@
+from djvu import sexpr
+from pdfrw import IndirectPdfDict
+
+from .outline import OutlineTransformVisitor
+
+
+def test_basic_outline():
+    src = sexpr.ListExpression([
+        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
+        sexpr.ListExpression([
+            sexpr.StringExpression(b'Chapter 2'),
+            sexpr.StringExpression(b'#100'),
+        ])
+    ])
+
+    visitor = OutlineTransformVisitor()
+    bookmarks = visitor.visit(src)
+    assert bookmarks.Count == 1
+    assert bookmarks.First.Title == 'Chapter 2'
+    assert bookmarks.First.A.D[0] == 99  # The page number
+
+
+def test_nested_outline():
+    src = sexpr.ListExpression([
+        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
+        sexpr.ListExpression([
+            sexpr.StringExpression(b'Chapter 2'),
+            sexpr.StringExpression(b'#100'),
+            sexpr.ListExpression([
+                sexpr.StringExpression(b'Chapter 2.1'),
+                sexpr.StringExpression(b'#200'),
+            ])
+        ])
+    ])
+
+    visitor = OutlineTransformVisitor()
+    bookmarks = visitor.visit(src)
+    assert bookmarks.Count == 1
+    assert bookmarks.First.Count == 1
+    assert bookmarks.First.A.D[0] == 99  # The page number of chapter 2
+    assert bookmarks.First.First.A.D[0] == 199  # The page number of chapter 2.1
+
+
+# Sometimes the page numbers are instead page titles, which our libdjvu bindings do not support
+# We ignore them since there is not much we can do in this case
+# See https://github.com/kcroker/dpsprep/issues/23
+def test_outline_with_page_titles():
+    src = sexpr.ListExpression([
+        sexpr.SymbolExpression(sexpr.Symbol('bookmarks')),
+        sexpr.ListExpression([
+            sexpr.StringExpression(b'Preface'),
+            sexpr.StringExpression(b'#f007.djvu'),
+        ]),
+        sexpr.ListExpression([
+            sexpr.StringExpression(b'Contents'),
+            sexpr.StringExpression(b'#f011.djvu'),
+        ]),
+        sexpr.ListExpression([
+            sexpr.StringExpression(b'0 Prologue'),
+            sexpr.StringExpression(b'#p001.djvu')
+        ])
+    ])
+
+    visitor = OutlineTransformVisitor()
+    bookmarks = visitor.visit(src)
+    empty_pdf_dict = IndirectPdfDict()
+    assert bookmarks == empty_pdf_dict
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dpsprep"
-version = "2.2.2"
+version = "2.2.3"
 description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers"
 authors = ["Kevin Arthur Schiff Croker", "Ianis Vasilev"]
 license = "GPL-3.0-or-later"