Skip to content

Commit 62d20c8

Browse files
authored
✨ Add TIFF and HEIC support (#88)
1 parent 56cd97c commit 62d20c8

File tree

5 files changed

+64
-19
lines changed

5 files changed

+64
-19
lines changed

mindee/inputs.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
11
import base64
22
import io
3+
import mimetypes
34
import os
4-
from mimetypes import guess_type
55
from typing import BinaryIO, Optional, Tuple
66

77
import pikepdf
88

99
from mindee.logger import logger
1010

11+
mimetypes.add_type("image/heic", ".heic")
12+
mimetypes.add_type("image/heic", ".heif")
13+
1114
ALLOWED_MIME_TYPES = [
15+
"application/pdf",
16+
"image/heic",
1217
"image/png",
1318
"image/jpg",
1419
"image/jpeg",
20+
"image/tiff",
1521
"image/webp",
16-
"application/pdf",
1722
]
1823

1924
INPUT_TYPE_FILE = "file"
@@ -54,7 +59,7 @@ def __init__(
5459
logger.debug("Loaded new document '%s' from %s", self.filename, self.input_type)
5560

5661
def _check_mimetype(self) -> None:
57-
file_mimetype = guess_type(self.filename)[0]
62+
file_mimetype = mimetypes.guess_type(self.filename)[0]
5863
if file_mimetype:
5964
self.file_mimetype = file_mimetype
6065
else:

tests/data/receipt/receipt.heic

41.6 KB
Binary file not shown.

tests/data/receipt/receipt.tif

548 KB
Binary file not shown.

tests/data/receipt/receipt.tiff

548 KB
Binary file not shown.

tests/test_inputs.py

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,29 @@
33
import pytest
44

55
from mindee.inputs import Base64Document, BytesDocument, FileDocument, PathDocument
6+
from tests import INVOICE_DATA_DIR, RECEIPT_DATA_DIR
7+
8+
#
9+
# PDF
10+
#
611

712

813
def test_pdf_reconstruct_fail():
914
with pytest.raises(AssertionError):
1015
PathDocument(
11-
"./tests/data/invoice/invoice_10p.pdf",
16+
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
1217
cut_pdf=True,
1318
n_pdf_pages=4,
1419
)
1520

1621

1722
def test_pdf_reconstruct_ok():
18-
input_file = PathDocument("./tests/data/invoice/invoice_10p.pdf")
23+
input_file = PathDocument(f"{INVOICE_DATA_DIR}/invoice_10p.pdf")
1924
assert isinstance(input_file.file_object, io.BytesIO)
2025

2126

22-
def test_read_contents():
23-
input_doc = PathDocument("./tests/data/invoice/invoice.pdf")
27+
def test_pdf_read_contents():
28+
input_doc = PathDocument(f"{INVOICE_DATA_DIR}/invoice.pdf")
2429
contents = input_doc.read_contents(close_file=False)
2530
assert contents[0] == "invoice.pdf"
2631
assert isinstance(contents[1], bytes)
@@ -31,27 +36,28 @@ def test_read_contents():
3136

3237

3338
def test_pdf_reconstruct_no_cut():
34-
input_file = PathDocument("./tests/data/invoice/invoice_10p.pdf", cut_pdf=False)
39+
input_file = PathDocument(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", cut_pdf=False)
3540
assert input_file.count_pdf_pages() == 10
3641
assert isinstance(input_file.file_object, io.BufferedReader)
3742

3843

3944
def test_pdf_reconstruct_check_n_pages():
4045
input_obj_3 = PathDocument(
41-
"./tests/data/invoice/invoice_10p.pdf",
46+
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
4247
cut_pdf=True,
4348
n_pdf_pages=3,
4449
)
4550
input_obj_2 = PathDocument(
46-
"./tests/data/invoice/invoice_10p.pdf",
51+
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
4752
cut_pdf=True,
4853
n_pdf_pages=2,
4954
)
5055
input_obj_1 = PathDocument(
51-
"./tests/data/invoice/invoice_10p.pdf",
56+
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
5257
cut_pdf=True,
5358
n_pdf_pages=1,
5459
)
60+
assert input_obj_1.file_mimetype == "application/pdf"
5561

5662
# re-initialize file pointer
5763
input_obj_3.file_object.seek(0)
@@ -63,40 +69,44 @@ def test_pdf_reconstruct_check_n_pages():
6369
assert input_obj_1.count_pdf_pages() == 1
6470

6571

66-
def test_input_from_path():
72+
def test_pdf_input_from_path():
6773
input_obj_1 = PathDocument(
68-
"./tests/data/invoice/invoice_10p.pdf",
74+
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
6975
cut_pdf=True,
7076
n_pdf_pages=1,
7177
)
78+
assert input_obj_1.file_mimetype == "application/pdf"
7279
assert input_obj_1.count_pdf_pages() == 1
7380

7481

75-
def test_input_from_file():
76-
with open("./tests/data/invoice/invoice_10p.pdf", "rb") as fp:
82+
def test_pdf_input_from_file():
83+
with open(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", "rb") as fp:
7784
input_obj_1 = FileDocument(fp, cut_pdf=True, n_pdf_pages=1)
85+
assert input_obj_1.file_mimetype == "application/pdf"
7886
assert input_obj_1.count_pdf_pages() == 1
7987

8088

81-
def test_input_from_base64():
82-
with open("./tests/data/invoice/invoice_10p.txt", "rt") as fp:
89+
def test_pdf_input_from_base64():
90+
with open(f"{INVOICE_DATA_DIR}/invoice_10p.txt", "rt") as fp:
8391
input_obj_1 = Base64Document(
8492
fp.read(),
8593
filename="invoice_10p.pdf",
8694
cut_pdf=True,
8795
n_pdf_pages=1,
8896
)
97+
assert input_obj_1.file_mimetype == "application/pdf"
8998
assert input_obj_1.count_pdf_pages() == 1
9099

91100

92-
def test_input_from_bytes():
93-
with open("./tests/data/invoice/invoice_10p.pdf", "rb") as fp:
101+
def test_pdf_input_from_bytes():
102+
with open(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", "rb") as fp:
94103
input_obj_1 = BytesDocument(
95104
fp.read(),
96105
filename="invoice_10p.pdf",
97106
cut_pdf=True,
98107
n_pdf_pages=1,
99108
)
109+
assert input_obj_1.file_mimetype == "application/pdf"
100110
assert input_obj_1.count_pdf_pages() == 1
101111

102112

@@ -109,3 +119,33 @@ def test_pdf_blank_check():
109119

110120
input_not_blank = PathDocument("./tests/data/pdfs/not_blank_image_only.pdf")
111121
assert input_not_blank.count_pdf_pages() == 1
122+
123+
124+
#
125+
# Images
126+
#
127+
128+
129+
def test_tif_input_from_path():
130+
input_obj_1 = PathDocument(
131+
f"{RECEIPT_DATA_DIR}/receipt.tif",
132+
cut_pdf=True,
133+
n_pdf_pages=1,
134+
)
135+
assert input_obj_1.file_mimetype == "image/tiff"
136+
137+
input_obj_2 = PathDocument(
138+
f"{RECEIPT_DATA_DIR}/receipt.tiff",
139+
cut_pdf=True,
140+
n_pdf_pages=1,
141+
)
142+
assert input_obj_2.file_mimetype == "image/tiff"
143+
144+
145+
def test_heic_input_from_path():
146+
input_obj_1 = PathDocument(
147+
f"{RECEIPT_DATA_DIR}/receipt.heic",
148+
cut_pdf=True,
149+
n_pdf_pages=1,
150+
)
151+
assert input_obj_1.file_mimetype == "image/heic"

0 commit comments

Comments
 (0)