Skip to content

Commit 77cad27

Browse files
authored
✨ add an URL input source (#125)
1 parent cf9b794 commit 77cad27

File tree

9 files changed

+142
-60
lines changed

9 files changed

+142
-60
lines changed

docs/guide/python-getting-started.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,15 @@ async def upload(upload: UploadFile):
153153
)
154154
```
155155

156+
### URL
157+
Allows sending an URL directly.
158+
159+
**Note**: No local operations can be performed on the input (such as removing pages from a PDF).
160+
161+
```python
162+
input_doc = mindee_client.doc_from_url(url="https://www.example.com/invoice.pdf")
163+
```
164+
156165
## Sending a File
157166
To send a file to the API, we need to specify how to process the document.
158167
This will determine which API endpoint is used and how the API return will be handled internally by the library.

mindee/cli.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,19 @@ class CommandConfig(Generic[TypeDoc]):
7676
def _get_input_doc(client, args) -> DocumentClient:
7777
if args.input_type == "file":
7878
with open(args.path, "rb", buffering=30) as file_handle:
79-
return client.doc_from_file(file_handle)
79+
return client.doc_from_file(input_file=file_handle)
8080
elif args.input_type == "base64":
8181
with open(args.path, "rt", encoding="ascii") as base64_handle:
82-
return client.doc_from_b64string(base64_handle.read(), "test.jpg")
82+
return client.doc_from_b64string(
83+
input_string=base64_handle.read(), filename="test.jpg"
84+
)
8385
elif args.input_type == "bytes":
8486
with open(args.path, "rb") as bytes_handle:
85-
return client.doc_from_bytes(bytes_handle.read(), bytes_handle.name)
87+
return client.doc_from_bytes(
88+
input_bytes=bytes_handle.read(), filename=bytes_handle.name
89+
)
90+
elif args.input_type == "url":
91+
return client.doc_from_url(url=args.path)
8692
return client.doc_from_path(args.path)
8793

8894

@@ -181,13 +187,14 @@ def _parse_args() -> Namespace:
181187
"-i",
182188
"--input-type",
183189
dest="input_type",
184-
choices=["path", "file", "base64", "bytes"],
190+
choices=["path", "file", "base64", "bytes", "url"],
185191
default="path",
186192
help="Specify how to handle the input.\n"
187193
"- path: open a path (default).\n"
188194
"- file: open as a file handle.\n"
189-
"- base64: load the from a base64 encoded text file.\n"
190-
"- bytes: load the contents as raw bytes.",
195+
"- base64: open a base64 encoded text file.\n"
196+
"- bytes: open the contents as raw bytes.\n"
197+
"- url: open an URL.",
191198
)
192199
subp.add_argument(
193200
"-o",

mindee/client.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import json
2-
from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type
2+
from typing import BinaryIO, Dict, List, NamedTuple, Optional, Type, Union
33

44
from mindee import documents
55
from mindee.documents.base import Document, TypeDocument
@@ -10,8 +10,9 @@
1010
Base64Input,
1111
BytesInput,
1212
FileInput,
13-
InputSource,
13+
LocalInputSource,
1414
PathInput,
15+
UrlInputSource,
1516
)
1617
from mindee.logger import logger
1718
from mindee.response import PredictResponse
@@ -23,13 +24,13 @@ def get_bound_classname(type_var) -> str:
2324

2425

2526
class DocumentClient:
26-
input_doc: InputSource
27+
input_doc: Union[LocalInputSource, UrlInputSource]
2728
doc_configs: DocumentConfigDict
2829
raise_on_error: bool = True
2930

3031
def __init__(
3132
self,
32-
input_doc: InputSource,
33+
input_doc: Union[LocalInputSource, UrlInputSource],
3334
doc_configs: DocumentConfigDict,
3435
raise_on_error: bool,
3536
):
@@ -108,12 +109,13 @@ def parse(
108109

109110
doc_config = self.doc_configs[config_key]
110111
doc_config.check_api_keys()
111-
if page_options and self.input_doc.is_pdf():
112-
self.input_doc.process_pdf(
113-
page_options.operation,
114-
page_options.on_min_pages,
115-
page_options.page_indexes,
116-
)
112+
if not isinstance(self.input_doc, UrlInputSource):
113+
if page_options and self.input_doc.is_pdf():
114+
self.input_doc.process_pdf(
115+
page_options.operation,
116+
page_options.on_min_pages,
117+
page_options.page_indexes,
118+
)
117119
return self._make_request(
118120
document_class, doc_config, include_words, close_file, cropper
119121
)
@@ -152,7 +154,8 @@ def _make_request(
152154

153155
def close(self) -> None:
154156
"""Close the file object."""
155-
self.input_doc.file_object.close()
157+
if not isinstance(self.input_doc, UrlInputSource):
158+
self.input_doc.file_object.close()
156159

157160

158161
class ConfigSpec(NamedTuple):
@@ -397,3 +400,21 @@ def doc_from_bytes(
397400
doc_configs=self._doc_configs,
398401
raise_on_error=self.raise_on_error,
399402
)
403+
404+
def doc_from_url(
405+
self,
406+
url: str,
407+
) -> DocumentClient:
408+
"""
409+
Load a document from an URL.
410+
411+
:param url: Raw byte input
412+
"""
413+
input_doc = UrlInputSource(
414+
url,
415+
)
416+
return DocumentClient(
417+
input_doc=input_doc,
418+
doc_configs=self._doc_configs,
419+
raise_on_error=self.raise_on_error,
420+
)

mindee/documents/base.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import datetime
22
import re
3-
from typing import Any, Dict, List, Optional, TypeVar
3+
from typing import Any, Dict, List, Optional, TypeVar, Union
44

55
from mindee.endpoints import Endpoint
66
from mindee.fields.orientation import OrientationField
77
from mindee.fields.position import PositionField
8-
from mindee.input.sources import InputSource
8+
from mindee.input.sources import LocalInputSource, UrlInputSource
99

1010
TypeApiPrediction = Dict[str, Any]
1111

@@ -46,15 +46,18 @@ class Document:
4646

4747
def __init__(
4848
self,
49-
input_source: InputSource,
49+
input_source: Union[LocalInputSource, UrlInputSource],
5050
document_type: Optional[str],
5151
api_prediction: TypeApiPrediction,
5252
page_n: Optional[int] = None,
5353
):
5454
if input_source:
55-
self.filepath = input_source.filepath
56-
self.filename = input_source.filename
57-
self.file_extension = input_source.file_mimetype
55+
if isinstance(input_source, UrlInputSource):
56+
self.filename = input_source.url
57+
else:
58+
self.filepath = input_source.filepath
59+
self.filename = input_source.filename
60+
self.file_extension = input_source.file_mimetype
5861
self.checklist = {}
5962
self.type = document_type
6063

@@ -67,7 +70,7 @@ def __init__(
6770
@staticmethod
6871
def request(
6972
endpoints: List[Endpoint],
70-
input_source: InputSource,
73+
input_source: Union[LocalInputSource, UrlInputSource],
7174
include_words: bool = False,
7275
close_file: bool = True,
7376
cropper: bool = False,

mindee/documents/financial/financial_v1.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, TypeVar
1+
from typing import List, Optional, TypeVar, Union
22

33
from mindee.documents.base import Document, TypeApiPrediction, clean_out_string
44
from mindee.documents.invoice.invoice_v3 import InvoiceV3
@@ -11,7 +11,7 @@
1111
from mindee.fields.payment_details import PaymentDetails
1212
from mindee.fields.tax import TaxField
1313
from mindee.fields.text import TextField
14-
from mindee.input.sources import InputSource
14+
from mindee.input.sources import LocalInputSource, UrlInputSource
1515

1616

1717
class FinancialV1(Document):
@@ -152,7 +152,7 @@ def __str__(self) -> str:
152152
@staticmethod
153153
def request(
154154
endpoints: List[Endpoint],
155-
input_source: InputSource,
155+
input_source: Union[LocalInputSource, UrlInputSource],
156156
include_words: bool = False,
157157
close_file: bool = True,
158158
cropper: bool = False,
@@ -166,6 +166,9 @@ def request(
166166
:param close_file: Whether to `close()` the file after parsing it.
167167
:param cropper: Including Mindee cropper results.
168168
"""
169+
if isinstance(input_source, UrlInputSource):
170+
raise AssertionError("URL input is not supported for this API endpoint.")
171+
169172
if "pdf" in input_source.file_mimetype:
170173
# invoices is index 0, receipts 1 (this should be cleaned up)
171174
index = 0

mindee/endpoints.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import requests
55

6-
from mindee.input.sources import InputSource
6+
from mindee.input.sources import LocalInputSource, UrlInputSource
77
from mindee.logger import logger
88
from mindee.versions import __version__, get_platform, python_version
99

@@ -97,7 +97,7 @@ def set_api_key_from_env(self) -> None:
9797

9898
def predict_req_post(
9999
self,
100-
input_source: InputSource,
100+
input_source: Union[LocalInputSource, UrlInputSource],
101101
include_words: bool = False,
102102
close_file: bool = True,
103103
cropper: bool = False,
@@ -111,7 +111,6 @@ def predict_req_post(
111111
:param cropper: Including Mindee cropping results.
112112
:return: requests response
113113
"""
114-
files = {"document": input_source.read_contents(close_file)}
115114
data = {}
116115
if include_words:
117116
data["include_mvision"] = "true"
@@ -120,20 +119,31 @@ def predict_req_post(
120119
if cropper:
121120
params["cropper"] = "true"
122121

123-
response = requests.post(
124-
f"{self._url_root}/predict",
125-
files=files,
126-
headers=self.base_headers,
127-
data=data,
128-
params=params,
129-
timeout=self._request_timeout,
130-
)
122+
if isinstance(input_source, UrlInputSource):
123+
data["document"] = input_source.url
124+
response = requests.post(
125+
f"{self._url_root}/predict",
126+
headers=self.base_headers,
127+
data=data,
128+
params=params,
129+
timeout=self._request_timeout,
130+
)
131+
else:
132+
files = {"document": input_source.read_contents(close_file)}
133+
response = requests.post(
134+
f"{self._url_root}/predict",
135+
files=files,
136+
headers=self.base_headers,
137+
data=data,
138+
params=params,
139+
timeout=self._request_timeout,
140+
)
131141
return response
132142

133143

134144
class CustomEndpoint(Endpoint):
135145
def training_req_post(
136-
self, input_source: InputSource, close_file: bool = True
146+
self, input_source: LocalInputSource, close_file: bool = True
137147
) -> requests.Response:
138148
"""
139149
Make a request to POST a document for training.
@@ -155,7 +165,7 @@ def training_req_post(
155165
return response
156166

157167
def training_async_req_post(
158-
self, input_source: InputSource, close_file: bool = True
168+
self, input_source: LocalInputSource, close_file: bool = True
159169
) -> requests.Response:
160170
"""
161171
Make a request to POST a document for training without processing.

0 commit comments

Comments
 (0)