From 53905494bd1b05e9c7233f491556c73c19d68069 Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Tue, 17 Feb 2026 16:11:56 +0000 Subject: [PATCH 1/6] ADD: ocr options inputs for all ocr engines --- indico/queries/datasets.py | 39 ++++++++++++++-- indico/types/dataset.py | 94 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 3 deletions(-) diff --git a/indico/queries/datasets.py b/indico/queries/datasets.py index 9bd9d565..763bc0aa 100644 --- a/indico/queries/datasets.py +++ b/indico/queries/datasets.py @@ -27,6 +27,9 @@ EmailOptions, OmnipageOcrOptionsInput, ReadApiOcrOptionsInput, + ReadApiTablesV1OcrOptionsInput, + ReadApiTablesV2OcrOptionsInput, + ReadApiV2OcrOptionsInput, ) from indico.typing import AnyDict, Payload @@ -206,9 +209,12 @@ class CreateDataset(RequestChain["Dataset"]): from_local_images (bool, optional): Flag whether files are local images or not. Defaults to False. image_filename_col (str, optional): Image filename column. Defaults to 'filename'. batch_size (int, optional): Size of file batch to upload at a time. Defaults to 20. - ocr_engine (OcrEngine, optional): Specify an OCR engine [OMNIPAGE, READAPI, READAPI_V2, READAPI_TABLES_V1]. Defaults to None. + ocr_engine (OcrEngine, optional): Specify an OCR engine [OMNIPAGE, READAPI, READAPI_V2, READAPI_TABLES_V1, READAPI_TABLES_V2]. Defaults to None. omnipage_ocr_options (OmnipageOcrOptionsInput, optional): If using Omnipage, specify Omnipage OCR options. Defaults to None. read_api_ocr_options: (ReadApiOcrOptionsInput, optional): If using ReadAPI, specify ReadAPI OCR options. Defaults to None. + read_api_v2_ocr_options: (ReadApiV2OcrOptionsInput, optional): If using ReadAPI v2, specify ReadAPI v2 OCR options. Defaults to None. + read_api_tables_v1_ocr_options: (ReadApiTablesV1OcrOptionsInput, optional): If using ReadAPI tables v1, specify ReadAPI tables v1 OCR options. Defaults to None. + read_api_tables_v2_ocr_options: (ReadApiTablesV2OcrOptionsInput, optional): If using ReadAPI tables v2, specify ReadAPI tables v2 OCR options. Defaults to None. request_interval (int or float, optional): The maximum time in between retry calls when waiting. Defaults to 5 seconds. Returns: @@ -228,6 +234,9 @@ def __init__( ocr_engine: "Optional[OcrEngine]" = None, omnipage_ocr_options: "Optional[OmnipageOcrOptionsInput]" = None, read_api_ocr_options: "Optional[ReadApiOcrOptionsInput]" = None, + read_api_v2_ocr_options: "Optional[ReadApiV2OcrOptionsInput]" = None, + read_api_tables_v1_ocr_options: "Optional[ReadApiTablesV1OcrOptionsInput]" = None, + read_api_tables_v2_ocr_options: "Optional[ReadApiTablesV2OcrOptionsInput]" = None, request_interval: "Union[int, float]" = 5, email_options: "Optional[EmailOptions]" = None, ): @@ -241,11 +250,26 @@ def __init__( self.ocr_engine = ocr_engine self.omnipage_ocr_options = omnipage_ocr_options self.read_api_ocr_options = read_api_ocr_options + self.read_api_v2_ocr_options = read_api_v2_ocr_options + self.read_api_tables_v1_ocr_options = read_api_tables_v1_ocr_options + self.read_api_tables_v2_ocr_options = read_api_tables_v2_ocr_options self.request_interval = request_interval self.email_options = email_options - if omnipage_ocr_options is not None and read_api_ocr_options is not None: + if ( + sum( + opt is not None + for opt in [ + omnipage_ocr_options, + read_api_ocr_options, + read_api_v2_ocr_options, + read_api_tables_v1_ocr_options, + read_api_tables_v2_ocr_options, + ] + ) + > 1 + ): raise IndicoInputError( - "Must supply either omnipage or readapi options but not both." + "Must supply only one of omnipage, readapi, readapi v2, readapi tables v1, or readapi tables v2 options." ) super().__init__() @@ -300,6 +324,9 @@ def requests( name=self.name, dataset_type=self.dataset_type, readapi_ocr_options=self.read_api_ocr_options, + readapi_v2_ocr_options=self.read_api_v2_ocr_options, + readapi_tables_v1_ocr_options=self.read_api_tables_v1_ocr_options, + readapi_tables_v2_ocr_options=self.read_api_tables_v2_ocr_options, omnipage_ocr_options=self.omnipage_ocr_options, ocr_engine=self.ocr_engine, email_options=self.email_options, @@ -401,6 +428,9 @@ def __init__( ocr_engine: "Optional[OcrEngine]" = None, omnipage_ocr_options: "Optional[OmnipageOcrOptionsInput]" = None, readapi_ocr_options: "Optional[ReadApiOcrOptionsInput]" = None, + readapi_v2_ocr_options: "Optional[ReadApiV2OcrOptionsInput]" = None, + readapi_tables_v1_ocr_options: "Optional[ReadApiTablesV1OcrOptionsInput]" = None, + readapi_tables_v2_ocr_options: "Optional[ReadApiTablesV2OcrOptionsInput]" = None, email_options: "Optional[EmailOptions]" = None, ): if not dataset_type: @@ -412,6 +442,9 @@ def __init__( "ocrEngine": ocr_engine.name, "omnipageOptions": omnipage_ocr_options, "readapiOptions": readapi_ocr_options, + "readapiV2Options": readapi_v2_ocr_options, + "readapiTablesV1Options": readapi_tables_v1_ocr_options, + "readapiTablesV2Options": readapi_tables_v2_ocr_options, }, "emailOptions": email_options, } diff --git a/indico/types/dataset.py b/indico/types/dataset.py index da155fec..97d3d415 100644 --- a/indico/types/dataset.py +++ b/indico/types/dataset.py @@ -75,6 +75,11 @@ class TableReadOrder(Enum): COLUMN = 1 +class ExcelTablesType(Enum): + RENDERED = 0 + NATIVE = 1 + + class OcrEngine(Enum): """ Enum representing available OCR engines. @@ -84,6 +89,7 @@ class OcrEngine(Enum): READAPI = 1 READAPI_V2 = 2 READAPI_TABLES_V1 = 3 + READAPI_TABLES_V2 = 4 class OmnipageOcrOptionsInput(BaseType): @@ -106,12 +112,14 @@ class OmnipageOcrOptionsInput(BaseType): auto_rotate: bool single_column: bool upscale_images: bool + spreadsheet_converter_version: int languages: List[str] cells: bool force_render: bool native_layout: bool native_pdf: bool table_read_order: TableReadOrder + split_version: int class ReadApiOcrOptionsInput(BaseType): @@ -122,13 +130,96 @@ class ReadApiOcrOptionsInput(BaseType): auto_rotate(bool): Auto rotate single_column(bool): Read table as a single column. upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. languages(List[str]): List of languages to use. + excel_tables(bool): Enable excel tables processing. + excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). """ auto_rotate: bool single_column: bool upscale_images: bool + spreadsheet_converter_version: int languages: List[str] + excel_tables: bool + excel_tables_type: ExcelTablesType + + +class ReadApiV2OcrOptionsInput(BaseType): + """ + Read API v2 OCR options. + + Args: + auto_rotate(bool): Auto rotate + single_column(bool): Read table as a single column. + upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. + languages(List[str]): List of languages to use. + excel_tables(bool): Enable excel tables processing. + excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + """ + + auto_rotate: bool + single_column: bool + upscale_images: bool + spreadsheet_converter_version: int + languages: List[str] + excel_tables: bool + excel_tables_type: ExcelTablesType + + +class ReadApiTablesV1OcrOptionsInput(BaseType): + """ + Read API tables v1 OCR options. + + Args: + auto_rotate(bool): Auto rotate + single_column(bool): Read table as a single column. + upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. + languages(List[str]): List of languages to use. + excel_tables(bool): Enable excel tables processing. + excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + table_read_order(TableReadOrder): Read table by row or column. + """ + + auto_rotate: bool + single_column: bool + upscale_images: bool + spreadsheet_converter_version: int + languages: List[str] + excel_tables: bool + excel_tables_type: ExcelTablesType + table_read_order: TableReadOrder + + +class ReadApiTablesV2OcrOptionsInput(BaseType): + """ + Read API tables v2 OCR options. + + Args: + auto_rotate(bool): Auto rotate + upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. + languages(List[str]): List of languages to use. + excel_tables(bool): Enable excel tables processing. + excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + table_read_order(TableReadOrder): Read table by row or column. + include_markdown(bool): Include formatted text in the output. + include_barcodes(bool): Recognize and extract barcodes. + include_key_value_pairs(bool): Recognize and extract key-value pairs. + """ + + auto_rotate: bool + upscale_images: bool + spreadsheet_converter_version: int + languages: List[str] + excel_tables: bool + excel_tables_type: ExcelTablesType + table_read_order: TableReadOrder + include_markdown: bool + include_barcodes: bool + include_key_value_pairs: bool class OcrInputLanguage(BaseType): @@ -165,3 +256,6 @@ class OcrOptionsInput: ocr_engine: OcrEngine omnipage_options: OmnipageOcrOptionsInput readapi_options: ReadApiOcrOptionsInput + readapi_v2_options: ReadApiV2OcrOptionsInput + readapi_tables_v1_options: ReadApiTablesV1OcrOptionsInput + readapi_tables_v2_options: ReadApiTablesV2OcrOptionsInput From eed25cea786ab4357e0d199e103e58e26b412ec2 Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Thu, 19 Feb 2026 16:31:30 +0000 Subject: [PATCH 2/6] fix: limit is not optional --- indico/queries/submission.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/indico/queries/submission.py b/indico/queries/submission.py index f89d9d3a..7e0197e2 100644 --- a/indico/queries/submission.py +++ b/indico/queries/submission.py @@ -295,7 +295,9 @@ def __init__(self, submission_ids: "List[int]", timeout: "Union[int, float]" = 6 self.timeout = timeout self.status_check = partial(ne, "PROCESSING") self.status_getter = partial( - ListSubmissions, submission_ids=self.submission_ids, limit=None + ListSubmissions, + submission_ids=self.submission_ids, + limit=len(self.submission_ids), ) def requests(self) -> "Iterator[ListSubmissions]": From 011c1d184d4153738304012ba48ebf0fad603f33 Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Thu, 19 Feb 2026 17:29:41 +0000 Subject: [PATCH 3/6] fix: fixed filters typing --- indico/queries/gallery.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indico/queries/gallery.py b/indico/queries/gallery.py index a9484ad1..31b87f8e 100644 --- a/indico/queries/gallery.py +++ b/indico/queries/gallery.py @@ -4,7 +4,7 @@ from indico.types.component_blueprint import BlueprintPage, BlueprintTags if TYPE_CHECKING: # pragma: no cover - from typing import Any, Optional + from typing import Any, Dict, Optional, Union from indico.typing import Payload @@ -53,7 +53,7 @@ class ListGallery(PagedRequestV2[BlueprintPage]): def __init__( self, - filters: "Optional[str]" = None, + filters: "Optional[Union[Dict[str, Any], str]]" = None, limit: int = 100, order_by: str = "name", desc: bool = False, From 00605eb874966a53c58cf8cc009f8d937a96f8bf Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Thu, 19 Feb 2026 18:07:36 +0000 Subject: [PATCH 4/6] fix: update model training options --- indico/queries/model_groups/model_groups.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/indico/queries/model_groups/model_groups.py b/indico/queries/model_groups/model_groups.py index 348b9bf2..b3a18edc 100644 --- a/indico/queries/model_groups/model_groups.py +++ b/indico/queries/model_groups/model_groups.py @@ -311,9 +311,9 @@ def __init__( model_training_options_json: "Optional[str]" = None if model_training_options: if isinstance(model_training_options, dict): - model_training_options = json.dumps(model_training_options) + model_training_options_json = json.dumps(model_training_options) else: - model_training_options = model_training_options + model_training_options_json = model_training_options predict_options_json: "Optional[str]" = None if predict_options: From 9ab443c6c47df269a7ff500d3a7b7ad2e3e1ebe4 Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Fri, 20 Feb 2026 17:37:28 +0000 Subject: [PATCH 5/6] fix: factor out common OCR options --- indico/types/dataset.py | 77 ++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/indico/types/dataset.py b/indico/types/dataset.py index 97d3d415..fa2e5a43 100644 --- a/indico/types/dataset.py +++ b/indico/types/dataset.py @@ -92,15 +92,33 @@ class OcrEngine(Enum): READAPI_TABLES_V2 = 4 -class OmnipageOcrOptionsInput(BaseType): +class BaseOCROptions(BaseType): + """ + Base OCR options shared across engines. + + Args: + auto_rotate(bool): Auto rotate. + upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. + languages(List[str]): List of languages to use. + """ + + auto_rotate: bool + upscale_images: bool + spreadsheet_converter_version: int + languages: List[str] + + +class OmnipageOcrOptionsInput(BaseOCROptions): """ Omnipage specific OCR options for dataset creation. Args: - auto_rotate(bool): auto rotate. - single_colum(bool): Read table as a single column. - upscale_images(bool): Scale up low-resolution images. - languages(List[OmnipageLanguageCode]): List of languages to use in ocr. + auto_rotate(bool): Auto rotate. + upscale_images(bool): Scale up low resolution images. + spreadsheet_converter_version(int): Spreadsheet converter version. + languages(List[str]): List of languages to use. + single_column(bool): Read table as a single column. cells(bool): Return table information for post-processing rules force_render(bool): Force rednering. native_layout(bool): Native layout. @@ -109,11 +127,7 @@ class OmnipageOcrOptionsInput(BaseType): """ - auto_rotate: bool single_column: bool - upscale_images: bool - spreadsheet_converter_version: int - languages: List[str] cells: bool force_render: bool native_layout: bool @@ -122,83 +136,64 @@ class OmnipageOcrOptionsInput(BaseType): split_version: int -class ReadApiOcrOptionsInput(BaseType): +class ReadApiOcrOptionsInput(BaseOCROptions): """ Read API OCR options. Args: - auto_rotate(bool): Auto rotate - single_column(bool): Read table as a single column. + auto_rotate(bool): Auto rotate. upscale_images(bool): Scale up low resolution images. spreadsheet_converter_version(int): Spreadsheet converter version. languages(List[str]): List of languages to use. excel_tables(bool): Enable excel tables processing. excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + single_column(bool): Read table as a single column. """ - auto_rotate: bool single_column: bool - upscale_images: bool - spreadsheet_converter_version: int - languages: List[str] excel_tables: bool excel_tables_type: ExcelTablesType -class ReadApiV2OcrOptionsInput(BaseType): +class ReadApiV2OcrOptionsInput(ReadApiOcrOptionsInput): """ Read API v2 OCR options. Args: - auto_rotate(bool): Auto rotate - single_column(bool): Read table as a single column. + auto_rotate(bool): Auto rotate. upscale_images(bool): Scale up low resolution images. spreadsheet_converter_version(int): Spreadsheet converter version. languages(List[str]): List of languages to use. excel_tables(bool): Enable excel tables processing. excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + single_column(bool): Read table as a single column. """ - auto_rotate: bool - single_column: bool - upscale_images: bool - spreadsheet_converter_version: int - languages: List[str] - excel_tables: bool - excel_tables_type: ExcelTablesType - -class ReadApiTablesV1OcrOptionsInput(BaseType): +class ReadApiTablesV1OcrOptionsInput(ReadApiOcrOptionsInput): """ Read API tables v1 OCR options. Args: - auto_rotate(bool): Auto rotate - single_column(bool): Read table as a single column. + auto_rotate(bool): Auto rotate. upscale_images(bool): Scale up low resolution images. spreadsheet_converter_version(int): Spreadsheet converter version. languages(List[str]): List of languages to use. excel_tables(bool): Enable excel tables processing. excel_tables_type(ExcelTablesType): Excel tables processing type (NATIVE or RENDERED). + single_column(bool): Read table as a single column. table_read_order(TableReadOrder): Read table by row or column. """ - auto_rotate: bool - single_column: bool - upscale_images: bool - spreadsheet_converter_version: int - languages: List[str] - excel_tables: bool - excel_tables_type: ExcelTablesType table_read_order: TableReadOrder -class ReadApiTablesV2OcrOptionsInput(BaseType): +class ReadApiTablesV2OcrOptionsInput(BaseOCROptions): """ Read API tables v2 OCR options. Args: - auto_rotate(bool): Auto rotate + auto_rotate(bool): Auto rotate. upscale_images(bool): Scale up low resolution images. spreadsheet_converter_version(int): Spreadsheet converter version. languages(List[str]): List of languages to use. @@ -210,10 +205,6 @@ class ReadApiTablesV2OcrOptionsInput(BaseType): include_key_value_pairs(bool): Recognize and extract key-value pairs. """ - auto_rotate: bool - upscale_images: bool - spreadsheet_converter_version: int - languages: List[str] excel_tables: bool excel_tables_type: ExcelTablesType table_read_order: TableReadOrder From 452a612e963b6944491630558de031c78df18baa Mon Sep 17 00:00:00 2001 From: benleetownsend Date: Fri, 20 Feb 2026 18:10:34 +0000 Subject: [PATCH 6/6] fix: misused typing on WaitSubmissions --- indico/queries/submission.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/indico/queries/submission.py b/indico/queries/submission.py index 7e0197e2..29898e5e 100644 --- a/indico/queries/submission.py +++ b/indico/queries/submission.py @@ -287,17 +287,22 @@ class WaitForSubmissions(RequestChain["List[Submission]"]): } """ - def __init__(self, submission_ids: "List[int]", timeout: "Union[int, float]" = 60): + def __init__( + self, submission_ids: "Union[int, List[int]]", timeout: "Union[int, float]" = 60 + ): if not submission_ids: raise IndicoInputError("Please provide submission ids") self.submission_ids = submission_ids self.timeout = timeout self.status_check = partial(ne, "PROCESSING") + num_submissions = ( + 1 if isinstance(self.submission_ids, int) else len(self.submission_ids) + ) self.status_getter = partial( ListSubmissions, submission_ids=self.submission_ids, - limit=len(self.submission_ids), + limit=num_submissions, ) def requests(self) -> "Iterator[ListSubmissions]":