From 44b324978dc8f610ca1421ebf9084d4fceb570f1 Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 4 Feb 2026 15:36:31 +0000 Subject: [PATCH 1/2] fix: fix Tokenizer has no len / no attribute token_to_id at loading --- torchTextClassifiers/tokenizers/base.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index dee5546..8e498e7 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -178,17 +178,14 @@ def load(cls, load_path: str): @classmethod def load_from_s3(cls, s3_path: str, filesystem): if filesystem.exists(s3_path) is False: - raise FileNotFoundError( - f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)." - ) + raise FileNotFoundError(f"Tokenizer not found at {s3_path}.") with filesystem.open(s3_path, "rb") as f: json_str = f.read().decode("utf-8") tokenizer_obj = Tokenizer.from_str(json_str) - tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj) - instance = cls(vocab_size=len(tokenizer), trained=True) - instance.tokenizer = tokenizer + instance = cls(vocab_size=tokenizer_obj.get_vocab_size(), trained=True) + instance.tokenizer = tokenizer_obj instance._post_training() return instance From 9bccd0cd40405182cf2ba4bd936489a3a108905d Mon Sep 17 00:00:00 2001 From: meilame-tayebjee Date: Wed, 4 Feb 2026 15:37:05 +0000 Subject: [PATCH 2/2] chore: force all tokenizers to have a load_from_s3 method --- torchTextClassifiers/tokenizers/base.py | 5 +++++ torchTextClassifiers/tokenizers/ngram.py | 18 +++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py index 8e498e7..3a3b360 100644 --- a/torchTextClassifiers/tokenizers/base.py +++ b/torchTextClassifiers/tokenizers/base.py @@ -103,6 +103,11 @@ def __repr__(self): def __call__(self, text: Union[str, List[str]], **kwargs) -> list: return self.tokenize(text, **kwargs) + @classmethod + @abstractmethod + def load_from_s3(cls, s3_path: str, filesystem): + pass + class HuggingFaceTokenizer(BaseTokenizer): def __init__( diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py index ed0d8cb..fae323e 100644 --- a/torchTextClassifiers/tokenizers/ngram.py +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -432,11 +432,24 @@ def save_pretrained(self, save_directory: str): print(f"✓ Tokenizer saved to {save_directory}") @classmethod - def from_pretrained(cls, directory: str): + def load_from_s3(cls, s3_path: str, filesystem): """Load tokenizer from saved configuration.""" - with open(f"{directory}/tokenizer.json", "r") as f: + + config = json.load(filesystem.open(s3_path, "r")) + tokenizer = cls.build_from_config(config) + return tokenizer + + @classmethod + def load(cls, path: str): + """Load tokenizer from saved configuration.""" + + with open(path, "r") as f: config = json.load(f) + tokenizer = cls.build_from_config(config) + return tokenizer + @classmethod + def build_from_config(cls, config): tokenizer = cls( min_count=config["min_count"], min_n=config["min_n"], @@ -468,5 +481,4 @@ def from_pretrained(cls, directory: str): ) print("✓ Subword cache built") - print(f"✓ Tokenizer loaded from {directory}") return tokenizer