From e0d8f966245710dd722c6a987c4ce29ee79e6827 Mon Sep 17 00:00:00 2001 From: arcticfly Date: Wed, 17 Dec 2025 17:00:33 -0800 Subject: [PATCH 1/4] Read chat template from checkpoint dir? --- src/art/local/backend.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 5a0bc619..c9e2e6ee 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -61,6 +61,7 @@ from ..utils import format_message, get_model_step from .checkpoints import ( delete_checkpoints, + get_last_checkpoint_dir, ) from .service import ModelService @@ -186,6 +187,21 @@ def _get_packed_tensors( self._tokenizers[model.base_model] = AutoTokenizer.from_pretrained( model.base_model ) + print("DEBUG: _get_packed_tensors", model.base_model) + # Check for custom chat template in checkpoint + model_dir = get_model_dir(model=model, art_path=self._path) + print("DEBUG: model_dir", model_dir) + checkpoint_dir = get_last_checkpoint_dir(model_dir) + print("DEBUG: checkpoint_dir", checkpoint_dir) + if checkpoint_dir: + print("DEBUG: checkpoint_dir exists") + chat_template_path = os.path.join(checkpoint_dir, "chat_template.jinja") + print("DEBUG: chat_template_path", chat_template_path) + if os.path.exists(chat_template_path): + print("DEBUG: chat_template_path exists") + with open(chat_template_path) as f: + print("DEBUG: chat_template", f.read()) + self._tokenizers[model.base_model].chat_template = f.read() if model.base_model not in self._image_processors: try: self._image_processors[model.base_model] = ( From 7f95a35e7ae43a5916331c05b221eeb97dd7881b Mon Sep 17 00:00:00 2001 From: arcticfly Date: Wed, 17 Dec 2025 17:13:24 -0800 Subject: [PATCH 2/4] Add more tracing --- src/art/local/backend.py | 8 ++++++++ src/art/local/checkpoints.py | 6 ++++++ src/art/utils/get_model_step.py | 21 +++++++++++++-------- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index c9e2e6ee..1676ea2c 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -191,6 +191,14 @@ def _get_packed_tensors( # Check for custom chat template in checkpoint model_dir = get_model_dir(model=model, art_path=self._path) print("DEBUG: model_dir", model_dir) + print("DEBUG: model_dir exists?", os.path.exists(model_dir)) + if os.path.exists(model_dir): + print("DEBUG: model_dir contents", os.listdir(model_dir)) + checkpoints_subdir = os.path.join(model_dir, "checkpoints") + if os.path.exists(checkpoints_subdir): + print("DEBUG: checkpoints subdir contents", os.listdir(checkpoints_subdir)) + else: + print("DEBUG: no checkpoints subdir") checkpoint_dir = get_last_checkpoint_dir(model_dir) print("DEBUG: checkpoint_dir", checkpoint_dir) if checkpoint_dir: diff --git a/src/art/local/checkpoints.py b/src/art/local/checkpoints.py index cbb8ff42..f509d9d4 100644 --- a/src/art/local/checkpoints.py +++ b/src/art/local/checkpoints.py @@ -21,14 +21,20 @@ def delete_checkpoints(output_dir: str, excluding: list[int]) -> None: def get_last_checkpoint_dir(output_dir: str) -> str | None: + print("DEBUG get_last_checkpoint_dir: output_dir =", output_dir) step = get_step_from_dir(output_dir) + print("DEBUG get_last_checkpoint_dir: step =", step) if step == 0: + print("DEBUG get_last_checkpoint_dir: step is 0, returning None") return None checkpoint_dir = os.path.join(output_dir, "checkpoints", f"{step:04d}") + print("DEBUG get_last_checkpoint_dir: checkpoint_dir =", checkpoint_dir) if os.path.exists(checkpoint_dir): + print("DEBUG get_last_checkpoint_dir: returning", checkpoint_dir) return checkpoint_dir + print("DEBUG get_last_checkpoint_dir: checkpoint_dir does not exist, returning None") return None diff --git a/src/art/utils/get_model_step.py b/src/art/utils/get_model_step.py index 87dc2d36..a45a93c3 100644 --- a/src/art/utils/get_model_step.py +++ b/src/art/utils/get_model_step.py @@ -8,19 +8,24 @@ def get_step_from_dir(output_dir: str) -> int: + print("DEBUG get_step_from_dir: output_dir =", output_dir) os.makedirs(output_dir, exist_ok=True) checkpoint_dir = os.path.join(output_dir, "checkpoints") + print("DEBUG get_step_from_dir: checkpoint_dir =", checkpoint_dir) if not os.path.exists(checkpoint_dir): + print("DEBUG get_step_from_dir: checkpoint_dir does not exist, returning 0") return 0 - return max( - ( - int(subdir) - for subdir in os.listdir(checkpoint_dir) - if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit() - ), - default=0, - ) + subdirs = os.listdir(checkpoint_dir) + print("DEBUG get_step_from_dir: subdirs =", subdirs) + numeric_subdirs = [ + subdir for subdir in subdirs + if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit() + ] + print("DEBUG get_step_from_dir: numeric_subdirs =", numeric_subdirs) + result = max((int(d) for d in numeric_subdirs), default=0) + print("DEBUG get_step_from_dir: returning", result) + return result def get_model_step(model: "TrainableModel", art_path: str) -> int: From d033921959e1dbdb2c310531bd7c1ffcf414f804 Mon Sep 17 00:00:00 2001 From: arcticfly Date: Wed, 17 Dec 2025 17:19:55 -0800 Subject: [PATCH 3/4] Do not return no checkpoint if step is 0 --- src/art/local/checkpoints.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/art/local/checkpoints.py b/src/art/local/checkpoints.py index f509d9d4..81c8f04b 100644 --- a/src/art/local/checkpoints.py +++ b/src/art/local/checkpoints.py @@ -24,9 +24,6 @@ def get_last_checkpoint_dir(output_dir: str) -> str | None: print("DEBUG get_last_checkpoint_dir: output_dir =", output_dir) step = get_step_from_dir(output_dir) print("DEBUG get_last_checkpoint_dir: step =", step) - if step == 0: - print("DEBUG get_last_checkpoint_dir: step is 0, returning None") - return None checkpoint_dir = os.path.join(output_dir, "checkpoints", f"{step:04d}") print("DEBUG get_last_checkpoint_dir: checkpoint_dir =", checkpoint_dir) From 901aebd16b0c4cca8825864804e4896cc9fcb7d3 Mon Sep 17 00:00:00 2001 From: arcticfly Date: Wed, 17 Dec 2025 17:35:15 -0800 Subject: [PATCH 4/4] Remove logs --- src/art/local/backend.py | 24 ------------------------ src/art/local/checkpoints.py | 5 ----- src/art/utils/get_model_step.py | 21 ++++++++------------- 3 files changed, 8 insertions(+), 42 deletions(-) diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 1676ea2c..5a0bc619 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -61,7 +61,6 @@ from ..utils import format_message, get_model_step from .checkpoints import ( delete_checkpoints, - get_last_checkpoint_dir, ) from .service import ModelService @@ -187,29 +186,6 @@ def _get_packed_tensors( self._tokenizers[model.base_model] = AutoTokenizer.from_pretrained( model.base_model ) - print("DEBUG: _get_packed_tensors", model.base_model) - # Check for custom chat template in checkpoint - model_dir = get_model_dir(model=model, art_path=self._path) - print("DEBUG: model_dir", model_dir) - print("DEBUG: model_dir exists?", os.path.exists(model_dir)) - if os.path.exists(model_dir): - print("DEBUG: model_dir contents", os.listdir(model_dir)) - checkpoints_subdir = os.path.join(model_dir, "checkpoints") - if os.path.exists(checkpoints_subdir): - print("DEBUG: checkpoints subdir contents", os.listdir(checkpoints_subdir)) - else: - print("DEBUG: no checkpoints subdir") - checkpoint_dir = get_last_checkpoint_dir(model_dir) - print("DEBUG: checkpoint_dir", checkpoint_dir) - if checkpoint_dir: - print("DEBUG: checkpoint_dir exists") - chat_template_path = os.path.join(checkpoint_dir, "chat_template.jinja") - print("DEBUG: chat_template_path", chat_template_path) - if os.path.exists(chat_template_path): - print("DEBUG: chat_template_path exists") - with open(chat_template_path) as f: - print("DEBUG: chat_template", f.read()) - self._tokenizers[model.base_model].chat_template = f.read() if model.base_model not in self._image_processors: try: self._image_processors[model.base_model] = ( diff --git a/src/art/local/checkpoints.py b/src/art/local/checkpoints.py index 81c8f04b..1fbb2b8e 100644 --- a/src/art/local/checkpoints.py +++ b/src/art/local/checkpoints.py @@ -21,17 +21,12 @@ def delete_checkpoints(output_dir: str, excluding: list[int]) -> None: def get_last_checkpoint_dir(output_dir: str) -> str | None: - print("DEBUG get_last_checkpoint_dir: output_dir =", output_dir) step = get_step_from_dir(output_dir) - print("DEBUG get_last_checkpoint_dir: step =", step) checkpoint_dir = os.path.join(output_dir, "checkpoints", f"{step:04d}") - print("DEBUG get_last_checkpoint_dir: checkpoint_dir =", checkpoint_dir) if os.path.exists(checkpoint_dir): - print("DEBUG get_last_checkpoint_dir: returning", checkpoint_dir) return checkpoint_dir - print("DEBUG get_last_checkpoint_dir: checkpoint_dir does not exist, returning None") return None diff --git a/src/art/utils/get_model_step.py b/src/art/utils/get_model_step.py index a45a93c3..87dc2d36 100644 --- a/src/art/utils/get_model_step.py +++ b/src/art/utils/get_model_step.py @@ -8,24 +8,19 @@ def get_step_from_dir(output_dir: str) -> int: - print("DEBUG get_step_from_dir: output_dir =", output_dir) os.makedirs(output_dir, exist_ok=True) checkpoint_dir = os.path.join(output_dir, "checkpoints") - print("DEBUG get_step_from_dir: checkpoint_dir =", checkpoint_dir) if not os.path.exists(checkpoint_dir): - print("DEBUG get_step_from_dir: checkpoint_dir does not exist, returning 0") return 0 - subdirs = os.listdir(checkpoint_dir) - print("DEBUG get_step_from_dir: subdirs =", subdirs) - numeric_subdirs = [ - subdir for subdir in subdirs - if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit() - ] - print("DEBUG get_step_from_dir: numeric_subdirs =", numeric_subdirs) - result = max((int(d) for d in numeric_subdirs), default=0) - print("DEBUG get_step_from_dir: returning", result) - return result + return max( + ( + int(subdir) + for subdir in os.listdir(checkpoint_dir) + if os.path.isdir(os.path.join(checkpoint_dir, subdir)) and subdir.isdigit() + ), + default=0, + ) def get_model_step(model: "TrainableModel", art_path: str) -> int: