Adding support to load checkpoints from epoch (#606)

tchawada · vjanfaza · commit d541bd5f5843 · 2025-11-06T23:31:32.000-08:00
Signed-off-by: Tanisha &lt;tchawada@qti.qualcomm.com&gt;
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -123,11 +123,19 @@ def train(
             break
 
         if train_config.use_peft and train_config.from_peft_checkpoint:
-            intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
-            intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
+            try:
+                intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
+                intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
+            except (IndexError, ValueError):
+                intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - 1
+                intermediate_step = 0
+
             if epoch < intermediate_epoch:
                 logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
                 continue
+            if intermediate_step == 0 and epoch == intermediate_epoch:
+                logger.log_rank_zero(f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it.")
+                continue
 
         logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
         if max_steps_reached:
@@ -154,6 +162,7 @@ def train(
             # resume training from a particular checkpoint, assuming the dataset is not shuffled
             if train_config.use_peft and train_config.from_peft_checkpoint:
                 # to bring the count of train_step in sync with where it left off
+
                 if epoch == intermediate_epoch and step == 0:
                     logger.log_rank_zero(
                         f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it."