Skip to content

Commit d541bd5

Browse files
tchawadavjanfaza
authored andcommitted
Adding support to load checkpoints from epoch (#606)
Signed-off-by: Tanisha <tchawada@qti.qualcomm.com> Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
1 parent ad0c6bd commit d541bd5

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

QEfficient/finetune/utils/train_utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,19 @@ def train(
123123
break
124124

125125
if train_config.use_peft and train_config.from_peft_checkpoint:
126-
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
127-
intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
126+
try:
127+
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
128+
intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
129+
except (IndexError, ValueError):
130+
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1]) - 1
131+
intermediate_step = 0
132+
128133
if epoch < intermediate_epoch:
129134
logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
130135
continue
136+
if intermediate_step == 0 and epoch == intermediate_epoch:
137+
logger.log_rank_zero(f"Skipping epoch {epoch + 1}, since fine tuning has already completed for it.")
138+
continue
131139

132140
logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
133141
if max_steps_reached:
@@ -154,6 +162,7 @@ def train(
154162
# resume training from a particular checkpoint, assuming the dataset is not shuffled
155163
if train_config.use_peft and train_config.from_peft_checkpoint:
156164
# to bring the count of train_step in sync with where it left off
165+
157166
if epoch == intermediate_epoch and step == 0:
158167
logger.log_rank_zero(
159168
f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it."

0 commit comments

Comments
 (0)