From bb6039ad391ddd39068ad21ecf30b8ab075243ef Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Mon, 14 Jul 2025 17:02:51 +0200
Subject: [PATCH 01/10] [FEATURE:jobs/gnu-parallel] Add basic scripts to launch
 GNU parallel

---
 docs/jobs/gnu-parallel.md | 106 ++++++++++++++++++++++++++++++++++++++
 mkdocs.yml                |   1 +
 2 files changed, 107 insertions(+)
 create mode 100644 docs/jobs/gnu-parallel.md

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
new file mode 100644
index 000000000..15b5f98a3
--- /dev/null
+++ b/docs/jobs/gnu-parallel.md
@@ -0,0 +1,106 @@
+## Running jobs with GNU parallel
+
+The Slurm scheduler performs 2 jobs,
+
+- allocate resources for a job (allocation),
+- lunches the job steps.
+
+The job steps are the actual processes launched within a job which consume the job resources. Resources can be entities like nodes, CPU cores, GPUs, and memory allocated for the job. The job steps can execute in serial or parallel given that enough resources are available.
+
+The Slurm scheduler is designed to allocate resources in an allocation loop that runs periodically, usually every 30-180sec depending on the Slurm configuration. The resource allocation loop is quite time consuming as the scheduler is configured to perform operations such as back-filling. If a lot of small jobs are in the queue, they tend to trigger expensive operations such as back-filling, and can delay the scheduling loop past its usual period. The end result is a scheduler with sluggish response.
+
+To avoid multiple small jobs, we can schedule multiple jobs in a single allocation.
+
+```bash
+#!/bin/bash --login
+#SBATCH --job-name=single_process
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=16
+#SBATCH --time=02:00:00
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.err
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+declare stress_test_duration=160
+
+parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 stress --cpu 16 --timeout "${stress_test_duration}" ::: {0..255}
+```
+
+The scheduler is much more efficient in lunching job steps within a job, as the resources have been allocated and there is no need to interact with the resource allocation loop. Job steps are lunched in blocking calls within a job whenever a `srun` command is executes in the job.
+
+However, even there are limit even in the number of job steps per job, as the scheduler needs to keep some information for each job step and multiple small jobs steps encumber the scheduler database. To reduce the number of job steps, we can group smaller jobs into groups of jobs lanced with parallel within a job step.
+
+There are 2 options when lunching multiple scripts in a single step, we can launch them in an external script or within functions. When using an external script, call the external script from your main script
+
+```bash
+#!/bin/bash --login
+#SBATCH --job-name=multi_process
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=16
+#SBATCH --time=02:00:00
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.err
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+declare stress_test_duration=5
+declare operations_per_step=256
+
+parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 run_job_step "${operations_per_step}" "${stress_test_duration}" ::: {0..255}
+```
+
+and ensure that the external script is accessible, for instance placed in the same directory:
+
+```bash
+#!/bin/bash --login
+# Contents of `run_job_step`
+
+declare total_operations="${1}"
+declare test_duration="${2}"
+declare final_operation=$((${total_operations}-1))
+
+parallel --max-procs 4 --max-args 0 stress --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
+```
+
+When running the job in a function, make sure that the function is exported to the environment of `srun`:
+
+```bash
+#!/bin/bash --login
+#SBATCH --job-name=function_multi_process
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=16
+#SBATCH --time=02:00:00
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.err
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+declare stress_test_duration=5
+declare operations_per_step=256
+
+run_step() {
+  local total_operations="${1}"
+  local test_duration="${2}"
+  local final_operation=$((${total_operations}-1))
+
+  parallel --max-procs 4 --max-args 0 stress --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
+}
+
+export -f run_step
+
+parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 bash -c "\"run_step ${operations_per_step} ${stress_test_duration}\"" ::: {0..255}
+```
+
+_Resources_
+
+- [luncher_script_examples.zip](https://github.com/user-attachments/files/21215923/luncher_script_examples.zip)
diff --git a/mkdocs.yml b/mkdocs.yml
index f2c736140..b4fba8497 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -95,6 +95,7 @@ nav:
     - Long Jobs: 'jobs/long.md'
     - Best-effort Jobs: 'jobs/best-effort.md'
     - Launcher Scripts Examples: 'slurm/launchers.md'
+    - GNU parallel: 'jobs/gnu-parallel.md'
     # - Affinity: 'jobs/affinity.md'
     # - (Multi-)GPU Jobs: 'jobs/gpu.md'
     # - Memory Management: 'jobs/memory.md'

From 6f9f61eaf48d708340d755b0f34b473cde88d4e1 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Thu, 17 Jul 2025 16:50:06 +0200
Subject: [PATCH 02/10] Expand the GNU parallel use cases

---
 docs/jobs/gnu-parallel.md | 109 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 2 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index 15b5f98a3..106e0b158 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -2,8 +2,10 @@
 
 The Slurm scheduler performs 2 jobs,
 
-- allocate resources for a job (allocation),
-- lunches the job steps.
+- allocates resources for a job (allocation),
+- launches the job steps.
+
+
 
 The job steps are the actual processes launched within a job which consume the job resources. Resources can be entities like nodes, CPU cores, GPUs, and memory allocated for the job. The job steps can execute in serial or parallel given that enough resources are available.
 
@@ -11,6 +13,7 @@ The Slurm scheduler is designed to allocate resources in an allocation loop that
 
 To avoid multiple small jobs, we can schedule multiple jobs in a single allocation.
 
+
 ```bash
 #!/bin/bash --login
 #SBATCH --job-name=single_process
@@ -101,6 +104,108 @@ export -f run_step
 parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 bash -c "\"run_step ${operations_per_step} ${stress_test_duration}\"" ::: {0..255}
 ```
 
+
+## Launching concurrent programs in one allocation
+
+Many real‑world pipelines need to run several different executables inside a single Slurm allocation. The simplest way to orchestrate them is to provide a space‑ or tab‑delimited command table:
+
+```bash
+#!/bin/bash --login
+#SBATCH --job-name=many_programs
+# ... Slurm directives ...
+
+cat > cmdlist.txt <<'EOF'
+fastqc   sample1.fastq.gz
+samtools sort sample1.bam -o sample1.sorted.bam
+python   train_model.py --epochs 10
+EOF
+
+parallel --colsep ' +' --max-procs "${SLURM_NTASKS}" \
+        srun --nodes=1 --ntasks=1 {1} {2..}  \
+        :::: cmdlist.txt
+```
+
+* `{1}` is the program; `{2..}` expands to the remaining columns (its arguments).
+* `--colsep ' +'` treats runs of spaces or tabs as column separators.
+
+
+
+## Collecting Logs and Monitoring Progress
+
+```bash
+parallel --joblog run.log \
+         --results results/{#}/ \
+         --bar --eta \
+         srun ... ::: ${TASKS}
+```
+
+* **`run.log`** — TSV with start/finish, runtime, exit status.
+* **`results/{#}`** — one directory per task; stdout/stderr captured automatically.
+* **`--bar`** — live progress bar; **`--eta`** — estimated completion time.
+
+Tail the bar in real time:
+
+```bash
+tail -f --pid=${PARALLEL_PID} parallel_bar.log
+```
+
+
+## Error Handling and Automatic Retries
+
+Enable bounded retries for flaky tasks:
+
+```bash
+parallel --retries 3 --halt now,fail=1 \
+         srun ... ::: ${TASKS}
+```
+
+* `--retries 3` — attempt each job up to 3 times.
+* `--halt now,fail=1` — abort the whole allocation if any task keeps failing.
+
+For *checkpointable* binaries, pair Parallel’s resume file with `--resume`:
+
+```bash
+parallel --joblog run.log --resume-failed ...
+```
+
+
+## Performance Tuning Tips
+
+| Symptom                     | Lever                | Example                           |
+| --------------------------- | -------------------- | --------------------------------- |
+| I/O saturation on shared FS | `--compress`         | pipe‑compress large stdout chunks |
+| Many tiny files             | `--results /scratch` | stage results to local SSD first  |
+| CPU under‑utilisation       | `--block 10M`        | batch stdin in 10 MB chunks       |
+| SSH startup cost            | `--sshloginfile`     | reuse control master via `-M`     |
+
+Benchmark one tweak at a time; use `sar`/`iostat` on compute nodes to confirm bottlenecks.
+
+
+
+## Comparing GNU Parallel and Slurm Job Arrays
+
+GNU Parallel and Slurm job arrays both launch many similar tasks, but they solve *different* bottlenecks.
+
+| Use Case                                           | Prefer GNU Parallel                               | Prefer Slurm Array                          |
+| -------------------------------------------------- | ------------------------------------------------- | ------------------------------------------- |
+| **Interactive/rapid turn‑around** (e.g. dev nodes) | ✔ Parallel runs immediately inside one allocation | ✖ Array needs scheduler cycle for each task |
+| **Thousands of ultra‑short jobs**                  | ✔ Drastically reduces queue chatter               | ✖ Creates scheduling overhead               |
+| **Need individual Slurm accounting per task**      | ✖ All tasks share one Slurm step                  | ✔ Each array index has its own record       |
+| **Mix heterogeneous commands**                     | ✔ Parallel can vary executable per line           | ✖ Arrays assume one script                  |
+| **Checkpoint/re‑queue tasks**                      | ✖ Must script custom resume                       | ✔ Native `--array` + `--requeue`            |
+
+A quick rule‑of‑thumb:
+
+> *If the run time of the task is **less than the scheduler cycle** (≈30‑180 s), package the tasks with GNU Parallel.*
+
+---
+
+
+
+
+
+
 _Resources_
 
 - [luncher_script_examples.zip](https://github.com/user-attachments/files/21215923/luncher_script_examples.zip)
+- 
\ No newline at end of file

From 29b6e28e2eafb12e70aff3df7e50471f6861ace0 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Fri, 18 Jul 2025 15:42:28 +0200
Subject: [PATCH 03/10] Format the text in the GNU parallel page

---
 docs/jobs/gnu-parallel.md | 79 ++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index 106e0b158..aa91244fa 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -105,24 +105,32 @@ parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 ba
 ```
 
 
-## Launching concurrent programs in one allocation
+## Launch concurrent Programs in One Allocation
+
+Often, real workflows need to run different commands or executables within one job. GNU Parallel can take a command list from a file and execute each line. For example, create a tab-separated file `cmdlist.txt` listing programs and their arguments for each task:
+
+```txt
+# prog  args
+python3 data_processing.py    sample1.dat sample1.proc
+python3 model_training.py    sample1.csv sample1.model
+```
+
+Each line defines a program and its arguments. We can then write a Slurm batch script to execute each line in parallel:
 
-Many real‑world pipelines need to run several different executables inside a single Slurm allocation. The simplest way to orchestrate them is to provide a space‑ or tab‑delimited command table:
 
 ```bash
 #!/bin/bash --login
-#SBATCH --job-name=many_programs
-# ... Slurm directives ...
-
-cat > cmdlist.txt <<'EOF'
-fastqc   sample1.fastq.gz
-samtools sort sample1.bam -o sample1.sorted.bam
-python   train_model.py --epochs 10
-EOF
-
-parallel --colsep ' +' --max-procs "${SLURM_NTASKS}" \
-        srun --nodes=1 --ntasks=1 {1} {2..}  \
-        :::: cmdlist.txt
+#SBATCH --job-name=conc_programs
+#SBATCH --partition=batch
+#SBATCH --qos=normal
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=16
+#SBATCH --time=02:00:00
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.err
+
+parallel --colsep '\t' --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {1} {2} :::: cmdlist.txt
 ```
 
 * `{1}` is the program; `{2..}` expands to the remaining columns (its arguments).
@@ -130,7 +138,7 @@ parallel --colsep ' +' --max-procs "${SLURM_NTASKS}" \
 
 
 
-## Collecting Logs and Monitoring Progress
+## Collect Logs and Monitor Progress
 
 ```bash
 parallel --joblog run.log \
@@ -139,9 +147,9 @@ parallel --joblog run.log \
          srun ... ::: ${TASKS}
 ```
 
-* **`run.log`** — TSV with start/finish, runtime, exit status.
-* **`results/{#}`** — one directory per task; stdout/stderr captured automatically.
-* **`--bar`** — live progress bar; **`--eta`** — estimated completion time.
+* `run.log` — TSV with start/finish, runtime, exit status.
+* `results/{#}` — one directory per task; stdout/stderr captured automatically.
+* `--bar` — live progress bar; **`--eta`** — estimated completion time.
 
 Tail the bar in real time:
 
@@ -169,34 +177,20 @@ parallel --joblog run.log --resume-failed ...
 ```
 
 
-## Performance Tuning Tips
-
-| Symptom                     | Lever                | Example                           |
-| --------------------------- | -------------------- | --------------------------------- |
-| I/O saturation on shared FS | `--compress`         | pipe‑compress large stdout chunks |
-| Many tiny files             | `--results /scratch` | stage results to local SSD first  |
-| CPU under‑utilisation       | `--block 10M`        | batch stdin in 10 MB chunks       |
-| SSH startup cost            | `--sshloginfile`     | reuse control master via `-M`     |
-
-Benchmark one tweak at a time; use `sar`/`iostat` on compute nodes to confirm bottlenecks.
+## GNU Parallel vs Slurm Job Arrays
 
+| **Use Case**                             | **Use GNU Parallel**                          | **Use Slurm Job Arrays**                        |
+|------------------------------------------|-----------------------------------------------|--------------------------------------------------|
+| Interactive or quick testing             | Runs tasks immediately                        | May wait for each task to be scheduled           |
+| Thousands of very short tasks            | Reduces load on the scheduler                 | Can overload the scheduler                       |
+| Need individual job tracking             | All tasks share the same job record           | Each task has its own job record                 |
+| Different commands per task              | Can run different commands in each task       | Usually runs the same script for all tasks       |
+| Restart failed tasks easily              | Needs manual scripting to resume tasks        | Has built-in support for retrying failed tasks   |
 
+---
 
-## Comparing GNU Parallel and Slurm Job Arrays
-
-GNU Parallel and Slurm job arrays both launch many similar tasks, but they solve *different* bottlenecks.
-
-| Use Case                                           | Prefer GNU Parallel                               | Prefer Slurm Array                          |
-| -------------------------------------------------- | ------------------------------------------------- | ------------------------------------------- |
-| **Interactive/rapid turn‑around** (e.g. dev nodes) | ✔ Parallel runs immediately inside one allocation | ✖ Array needs scheduler cycle for each task |
-| **Thousands of ultra‑short jobs**                  | ✔ Drastically reduces queue chatter               | ✖ Creates scheduling overhead               |
-| **Need individual Slurm accounting per task**      | ✖ All tasks share one Slurm step                  | ✔ Each array index has its own record       |
-| **Mix heterogeneous commands**                     | ✔ Parallel can vary executable per line           | ✖ Arrays assume one script                  |
-| **Checkpoint/re‑queue tasks**                      | ✖ Must script custom resume                       | ✔ Native `--array` + `--requeue`            |
-
-A quick rule‑of‑thumb:
+> **Tip**: If your tasks are shorter than the scheduler wait time (around **30 to 180 seconds**), it's better to use **GNU Parallel**. Otherwise, use **Slurm Job Arrays**.
 
-> *If the run time of the task is **less than the scheduler cycle** (≈30‑180 s), package the tasks with GNU Parallel.*
 
 ---
 
@@ -208,4 +202,3 @@ A quick rule‑of‑thumb:
 _Resources_
 
 - [luncher_script_examples.zip](https://github.com/user-attachments/files/21215923/luncher_script_examples.zip)
-- 
\ No newline at end of file

From df17e6eaf55a1586b31b156af4a5c613acc83e2a Mon Sep 17 00:00:00 2001
From: Georgios Kafanas <kafanas.george@gmail.com>
Date: Mon, 21 Jul 2025 11:06:12 +0200
Subject: [PATCH 04/10] Update gnu-parallel.md

Remove redundant blank lines.
---
 docs/jobs/gnu-parallel.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index aa91244fa..82364d6a4 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -5,15 +5,12 @@ The Slurm scheduler performs 2 jobs,
 - allocates resources for a job (allocation),
 - launches the job steps.
 
-
-
 The job steps are the actual processes launched within a job which consume the job resources. Resources can be entities like nodes, CPU cores, GPUs, and memory allocated for the job. The job steps can execute in serial or parallel given that enough resources are available.
 
 The Slurm scheduler is designed to allocate resources in an allocation loop that runs periodically, usually every 30-180sec depending on the Slurm configuration. The resource allocation loop is quite time consuming as the scheduler is configured to perform operations such as back-filling. If a lot of small jobs are in the queue, they tend to trigger expensive operations such as back-filling, and can delay the scheduling loop past its usual period. The end result is a scheduler with sluggish response.
 
 To avoid multiple small jobs, we can schedule multiple jobs in a single allocation.
 
-
 ```bash
 #!/bin/bash --login
 #SBATCH --job-name=single_process

From 830a7e7b0d507554b2a9f462e6a055d1a019951a Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Mon, 21 Jul 2025 18:11:02 +0200
Subject: [PATCH 05/10] Add best practices and info section

---
 docs/jobs/gnu-parallel.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index aa91244fa..1df864eeb 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -105,6 +105,37 @@ parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 ba
 ```
 
 
+To run jobs successfully, the resources you request from Slurm (#SBATCH directives) must match what your commands (parallel, srun, and your program) actually use. Let's break down the previous examples to see how the numbers connect.
+
+
+??? info "How the Resources Are Calculated and Used"
+
+    1.  **Total Tasks (The "Slots" for Work)**
+        *   We request `#SBATCH --nodes=4` and `#SBATCH --ntasks-per-node=8`.
+        *   Slurm calculates the total number of tasks it will create for our job: `4 nodes × 8 tasks/node = 32 total tasks`.
+        *   This total value is automatically stored in the `$SLURM_NTASKS` environment variable.
+
+    2.  **CPUs for Each Task**
+        *   We request `#SBATCH --cpus-per-task=16`.
+        *   This tells Slurm: "For each of the 32 tasks, reserve **16 dedicated CPU cores**." This is the resource pool for a single piece of work.
+
+    3.  **GNU Parallel's Role**
+        *  We use `parallel --max-procs "${SLURM_NTASKS}" ...`
+        *  This instructs GNU Parallel to run up to `$SLURM_NTASKS` (which is 32) commands concurrently. It will launch 32 `srun` commands at once, filling every available task "slot" that Slurm prepared for us.
+
+     4.  **The `srun` Command (The Job Step)**
+        *  The command being run by `parallel` is `srun --nodes=1 --ntasks=1 ...`
+        * Each of these `srun` commands consumes exactly **one** of the 32 available task slots.
+
+     5.  **The `stress` Program (The Actual Work)**
+        * Finally, the program being run is `stress --cpu 16`.
+        * This is the crucial link: we instruct our program to use **16 CPUs**, which perfectly matches the `#SBATCH --cpus-per-task=16` directive. The `srun` command ensures this `stress` test runs within the 16 cores that Slurm reserved for it.
+
+
+> **The Golden Rule:** The `--cpu` (or `--threads`, etc.) value in your final program should match the value you requested in `#SBATCH --cpus-per-task`. This ensures your job uses exactly what it asked for, leading to maximum efficiency and stability.
+
+
+
 ## Launch concurrent Programs in One Allocation
 
 Often, real workflows need to run different commands or executables within one job. GNU Parallel can take a command list from a file and execute each line. For example, create a tab-separated file `cmdlist.txt` listing programs and their arguments for each task:

From 5d5d2a8897b36d5d9a9bb0ff45481d3f197e5f58 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Tue, 22 Jul 2025 16:38:03 +0200
Subject: [PATCH 06/10] Replace info section and comparison table

---
 docs/jobs/gnu-parallel.md | 106 ++++++++++++++++----------------------
 1 file changed, 43 insertions(+), 63 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index e32f842c7..370e53aab 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -101,50 +101,21 @@ export -f run_step
 parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 bash -c "\"run_step ${operations_per_step} ${stress_test_duration}\"" ::: {0..255}
 ```
 
-
-To run jobs successfully, the resources you request from Slurm (#SBATCH directives) must match what your commands (parallel, srun, and your program) actually use. Let's break down the previous examples to see how the numbers connect.
-
-
-??? info "How the Resources Are Calculated and Used"
-
-    1.  **Total Tasks (The "Slots" for Work)**
-        *   We request `#SBATCH --nodes=4` and `#SBATCH --ntasks-per-node=8`.
-        *   Slurm calculates the total number of tasks it will create for our job: `4 nodes × 8 tasks/node = 32 total tasks`.
-        *   This total value is automatically stored in the `$SLURM_NTASKS` environment variable.
-
-    2.  **CPUs for Each Task**
-        *   We request `#SBATCH --cpus-per-task=16`.
-        *   This tells Slurm: "For each of the 32 tasks, reserve **16 dedicated CPU cores**." This is the resource pool for a single piece of work.
-
-    3.  **GNU Parallel's Role**
-        *  We use `parallel --max-procs "${SLURM_NTASKS}" ...`
-        *  This instructs GNU Parallel to run up to `$SLURM_NTASKS` (which is 32) commands concurrently. It will launch 32 `srun` commands at once, filling every available task "slot" that Slurm prepared for us.
-
-     4.  **The `srun` Command (The Job Step)**
-        *  The command being run by `parallel` is `srun --nodes=1 --ntasks=1 ...`
-        * Each of these `srun` commands consumes exactly **one** of the 32 available task slots.
-
-     5.  **The `stress` Program (The Actual Work)**
-        * Finally, the program being run is `stress --cpu 16`.
-        * This is the crucial link: we instruct our program to use **16 CPUs**, which perfectly matches the `#SBATCH --cpus-per-task=16` directive. The `srun` command ensures this `stress` test runs within the 16 cores that Slurm reserved for it.
-
-
-> **The Golden Rule:** The `--cpu` (or `--threads`, etc.) value in your final program should match the value you requested in `#SBATCH --cpus-per-task`. This ensures your job uses exactly what it asked for, leading to maximum efficiency and stability.
-
+To run jobs successfully, the resources you request from Slurm (#SBATCH directives) must match what your commands (parallel, srun, and your program) actually use. See [Resource Allocation Guidelines](https://hpc-docs.uni.lu/slurm/launchers/#resource-allocation-guidelines) 
 
 
 ## Launch concurrent Programs in One Allocation
 
-Often, real workflows need to run different commands or executables within one job. GNU Parallel can take a command list from a file and execute each line. For example, create a tab-separated file `cmdlist.txt` listing programs and their arguments for each task:
+Often, real workflows need to run different commands or executables within one job. GNU Parallel can take a command list from a file and execute each line. For example, create `cmdlist.txt` (tab-separated, or use multiple spaces) listing programs and their arguments for each task:
 
 ```txt
-# prog  args
-python3 data_processing.py    sample1.dat sample1.proc
-python3 model_training.py    sample1.csv sample1.model
+python3	data_processing.py	sample1.dat	sample1.proc
+python3	model_training.py	sample1.csv	sample1.model
 ```
 
-Each line defines a program and its arguments. We can then write a Slurm batch script to execute each line in parallel:
+*(Use tabs or multiple spaces between columns)*
 
+Each line defines a program and its arguments. We can then write a Slurm batch script to execute each line in parallel:
 
 ```bash
 #!/bin/bash --login
@@ -161,71 +132,80 @@ Each line defines a program and its arguments. We can then write a Slurm batch s
 parallel --colsep '\t' --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {1} {2} :::: cmdlist.txt
 ```
 
-* `{1}` is the program; `{2..}` expands to the remaining columns (its arguments).
-* `--colsep ' +'` treats runs of spaces or tabs as column separators.
+- `{1}` is the program; `{2..}` expands to the remaining columns (its arguments).
+- `--colsep ' +'` treats runs of spaces or tabs as column separators.
 
+if you want to pass each line as a full command use:
+```bash
+parallel --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {} :::: cmdlist.txt
+```
+- `{}` is replaced by the entire line (the full command and its arguments).
 
 
 ## Collect Logs and Monitor Progress
 
 ```bash
-parallel --joblog run.log \
-         --results results/{#}/ \
-         --bar --eta \
-         srun ... ::: ${TASKS}
+parallel --joblog run.log --results results/{#}/ --bar --eta srun ... ::: ${TASKS}
 ```
 
-* `run.log` — TSV with start/finish, runtime, exit status.
-* `results/{#}` — one directory per task; stdout/stderr captured automatically.
-* `--bar` — live progress bar; **`--eta`** — estimated completion time.
+- `run.log` — records start/finish time, runtime duration, exit status.
+- `results/{#}` — create a separate directory per task; stdout/stderr captured automatically.
+- `--bar` — live progress bar
+- `--eta` — estimated completion time
+
+
 
-Tail the bar in real time:
+To check the actual state of your job and all it's steps you can use `sacct` 
 
 ```bash
-tail -f --pid=${PARALLEL_PID} parallel_bar.log
+sacct -j $SLURM_JOBID --format=JobID,JobName,State,ExitCode,Elapsed
 ```
 
-
 ## Error Handling and Automatic Retries
 
 Enable bounded retries for flaky tasks:
 
 ```bash
-parallel --retries 3 --halt now,fail=1 \
-         srun ... ::: ${TASKS}
+parallel --retries 3 --halt now,fail=1 srun ... ::: ${TASKS}
 ```
 
-* `--retries 3` — attempt each job up to 3 times.
-* `--halt now,fail=1` — abort the whole allocation if any task keeps failing.
+- `--retries 3` — retries each task up to 3 times if it fails.
+- `--halt now,fail=1` — If any task fails after all retries, GNU Parallel will immediately stop all running and pending tasks, aborting the whole job allocation.
 
-For *checkpointable* binaries, pair Parallel’s resume file with `--resume`:
+For _checkpointable_ binaries, you can resume failed tasks using the joblog:
 
 ```bash
 parallel --joblog run.log --resume-failed ...
 ```
+`--resume-failed` — Only reruns the tasks that failed (according to the log file).
 
+## When to Use GNU Parallel
 
-## GNU Parallel vs Slurm Job Arrays
+Use GNU Parallel when:
 
-| **Use Case**                             | **Use GNU Parallel**                          | **Use Slurm Job Arrays**                        |
-|------------------------------------------|-----------------------------------------------|--------------------------------------------------|
-| Interactive or quick testing             | Runs tasks immediately                        | May wait for each task to be scheduled           |
-| Thousands of very short tasks            | Reduces load on the scheduler                 | Can overload the scheduler                       |
-| Need individual job tracking             | All tasks share the same job record           | Each task has its own job record                 |
-| Different commands per task              | Can run different commands in each task       | Usually runs the same script for all tasks       |
-| Restart failed tasks easily              | Needs manual scripting to resume tasks        | Has built-in support for retrying failed tasks   |
+- You have many short or heterogeneous tasks (different commands or arguments per task).
 
----
+- You want to minimize scheduler overhead by running many tasks within a single job allocation.
 
-> **Tip**: If your tasks are shorter than the scheduler wait time (around **30 to 180 seconds**), it's better to use **GNU Parallel**. Otherwise, use **Slurm Job Arrays**.
+- You need to quickly retry or resume failed tasks using GNU Parallel’s joblog.
 
+- You want interactive or rapid prototyping without waiting for the scheduler.
 
----
+- You want to efficiently utilize allocated resources by launching multiple commands concurrently.
+
+When not to use GNU Parallel:
+
+- When you need each task to be tracked individually by the scheduler for accounting or dependencies.
 
+- When your tasks are long-running and require advanced scheduler features like job dependencies or per-task resource allocation.
 
+- When your workflow is already well-suited to Slurm’s built-in job array features.
 
+---
 
+> **Tip**: If your tasks are shorter than the scheduler wait time (around **30 to 180 seconds**), it's better to use **GNU Parallel**. Otherwise, use **Slurm Job Arrays**.
 
+---
 
 _Resources_
 

From 013081dc80752572a7696c8e5e2c9096b66a1559 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Wed, 23 Jul 2025 16:48:46 +0200
Subject: [PATCH 07/10] Expands GNU parallel introduction and basic example

---
 docs/jobs/gnu-parallel.md | 45 +++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index 370e53aab..8add06208 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -1,3 +1,16 @@
+[GNU Parallel](https://www.gnu.org/software/parallel/) is a tool for executing tasks in parallel, typically on a single machine. When coupled with the Slurm command `srun`, `parallel` becomes a powerful way of distributing a set of tasks amongst a number of workers. This is particularly useful when the number of tasks is significantly larger than the number of available workers (i.e. `$SLURM_NTASKS`), and each tasks is independent of the others.
+
+??? info "Example usage"
+
+    ```bash
+    $ parallel -j 5 'srun --exclusive -N1 -n1 echo Task {} on $(hostname)' ::: {1..5}
+    Task 1 on access1.aion-cluster.uni.lux
+    Task 4 on access1.aion-cluster.uni.lux
+    Task 5 on access1.aion-cluster.uni.lux
+    Task 3 on access1.aion-cluster.uni.lux
+    Task 2 on access1.aion-cluster.uni.lux
+    ```
+
 ## Running jobs with GNU parallel
 
 The Slurm scheduler performs 2 jobs,
@@ -106,7 +119,7 @@ To run jobs successfully, the resources you request from Slurm (#SBATCH directiv
 
 ## Launch concurrent Programs in One Allocation
 
-Often, real workflows need to run different commands or executables within one job. GNU Parallel can take a command list from a file and execute each line. For example, create `cmdlist.txt` (tab-separated, or use multiple spaces) listing programs and their arguments for each task:
+Often, real workflows need to run different commands or executables within one job. GNU parallel can take a command list from a file and execute each line. For example, create `cmdlist.txt` (tab-separated, or use multiple spaces) listing programs and their arguments for each task:
 
 ```txt
 python3	data_processing.py	sample1.dat	sample1.proc
@@ -155,10 +168,27 @@ parallel --joblog run.log --results results/{#}/ --bar --eta srun ... ::: ${TASK
 
 
 
-To check the actual state of your job and all it's steps you can use `sacct` 
+To check the actual state of your job and all it's steps you can use `sacct` command.  
 
 ```bash
 sacct -j $SLURM_JOBID --format=JobID,JobName,State,ExitCode,Elapsed
+$ sacct -j 8717582 --format=JobID,JobName,State,ExitCode,Elapsed
+JobID           JobName      State ExitCode    Elapsed 
+------------ ---------- ---------- -------- ---------- 
+8717582      single_pr+    RUNNING      0:0   00:00:52 
+8717582.bat+      batch    RUNNING      0:0   00:00:52 
+8717582.ext+     extern    RUNNING      0:0   00:00:52 
+8717582.0        stress    RUNNING      0:0   00:00:51 
+8717582.1        stress    RUNNING      0:0   00:00:51 
+8717582.2        stress    RUNNING      0:0   00:00:51 
+8717582.3        stress    RUNNING      0:0   00:00:51 
+8717582.4        stress    RUNNING      0:0   00:00:51 
+8717582.5        stress    RUNNING      0:0   00:00:51 
+8717582.6        stress    RUNNING      0:0   00:00:51 
+8717582.7        stress    RUNNING      0:0   00:00:51 
+8717582.8        stress    RUNNING      0:0   00:00:51 
+8717582.9        stress    RUNNING      0:0   00:00:51 
+8717582.10       stress    RUNNING      0:0   00:00:51 
 ```
 
 ## Error Handling and Automatic Retries
@@ -166,7 +196,8 @@ sacct -j $SLURM_JOBID --format=JobID,JobName,State,ExitCode,Elapsed
 Enable bounded retries for flaky tasks:
 
 ```bash
-parallel --retries 3 --halt now,fail=1 srun ... ::: ${TASKS}
+# /!\ ADAPT <n> to set the number of automatic retries
+parallel --retries <n> --halt now,fail=1 srun ... ::: ${TASKS}
 ```
 
 - `--retries 3` — retries each task up to 3 times if it fails.
@@ -179,9 +210,9 @@ parallel --joblog run.log --resume-failed ...
 ```
 `--resume-failed` — Only reruns the tasks that failed (according to the log file).
 
-## When to Use GNU Parallel
+## When to Use GNU parallel
 
-Use GNU Parallel when:
+Use GNU parallel when:
 
 - You have many short or heterogeneous tasks (different commands or arguments per task).
 
@@ -193,7 +224,7 @@ Use GNU Parallel when:
 
 - You want to efficiently utilize allocated resources by launching multiple commands concurrently.
 
-When not to use GNU Parallel:
+When not to use GNU parallel:
 
 - When you need each task to be tracked individually by the scheduler for accounting or dependencies.
 
@@ -203,7 +234,7 @@ When not to use GNU Parallel:
 
 ---
 
-> **Tip**: If your tasks are shorter than the scheduler wait time (around **30 to 180 seconds**), it's better to use **GNU Parallel**. Otherwise, use **Slurm Job Arrays**.
+> **Tip**: If your tasks are shorter than the scheduler wait time (around **30 to 180 seconds**), it's better to use **GNU parallel**. Otherwise, use **Slurm Job Arrays**.
 
 ---
 

From d3401ac7312a426c2b58c81d25365b36ffb5b546 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Thu, 24 Jul 2025 14:50:56 +0200
Subject: [PATCH 08/10] Add a tip on how srun works with parallel

---
 docs/jobs/gnu-parallel.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index 8add06208..c3b8999bc 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -10,6 +10,7 @@
     Task 3 on access1.aion-cluster.uni.lux
     Task 2 on access1.aion-cluster.uni.lux
     ```
+    The default argument separator `:::` separate your command from your inputs. Inputs can either be a list separated by spaces, or a range using brace expansions. Place `{}` replacement string where you want your inputs to go inside your command.
 
 ## Running jobs with GNU parallel
 
@@ -114,6 +115,12 @@ export -f run_step
 parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 bash -c "\"run_step ${operations_per_step} ${stress_test_duration}\"" ::: {0..255}
 ```
 
+!!! tip "Notice how `srun` works"
+    
+    For `parallel` jobs `srun` command plays an important role of starting the parallel program and setting up the environment. Each srun invocation becomes a separate SLURM job step within your overall allocation meaning it will start as many instances of the program as requested with the `--ntasks` option on the CPUs that were allocated for the job. 
+
+
+
 To run jobs successfully, the resources you request from Slurm (#SBATCH directives) must match what your commands (parallel, srun, and your program) actually use. See [Resource Allocation Guidelines](https://hpc-docs.uni.lu/slurm/launchers/#resource-allocation-guidelines) 
 
 
@@ -150,7 +157,7 @@ parallel --colsep '\t' --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1
 
 if you want to pass each line as a full command use:
 ```bash
-parallel --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {} :::: cmdlist.txt
+parallel --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {} ::: cmdlist.txt
 ```
 - `{}` is replaced by the entire line (the full command and its arguments).
 
@@ -170,7 +177,7 @@ parallel --joblog run.log --results results/{#}/ --bar --eta srun ... ::: ${TASK
 
 To check the actual state of your job and all it's steps you can use `sacct` command.  
 
-```bash
+```
 sacct -j $SLURM_JOBID --format=JobID,JobName,State,ExitCode,Elapsed
 $ sacct -j 8717582 --format=JobID,JobName,State,ExitCode,Elapsed
 JobID           JobName      State ExitCode    Elapsed 
@@ -188,7 +195,8 @@ JobID           JobName      State ExitCode    Elapsed
 8717582.7        stress    RUNNING      0:0   00:00:51 
 8717582.8        stress    RUNNING      0:0   00:00:51 
 8717582.9        stress    RUNNING      0:0   00:00:51 
-8717582.10       stress    RUNNING      0:0   00:00:51 
+8717582.10       stress    RUNNING      0:0   00:00:51
+[...] 
 ```
 
 ## Error Handling and Automatic Retries

From fc24a2868500d2788de0dfd5419244181870ffd3 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Thu, 28 Aug 2025 16:10:19 +0200
Subject: [PATCH 09/10] Adds resources, change to sample scripts and log
 collection.

---
 docs/jobs/gnu-parallel.md | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/docs/jobs/gnu-parallel.md b/docs/jobs/gnu-parallel.md
index c3b8999bc..8fc0de2cf 100644
--- a/docs/jobs/gnu-parallel.md
+++ b/docs/jobs/gnu-parallel.md
@@ -41,7 +41,7 @@ To avoid multiple small jobs, we can schedule multiple jobs in a single allocati
 
 declare stress_test_duration=160
 
-parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 stress --cpu 16 --timeout "${stress_test_duration}" ::: {0..255}
+parallel --max-procs "${SLURM_NTASKS}" --max-args 0 srun --nodes=1 --ntasks=1 stress-ng --cpu 16 --timeout "${stress_test_duration}" ::: {0..255}
 ```
 
 The scheduler is much more efficient in lunching job steps within a job, as the resources have been allocated and there is no need to interact with the resource allocation loop. Job steps are lunched in blocking calls within a job whenever a `srun` command is executes in the job.
@@ -80,7 +80,7 @@ declare total_operations="${1}"
 declare test_duration="${2}"
 declare final_operation=$((${total_operations}-1))
 
-parallel --max-procs 4 --max-args 0 stress --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
+parallel --max-procs 4 --max-args 0 stress-ng --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
 ```
 
 When running the job in a function, make sure that the function is exported to the environment of `srun`:
@@ -107,7 +107,7 @@ run_step() {
   local test_duration="${2}"
   local final_operation=$((${total_operations}-1))
 
-  parallel --max-procs 4 --max-args 0 stress --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
+  parallel --max-procs 4 --max-args 0 stress-ng --cpu 4 --timeout "${test_duration}" ::: $(seq 0 "${final_operation}")
 }
 
 export -f run_step
@@ -153,7 +153,9 @@ parallel --colsep '\t' --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1
 ```
 
 - `{1}` is the program; `{2..}` expands to the remaining columns (its arguments).
-- `--colsep ' +'` treats runs of spaces or tabs as column separators.
+- `--colsep ' +'` (Column separator) The input will be treated as a table with regexp separating the columns. The n'th column can be accessed using {n} or {n.}. E.g. {3} is the 3rd column.
+
+
 
 if you want to pass each line as a full command use:
 ```bash
@@ -164,15 +166,17 @@ parallel --jobs "$SLURM_NTASKS" --results parallel_logs/ srun -N1 -n1 {} ::: cmd
 
 ## Collect Logs and Monitor Progress
 
+GNU Parallel can automatically log task execution and capture per-task output. A single line is enough to both run tasks and collect reports:
+
 ```bash
 parallel --joblog run.log --results results/{#}/ --bar --eta srun ... ::: ${TASKS}
 ```
 
-- `run.log` — records start/finish time, runtime duration, exit status.
-- `results/{#}` — create a separate directory per task; stdout/stderr captured automatically.
-- `--bar` — live progress bar
-- `--eta` — estimated completion time
+- `--joblog run.log` — records start time, runtime, and exit status for each task.
+- `--results results/{#}` — stores stdout/stderr of each task in its own numbered directory.
+- `--bar --eta` - displays a live progress bar and estimated time to completion.
 
+This one-line pattern provides an audit trail (run.log), separate logs per task (results/{#}/), and real-time progress reporting with no extra scripting.
 
 
 To check the actual state of your job and all it's steps you can use `sacct` command.  
@@ -246,6 +250,11 @@ When not to use GNU parallel:
 
 ---
 
+
+
+
 _Resources_
 
 - [luncher_script_examples.zip](https://github.com/user-attachments/files/21215923/luncher_script_examples.zip)
+- [Official Documentation](https://www.gnu.org/software/parallel/parallel.html)
+- [NERSC Documentation](https://docs.nersc.gov/jobs/workflow/gnuparallel/)

From ccd10592c00f78faae42cbff48bd9bcaf6a7ffc7 Mon Sep 17 00:00:00 2001
From: codeharris <heriel9580@gmail.com>
Date: Mon, 1 Sep 2025 14:05:04 +0200
Subject: [PATCH 10/10] [FEATURE] Added a page about job steps

---
 docs/jobs/steps.md | 151 ++++++++++++++++++++++++++++++++++++++++++++-
 mkdocs.yml         |   2 +-
 2 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/docs/jobs/steps.md b/docs/jobs/steps.md
index bca2b28ad..c03ec8ac8 100644
--- a/docs/jobs/steps.md
+++ b/docs/jobs/steps.md
@@ -1 +1,150 @@
-# Job steps
+# Slurm job steps
+
+A job step is a unit of work launched within a job’s resource allocation. You obtain an allocation with `sbatch` or `salloc`, then create one or more steps with `srun` to execute commands using some or all of the allocated resources.
+
+
+## Job allocation vs. Job step
+
+- Job allocation: resources reserved for the job (nodes/CPUs/memory/GPUs).
+
+- Job step: actual processes launched within a job which consume the job resources. The job steps can execute in serial or parallel given that enough resources are available.
+
+- Multiple steps can run sequentially or concurrently inside the same allocation.
+
+## How steps are created
+
+- `sbatch`: submits a non-interactive batch job. The batch script runs in a special batch step on the first node of the allocation. Additional parallel work must be launched via srun.
+
+- `salloc`: creates an interactive allocation. The user’s shell/command runs in a special interactive step on the first node. Further parallel work is launched via srun inside that allocation.
+
+- `srun`: inside an allocation, launches a regular job step. You can launch multiple steps, sequentially or in parallel.
+
+## Step types
+
+- batch step: created for jobs submitted with sbatch; runs the batch script.
+
+- interactive step: created for jobs started with salloc, runs the interactive shell/command provided to salloc.
+
+- extern step: optional step used to account for processes not started by Slurm within the allocation (e.g., daemons, ssh). Presence depends on site configuration (proctrack/cgroup, task plugin, containment settings).
+
+- regular (srun) steps: created by each srun invocation inside the allocation.
+
+Identifiers appear as JobID.StepID (e.g., 123456.0, 123456.batch, 123456.interactive, 123456.extern). Regular step IDs are numbered starting at 0 and increment per srun.
+
+## Why steps matter
+
+- Resource sharing: Each step consumes a portion of the job’s allocation; concurrent steps can oversubscribe CPUs/GPUs if not sized carefully.
+
+- Accounting/monitoring: Each step has its own status and resource usage.
+
+- Placement/binding: Steps define how tasks are distributed and bound on nodes.
+
+## Always use srun for parallel work
+
+Commands not launched with srun run only in the batch or interactive step (typically on the first node). Use srun to utilize the full allocation across nodes and tasks.
+
+## Common srun options for steps
+
+- `--ntasks` (-n),`--nodes` (-N), `--ntasks-per-node`, `--cpus-per-task` (-c)
+
+- `--gpus`, `--gpus-per-node`, `--gpus-per-task`
+
+- `--exclusive` (prevent CPU sharing across concurrent steps)
+
+- `--oversubscribe` (allow sharing)
+
+- `--hint`, `--cpu-bind`, `--distribution` (placement/binding)
+
+- `--job-name`, `--output/--error`, `--label` (naming and I/O)
+
+Each step must fit within the job’s allocated resources.
+
+## Monitoring and control
+
+- List job and steps: `squeue -j <jobid> -s`
+
+- Show job details: `scontrol show job <jobid>`
+
+- Show step details: `scontrol show step <jobid>.<stepid>`
+
+- Live step stats: `sstat -j <jobid>.<stepid> --format JobID,MaxRSS,AveCPU`
+
+- Historical accounting: `sacct -j <jobid> --format JobID,JobName,State,Elapsed,CPUTime,MaxRSS,ReqTRES,AllocTRES`
+
+- Cancel a step: `scancel <jobid>.<stepid>`
+
+- Cancel a job: `scancel <jobid>`
+
+```
+$ sacct -j 9457023 --format JobID,JobName,State,Elapsed,CPUTime,MaxRSS,ReqTRES,AllocTRES
+9457023      single_pr+  COMPLETED   00:01:06   09:23:12            billing=7+ billing=7+ 
+9457023.bat+      batch  COMPLETED   00:01:06   02:20:48    181096K            cpu=128,m+ 
+9457023.ext+     extern  COMPLETED   00:01:06   09:23:12          0            billing=7+ 
+9457023.0     stress-ng  COMPLETED   00:01:01   00:16:16     46320K            cpu=16,me+ 
+9457023.1     stress-ng  COMPLETED   00:01:01   00:16:16     38124K            cpu=16,me+ 
+9457023.2     stress-ng  COMPLETED   00:01:01   00:16:16     46628K            cpu=16,me+ 
+9457023.3     stress-ng  COMPLETED   00:01:01   00:16:16     38480K            cpu=16,me+ 
+9457023.4     stress-ng  COMPLETED   00:01:01   00:16:16     38092K            cpu=16,me+ 
+9457023.5     stress-ng  COMPLETED   00:01:01   00:16:16     38244K            cpu=16,me+ 
+9457023.6     stress-ng  COMPLETED   00:01:01   00:16:16     38524K            cpu=16,me+ 
+9457023.7     stress-ng  COMPLETED   00:01:01   00:16:16     47332K            cpu=16,me+ 
+9457023.8     stress-ng  COMPLETED   00:01:01   00:16:16     38280K            cpu=16,me+ 
+9457023.9     stress-ng  COMPLETED   00:01:01   00:16:16     44344K            cpu=16,me+ 
+9457023.10    stress-ng  COMPLETED   00:01:01   00:16:16     38364K            cpu=16,me+ 
+[...]   
+
+```
+
+
+## Step environment variables
+
+Common variables available inside a step:
+
+- SLURM_JOB_ID, SLURM_JOB_NODELIST
+
+- SLURM_STEP_ID (numeric ID or batch/interactive/extern)
+
+- SLURM_STEP_NODELIST, SLURM_STEP_NUM_NODES
+
+- SLURM_NTASKS, SLURM_NTASKS_PER_NODE, SLURM_CPUS_PER_TASK
+
+## Input/output of steps
+
+- By default, step stdout/stderr go to the job’s output/error.
+
+- Control per-step I/O, e.g.:
+  
+  - srun --output=step_%j.%2t.out --error=step_%j.%2t.err <cmd>
+  
+  - srun --label <cmd> (prefix lines by task rank)
+
+Placeholders include %j (jobid), %t (task id), %2t (zero-padded).
+
+## Resource placement and binding
+
+- Distribution: `--nodes`, `--ntasks-per-node`, `--distribution=block|cyclic|...`
+
+- CPU binding: `--cpu-bind=cores|threads|rank`, `--hint=nomultithread|compute_bound`
+
+- GPUs: request at job level (e.g., #SBATCH --gpus=4) and size each step with --gpus*, ensuring fit within allocation.
+
+## Best practices
+
+- Launch parallel work with srun; don’t rely on implicit shell execution.
+
+- Size steps explicitly (tasks, CPUs per task, GPUs).
+
+- Use --exclusive for concurrent steps that must not share CPUs; use --oversubscribe only when intentional.
+
+- Name steps for clarity (srun --job-name=...).
+
+- Use sacct/sstat to collect per-step accounting.
+
+- For many small tasks, consider srun --multi-prog or job arrays.
+
+- If you use GNU Parallel inside allocations, wrap commands with srun so tasks run as tracked steps; see the [GNU Parallel](jobs/gnu-parallel/) page for more details.
+
+## References
+
+- https://slurm.schedmd.com/job_launch.html
+- https://slurm.schedmd.com/srun.html
diff --git a/mkdocs.yml b/mkdocs.yml
index b4fba8497..5eae9af68 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -82,7 +82,7 @@ nav:
     #- Job dependencies: 'jobs/dependencies.md'
     - Affinity and pinning: 'jobs/affinity_and_pinning.md'
     - Inspecting node architecture: 'jobs/hwloc.md'
-    #- Job steps: 'jobs/steps.md'
+    - Job steps: 'jobs/steps.md'
     - Account Hierarchy: 'slurm/accounts.md'
     - Job State and Reason Code: 'jobs/reason-codes.md'
     - Fairsharing: 'slurm/fairsharing.md'