From 220270e36053cba596c850c41a4686192f28894c Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 21:30:36 +0530 Subject: [PATCH 01/36] Create real_time_encoder_transformer.py Created a real-time encoder only transformer model with Time2Vec as positional encoding along with generalised classifier layer for modelling realtime data like EEG. --- .../real_time_encoder_transformer.py | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 neural_network/real_time_encoder_transformer.py diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py new file mode 100644 index 000000000000..02a7591c6fed --- /dev/null +++ b/neural_network/real_time_encoder_transformer.py @@ -0,0 +1,185 @@ +#imports +import torch +import torch.nn as nn +import math +#Time2Vec layer for positional encoding of real-time data like EEG +class Time2Vec(nn.Module): + #Encodes time steps into a continuous embedding space so to help the transformer learn temporal dependencies. + def __init__(self, d_model): + super().__init__() + self.w0 = nn.Parameter(torch.randn(1, 1)) + self.b0 = nn.Parameter(torch.randn(1, 1)) + self.w = nn.Parameter(torch.randn(1, d_model - 1)) + self.b = nn.Parameter(torch.randn(1, d_model - 1)) + + def forward(self, t): + linear = self.w0 * t + self.b0 + periodic = torch.sin(self.w * t + self.b) + return torch.cat([linear, periodic], dim=-1) + +#positionwise feedforward network +class PositionwiseFeedForward(nn.Module): + def __init__(self, d_model, hidden, drop_prob=0.1): + super().__init__() + self.fc1 = nn.Linear(d_model, hidden) + self.fc2 = nn.Linear(hidden, d_model) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(drop_prob) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + return self.fc2(x) +#scaled dot product attention +class ScaleDotProductAttention(nn.Module): + def __init__(self): + super().__init__() + self.softmax = nn.Softmax(dim=-1) + + def forward(self, q, k, v, mask=None): + _, _, _, d_k = k.size() + scores = (q @ k.transpose(2, 3)) / math.sqrt(d_k) + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e9) + + attn = self.softmax(scores) + context = attn @ v + return context, attn +#multi head attention +class MultiHeadAttention(nn.Module): + def __init__(self, d_model, n_head): + super().__init__() + self.n_head = n_head + self.attn = ScaleDotProductAttention() + self.w_q = nn.Linear(d_model, d_model) + self.w_k = nn.Linear(d_model, d_model) + self.w_v = nn.Linear(d_model, d_model) + self.w_out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) + q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v) + + context, _ = self.attn(q, k, v, mask) + out = self.w_out(self.concat_heads(context)) + return out + + def split_heads(self, x): + batch, seq_len, d_model = x.size() + d_k = d_model // self.n_head + return x.view(batch, seq_len, self.n_head, d_k).transpose(1, 2) + + def concat_heads(self, x): + batch, n_head, seq_len, d_k = x.size() + return x.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) + +#Layer normalization +class LayerNorm(nn.Module): + def __init__(self, d_model, eps=1e-12): + super().__init__() + self.gamma = nn.Parameter(torch.ones(d_model)) + self.beta = nn.Parameter(torch.zeros(d_model)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + var = x.var(-1, unbiased=False, keepdim=True) + return self.gamma * (x - mean) / torch.sqrt(var + self.eps) + self.beta + +#transformer encoder layer +class TransformerEncoderLayer(nn.Module): + def __init__(self, d_model, n_head, hidden_dim, drop_prob=0.1): + super().__init__() + self.self_attn = MultiHeadAttention(d_model, n_head) + self.ffn = PositionwiseFeedForward(d_model, hidden_dim, drop_prob) + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.dropout = nn.Dropout(drop_prob) + + def forward(self, x, mask=None): + attn_out = self.self_attn(x, x, x, mask) + x = self.norm1(x + self.dropout(attn_out)) + ffn_out = self.ffn(x) + x = self.norm2(x + self.dropout(ffn_out)) + + return x + +#encoder stack +class TransformerEncoder(nn.Module): + def __init__(self, d_model, n_head, hidden_dim, num_layers, drop_prob=0.1): + super().__init__() + self.layers = nn.ModuleList([ + TransformerEncoderLayer(d_model, n_head, hidden_dim, drop_prob) + for _ in range(num_layers) + ]) + + def forward(self, x, mask=None): + for layer in self.layers: + x = layer(x, mask) + return x + + +#attention pooling layer +class AttentionPooling(nn.Module): + def __init__(self, d_model): + super().__init__() + self.attn_score = nn.Linear(d_model, 1) + + def forward(self, x, mask=None): + attn_weights = torch.softmax(self.attn_score(x).squeeze(-1), dim=-1) + + if mask is not None: + attn_weights = attn_weights.masked_fill(mask == 0, 0) + attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) + + pooled = torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1) + return pooled, attn_weights + +# transformer model + +class EEGTransformer(nn.Module): + + def __init__(self, feature_dim, d_model=128, n_head=8, hidden_dim=512, + num_layers=4, drop_prob=0.1, output_dim=1, task_type='regression'): + super().__init__() + self.task_type = task_type + self.input_proj = nn.Linear(feature_dim, d_model) + + # Time encoding for temporal understanding + self.time2vec = Time2Vec(d_model) + + # Transformer encoder for sequence modeling + self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, drop_prob) + + # Attention pooling to summarize time dimension + self.pooling = AttentionPooling(d_model) + + # Final output layer + self.output_layer = nn.Linear(d_model, output_dim) + + def forward(self, x, mask=None): + + b, t, _ = x.size() + + # Create time indices and embed them + t_idx = torch.arange(t, device=x.device).view(1, t, 1).expand(b, t, 1).float() + time_emb = self.time2vec(t_idx) + + # Add time embedding to feature projection + x = self.input_proj(x) + time_emb + + # Pass through the Transformer encoder + x = self.encoder(x, mask) + + # Aggregate features across time with attention + pooled, attn_weights = self.pooling(x, mask) + + # Final output (regression or classification) + out = self.output_layer(pooled) + + if self.task_type == 'classification': + out = torch.softmax(out, dim=-1) + + return out, attn_weights From 23c5117971eec03def6a05ff077ebbcb0c839077 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:06:07 +0000 Subject: [PATCH 02/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 71 ++++++++++++------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 02a7591c6fed..9e459fe8a1d8 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,10 +1,12 @@ -#imports +# imports import torch import torch.nn as nn import math -#Time2Vec layer for positional encoding of real-time data like EEG + + +# Time2Vec layer for positional encoding of real-time data like EEG class Time2Vec(nn.Module): - #Encodes time steps into a continuous embedding space so to help the transformer learn temporal dependencies. + # Encodes time steps into a continuous embedding space so to help the transformer learn temporal dependencies. def __init__(self, d_model): super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -13,11 +15,12 @@ def __init__(self, d_model): self.b = nn.Parameter(torch.randn(1, d_model - 1)) def forward(self, t): - linear = self.w0 * t + self.b0 - periodic = torch.sin(self.w * t + self.b) - return torch.cat([linear, periodic], dim=-1) - -#positionwise feedforward network + linear = self.w0 * t + self.b0 + periodic = torch.sin(self.w * t + self.b) + return torch.cat([linear, periodic], dim=-1) + + +# positionwise feedforward network class PositionwiseFeedForward(nn.Module): def __init__(self, d_model, hidden, drop_prob=0.1): super().__init__() @@ -31,7 +34,9 @@ def forward(self, x): x = self.relu(x) x = self.dropout(x) return self.fc2(x) -#scaled dot product attention + + +# scaled dot product attention class ScaleDotProductAttention(nn.Module): def __init__(self): super().__init__() @@ -47,7 +52,9 @@ def forward(self, q, k, v, mask=None): attn = self.softmax(scores) context = attn @ v return context, attn -#multi head attention + + +# multi head attention class MultiHeadAttention(nn.Module): def __init__(self, d_model, n_head): super().__init__() @@ -75,7 +82,8 @@ def concat_heads(self, x): batch, n_head, seq_len, d_k = x.size() return x.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) -#Layer normalization + +# Layer normalization class LayerNorm(nn.Module): def __init__(self, d_model, eps=1e-12): super().__init__() @@ -88,7 +96,8 @@ def forward(self, x): var = x.var(-1, unbiased=False, keepdim=True) return self.gamma * (x - mean) / torch.sqrt(var + self.eps) + self.beta -#transformer encoder layer + +# transformer encoder layer class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, n_head, hidden_dim, drop_prob=0.1): super().__init__() @@ -106,14 +115,17 @@ def forward(self, x, mask=None): return x -#encoder stack + +# encoder stack class TransformerEncoder(nn.Module): def __init__(self, d_model, n_head, hidden_dim, num_layers, drop_prob=0.1): super().__init__() - self.layers = nn.ModuleList([ - TransformerEncoderLayer(d_model, n_head, hidden_dim, drop_prob) - for _ in range(num_layers) - ]) + self.layers = nn.ModuleList( + [ + TransformerEncoderLayer(d_model, n_head, hidden_dim, drop_prob) + for _ in range(num_layers) + ] + ) def forward(self, x, mask=None): for layer in self.layers: @@ -121,7 +133,7 @@ def forward(self, x, mask=None): return x -#attention pooling layer +# attention pooling layer class AttentionPooling(nn.Module): def __init__(self, d_model): super().__init__() @@ -137,12 +149,22 @@ def forward(self, x, mask=None): pooled = torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1) return pooled, attn_weights + # transformer model -class EEGTransformer(nn.Module): - def __init__(self, feature_dim, d_model=128, n_head=8, hidden_dim=512, - num_layers=4, drop_prob=0.1, output_dim=1, task_type='regression'): +class EEGTransformer(nn.Module): + def __init__( + self, + feature_dim, + d_model=128, + n_head=8, + hidden_dim=512, + num_layers=4, + drop_prob=0.1, + output_dim=1, + task_type="regression", + ): super().__init__() self.task_type = task_type self.input_proj = nn.Linear(feature_dim, d_model) @@ -151,7 +173,9 @@ def __init__(self, feature_dim, d_model=128, n_head=8, hidden_dim=512, self.time2vec = Time2Vec(d_model) # Transformer encoder for sequence modeling - self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, drop_prob) + self.encoder = TransformerEncoder( + d_model, n_head, hidden_dim, num_layers, drop_prob + ) # Attention pooling to summarize time dimension self.pooling = AttentionPooling(d_model) @@ -160,7 +184,6 @@ def __init__(self, feature_dim, d_model=128, n_head=8, hidden_dim=512, self.output_layer = nn.Linear(d_model, output_dim) def forward(self, x, mask=None): - b, t, _ = x.size() # Create time indices and embed them @@ -179,7 +202,7 @@ def forward(self, x, mask=None): # Final output (regression or classification) out = self.output_layer(pooled) - if self.task_type == 'classification': + if self.task_type == "classification": out = torch.softmax(out, dim=-1) return out, attn_weights From c96d440c6d53fa470d949ab5211b65ce9bf2b032 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 21:44:12 +0530 Subject: [PATCH 03/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 9e459fe8a1d8..69cf48e23bb5 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,12 +1,12 @@ # imports -import torch -import torch.nn as nn import math +import torch +from torch import nn # Time2Vec layer for positional encoding of real-time data like EEG class Time2Vec(nn.Module): - # Encodes time steps into a continuous embedding space so to help the transformer learn temporal dependencies. + # Encodes time steps into a continuous embedding space def __init__(self, d_model): super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -174,8 +174,12 @@ def __init__( # Transformer encoder for sequence modeling self.encoder = TransformerEncoder( - d_model, n_head, hidden_dim, num_layers, drop_prob - ) + d_model, + n_head, + hidden_dim, + num_layers, + drop_prob +) # Attention pooling to summarize time dimension self.pooling = AttentionPooling(d_model) From 4a62b5778a0a5e6783b847a109df1cc3d595736e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:14:33 +0000 Subject: [PATCH 04/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 69cf48e23bb5..a004e45ca771 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -6,7 +6,7 @@ # Time2Vec layer for positional encoding of real-time data like EEG class Time2Vec(nn.Module): - # Encodes time steps into a continuous embedding space + # Encodes time steps into a continuous embedding space def __init__(self, d_model): super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -174,12 +174,8 @@ def __init__( # Transformer encoder for sequence modeling self.encoder = TransformerEncoder( - d_model, - n_head, - hidden_dim, - num_layers, - drop_prob -) + d_model, n_head, hidden_dim, num_layers, drop_prob + ) # Attention pooling to summarize time dimension self.pooling = AttentionPooling(d_model) From 1eca445397c90582ca8074c4950d8eb1c0192fae Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 21:49:50 +0530 Subject: [PATCH 05/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 223 ++++++++++++------ 1 file changed, 149 insertions(+), 74 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index a004e45ca771..63d81295c58f 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -4,45 +4,73 @@ from torch import nn -# Time2Vec layer for positional encoding of real-time data like EEG class Time2Vec(nn.Module): - # Encodes time steps into a continuous embedding space - def __init__(self, d_model): + """ + Time2Vec layer for positional encoding of real-time data like EEG. + + >>> import torch + >>> layer = Time2Vec(4) + >>> t = torch.ones(1, 3, 1) + >>> output = layer.forward(t) + >>> output.shape + torch.Size([1, 3, 4]) + """ + def __init__(self, d_model: int) -> None: super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) self.b0 = nn.Parameter(torch.randn(1, 1)) self.w = nn.Parameter(torch.randn(1, d_model - 1)) self.b = nn.Parameter(torch.randn(1, d_model - 1)) - def forward(self, t): - linear = self.w0 * t + self.b0 - periodic = torch.sin(self.w * t + self.b) + def forward(self, time_steps: Tensor) -> Tensor: + linear = self.w0 * time_steps + self.b0 + periodic = torch.sin(self.w * time_steps + self.b) return torch.cat([linear, periodic], dim=-1) -# positionwise feedforward network class PositionwiseFeedForward(nn.Module): - def __init__(self, d_model, hidden, drop_prob=0.1): + """ + Positionwise feedforward network. + + >>> import torch + >>> layer = PositionwiseFeedForward(8, 16) + >>> x = torch.rand(4, 10, 8) + >>> out = layer.forward(x) + >>> out.shape + torch.Size([4, 10, 8]) + """ + def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.1) -> None: super().__init__() self.fc1 = nn.Linear(d_model, hidden) self.fc2 = nn.Linear(hidden, d_model) self.relu = nn.ReLU() self.dropout = nn.Dropout(drop_prob) - def forward(self, x): - x = self.fc1(x) + def forward(self, input_tensor: Tensor) -> Tensor: + x = self.fc1(input_tensor) x = self.relu(x) x = self.dropout(x) return self.fc2(x) -# scaled dot product attention class ScaleDotProductAttention(nn.Module): - def __init__(self): + """ + Scaled dot product attention. + + >>> import torch + >>> attn = ScaleDotProductAttention() + >>> q = torch.rand(2, 8, 10, 16) + >>> k = torch.rand(2, 8, 10, 16) + >>> v = torch.rand(2, 8, 10, 16) + >>> ctx, attn_w = attn.forward(q, k, v) + >>> ctx.shape + torch.Size([2, 8, 10, 16]) + """ + def __init__(self) -> None: super().__init__() self.softmax = nn.Softmax(dim=-1) - def forward(self, q, k, v, mask=None): + def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: _, _, _, d_k = k.size() scores = (q @ k.transpose(2, 3)) / math.sqrt(d_k) @@ -54,9 +82,18 @@ def forward(self, q, k, v, mask=None): return context, attn -# multi head attention class MultiHeadAttention(nn.Module): - def __init__(self, d_model, n_head): + """ + Multi-head attention. + + >>> import torch + >>> attn = MultiHeadAttention(16, 4) + >>> q = torch.rand(2, 10, 16) + >>> out = attn.forward(q, q, q) + >>> out.shape + torch.Size([2, 10, 16]) + """ + def __init__(self, d_model: int, n_head: int) -> None: super().__init__() self.n_head = n_head self.attn = ScaleDotProductAttention() @@ -65,7 +102,7 @@ def __init__(self, d_model, n_head): self.w_v = nn.Linear(d_model, d_model) self.w_out = nn.Linear(d_model, d_model) - def forward(self, q, k, v, mask=None): + def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None) -> Tensor: q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v) @@ -73,33 +110,57 @@ def forward(self, q, k, v, mask=None): out = self.w_out(self.concat_heads(context)) return out - def split_heads(self, x): + def split_heads(self, x: Tensor) -> Tensor: batch, seq_len, d_model = x.size() d_k = d_model // self.n_head return x.view(batch, seq_len, self.n_head, d_k).transpose(1, 2) - def concat_heads(self, x): + def concat_heads(self, x: Tensor) -> Tensor: batch, n_head, seq_len, d_k = x.size() return x.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) -# Layer normalization class LayerNorm(nn.Module): - def __init__(self, d_model, eps=1e-12): + """ + Layer normalization. + + >>> import torch + >>> ln = LayerNorm(8) + >>> x = torch.rand(4, 10, 8) + >>> out = ln.forward(x) + >>> out.shape + torch.Size([4, 10, 8]) + """ + def __init__(self, d_model: int, eps: float = 1e-12) -> None: super().__init__() self.gamma = nn.Parameter(torch.ones(d_model)) self.beta = nn.Parameter(torch.zeros(d_model)) self.eps = eps - def forward(self, x): - mean = x.mean(-1, keepdim=True) - var = x.var(-1, unbiased=False, keepdim=True) - return self.gamma * (x - mean) / torch.sqrt(var + self.eps) + self.beta + def forward(self, input_tensor: Tensor) -> Tensor: + mean = input_tensor.mean(-1, keepdim=True) + var = input_tensor.var(-1, unbiased=False, keepdim=True) + return self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta -# transformer encoder layer class TransformerEncoderLayer(nn.Module): - def __init__(self, d_model, n_head, hidden_dim, drop_prob=0.1): + """ + Transformer encoder layer. + + >>> import torch + >>> layer = TransformerEncoderLayer(8, 2, 16) + >>> x = torch.rand(4, 10, 8) + >>> out = layer.forward(x) + >>> out.shape + torch.Size([4, 10, 8]) + """ + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + drop_prob: float = 0.1, + ) -> None: super().__init__() self.self_attn = MultiHeadAttention(d_model, n_head) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, drop_prob) @@ -107,18 +168,33 @@ def __init__(self, d_model, n_head, hidden_dim, drop_prob=0.1): self.norm2 = LayerNorm(d_model) self.dropout = nn.Dropout(drop_prob) - def forward(self, x, mask=None): - attn_out = self.self_attn(x, x, x, mask) - x = self.norm1(x + self.dropout(attn_out)) + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> Tensor: + attn_out = self.self_attn(input_tensor, input_tensor, input_tensor, mask) + x = self.norm1(input_tensor + self.dropout(attn_out)) ffn_out = self.ffn(x) x = self.norm2(x + self.dropout(ffn_out)) - return x -# encoder stack class TransformerEncoder(nn.Module): - def __init__(self, d_model, n_head, hidden_dim, num_layers, drop_prob=0.1): + """ + Encoder stack. + + >>> import torch + >>> enc = TransformerEncoder(8, 2, 16, 2) + >>> x = torch.rand(4, 10, 8) + >>> out = enc.forward(x) + >>> out.shape + torch.Size([4, 10, 8]) + """ + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, + drop_prob: float = 0.1, + ) -> None: super().__init__() self.layers = nn.ModuleList( [ @@ -127,82 +203,81 @@ def __init__(self, d_model, n_head, hidden_dim, num_layers, drop_prob=0.1): ] ) - def forward(self, x, mask=None): + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> Tensor: + x = input_tensor for layer in self.layers: x = layer(x, mask) return x -# attention pooling layer class AttentionPooling(nn.Module): - def __init__(self, d_model): + """ + Attention pooling layer. + + >>> import torch + >>> pooling = AttentionPooling(8) + >>> x = torch.rand(4, 10, 8) + >>> pooled, weights = pooling.forward(x) + >>> pooled.shape + torch.Size([4, 8]) + >>> weights.shape + torch.Size([4, 10]) + """ + def __init__(self, d_model: int) -> None: super().__init__() self.attn_score = nn.Linear(d_model, 1) - def forward(self, x, mask=None): - attn_weights = torch.softmax(self.attn_score(x).squeeze(-1), dim=-1) + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + attn_weights = torch.softmax(self.attn_score(input_tensor).squeeze(-1), dim=-1) if mask is not None: attn_weights = attn_weights.masked_fill(mask == 0, 0) attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) - pooled = torch.bmm(attn_weights.unsqueeze(1), x).squeeze(1) + pooled = torch.bmm(attn_weights.unsqueeze(1), input_tensor).squeeze(1) return pooled, attn_weights -# transformer model - - class EEGTransformer(nn.Module): + """ + EEG Transformer model. + + >>> import torch + >>> model = EEGTransformer(feature_dim=8) + >>> x = torch.rand(2, 10, 8) + >>> out, attn_w = model.forward(x) + >>> out.shape + torch.Size([2, 1]) + """ def __init__( self, - feature_dim, - d_model=128, - n_head=8, - hidden_dim=512, - num_layers=4, - drop_prob=0.1, - output_dim=1, - task_type="regression", - ): + feature_dim: int, + d_model: int = 128, + n_head: int = 8, + hidden_dim: int = 512, + num_layers: int = 4, + drop_prob: float = 0.1, + output_dim: int = 1, + task_type: str = "regression", + ) -> None: super().__init__() self.task_type = task_type self.input_proj = nn.Linear(feature_dim, d_model) - - # Time encoding for temporal understanding self.time2vec = Time2Vec(d_model) - - # Transformer encoder for sequence modeling self.encoder = TransformerEncoder( d_model, n_head, hidden_dim, num_layers, drop_prob ) - - # Attention pooling to summarize time dimension self.pooling = AttentionPooling(d_model) - - # Final output layer self.output_layer = nn.Linear(d_model, output_dim) - def forward(self, x, mask=None): - b, t, _ = x.size() - - # Create time indices and embed them - t_idx = torch.arange(t, device=x.device).view(1, t, 1).expand(b, t, 1).float() + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + b, t, _ = input_tensor.size() + t_idx = torch.arange(t, device=input_tensor.device).view(1, t, 1).expand(b, t, 1).float() time_emb = self.time2vec(t_idx) - - # Add time embedding to feature projection - x = self.input_proj(x) + time_emb - - # Pass through the Transformer encoder + x = self.input_proj(input_tensor) + time_emb x = self.encoder(x, mask) - - # Aggregate features across time with attention pooled, attn_weights = self.pooling(x, mask) - - # Final output (regression or classification) out = self.output_layer(pooled) - if self.task_type == "classification": out = torch.softmax(out, dim=-1) - return out, attn_weights From 47ba945c8c86f50e51f113714cb6a8d302871ac4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:20:18 +0000 Subject: [PATCH 06/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 63d81295c58f..ec906102cf3a 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -15,6 +15,7 @@ class Time2Vec(nn.Module): >>> output.shape torch.Size([1, 3, 4]) """ + def __init__(self, d_model: int) -> None: super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -39,6 +40,7 @@ class PositionwiseFeedForward(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.1) -> None: super().__init__() self.fc1 = nn.Linear(d_model, hidden) @@ -66,11 +68,14 @@ class ScaleDotProductAttention(nn.Module): >>> ctx.shape torch.Size([2, 8, 10, 16]) """ + def __init__(self) -> None: super().__init__() self.softmax = nn.Softmax(dim=-1) - def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + def forward( + self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None + ) -> tuple[Tensor, Tensor]: _, _, _, d_k = k.size() scores = (q @ k.transpose(2, 3)) / math.sqrt(d_k) @@ -93,6 +98,7 @@ class MultiHeadAttention(nn.Module): >>> out.shape torch.Size([2, 10, 16]) """ + def __init__(self, d_model: int, n_head: int) -> None: super().__init__() self.n_head = n_head @@ -131,6 +137,7 @@ class LayerNorm(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__(self, d_model: int, eps: float = 1e-12) -> None: super().__init__() self.gamma = nn.Parameter(torch.ones(d_model)) @@ -140,7 +147,9 @@ def __init__(self, d_model: int, eps: float = 1e-12) -> None: def forward(self, input_tensor: Tensor) -> Tensor: mean = input_tensor.mean(-1, keepdim=True) var = input_tensor.var(-1, unbiased=False, keepdim=True) - return self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta + return ( + self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta + ) class TransformerEncoderLayer(nn.Module): @@ -154,6 +163,7 @@ class TransformerEncoderLayer(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__( self, d_model: int, @@ -187,6 +197,7 @@ class TransformerEncoder(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__( self, d_model: int, @@ -223,11 +234,14 @@ class AttentionPooling(nn.Module): >>> weights.shape torch.Size([4, 10]) """ + def __init__(self, d_model: int) -> None: super().__init__() self.attn_score = nn.Linear(d_model, 1) - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + def forward( + self, input_tensor: Tensor, mask: Tensor = None + ) -> tuple[Tensor, Tensor]: attn_weights = torch.softmax(self.attn_score(input_tensor).squeeze(-1), dim=-1) if mask is not None: @@ -249,6 +263,7 @@ class EEGTransformer(nn.Module): >>> out.shape torch.Size([2, 1]) """ + def __init__( self, feature_dim: int, @@ -270,9 +285,16 @@ def __init__( self.pooling = AttentionPooling(d_model) self.output_layer = nn.Linear(d_model, output_dim) - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + def forward( + self, input_tensor: Tensor, mask: Tensor = None + ) -> tuple[Tensor, Tensor]: b, t, _ = input_tensor.size() - t_idx = torch.arange(t, device=input_tensor.device).view(1, t, 1).expand(b, t, 1).float() + t_idx = ( + torch.arange(t, device=input_tensor.device) + .view(1, t, 1) + .expand(b, t, 1) + .float() + ) time_emb = self.time2vec(t_idx) x = self.input_proj(input_tensor) + time_emb x = self.encoder(x, mask) From 0974fee722b5e9a7826954a3f5c44573cb935190 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 21:55:07 +0530 Subject: [PATCH 07/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index ec906102cf3a..421d23a3e520 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,7 +1,8 @@ # imports import math + import torch -from torch import nn +from torch import nn, Tensor class Time2Vec(nn.Module): From d3a8f47ced98c36cde82caa58af28d6b98df3694 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 21:59:08 +0530 Subject: [PATCH 08/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 85 +++++++++---------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 421d23a3e520..4a5ddc14b33d 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,4 +1,3 @@ -# imports import math import torch @@ -16,7 +15,6 @@ class Time2Vec(nn.Module): >>> output.shape torch.Size([1, 3, 4]) """ - def __init__(self, d_model: int) -> None: super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -41,7 +39,6 @@ class PositionwiseFeedForward(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ - def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.1) -> None: super().__init__() self.fc1 = nn.Linear(d_model, hidden) @@ -62,29 +59,32 @@ class ScaleDotProductAttention(nn.Module): >>> import torch >>> attn = ScaleDotProductAttention() - >>> q = torch.rand(2, 8, 10, 16) - >>> k = torch.rand(2, 8, 10, 16) - >>> v = torch.rand(2, 8, 10, 16) - >>> ctx, attn_w = attn.forward(q, k, v) + >>> query_tensor = torch.rand(2, 8, 10, 16) + >>> key_tensor = torch.rand(2, 8, 10, 16) + >>> value_tensor = torch.rand(2, 8, 10, 16) + >>> ctx, attn_w = attn.forward(query_tensor, key_tensor, value_tensor) >>> ctx.shape torch.Size([2, 8, 10, 16]) """ - def __init__(self) -> None: super().__init__() self.softmax = nn.Softmax(dim=-1) def forward( - self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None + self, + query_tensor: Tensor, + key_tensor: Tensor, + value_tensor: Tensor, + mask: Tensor = None, ) -> tuple[Tensor, Tensor]: - _, _, _, d_k = k.size() - scores = (q @ k.transpose(2, 3)) / math.sqrt(d_k) + _, _, _, d_k = key_tensor.size() + scores = (query_tensor @ key_tensor.transpose(2, 3)) / math.sqrt(d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attn = self.softmax(scores) - context = attn @ v + context = attn @ value_tensor return context, attn @@ -94,12 +94,11 @@ class MultiHeadAttention(nn.Module): >>> import torch >>> attn = MultiHeadAttention(16, 4) - >>> q = torch.rand(2, 10, 16) - >>> out = attn.forward(q, q, q) + >>> query_tensor = torch.rand(2, 10, 16) + >>> out = attn.forward(query_tensor, query_tensor, query_tensor) >>> out.shape torch.Size([2, 10, 16]) """ - def __init__(self, d_model: int, n_head: int) -> None: super().__init__() self.n_head = n_head @@ -109,22 +108,34 @@ def __init__(self, d_model: int, n_head: int) -> None: self.w_v = nn.Linear(d_model, d_model) self.w_out = nn.Linear(d_model, d_model) - def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor = None) -> Tensor: - q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) - q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v) + def forward( + self, + query_tensor: Tensor, + key_tensor: Tensor, + value_tensor: Tensor, + mask: Tensor = None, + ) -> Tensor: + query_tensor, key_tensor, value_tensor = ( + self.w_q(query_tensor), + self.w_k(key_tensor), + self.w_v(value_tensor), + ) + query_tensor = self.split_heads(query_tensor) + key_tensor = self.split_heads(key_tensor) + value_tensor = self.split_heads(value_tensor) - context, _ = self.attn(q, k, v, mask) + context, _ = self.attn(query_tensor, key_tensor, value_tensor, mask) out = self.w_out(self.concat_heads(context)) return out - def split_heads(self, x: Tensor) -> Tensor: - batch, seq_len, d_model = x.size() + def split_heads(self, input_tensor: Tensor) -> Tensor: + batch, seq_len, d_model = input_tensor.size() d_k = d_model // self.n_head - return x.view(batch, seq_len, self.n_head, d_k).transpose(1, 2) + return input_tensor.view(batch, seq_len, self.n_head, d_k).transpose(1, 2) - def concat_heads(self, x: Tensor) -> Tensor: - batch, n_head, seq_len, d_k = x.size() - return x.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) + def concat_heads(self, input_tensor: Tensor) -> Tensor: + batch, n_head, seq_len, d_k = input_tensor.size() + return input_tensor.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) class LayerNorm(nn.Module): @@ -138,7 +149,6 @@ class LayerNorm(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ - def __init__(self, d_model: int, eps: float = 1e-12) -> None: super().__init__() self.gamma = nn.Parameter(torch.ones(d_model)) @@ -148,9 +158,7 @@ def __init__(self, d_model: int, eps: float = 1e-12) -> None: def forward(self, input_tensor: Tensor) -> Tensor: mean = input_tensor.mean(-1, keepdim=True) var = input_tensor.var(-1, unbiased=False, keepdim=True) - return ( - self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta - ) + return self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta class TransformerEncoderLayer(nn.Module): @@ -164,7 +172,6 @@ class TransformerEncoderLayer(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ - def __init__( self, d_model: int, @@ -198,7 +205,6 @@ class TransformerEncoder(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ - def __init__( self, d_model: int, @@ -235,14 +241,11 @@ class AttentionPooling(nn.Module): >>> weights.shape torch.Size([4, 10]) """ - def __init__(self, d_model: int) -> None: super().__init__() self.attn_score = nn.Linear(d_model, 1) - def forward( - self, input_tensor: Tensor, mask: Tensor = None - ) -> tuple[Tensor, Tensor]: + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: attn_weights = torch.softmax(self.attn_score(input_tensor).squeeze(-1), dim=-1) if mask is not None: @@ -264,7 +267,6 @@ class EEGTransformer(nn.Module): >>> out.shape torch.Size([2, 1]) """ - def __init__( self, feature_dim: int, @@ -286,16 +288,9 @@ def __init__( self.pooling = AttentionPooling(d_model) self.output_layer = nn.Linear(d_model, output_dim) - def forward( - self, input_tensor: Tensor, mask: Tensor = None - ) -> tuple[Tensor, Tensor]: + def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: b, t, _ = input_tensor.size() - t_idx = ( - torch.arange(t, device=input_tensor.device) - .view(1, t, 1) - .expand(b, t, 1) - .float() - ) + t_idx = torch.arange(t, device=input_tensor.device).view(1, t, 1).expand(b, t, 1).float() time_emb = self.time2vec(t_idx) x = self.input_proj(input_tensor) + time_emb x = self.encoder(x, mask) From d30966c0d1a4d88b811ddb71dd098d756f994f06 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:29:35 +0000 Subject: [PATCH 09/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 4a5ddc14b33d..f4aa957a78b8 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -15,6 +15,7 @@ class Time2Vec(nn.Module): >>> output.shape torch.Size([1, 3, 4]) """ + def __init__(self, d_model: int) -> None: super().__init__() self.w0 = nn.Parameter(torch.randn(1, 1)) @@ -39,6 +40,7 @@ class PositionwiseFeedForward(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.1) -> None: super().__init__() self.fc1 = nn.Linear(d_model, hidden) @@ -66,6 +68,7 @@ class ScaleDotProductAttention(nn.Module): >>> ctx.shape torch.Size([2, 8, 10, 16]) """ + def __init__(self) -> None: super().__init__() self.softmax = nn.Softmax(dim=-1) @@ -99,6 +102,7 @@ class MultiHeadAttention(nn.Module): >>> out.shape torch.Size([2, 10, 16]) """ + def __init__(self, d_model: int, n_head: int) -> None: super().__init__() self.n_head = n_head @@ -135,7 +139,9 @@ def split_heads(self, input_tensor: Tensor) -> Tensor: def concat_heads(self, input_tensor: Tensor) -> Tensor: batch, n_head, seq_len, d_k = input_tensor.size() - return input_tensor.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) + return ( + input_tensor.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) + ) class LayerNorm(nn.Module): @@ -149,6 +155,7 @@ class LayerNorm(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__(self, d_model: int, eps: float = 1e-12) -> None: super().__init__() self.gamma = nn.Parameter(torch.ones(d_model)) @@ -158,7 +165,9 @@ def __init__(self, d_model: int, eps: float = 1e-12) -> None: def forward(self, input_tensor: Tensor) -> Tensor: mean = input_tensor.mean(-1, keepdim=True) var = input_tensor.var(-1, unbiased=False, keepdim=True) - return self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta + return ( + self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta + ) class TransformerEncoderLayer(nn.Module): @@ -172,6 +181,7 @@ class TransformerEncoderLayer(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__( self, d_model: int, @@ -205,6 +215,7 @@ class TransformerEncoder(nn.Module): >>> out.shape torch.Size([4, 10, 8]) """ + def __init__( self, d_model: int, @@ -241,11 +252,14 @@ class AttentionPooling(nn.Module): >>> weights.shape torch.Size([4, 10]) """ + def __init__(self, d_model: int) -> None: super().__init__() self.attn_score = nn.Linear(d_model, 1) - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + def forward( + self, input_tensor: Tensor, mask: Tensor = None + ) -> tuple[Tensor, Tensor]: attn_weights = torch.softmax(self.attn_score(input_tensor).squeeze(-1), dim=-1) if mask is not None: @@ -267,6 +281,7 @@ class EEGTransformer(nn.Module): >>> out.shape torch.Size([2, 1]) """ + def __init__( self, feature_dim: int, @@ -288,9 +303,16 @@ def __init__( self.pooling = AttentionPooling(d_model) self.output_layer = nn.Linear(d_model, output_dim) - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> tuple[Tensor, Tensor]: + def forward( + self, input_tensor: Tensor, mask: Tensor = None + ) -> tuple[Tensor, Tensor]: b, t, _ = input_tensor.size() - t_idx = torch.arange(t, device=input_tensor.device).view(1, t, 1).expand(b, t, 1).float() + t_idx = ( + torch.arange(t, device=input_tensor.device) + .view(1, t, 1) + .expand(b, t, 1) + .float() + ) time_emb = self.time2vec(t_idx) x = self.input_proj(input_tensor) + time_emb x = self.encoder(x, mask) From 24c52d462369a554b8ad4ff733d09c0ceb4e9239 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 22:03:34 +0530 Subject: [PATCH 10/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index f4aa957a78b8..40fb09d7c0b7 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,7 +1,8 @@ import math import torch -from torch import nn, Tensor +from torch import nn +from torch import Tensor class Time2Vec(nn.Module): From 2a0a8f6e1c5773efa143c005686a78b4ed0e6858 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 22:06:06 +0530 Subject: [PATCH 11/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 40fb09d7c0b7..47aba778e2f3 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,8 +1,7 @@ import math import torch -from torch import nn -from torch import Tensor +from torch import Tensor, nn class Time2Vec(nn.Module): From 2dccc2dbb058c5560f75a13fe4f80356f87c8e74 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Tue, 21 Oct 2025 22:19:46 +0530 Subject: [PATCH 12/36] Update real_time_encoder_transformer.py From 101e30504862518581a13c3bbabe957523631bc6 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 09:42:57 +0530 Subject: [PATCH 13/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 529 +++++++++--------- 1 file changed, 259 insertions(+), 270 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 47aba778e2f3..369906c440fb 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,287 +1,236 @@ + +from __future__ import annotations import math +from typing import Optional, Tuple -import torch -from torch import Tensor, nn +import numpy as np +import pandas as pd +def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x_max = np.max(x, axis=axis, keepdims=True) + e = np.exp(x - x_max) + return e / (np.sum(e, axis=axis, keepdims=True) + 1e-12) -class Time2Vec(nn.Module): - """ - Time2Vec layer for positional encoding of real-time data like EEG. - - >>> import torch - >>> layer = Time2Vec(4) - >>> t = torch.ones(1, 3, 1) - >>> output = layer.forward(t) - >>> output.shape - torch.Size([1, 3, 4]) - """ - def __init__(self, d_model: int) -> None: - super().__init__() - self.w0 = nn.Parameter(torch.randn(1, 1)) - self.b0 = nn.Parameter(torch.randn(1, 1)) - self.w = nn.Parameter(torch.randn(1, d_model - 1)) - self.b = nn.Parameter(torch.randn(1, d_model - 1)) +def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: + return x / (denom + 1e-12) - def forward(self, time_steps: Tensor) -> Tensor: - linear = self.w0 * time_steps + self.b0 - periodic = torch.sin(self.w * time_steps + self.b) - return torch.cat([linear, periodic], dim=-1) +# Time2Vec -class PositionwiseFeedForward(nn.Module): +class Time2Vec: """ - Positionwise feedforward network. - - >>> import torch - >>> layer = PositionwiseFeedForward(8, 16) - >>> x = torch.rand(4, 10, 8) - >>> out = layer.forward(x) - >>> out.shape - torch.Size([4, 10, 8]) + Time2Vec positional encoding (simple) for real-valued time steps. + Produces shape (..., d_model) """ - def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.1) -> None: - super().__init__() - self.fc1 = nn.Linear(d_model, hidden) - self.fc2 = nn.Linear(hidden, d_model) - self.relu = nn.ReLU() - self.dropout = nn.Dropout(drop_prob) - - def forward(self, input_tensor: Tensor) -> Tensor: - x = self.fc1(input_tensor) - x = self.relu(x) - x = self.dropout(x) - return self.fc2(x) + def __init__(self, d_model: int, seed: Optional[int] = None): + if seed is not None: + np.random.seed(seed) + # linear term params (scalar per batch/time) + self.w0 = np.random.randn(1, 1) # multiply time scalar + self.b0 = np.random.randn(1, 1) + # periodic terms params (d_model - 1) + if d_model < 2: + raise ValueError("d_model must be >= 2 for Time2Vec") + self.w = np.random.randn(1, d_model - 1) + self.b = np.random.randn(1, d_model - 1) + + def forward(self, time_steps: np.ndarray) -> np.ndarray: + """ + time_steps: shape (batch, seq_len, 1) or (batch, seq_len) (will be reshaped) + returns: (batch, seq_len, d_model) + """ + ts = time_steps + if ts.ndim == 2: + ts = ts[..., None] + linear = (self.w0 * ts) + self.b0 # (b, t, 1) + periodic = np.sin((ts * self.w) + self.b) # broadcasting -> (b,t,d_model-1) + return np.concatenate([linear, periodic], axis=-1) + + +# PositionwiseFeedForward + +class PositionwiseFeedForward: + def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: Optional[int] = None): + if seed is not None: + np.random.seed(seed) + # simple linear layers (no dropout during forward-only inference, but kept shape) + self.w1 = np.random.randn(d_model, hidden) * math.sqrt(2.0 / (d_model + hidden)) + self.b1 = np.zeros((hidden,)) + self.w2 = np.random.randn(hidden, d_model) * math.sqrt(2.0 / (hidden + d_model)) + self.b2 = np.zeros((d_model,)) + + def forward(self, x: np.ndarray) -> np.ndarray: + # x: (b, t, d_model) + b, t, d = x.shape + h = np.tensordot(x, self.w1, axes=([2], [0])) + self.b1 # (b,t,hidden) + h = np.maximum(h, 0.0) # ReLU + out = np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 # (b,t,d_model) + return out -class ScaleDotProductAttention(nn.Module): - """ - Scaled dot product attention. - - >>> import torch - >>> attn = ScaleDotProductAttention() - >>> query_tensor = torch.rand(2, 8, 10, 16) - >>> key_tensor = torch.rand(2, 8, 10, 16) - >>> value_tensor = torch.rand(2, 8, 10, 16) - >>> ctx, attn_w = attn.forward(query_tensor, key_tensor, value_tensor) - >>> ctx.shape - torch.Size([2, 8, 10, 16]) - """ - def __init__(self) -> None: - super().__init__() - self.softmax = nn.Softmax(dim=-1) +# Scaled Dot-Product Attention - def forward( - self, - query_tensor: Tensor, - key_tensor: Tensor, - value_tensor: Tensor, - mask: Tensor = None, - ) -> tuple[Tensor, Tensor]: - _, _, _, d_k = key_tensor.size() - scores = (query_tensor @ key_tensor.transpose(2, 3)) / math.sqrt(d_k) +class ScaledDotProductAttention: + def forward(self, q: np.ndarray, k: np.ndarray, v: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + """ + q,k,v: shapes (b, n_head, seq_len, d_k) + mask: optional boolean or 0/1 mask of shape (b, seq_len) or (b, 1, 1, seq_len) + returns: context (b, n_head, seq_len, d_k), attn_weights (b, n_head, seq_len, seq_len) + """ + b, n_head, seq_len, d_k = q.shape + # scores: (b, n_head, seq_len, seq_len) + scores = np.matmul(q, k.transpose(0, 1, 3, 2)) / math.sqrt(d_k) if mask is not None: - scores = scores.masked_fill(mask == 0, -1e9) - - attn = self.softmax(scores) - context = attn @ value_tensor + # normalize mask to shape (b, 1, 1, seq_len) broadcasting over heads and queries + if mask.ndim == 2: + mask2 = mask[:, None, None, :] # (b,1,1,seq_len) + elif mask.ndim == 3: + # if provided as (b, n_head, seq_len) or (b, 1, seq_len) + mask2 = mask[:, None, :, :] if mask.shape[1] != seq_len else mask[:, None, None, :] + else: + mask2 = mask + # mask2==0 => masked + scores = np.where(mask2 == 0, -1e9, scores) + + attn = _softmax(scores, axis=-1) # (b, n_head, seq_len, seq_len) + context = np.matmul(attn, v) # (b, n_head, seq_len, d_k) return context, attn -class MultiHeadAttention(nn.Module): - """ - Multi-head attention. - - >>> import torch - >>> attn = MultiHeadAttention(16, 4) - >>> query_tensor = torch.rand(2, 10, 16) - >>> out = attn.forward(query_tensor, query_tensor, query_tensor) - >>> out.shape - torch.Size([2, 10, 16]) - """ +# MultiHeadAttention - def __init__(self, d_model: int, n_head: int) -> None: - super().__init__() +class MultiHeadAttention: + def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None): + if d_model % n_head != 0: + raise ValueError("d_model must be divisible by n_head") + if seed is not None: + np.random.seed(seed) + self.d_model = d_model self.n_head = n_head - self.attn = ScaleDotProductAttention() - self.w_q = nn.Linear(d_model, d_model) - self.w_k = nn.Linear(d_model, d_model) - self.w_v = nn.Linear(d_model, d_model) - self.w_out = nn.Linear(d_model, d_model) - - def forward( - self, - query_tensor: Tensor, - key_tensor: Tensor, - value_tensor: Tensor, - mask: Tensor = None, - ) -> Tensor: - query_tensor, key_tensor, value_tensor = ( - self.w_q(query_tensor), - self.w_k(key_tensor), - self.w_v(value_tensor), - ) - query_tensor = self.split_heads(query_tensor) - key_tensor = self.split_heads(key_tensor) - value_tensor = self.split_heads(value_tensor) - - context, _ = self.attn(query_tensor, key_tensor, value_tensor, mask) - out = self.w_out(self.concat_heads(context)) - return out - - def split_heads(self, input_tensor: Tensor) -> Tensor: - batch, seq_len, d_model = input_tensor.size() - d_k = d_model // self.n_head - return input_tensor.view(batch, seq_len, self.n_head, d_k).transpose(1, 2) - - def concat_heads(self, input_tensor: Tensor) -> Tensor: - batch, n_head, seq_len, d_k = input_tensor.size() - return ( - input_tensor.transpose(1, 2).contiguous().view(batch, seq_len, n_head * d_k) - ) - - -class LayerNorm(nn.Module): - """ - Layer normalization. - - >>> import torch - >>> ln = LayerNorm(8) - >>> x = torch.rand(4, 10, 8) - >>> out = ln.forward(x) - >>> out.shape - torch.Size([4, 10, 8]) - """ - - def __init__(self, d_model: int, eps: float = 1e-12) -> None: - super().__init__() - self.gamma = nn.Parameter(torch.ones(d_model)) - self.beta = nn.Parameter(torch.zeros(d_model)) + self.d_k = d_model // n_head + + # weight matrices for q,k,v and output + self.w_q = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.b_q = np.zeros((d_model,)) + self.w_k = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.b_k = np.zeros((d_model,)) + self.w_v = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.b_v = np.zeros((d_model,)) + self.w_out = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.b_out = np.zeros((d_model,)) + + self.attn = ScaledDotProductAttention() + + def _linear(self, x: np.ndarray, W: np.ndarray, b: np.ndarray) -> np.ndarray: + # x: (b, seq_len, d_model) -> (b, seq_len, d_model) + return np.tensordot(x, W, axes=([2], [0])) + b + + def _split_heads(self, x: np.ndarray) -> np.ndarray: + # x: (b, seq_len, d_model) -> (b, n_head, seq_len, d_k) + b, seq_len, _ = x.shape + return x.reshape(b, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) + + def _concat_heads(self, x: np.ndarray) -> np.ndarray: + # x: (b, n_head, seq_len, d_k) -> (b, seq_len, d_model) + b, n_head, seq_len, d_k = x.shape + return x.transpose(0, 2, 1, 3).reshape(b, seq_len, n_head * d_k) + + def forward(self, query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + """ + query/key/value: (b, seq_len, d_model) + returns: out (b, seq_len, d_model), attn_weights (b, n_head, seq_len, seq_len) + """ + q = self._linear(query, self.w_q, self.b_q) + k = self._linear(key, self.w_k, self.b_k) + v = self._linear(value, self.w_v, self.b_v) + qh = self._split_heads(q) + kh = self._split_heads(k) + vh = self._split_heads(v) + + context, attn = self.attn.forward(qh, kh, vh, mask) + concat = self._concat_heads(context) # (b, seq_len, d_model) + out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out + return out, attn + + + +# LayerNorm + +class LayerNorm: + def __init__(self, d_model: int, eps: float = 1e-12): + self.gamma = np.ones((d_model,)) + self.beta = np.zeros((d_model,)) self.eps = eps - def forward(self, input_tensor: Tensor) -> Tensor: - mean = input_tensor.mean(-1, keepdim=True) - var = input_tensor.var(-1, unbiased=False, keepdim=True) - return ( - self.gamma * (input_tensor - mean) / torch.sqrt(var + self.eps) + self.beta - ) + def forward(self, x: np.ndarray) -> np.ndarray: + # x: (b, seq_len, d_model) + mean = np.mean(x, axis=-1, keepdims=True) + var = np.mean((x - mean) ** 2, axis=-1, keepdims=True) + x_norm = (x - mean) / np.sqrt(var + self.eps) + return self.gamma * x_norm + self.beta +# TransformerEncoderLayer -class TransformerEncoderLayer(nn.Module): - """ - Transformer encoder layer. - - >>> import torch - >>> layer = TransformerEncoderLayer(8, 2, 16) - >>> x = torch.rand(4, 10, 8) - >>> out = layer.forward(x) - >>> out.shape - torch.Size([4, 10, 8]) - """ - - def __init__( - self, - d_model: int, - n_head: int, - hidden_dim: int, - drop_prob: float = 0.1, - ) -> None: - super().__init__() - self.self_attn = MultiHeadAttention(d_model, n_head) - self.ffn = PositionwiseFeedForward(d_model, hidden_dim, drop_prob) +class TransformerEncoderLayer: + def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None): + self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) + self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - self.dropout = nn.Dropout(drop_prob) - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> Tensor: - attn_out = self.self_attn(input_tensor, input_tensor, input_tensor, mask) - x = self.norm1(input_tensor + self.dropout(attn_out)) - ffn_out = self.ffn(x) - x = self.norm2(x + self.dropout(ffn_out)) - return x + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + # Self-attention + attn_out, _ = self.self_attn.forward(x, x, x, mask) # (b, seq_len, d_model) + x2 = self.norm1.forward(x + attn_out) + ffn_out = self.ffn.forward(x2) + x3 = self.norm2.forward(x2 + ffn_out) + return x3 -class TransformerEncoder(nn.Module): - """ - Encoder stack. - - >>> import torch - >>> enc = TransformerEncoder(8, 2, 16, 2) - >>> x = torch.rand(4, 10, 8) - >>> out = enc.forward(x) - >>> out.shape - torch.Size([4, 10, 8]) - """ +# TransformerEncoder (stack) - def __init__( - self, - d_model: int, - n_head: int, - hidden_dim: int, - num_layers: int, - drop_prob: float = 0.1, - ) -> None: - super().__init__() - self.layers = nn.ModuleList( - [ - TransformerEncoderLayer(d_model, n_head, hidden_dim, drop_prob) - for _ in range(num_layers) - ] - ) - - def forward(self, input_tensor: Tensor, mask: Tensor = None) -> Tensor: - x = input_tensor - for layer in self.layers: - x = layer(x, mask) - return x +class TransformerEncoder: + def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None): + self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + out = x + for layer in self.layers: + out = layer.forward(out, mask) + return out -class AttentionPooling(nn.Module): - """ - Attention pooling layer. - - >>> import torch - >>> pooling = AttentionPooling(8) - >>> x = torch.rand(4, 10, 8) - >>> pooled, weights = pooling.forward(x) - >>> pooled.shape - torch.Size([4, 8]) - >>> weights.shape - torch.Size([4, 10]) - """ +# AttentionPooling - def __init__(self, d_model: int) -> None: - super().__init__() - self.attn_score = nn.Linear(d_model, 1) +class AttentionPooling: + def __init__(self, d_model: int, seed: Optional[int] = None): + if seed is not None: + np.random.seed(seed) + self.w = np.random.randn(d_model) * math.sqrt(2.0 / d_model) + self.b = 0.0 - def forward( - self, input_tensor: Tensor, mask: Tensor = None - ) -> tuple[Tensor, Tensor]: - attn_weights = torch.softmax(self.attn_score(input_tensor).squeeze(-1), dim=-1) + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + """ + x: (b, seq_len, d_model) + mask: (b, seq_len) where 1 = valid, 0 = pad + returns: pooled (b, d_model), attn_weights (b, seq_len) + """ + # raw scores: (b, seq_len) + scores = np.tensordot(x, self.w, axes=([2], [0])) + self.b if mask is not None: - attn_weights = attn_weights.masked_fill(mask == 0, 0) - attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) + scores = np.where(mask == 0, -1e9, scores) - pooled = torch.bmm(attn_weights.unsqueeze(1), input_tensor).squeeze(1) - return pooled, attn_weights + weights = _softmax(scores, axis=-1) # (b, seq_len) + pooled = np.matmul(weights[:, None, :], x).squeeze(1) # (b, d_model) + return pooled, weights +# EEGTransformer (forward-only) -class EEGTransformer(nn.Module): - """ - EEG Transformer model. - - >>> import torch - >>> model = EEGTransformer(feature_dim=8) - >>> x = torch.rand(2, 10, 8) - >>> out, attn_w = model.forward(x) - >>> out.shape - torch.Size([2, 1]) - """ - +class EEGTransformer: def __init__( self, feature_dim: int, @@ -289,35 +238,75 @@ def __init__( n_head: int = 8, hidden_dim: int = 512, num_layers: int = 4, - drop_prob: float = 0.1, output_dim: int = 1, task_type: str = "regression", - ) -> None: - super().__init__() + seed: Optional[int] = None, + ): + if seed is not None: + np.random.seed(seed) + self.feature_dim = feature_dim + self.d_model = d_model self.task_type = task_type - self.input_proj = nn.Linear(feature_dim, d_model) - self.time2vec = Time2Vec(d_model) - self.encoder = TransformerEncoder( - d_model, n_head, hidden_dim, num_layers, drop_prob - ) - self.pooling = AttentionPooling(d_model) - self.output_layer = nn.Linear(d_model, output_dim) - - def forward( - self, input_tensor: Tensor, mask: Tensor = None - ) -> tuple[Tensor, Tensor]: - b, t, _ = input_tensor.size() - t_idx = ( - torch.arange(t, device=input_tensor.device) - .view(1, t, 1) - .expand(b, t, 1) - .float() - ) - time_emb = self.time2vec(t_idx) - x = self.input_proj(input_tensor) + time_emb - x = self.encoder(x, mask) - pooled, attn_weights = self.pooling(x, mask) - out = self.output_layer(pooled) + # input projection + self.w_in = np.random.randn(feature_dim, d_model) * math.sqrt(2.0 / (feature_dim + d_model)) + self.b_in = np.zeros((d_model,)) + # time embedding + self.time2vec = Time2Vec(d_model, seed=seed) + self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) + self.pooling = AttentionPooling(d_model, seed=seed) + # output + self.w_out = np.random.randn(d_model, output_dim) * math.sqrt(2.0 / (d_model + output_dim)) + self.b_out = np.zeros((output_dim,)) + + def _input_proj(self, x: np.ndarray) -> np.ndarray: + # x: (b, seq_len, feature_dim) -> (b, seq_len, d_model) + return np.tensordot(x, self.w_in, axes=([2], [0])) + self.b_in + + def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + """ + x: (b, seq_len, feature_dim) + mask: optional (b, seq_len) 1=valid,0=pad + returns: out (b, output_dim), attn_weights_from_pooling (b, seq_len) + """ + b, t, f = x.shape + # time indices + t_idx = np.arange(t, dtype=float)[None, :, None] # (1,t,1) + t_idx = np.tile(t_idx, (b, 1, 1)) # (b,t,1) + time_emb = self.time2vec.forward(t_idx) # (b,t,d_model) + x_proj = self._input_proj(x) + time_emb # broadcast add -> (b,t,d_model) + enc = self.encoder.forward(x_proj, mask) + pooled, attn_weights = self.pooling.forward(enc, mask) + out = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out # (b,output_dim) if self.task_type == "classification": - out = torch.softmax(out, dim=-1) + out = _softmax(out, axis=-1) return out, attn_weights + + +# Example usage + +if __name__ == "__main__": + # Example 1: Synthetic EEG-like array + batch = 2 + seq_len = 10 + feature_dim = 8 # e.g., 8 channels + rng = np.random.RandomState(42) + X = rng.randn(batch, seq_len, feature_dim).astype(float) + + model = EEGTransformer(feature_dim=feature_dim, d_model=32, n_head=4, hidden_dim=64, num_layers=2, output_dim=1, seed=0) + out, attn_weights = model.forward(X) + print("Output shape:", out.shape) + print("Output:", out) + print("Pooling attn shape:", attn_weights.shape) + print("Pooling attn (per sample):", attn_weights) + + # Example 2: Loading EEG from a pandas DataFrame (CSV-like) + # Suppose CSV has columns: time, ch1, ch2, ..., chN + # We'll simulate a DataFrame first: + channels = [f"ch{i}" for i in range(feature_dim)] + # create a long single-trial dataframe with seq_len rows + df = pd.DataFrame(rng.randn(seq_len, feature_dim), columns=channels) + # convert to numpy trial (1, seq_len, feature_dim) + trial_np = df[channels].values.reshape(1, seq_len, feature_dim) + out2, attn2 = model.forward(trial_np) + print("Single-trial output:", out2) + print("Single-trial pooling attn:", attn2) From 53eff3c8205554df40350450ea0674b55ae3033d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:13:15 +0000 Subject: [PATCH 14/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 114 ++++++++++++++---- 1 file changed, 93 insertions(+), 21 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 369906c440fb..83b15990a958 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,4 +1,3 @@ - from __future__ import annotations import math from typing import Optional, Tuple @@ -6,6 +5,7 @@ import numpy as np import pandas as pd + def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: x_max = np.max(x, axis=axis, keepdims=True) e = np.exp(x - x_max) @@ -18,6 +18,7 @@ def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: # Time2Vec + class Time2Vec: """ Time2Vec positional encoding (simple) for real-valued time steps. @@ -51,8 +52,15 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: # PositionwiseFeedForward + class PositionwiseFeedForward: - def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: Optional[int] = None): + def __init__( + self, + d_model: int, + hidden: int, + drop_prob: float = 0.0, + seed: Optional[int] = None, + ): if seed is not None: np.random.seed(seed) # simple linear layers (no dropout during forward-only inference, but kept shape) @@ -70,11 +78,17 @@ def forward(self, x: np.ndarray) -> np.ndarray: return out - # Scaled Dot-Product Attention + class ScaledDotProductAttention: - def forward(self, q: np.ndarray, k: np.ndarray, v: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + def forward( + self, + q: np.ndarray, + k: np.ndarray, + v: np.ndarray, + mask: Optional[np.ndarray] = None, + ) -> Tuple[np.ndarray, np.ndarray]: """ q,k,v: shapes (b, n_head, seq_len, d_k) mask: optional boolean or 0/1 mask of shape (b, seq_len) or (b, 1, 1, seq_len) @@ -90,7 +104,11 @@ def forward(self, q: np.ndarray, k: np.ndarray, v: np.ndarray, mask: Optional[np mask2 = mask[:, None, None, :] # (b,1,1,seq_len) elif mask.ndim == 3: # if provided as (b, n_head, seq_len) or (b, 1, seq_len) - mask2 = mask[:, None, :, :] if mask.shape[1] != seq_len else mask[:, None, None, :] + mask2 = ( + mask[:, None, :, :] + if mask.shape[1] != seq_len + else mask[:, None, None, :] + ) else: mask2 = mask # mask2==0 => masked @@ -103,6 +121,7 @@ def forward(self, q: np.ndarray, k: np.ndarray, v: np.ndarray, mask: Optional[np # MultiHeadAttention + class MultiHeadAttention: def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None): if d_model % n_head != 0: @@ -114,13 +133,21 @@ def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None): self.d_k = d_model // n_head # weight matrices for q,k,v and output - self.w_q = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.w_q = np.random.randn(d_model, d_model) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_q = np.zeros((d_model,)) - self.w_k = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.w_k = np.random.randn(d_model, d_model) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_k = np.zeros((d_model,)) - self.w_v = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.w_v = np.random.randn(d_model, d_model) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_v = np.zeros((d_model,)) - self.w_out = np.random.randn(d_model, d_model) * math.sqrt(2.0 / (d_model + d_model)) + self.w_out = np.random.randn(d_model, d_model) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_out = np.zeros((d_model,)) self.attn = ScaledDotProductAttention() @@ -139,7 +166,13 @@ def _concat_heads(self, x: np.ndarray) -> np.ndarray: b, n_head, seq_len, d_k = x.shape return x.transpose(0, 2, 1, 3).reshape(b, seq_len, n_head * d_k) - def forward(self, query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + def forward( + self, + query: np.ndarray, + key: np.ndarray, + value: np.ndarray, + mask: Optional[np.ndarray] = None, + ) -> Tuple[np.ndarray, np.ndarray]: """ query/key/value: (b, seq_len, d_model) returns: out (b, seq_len, d_model), attn_weights (b, n_head, seq_len, seq_len) @@ -157,9 +190,9 @@ def forward(self, query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: O return out, attn - # LayerNorm + class LayerNorm: def __init__(self, d_model: int, eps: float = 1e-12): self.gamma = np.ones((d_model,)) @@ -173,10 +206,14 @@ def forward(self, x: np.ndarray) -> np.ndarray: x_norm = (x - mean) / np.sqrt(var + self.eps) return self.gamma * x_norm + self.beta + # TransformerEncoderLayer + class TransformerEncoderLayer: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None): + def __init__( + self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None + ): self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) @@ -193,9 +230,20 @@ def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarra # TransformerEncoder (stack) + class TransformerEncoder: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None): - self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, + seed: Optional[int] = None, + ): + self.layers = [ + TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) + for _ in range(num_layers) + ] def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: out = x @@ -203,8 +251,10 @@ def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarra out = layer.forward(out, mask) return out + # AttentionPooling + class AttentionPooling: def __init__(self, d_model: int, seed: Optional[int] = None): if seed is not None: @@ -212,7 +262,9 @@ def __init__(self, d_model: int, seed: Optional[int] = None): self.w = np.random.randn(d_model) * math.sqrt(2.0 / d_model) self.b = 0.0 - def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + def forward( + self, x: np.ndarray, mask: Optional[np.ndarray] = None + ) -> Tuple[np.ndarray, np.ndarray]: """ x: (b, seq_len, d_model) mask: (b, seq_len) where 1 = valid, 0 = pad @@ -228,8 +280,10 @@ def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np. pooled = np.matmul(weights[:, None, :], x).squeeze(1) # (b, d_model) return pooled, weights + # EEGTransformer (forward-only) + class EEGTransformer: def __init__( self, @@ -248,21 +302,29 @@ def __init__( self.d_model = d_model self.task_type = task_type # input projection - self.w_in = np.random.randn(feature_dim, d_model) * math.sqrt(2.0 / (feature_dim + d_model)) + self.w_in = np.random.randn(feature_dim, d_model) * math.sqrt( + 2.0 / (feature_dim + d_model) + ) self.b_in = np.zeros((d_model,)) # time embedding self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) + self.encoder = TransformerEncoder( + d_model, n_head, hidden_dim, num_layers, seed=seed + ) self.pooling = AttentionPooling(d_model, seed=seed) # output - self.w_out = np.random.randn(d_model, output_dim) * math.sqrt(2.0 / (d_model + output_dim)) + self.w_out = np.random.randn(d_model, output_dim) * math.sqrt( + 2.0 / (d_model + output_dim) + ) self.b_out = np.zeros((output_dim,)) def _input_proj(self, x: np.ndarray) -> np.ndarray: # x: (b, seq_len, feature_dim) -> (b, seq_len, d_model) return np.tensordot(x, self.w_in, axes=([2], [0])) + self.b_in - def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + def forward( + self, x: np.ndarray, mask: Optional[np.ndarray] = None + ) -> Tuple[np.ndarray, np.ndarray]: """ x: (b, seq_len, feature_dim) mask: optional (b, seq_len) 1=valid,0=pad @@ -276,7 +338,9 @@ def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np. x_proj = self._input_proj(x) + time_emb # broadcast add -> (b,t,d_model) enc = self.encoder.forward(x_proj, mask) pooled, attn_weights = self.pooling.forward(enc, mask) - out = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out # (b,output_dim) + out = ( + np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out + ) # (b,output_dim) if self.task_type == "classification": out = _softmax(out, axis=-1) return out, attn_weights @@ -292,7 +356,15 @@ def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np. rng = np.random.RandomState(42) X = rng.randn(batch, seq_len, feature_dim).astype(float) - model = EEGTransformer(feature_dim=feature_dim, d_model=32, n_head=4, hidden_dim=64, num_layers=2, output_dim=1, seed=0) + model = EEGTransformer( + feature_dim=feature_dim, + d_model=32, + n_head=4, + hidden_dim=64, + num_layers=2, + output_dim=1, + seed=0, + ) out, attn_weights = model.forward(X) print("Output shape:", out.shape) print("Output:", out) From 5f20061fabbb38695363227d47ffb32d2dade565 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 09:46:29 +0530 Subject: [PATCH 15/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 329 ++++++------------ 1 file changed, 110 insertions(+), 219 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 83b15990a958..274259e1e6af 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,11 +1,14 @@ from __future__ import annotations + import math -from typing import Optional, Tuple import numpy as np import pandas as pd +# -------------------------------------------------- +# Utility functions +# -------------------------------------------------- def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: x_max = np.max(x, axis=axis, keepdims=True) e = np.exp(x - x_max) @@ -16,153 +19,101 @@ def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: return x / (denom + 1e-12) +# -------------------------------------------------- # Time2Vec - - +# -------------------------------------------------- class Time2Vec: - """ - Time2Vec positional encoding (simple) for real-valued time steps. - Produces shape (..., d_model) - """ - - def __init__(self, d_model: int, seed: Optional[int] = None): - if seed is not None: - np.random.seed(seed) - # linear term params (scalar per batch/time) - self.w0 = np.random.randn(1, 1) # multiply time scalar - self.b0 = np.random.randn(1, 1) - # periodic terms params (d_model - 1) + """Time2Vec positional encoding for real-valued time steps.""" + + def __init__(self, d_model: int, seed: int | None = None): if d_model < 2: raise ValueError("d_model must be >= 2 for Time2Vec") - self.w = np.random.randn(1, d_model - 1) - self.b = np.random.randn(1, d_model - 1) + self.rng = np.random.default_rng(seed) + self.w0 = self.rng.standard_normal((1, 1)) + self.b0 = self.rng.standard_normal((1, 1)) + self.w = self.rng.standard_normal((1, d_model - 1)) + self.b = self.rng.standard_normal((1, d_model - 1)) def forward(self, time_steps: np.ndarray) -> np.ndarray: - """ - time_steps: shape (batch, seq_len, 1) or (batch, seq_len) (will be reshaped) - returns: (batch, seq_len, d_model) - """ - ts = time_steps - if ts.ndim == 2: - ts = ts[..., None] - linear = (self.w0 * ts) + self.b0 # (b, t, 1) - periodic = np.sin((ts * self.w) + self.b) # broadcasting -> (b,t,d_model-1) + """time_steps: (batch, seq_len, 1) or (batch, seq_len).""" + ts = time_steps if time_steps.ndim == 3 else time_steps[..., None] + linear = (self.w0 * ts) + self.b0 + periodic = np.sin((ts * self.w) + self.b) return np.concatenate([linear, periodic], axis=-1) -# PositionwiseFeedForward - - +# -------------------------------------------------- +# Positionwise FeedForward +# -------------------------------------------------- class PositionwiseFeedForward: - def __init__( - self, - d_model: int, - hidden: int, - drop_prob: float = 0.0, - seed: Optional[int] = None, - ): - if seed is not None: - np.random.seed(seed) - # simple linear layers (no dropout during forward-only inference, but kept shape) - self.w1 = np.random.randn(d_model, hidden) * math.sqrt(2.0 / (d_model + hidden)) - self.b1 = np.zeros((hidden,)) - self.w2 = np.random.randn(hidden, d_model) * math.sqrt(2.0 / (hidden + d_model)) - self.b2 = np.zeros((d_model,)) + def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: int | None = None): + self.rng = np.random.default_rng(seed) + self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt(2.0 / (d_model + hidden)) + self.b1 = np.zeros(hidden) + self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt(2.0 / (hidden + d_model)) + self.b2 = np.zeros(d_model) def forward(self, x: np.ndarray) -> np.ndarray: - # x: (b, t, d_model) - b, t, d = x.shape - h = np.tensordot(x, self.w1, axes=([2], [0])) + self.b1 # (b,t,hidden) - h = np.maximum(h, 0.0) # ReLU - out = np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 # (b,t,d_model) - return out + h = np.tensordot(x, self.w1, axes=([2], [0])) + self.b1 + h = np.maximum(h, 0.0) + return np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 +# -------------------------------------------------- # Scaled Dot-Product Attention - - +# -------------------------------------------------- class ScaledDotProductAttention: def forward( self, q: np.ndarray, k: np.ndarray, v: np.ndarray, - mask: Optional[np.ndarray] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - q,k,v: shapes (b, n_head, seq_len, d_k) - mask: optional boolean or 0/1 mask of shape (b, seq_len) or (b, 1, 1, seq_len) - returns: context (b, n_head, seq_len, d_k), attn_weights (b, n_head, seq_len, seq_len) - """ + mask: np.ndarray | None = None, + ) -> tuple[np.ndarray, np.ndarray]: b, n_head, seq_len, d_k = q.shape - # scores: (b, n_head, seq_len, seq_len) scores = np.matmul(q, k.transpose(0, 1, 3, 2)) / math.sqrt(d_k) if mask is not None: - # normalize mask to shape (b, 1, 1, seq_len) broadcasting over heads and queries - if mask.ndim == 2: - mask2 = mask[:, None, None, :] # (b,1,1,seq_len) - elif mask.ndim == 3: - # if provided as (b, n_head, seq_len) or (b, 1, seq_len) - mask2 = ( - mask[:, None, :, :] - if mask.shape[1] != seq_len - else mask[:, None, None, :] - ) - else: - mask2 = mask - # mask2==0 => masked + mask2 = mask[:, None, None, :] if mask.ndim == 2 else mask scores = np.where(mask2 == 0, -1e9, scores) - attn = _softmax(scores, axis=-1) # (b, n_head, seq_len, seq_len) - context = np.matmul(attn, v) # (b, n_head, seq_len, d_k) + attn = _softmax(scores, axis=-1) + context = np.matmul(attn, v) return context, attn -# MultiHeadAttention - - +# -------------------------------------------------- +# Multi-Head Attention +# -------------------------------------------------- class MultiHeadAttention: - def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None): + def __init__(self, d_model: int, n_head: int, seed: int | None = None): if d_model % n_head != 0: raise ValueError("d_model must be divisible by n_head") - if seed is not None: - np.random.seed(seed) + self.d_model = d_model self.n_head = n_head self.d_k = d_model // n_head + self.rng = np.random.default_rng(seed) - # weight matrices for q,k,v and output - self.w_q = np.random.randn(d_model, d_model) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_q = np.zeros((d_model,)) - self.w_k = np.random.randn(d_model, d_model) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_k = np.zeros((d_model,)) - self.w_v = np.random.randn(d_model, d_model) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_v = np.zeros((d_model,)) - self.w_out = np.random.randn(d_model, d_model) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_out = np.zeros((d_model,)) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.b_q = np.zeros(d_model) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.b_k = np.zeros(d_model) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.b_v = np.zeros(d_model) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.b_out = np.zeros(d_model) self.attn = ScaledDotProductAttention() - def _linear(self, x: np.ndarray, W: np.ndarray, b: np.ndarray) -> np.ndarray: - # x: (b, seq_len, d_model) -> (b, seq_len, d_model) - return np.tensordot(x, W, axes=([2], [0])) + b + def _linear(self, x: np.ndarray, w: np.ndarray, b: np.ndarray) -> np.ndarray: + return np.tensordot(x, w, axes=([2], [0])) + b def _split_heads(self, x: np.ndarray) -> np.ndarray: - # x: (b, seq_len, d_model) -> (b, n_head, seq_len, d_k) b, seq_len, _ = x.shape return x.reshape(b, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) def _concat_heads(self, x: np.ndarray) -> np.ndarray: - # x: (b, n_head, seq_len, d_k) -> (b, seq_len, d_model) b, n_head, seq_len, d_k = x.shape return x.transpose(0, 2, 1, 3).reshape(b, seq_len, n_head * d_k) @@ -171,119 +122,86 @@ def forward( query: np.ndarray, key: np.ndarray, value: np.ndarray, - mask: Optional[np.ndarray] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - query/key/value: (b, seq_len, d_model) - returns: out (b, seq_len, d_model), attn_weights (b, n_head, seq_len, seq_len) - """ + mask: np.ndarray | None = None, + ) -> tuple[np.ndarray, np.ndarray]: q = self._linear(query, self.w_q, self.b_q) k = self._linear(key, self.w_k, self.b_k) v = self._linear(value, self.w_v, self.b_v) - qh = self._split_heads(q) - kh = self._split_heads(k) - vh = self._split_heads(v) - + qh, kh, vh = self._split_heads(q), self._split_heads(k), self._split_heads(v) context, attn = self.attn.forward(qh, kh, vh, mask) - concat = self._concat_heads(context) # (b, seq_len, d_model) + concat = self._concat_heads(context) out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out return out, attn -# LayerNorm - - +# -------------------------------------------------- +# Layer Normalization +# -------------------------------------------------- class LayerNorm: def __init__(self, d_model: int, eps: float = 1e-12): - self.gamma = np.ones((d_model,)) - self.beta = np.zeros((d_model,)) + self.gamma = np.ones(d_model) + self.beta = np.zeros(d_model) self.eps = eps def forward(self, x: np.ndarray) -> np.ndarray: - # x: (b, seq_len, d_model) mean = np.mean(x, axis=-1, keepdims=True) var = np.mean((x - mean) ** 2, axis=-1, keepdims=True) x_norm = (x - mean) / np.sqrt(var + self.eps) return self.gamma * x_norm + self.beta -# TransformerEncoderLayer - - +# -------------------------------------------------- +# Transformer Encoder Layer +# -------------------------------------------------- class TransformerEncoderLayer: - def __init__( - self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None - ): + def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None): self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: - # Self-attention - attn_out, _ = self.self_attn.forward(x, x, x, mask) # (b, seq_len, d_model) + def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: + attn_out, _ = self.self_attn.forward(x, x, x, mask) x2 = self.norm1.forward(x + attn_out) ffn_out = self.ffn.forward(x2) - x3 = self.norm2.forward(x2 + ffn_out) - return x3 - - -# TransformerEncoder (stack) + return self.norm2.forward(x2 + ffn_out) +# -------------------------------------------------- +# Transformer Encoder Stack +# -------------------------------------------------- class TransformerEncoder: - def __init__( - self, - d_model: int, - n_head: int, - hidden_dim: int, - num_layers: int, - seed: Optional[int] = None, - ): - self.layers = [ - TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) - for _ in range(num_layers) - ] + def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: int | None = None): + self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] - def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: out = x for layer in self.layers: out = layer.forward(out, mask) return out -# AttentionPooling - - +# -------------------------------------------------- +# Attention Pooling +# -------------------------------------------------- class AttentionPooling: - def __init__(self, d_model: int, seed: Optional[int] = None): - if seed is not None: - np.random.seed(seed) - self.w = np.random.randn(d_model) * math.sqrt(2.0 / d_model) + def __init__(self, d_model: int, seed: int | None = None): + self.rng = np.random.default_rng(seed) + self.w = self.rng.standard_normal(d_model) * math.sqrt(2.0 / d_model) self.b = 0.0 - def forward( - self, x: np.ndarray, mask: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: - """ - x: (b, seq_len, d_model) - mask: (b, seq_len) where 1 = valid, 0 = pad - returns: pooled (b, d_model), attn_weights (b, seq_len) - """ - # raw scores: (b, seq_len) + def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: scores = np.tensordot(x, self.w, axes=([2], [0])) + self.b - if mask is not None: scores = np.where(mask == 0, -1e9, scores) - - weights = _softmax(scores, axis=-1) # (b, seq_len) - pooled = np.matmul(weights[:, None, :], x).squeeze(1) # (b, d_model) + weights = _softmax(scores, axis=-1) + pooled = np.matmul(weights[:, None, :], x).squeeze(1) return pooled, weights -# EEGTransformer (forward-only) - - +# -------------------------------------------------- +# EEG Transformer (forward-only) +# -------------------------------------------------- class EEGTransformer: def __init__( self, @@ -294,67 +212,45 @@ def __init__( num_layers: int = 4, output_dim: int = 1, task_type: str = "regression", - seed: Optional[int] = None, + seed: int | None = None, ): - if seed is not None: - np.random.seed(seed) + self.rng = np.random.default_rng(seed) self.feature_dim = feature_dim self.d_model = d_model self.task_type = task_type - # input projection - self.w_in = np.random.randn(feature_dim, d_model) * math.sqrt( - 2.0 / (feature_dim + d_model) - ) - self.b_in = np.zeros((d_model,)) - # time embedding + + self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.b_in = np.zeros(d_model) self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder( - d_model, n_head, hidden_dim, num_layers, seed=seed - ) + self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) self.pooling = AttentionPooling(d_model, seed=seed) - # output - self.w_out = np.random.randn(d_model, output_dim) * math.sqrt( - 2.0 / (d_model + output_dim) - ) - self.b_out = np.zeros((output_dim,)) + self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.b_out = np.zeros(output_dim) def _input_proj(self, x: np.ndarray) -> np.ndarray: - # x: (b, seq_len, feature_dim) -> (b, seq_len, d_model) return np.tensordot(x, self.w_in, axes=([2], [0])) + self.b_in - def forward( - self, x: np.ndarray, mask: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: - """ - x: (b, seq_len, feature_dim) - mask: optional (b, seq_len) 1=valid,0=pad - returns: out (b, output_dim), attn_weights_from_pooling (b, seq_len) - """ - b, t, f = x.shape - # time indices - t_idx = np.arange(t, dtype=float)[None, :, None] # (1,t,1) - t_idx = np.tile(t_idx, (b, 1, 1)) # (b,t,1) - time_emb = self.time2vec.forward(t_idx) # (b,t,d_model) - x_proj = self._input_proj(x) + time_emb # broadcast add -> (b,t,d_model) + def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: + b, t, _ = x.shape + t_idx = np.arange(t, dtype=float)[None, :, None] + t_idx = np.tile(t_idx, (b, 1, 1)) + time_emb = self.time2vec.forward(t_idx) + x_proj = self._input_proj(x) + time_emb enc = self.encoder.forward(x_proj, mask) pooled, attn_weights = self.pooling.forward(enc, mask) - out = ( - np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out - ) # (b,output_dim) + out = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out if self.task_type == "classification": out = _softmax(out, axis=-1) return out, attn_weights +# -------------------------------------------------- # Example usage - +# -------------------------------------------------- if __name__ == "__main__": - # Example 1: Synthetic EEG-like array - batch = 2 - seq_len = 10 - feature_dim = 8 # e.g., 8 channels - rng = np.random.RandomState(42) - X = rng.randn(batch, seq_len, feature_dim).astype(float) + batch, seq_len, feature_dim = 2, 10, 8 + rng = np.random.default_rng(42) + X = rng.standard_normal((batch, seq_len, feature_dim)) model = EEGTransformer( feature_dim=feature_dim, @@ -369,16 +265,11 @@ def forward( print("Output shape:", out.shape) print("Output:", out) print("Pooling attn shape:", attn_weights.shape) - print("Pooling attn (per sample):", attn_weights) - # Example 2: Loading EEG from a pandas DataFrame (CSV-like) - # Suppose CSV has columns: time, ch1, ch2, ..., chN - # We'll simulate a DataFrame first: + # Example with pandas DataFrame channels = [f"ch{i}" for i in range(feature_dim)] - # create a long single-trial dataframe with seq_len rows - df = pd.DataFrame(rng.randn(seq_len, feature_dim), columns=channels) - # convert to numpy trial (1, seq_len, feature_dim) - trial_np = df[channels].values.reshape(1, seq_len, feature_dim) + df = pd.DataFrame(rng.standard_normal((seq_len, feature_dim)), columns=channels) + trial_np = df[channels].to_numpy().reshape(1, seq_len, feature_dim) out2, attn2 = model.forward(trial_np) print("Single-trial output:", out2) print("Single-trial pooling attn:", attn2) From 986cd982bc74b317eb4c947b88a84ddbc4d62892 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:17:16 +0000 Subject: [PATCH 16/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 66 ++++++++++++++----- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 274259e1e6af..4a10e234a16f 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -46,11 +46,17 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: # Positionwise FeedForward # -------------------------------------------------- class PositionwiseFeedForward: - def __init__(self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: int | None = None): + def __init__( + self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: int | None = None + ): self.rng = np.random.default_rng(seed) - self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt(2.0 / (d_model + hidden)) + self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt( + 2.0 / (d_model + hidden) + ) self.b1 = np.zeros(hidden) - self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt(2.0 / (hidden + d_model)) + self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt( + 2.0 / (hidden + d_model) + ) self.b2 = np.zeros(d_model) def forward(self, x: np.ndarray) -> np.ndarray: @@ -95,13 +101,21 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None): self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (2 * d_model) + ) self.b_q = np.zeros(d_model) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (2 * d_model) + ) self.b_k = np.zeros(d_model) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (2 * d_model) + ) self.b_v = np.zeros(d_model) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (2 * d_model)) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (2 * d_model) + ) self.b_out = np.zeros(d_model) self.attn = ScaledDotProductAttention() @@ -154,7 +168,9 @@ def forward(self, x: np.ndarray) -> np.ndarray: # Transformer Encoder Layer # -------------------------------------------------- class TransformerEncoderLayer: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None): + def __init__( + self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None + ): self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) @@ -171,8 +187,18 @@ def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: # Transformer Encoder Stack # -------------------------------------------------- class TransformerEncoder: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: int | None = None): - self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, + seed: int | None = None, + ): + self.layers = [ + TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) + for _ in range(num_layers) + ] def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: out = x @@ -190,7 +216,9 @@ def __init__(self, d_model: int, seed: int | None = None): self.w = self.rng.standard_normal(d_model) * math.sqrt(2.0 / d_model) self.b = 0.0 - def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, x: np.ndarray, mask: np.ndarray | None = None + ) -> tuple[np.ndarray, np.ndarray]: scores = np.tensordot(x, self.w, axes=([2], [0])) + self.b if mask is not None: scores = np.where(mask == 0, -1e9, scores) @@ -219,18 +247,26 @@ def __init__( self.d_model = d_model self.task_type = task_type - self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt( + 2.0 / (feature_dim + d_model) + ) self.b_in = np.zeros(d_model) self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) + self.encoder = TransformerEncoder( + d_model, n_head, hidden_dim, num_layers, seed=seed + ) self.pooling = AttentionPooling(d_model, seed=seed) - self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt( + 2.0 / (d_model + output_dim) + ) self.b_out = np.zeros(output_dim) def _input_proj(self, x: np.ndarray) -> np.ndarray: return np.tensordot(x, self.w_in, axes=([2], [0])) + self.b_in - def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, x: np.ndarray, mask: np.ndarray | None = None + ) -> tuple[np.ndarray, np.ndarray]: b, t, _ = x.shape t_idx = np.arange(t, dtype=float)[None, :, None] t_idx = np.tile(t_idx, (b, 1, 1)) From 0fc2b8e8e2a0331600debce7edfb1ae0d26a069a Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:00:01 +0530 Subject: [PATCH 17/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 523 ++++++++++-------- 1 file changed, 301 insertions(+), 222 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 4a10e234a16f..681971db6551 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,235 +1,320 @@ from __future__ import annotations - import math +from typing import Optional import numpy as np import pandas as pd -# -------------------------------------------------- -# Utility functions -# -------------------------------------------------- -def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: - x_max = np.max(x, axis=axis, keepdims=True) - e = np.exp(x - x_max) - return e / (np.sum(e, axis=axis, keepdims=True) + 1e-12) +def _softmax(array: np.ndarray, axis: int = -1) -> np.ndarray: + max_val = np.max(array, axis=axis, keepdims=True) + exp = np.exp(array - max_val) + return exp / (np.sum(exp, axis=axis, keepdims=True) + 1e-12) -def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: - return x / (denom + 1e-12) +def _stable_div(numerator: np.ndarray, denominator: np.ndarray) -> np.ndarray: + return numerator / (denominator + 1e-12) -# -------------------------------------------------- -# Time2Vec -# -------------------------------------------------- -class Time2Vec: - """Time2Vec positional encoding for real-valued time steps.""" +# ------------------------------- +# 🔹 Time2Vec +# ------------------------------- - def __init__(self, d_model: int, seed: int | None = None): +class Time2Vec: + def __init__(self, d_model: int, seed: Optional[int] = None) -> None: if d_model < 2: raise ValueError("d_model must be >= 2 for Time2Vec") + self.rng = np.random.default_rng(seed) - self.w0 = self.rng.standard_normal((1, 1)) - self.b0 = self.rng.standard_normal((1, 1)) - self.w = self.rng.standard_normal((1, d_model - 1)) - self.b = self.rng.standard_normal((1, d_model - 1)) - - def forward(self, time_steps: np.ndarray) -> np.ndarray: - """time_steps: (batch, seq_len, 1) or (batch, seq_len).""" - ts = time_steps if time_steps.ndim == 3 else time_steps[..., None] - linear = (self.w0 * ts) + self.b0 - periodic = np.sin((ts * self.w) + self.b) - return np.concatenate([linear, periodic], axis=-1) - - -# -------------------------------------------------- -# Positionwise FeedForward -# -------------------------------------------------- + self.w0: np.ndarray = self.rng.standard_normal((1, 1)) + self.b0: np.ndarray = self.rng.standard_normal((1, 1)) + self.w: np.ndarray = self.rng.standard_normal((1, d_model - 1)) + self.b: np.ndarray = self.rng.standard_normal((1, d_model - 1)) + + def forward(self, time_indices: np.ndarray) -> np.ndarray: + """ + Parameters + ---------- + time_indices : np.ndarray + Shape (batch, seq_len) or (batch, seq_len, 1) + + Returns + ------- + np.ndarray + Shape (batch, seq_len, d_model) + + Example + ------- + >>> t2v = Time2Vec(4, seed=0) + >>> ts = np.arange(3).reshape(1, 3, 1) + >>> out = t2v.forward(ts) + >>> out.shape + (1, 3, 4) + """ + if time_indices.ndim == 2: + time_indices = time_indices[..., None] + + linear_term = (self.w0 * time_indices) + self.b0 + periodic_term = np.sin((time_indices * self.w) + self.b) + return np.concatenate([linear_term, periodic_term], axis=-1) + + +# ------------------------------- +# 🔹 Positionwise FeedForward +# ------------------------------- + class PositionwiseFeedForward: - def __init__( - self, d_model: int, hidden: int, drop_prob: float = 0.0, seed: int | None = None - ): + def __init__(self, d_model: int, hidden_dim: int, drop_prob: float = 0.0, seed: Optional[int] = None) -> None: self.rng = np.random.default_rng(seed) - self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt( - 2.0 / (d_model + hidden) - ) - self.b1 = np.zeros(hidden) - self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt( - 2.0 / (hidden + d_model) - ) - self.b2 = np.zeros(d_model) - - def forward(self, x: np.ndarray) -> np.ndarray: - h = np.tensordot(x, self.w1, axes=([2], [0])) + self.b1 - h = np.maximum(h, 0.0) - return np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 - - -# -------------------------------------------------- -# Scaled Dot-Product Attention -# -------------------------------------------------- + self.w1: np.ndarray = self.rng.standard_normal((d_model, hidden_dim)) * math.sqrt(2.0 / (d_model + hidden_dim)) + self.b1: np.ndarray = np.zeros((hidden_dim,)) + self.w2: np.ndarray = self.rng.standard_normal((hidden_dim, d_model)) * math.sqrt(2.0 / (hidden_dim + d_model)) + self.b2: np.ndarray = np.zeros((d_model,)) + + def forward(self, input_tensor: np.ndarray) -> np.ndarray: + hidden = np.tensordot(input_tensor, self.w1, axes=([2], [0])) + self.b1 + hidden = np.maximum(hidden, 0.0) # ReLU + output_tensor = np.tensordot(hidden, self.w2, axes=([2], [0])) + self.b2 + return output_tensor + + +# ------------------------------- +# 🔹 Scaled Dot-Product Attention +# ------------------------------- + class ScaledDotProductAttention: def forward( self, - q: np.ndarray, - k: np.ndarray, - v: np.ndarray, - mask: np.ndarray | None = None, + query: np.ndarray, + key: np.ndarray, + value: np.ndarray, + mask: Optional[np.ndarray] = None, ) -> tuple[np.ndarray, np.ndarray]: - b, n_head, seq_len, d_k = q.shape - scores = np.matmul(q, k.transpose(0, 1, 3, 2)) / math.sqrt(d_k) + batch_size, n_head, seq_len, d_k = query.shape + scores = np.matmul(query, key.transpose(0, 1, 3, 2)) / math.sqrt(d_k) if mask is not None: - mask2 = mask[:, None, None, :] if mask.ndim == 2 else mask - scores = np.where(mask2 == 0, -1e9, scores) + if mask.ndim == 2: + mask_reshaped = mask[:, None, None, :] + elif mask.ndim == 3: + mask_reshaped = mask[:, None, :, :] if mask.shape[1] != seq_len else mask[:, None, None, :] + else: + mask_reshaped = mask + scores = np.where(mask_reshaped == 0, -1e9, scores) + + attn_weights = _softmax(scores, axis=-1) + context = np.matmul(attn_weights, value) + return context, attn_weights - attn = _softmax(scores, axis=-1) - context = np.matmul(attn, v) - return context, attn +# ------------------------------- +# 🔹 Multi-Head Attention +# ------------------------------- -# -------------------------------------------------- -# Multi-Head Attention -# -------------------------------------------------- class MultiHeadAttention: - def __init__(self, d_model: int, n_head: int, seed: int | None = None): + def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> None: if d_model % n_head != 0: raise ValueError("d_model must be divisible by n_head") + self.rng = np.random.default_rng(seed) self.d_model = d_model self.n_head = n_head self.d_k = d_model // n_head - self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (2 * d_model) - ) - self.b_q = np.zeros(d_model) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (2 * d_model) - ) - self.b_k = np.zeros(d_model) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (2 * d_model) - ) - self.b_v = np.zeros(d_model) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (2 * d_model) - ) - self.b_out = np.zeros(d_model) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.b_q = np.zeros((d_model,)) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.b_k = np.zeros((d_model,)) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.b_v = np.zeros((d_model,)) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.b_out = np.zeros((d_model,)) self.attn = ScaledDotProductAttention() - def _linear(self, x: np.ndarray, w: np.ndarray, b: np.ndarray) -> np.ndarray: - return np.tensordot(x, w, axes=([2], [0])) + b + def _linear(self, input_tensor: np.ndarray, weight: np.ndarray, bias: np.ndarray) -> np.ndarray: + return np.tensordot(input_tensor, weight, axes=([2], [0])) + bias - def _split_heads(self, x: np.ndarray) -> np.ndarray: - b, seq_len, _ = x.shape - return x.reshape(b, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) + def _split_heads(self, input_tensor: np.ndarray) -> np.ndarray: + batch_size, seq_len, _ = input_tensor.shape + return input_tensor.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) - def _concat_heads(self, x: np.ndarray) -> np.ndarray: - b, n_head, seq_len, d_k = x.shape - return x.transpose(0, 2, 1, 3).reshape(b, seq_len, n_head * d_k) + def _concat_heads(self, input_tensor: np.ndarray) -> np.ndarray: + batch_size, n_head, seq_len, d_k = input_tensor.shape + return input_tensor.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, n_head * d_k) def forward( self, - query: np.ndarray, - key: np.ndarray, - value: np.ndarray, - mask: np.ndarray | None = None, + query_tensor: np.ndarray, + key_tensor: np.ndarray, + value_tensor: np.ndarray, + mask: Optional[np.ndarray] = None, ) -> tuple[np.ndarray, np.ndarray]: - q = self._linear(query, self.w_q, self.b_q) - k = self._linear(key, self.w_k, self.b_k) - v = self._linear(value, self.w_v, self.b_v) - qh, kh, vh = self._split_heads(q), self._split_heads(k), self._split_heads(v) - context, attn = self.attn.forward(qh, kh, vh, mask) + """ + Forward pass of multi-head attention. + + Returns + ------- + out: np.ndarray + Shape (batch, seq_len, d_model) + attn_weights: np.ndarray + Shape (batch, n_head, seq_len, seq_len) + """ + qh = self._split_heads(self._linear(query_tensor, self.w_q, self.b_q)) + kh = self._split_heads(self._linear(key_tensor, self.w_k, self.b_k)) + vh = self._split_heads(self._linear(value_tensor, self.w_v, self.b_v)) + + context, attn_weights = self.attn.forward(qh, kh, vh, mask) concat = self._concat_heads(context) - out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out - return out, attn + out_tensor = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out + return out_tensor, attn_weights + +# ------------------------------- +# 🔹 LayerNorm +# ------------------------------- -# -------------------------------------------------- -# Layer Normalization -# -------------------------------------------------- class LayerNorm: - def __init__(self, d_model: int, eps: float = 1e-12): - self.gamma = np.ones(d_model) - self.beta = np.zeros(d_model) + def __init__(self, d_model: int, eps: float = 1e-12) -> None: + self.gamma: np.ndarray = np.ones((d_model,)) + self.beta: np.ndarray = np.zeros((d_model,)) self.eps = eps - def forward(self, x: np.ndarray) -> np.ndarray: - mean = np.mean(x, axis=-1, keepdims=True) - var = np.mean((x - mean) ** 2, axis=-1, keepdims=True) - x_norm = (x - mean) / np.sqrt(var + self.eps) - return self.gamma * x_norm + self.beta - + def forward(self, input_tensor: np.ndarray) -> np.ndarray: + mean = np.mean(input_tensor, axis=-1, keepdims=True) + var = np.mean((input_tensor - mean) ** 2, axis=-1, keepdims=True) + normalized_tensor = (input_tensor - mean) / np.sqrt(var + self.eps) + return self.gamma * normalized_tensor + self.beta +# ------------------------------- +# 🔹 Transformer Encoder Layer +# ------------------------------- -# -------------------------------------------------- -# Transformer Encoder Layer -# -------------------------------------------------- class TransformerEncoderLayer: - def __init__( - self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None - ): + def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None) -> None: self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: - attn_out, _ = self.self_attn.forward(x, x, x, mask) - x2 = self.norm1.forward(x + attn_out) - ffn_out = self.ffn.forward(x2) - return self.norm2.forward(x2 + ffn_out) - + def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + """ + Forward pass for one encoder layer. + + Parameters + ---------- + encoded_input : np.ndarray + Shape (batch, seq_len, d_model) + mask : np.ndarray | None + Optional mask (batch, seq_len) + + Returns + ------- + np.ndarray + Shape (batch, seq_len, d_model) + + Example + ------- + >>> layer = TransformerEncoderLayer(d_model=4, n_head=2, hidden_dim=8, seed=0) + >>> x = np.random.randn(1, 3, 4) + >>> out = layer.forward(x) + >>> out.shape + (1, 3, 4) + """ + attn_output, _ = self.self_attn.forward(encoded_input, encoded_input, encoded_input, mask) + out1 = self.norm1.forward(encoded_input + attn_output) + ffn_output = self.ffn.forward(out1) + out2 = self.norm2.forward(out1 + ffn_output) + return out2 + + +# ------------------------------- +# 🔹 Transformer Encoder Stack +# ------------------------------- -# -------------------------------------------------- -# Transformer Encoder Stack -# -------------------------------------------------- class TransformerEncoder: - def __init__( - self, - d_model: int, - n_head: int, - hidden_dim: int, - num_layers: int, - seed: int | None = None, - ): - self.layers = [ - TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) - for _ in range(num_layers) - ] - - def forward(self, x: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: - out = x + def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None) -> None: + self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] + + def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + """ + Forward pass for encoder stack. + + Parameters + ---------- + encoded_input : np.ndarray + Shape (batch, seq_len, d_model) + mask : np.ndarray | None + Optional mask + + Returns + ------- + np.ndarray + Shape (batch, seq_len, d_model) + + Example + ------- + >>> encoder = TransformerEncoder(d_model=4, n_head=2, hidden_dim=8, num_layers=2, seed=0) + >>> x = np.random.randn(1, 3, 4) + >>> out = encoder.forward(x) + >>> out.shape + (1, 3, 4) + """ + out = encoded_input for layer in self.layers: out = layer.forward(out, mask) return out -# -------------------------------------------------- -# Attention Pooling -# -------------------------------------------------- +# ------------------------------- +# 🔹 Attention Pooling +# ------------------------------- + class AttentionPooling: - def __init__(self, d_model: int, seed: int | None = None): + def __init__(self, d_model: int, seed: Optional[int] = None) -> None: self.rng = np.random.default_rng(seed) - self.w = self.rng.standard_normal(d_model) * math.sqrt(2.0 / d_model) - self.b = 0.0 - - def forward( - self, x: np.ndarray, mask: np.ndarray | None = None - ) -> tuple[np.ndarray, np.ndarray]: - scores = np.tensordot(x, self.w, axes=([2], [0])) + self.b + self.w: np.ndarray = self.rng.standard_normal(d_model) * math.sqrt(2.0 / d_model) + self.b: float = 0.0 + + def forward(self, encoded_features: np.ndarray, mask: Optional[np.ndarray] = None) -> tuple[np.ndarray, np.ndarray]: + """ + Attention-based pooling. + + Parameters + ---------- + encoded_features : np.ndarray + Shape (batch, seq_len, d_model) + mask : np.ndarray | None + Optional mask (batch, seq_len), 1=valid, 0=pad + + Returns + ------- + pooled_output : np.ndarray + Shape (batch, d_model) + attention_weights : np.ndarray + Shape (batch, seq_len) + + Example + ------- + >>> pooling = AttentionPooling(d_model=4, seed=0) + >>> x = np.random.randn(1, 3, 4) + >>> pooled, weights = pooling.forward(x) + >>> pooled.shape + (1, 4) + >>> weights.shape + (1, 3) + """ + scores = np.tensordot(encoded_features, self.w, axes=([2], [0])) + self.b if mask is not None: scores = np.where(mask == 0, -1e9, scores) weights = _softmax(scores, axis=-1) - pooled = np.matmul(weights[:, None, :], x).squeeze(1) - return pooled, weights + pooled_output = np.matmul(weights[:, None, :], encoded_features).squeeze(1) + return pooled_output, weights -# -------------------------------------------------- -# EEG Transformer (forward-only) -# -------------------------------------------------- +# ------------------------------- +# 🔹 EEG Transformer +# ------------------------------- + class EEGTransformer: def __init__( self, @@ -240,72 +325,66 @@ def __init__( num_layers: int = 4, output_dim: int = 1, task_type: str = "regression", - seed: int | None = None, - ): + seed: Optional[int] = None, + ) -> None: self.rng = np.random.default_rng(seed) self.feature_dim = feature_dim self.d_model = d_model self.task_type = task_type - self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt( - 2.0 / (feature_dim + d_model) - ) - self.b_in = np.zeros(d_model) + self.w_in: np.ndarray = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.b_in: np.ndarray = np.zeros((d_model,)) + self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder( - d_model, n_head, hidden_dim, num_layers, seed=seed - ) + self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) self.pooling = AttentionPooling(d_model, seed=seed) - self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt( - 2.0 / (d_model + output_dim) - ) - self.b_out = np.zeros(output_dim) - def _input_proj(self, x: np.ndarray) -> np.ndarray: - return np.tensordot(x, self.w_in, axes=([2], [0])) + self.b_in - - def forward( - self, x: np.ndarray, mask: np.ndarray | None = None - ) -> tuple[np.ndarray, np.ndarray]: - b, t, _ = x.shape - t_idx = np.arange(t, dtype=float)[None, :, None] - t_idx = np.tile(t_idx, (b, 1, 1)) - time_emb = self.time2vec.forward(t_idx) - x_proj = self._input_proj(x) + time_emb - enc = self.encoder.forward(x_proj, mask) - pooled, attn_weights = self.pooling.forward(enc, mask) - out = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out + self.w_out: np.ndarray = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.b_out: np.ndarray = np.zeros((output_dim,)) + + def _input_projection(self, input_tensor: np.ndarray) -> np.ndarray: + return np.tensordot(input_tensor, self.w_in, axes=([2], [0])) + self.b_in + + def forward(self, input_tensor: np.ndarray, mask: Optional[np.ndarray] = None) -> tuple[np.ndarray, np.ndarray]: + """ + Forward pass for EEG Transformer. + + Parameters + ---------- + input_tensor : np.ndarray + Shape (batch, seq_len, feature_dim) + mask : np.ndarray | None + Optional mask (batch, seq_len), 1=valid, 0=pad + + Returns + ------- + output_tensor : np.ndarray + Shape (batch, output_dim) + attention_weights : np.ndarray + Shape (batch, seq_len) + + Example + ------- + >>> model = EEGTransformer(feature_dim=8, d_model=32, n_head=4, hidden_dim=64, num_layers=2, output_dim=1, seed=0) + >>> x = np.random.randn(2, 10, 8) + >>> out, attn = model.forward(x) + >>> out.shape + (2, 1) + >>> attn.shape + (2, 10) + """ + batch_size, seq_len, _ = input_tensor.shape + time_indices = np.arange(seq_len, dtype=float)[None, :, None] + time_indices = np.tile(time_indices, (batch_size, 1, 1)) + + time_embedding = self.time2vec.forward(time_indices) + projected_input = self._input_projection(input_tensor) + time_embedding + + encoded_features = self.encoder.forward(projected_input, mask) + pooled_output, attention_weights = self.pooling.forward(encoded_features, mask) + + output_tensor = np.tensordot(pooled_output, self.w_out, axes=([1], [0])) + self.b_out if self.task_type == "classification": - out = _softmax(out, axis=-1) - return out, attn_weights - - -# -------------------------------------------------- -# Example usage -# -------------------------------------------------- -if __name__ == "__main__": - batch, seq_len, feature_dim = 2, 10, 8 - rng = np.random.default_rng(42) - X = rng.standard_normal((batch, seq_len, feature_dim)) - - model = EEGTransformer( - feature_dim=feature_dim, - d_model=32, - n_head=4, - hidden_dim=64, - num_layers=2, - output_dim=1, - seed=0, - ) - out, attn_weights = model.forward(X) - print("Output shape:", out.shape) - print("Output:", out) - print("Pooling attn shape:", attn_weights.shape) - - # Example with pandas DataFrame - channels = [f"ch{i}" for i in range(feature_dim)] - df = pd.DataFrame(rng.standard_normal((seq_len, feature_dim)), columns=channels) - trial_np = df[channels].to_numpy().reshape(1, seq_len, feature_dim) - out2, attn2 = model.forward(trial_np) - print("Single-trial output:", out2) - print("Single-trial pooling attn:", attn2) + output_tensor = _softmax(output_tensor, axis=-1) + + return output_tensor, attention_weights From f10a2ea63a6a92e03cb636f46a51611ca4be8c3b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:30:37 +0000 Subject: [PATCH 18/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 119 ++++++++++++++---- 1 file changed, 95 insertions(+), 24 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 681971db6551..cf75f88f2e90 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -20,6 +20,7 @@ def _stable_div(numerator: np.ndarray, denominator: np.ndarray) -> np.ndarray: # 🔹 Time2Vec # ------------------------------- + class Time2Vec: def __init__(self, d_model: int, seed: Optional[int] = None) -> None: if d_model < 2: @@ -63,12 +64,23 @@ def forward(self, time_indices: np.ndarray) -> np.ndarray: # 🔹 Positionwise FeedForward # ------------------------------- + class PositionwiseFeedForward: - def __init__(self, d_model: int, hidden_dim: int, drop_prob: float = 0.0, seed: Optional[int] = None) -> None: + def __init__( + self, + d_model: int, + hidden_dim: int, + drop_prob: float = 0.0, + seed: Optional[int] = None, + ) -> None: self.rng = np.random.default_rng(seed) - self.w1: np.ndarray = self.rng.standard_normal((d_model, hidden_dim)) * math.sqrt(2.0 / (d_model + hidden_dim)) + self.w1: np.ndarray = self.rng.standard_normal( + (d_model, hidden_dim) + ) * math.sqrt(2.0 / (d_model + hidden_dim)) self.b1: np.ndarray = np.zeros((hidden_dim,)) - self.w2: np.ndarray = self.rng.standard_normal((hidden_dim, d_model)) * math.sqrt(2.0 / (hidden_dim + d_model)) + self.w2: np.ndarray = self.rng.standard_normal( + (hidden_dim, d_model) + ) * math.sqrt(2.0 / (hidden_dim + d_model)) self.b2: np.ndarray = np.zeros((d_model,)) def forward(self, input_tensor: np.ndarray) -> np.ndarray: @@ -82,6 +94,7 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray: # 🔹 Scaled Dot-Product Attention # ------------------------------- + class ScaledDotProductAttention: def forward( self, @@ -97,7 +110,11 @@ def forward( if mask.ndim == 2: mask_reshaped = mask[:, None, None, :] elif mask.ndim == 3: - mask_reshaped = mask[:, None, :, :] if mask.shape[1] != seq_len else mask[:, None, None, :] + mask_reshaped = ( + mask[:, None, :, :] + if mask.shape[1] != seq_len + else mask[:, None, None, :] + ) else: mask_reshaped = mask scores = np.where(mask_reshaped == 0, -1e9, scores) @@ -111,6 +128,7 @@ def forward( # 🔹 Multi-Head Attention # ------------------------------- + class MultiHeadAttention: def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> None: if d_model % n_head != 0: @@ -121,27 +139,41 @@ def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> Non self.n_head = n_head self.d_k = d_model // n_head - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_q = np.zeros((d_model,)) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_k = np.zeros((d_model,)) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_v = np.zeros((d_model,)) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_out = np.zeros((d_model,)) self.attn = ScaledDotProductAttention() - def _linear(self, input_tensor: np.ndarray, weight: np.ndarray, bias: np.ndarray) -> np.ndarray: + def _linear( + self, input_tensor: np.ndarray, weight: np.ndarray, bias: np.ndarray + ) -> np.ndarray: return np.tensordot(input_tensor, weight, axes=([2], [0])) + bias def _split_heads(self, input_tensor: np.ndarray) -> np.ndarray: batch_size, seq_len, _ = input_tensor.shape - return input_tensor.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) + return input_tensor.reshape( + batch_size, seq_len, self.n_head, self.d_k + ).transpose(0, 2, 1, 3) def _concat_heads(self, input_tensor: np.ndarray) -> np.ndarray: batch_size, n_head, seq_len, d_k = input_tensor.shape - return input_tensor.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, n_head * d_k) + return input_tensor.transpose(0, 2, 1, 3).reshape( + batch_size, seq_len, n_head * d_k + ) def forward( self, @@ -174,6 +206,7 @@ def forward( # 🔹 LayerNorm # ------------------------------- + class LayerNorm: def __init__(self, d_model: int, eps: float = 1e-12) -> None: self.gamma: np.ndarray = np.ones((d_model,)) @@ -185,18 +218,25 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray: var = np.mean((input_tensor - mean) ** 2, axis=-1, keepdims=True) normalized_tensor = (input_tensor - mean) / np.sqrt(var + self.eps) return self.gamma * normalized_tensor + self.beta + + # ------------------------------- # 🔹 Transformer Encoder Layer # ------------------------------- + class TransformerEncoderLayer: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None) -> None: + def __init__( + self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None + ) -> None: self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + def forward( + self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None + ) -> np.ndarray: """ Forward pass for one encoder layer. @@ -220,7 +260,9 @@ def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) >>> out.shape (1, 3, 4) """ - attn_output, _ = self.self_attn.forward(encoded_input, encoded_input, encoded_input, mask) + attn_output, _ = self.self_attn.forward( + encoded_input, encoded_input, encoded_input, mask + ) out1 = self.norm1.forward(encoded_input + attn_output) ffn_output = self.ffn.forward(out1) out2 = self.norm2.forward(out1 + ffn_output) @@ -231,11 +273,24 @@ def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) # 🔹 Transformer Encoder Stack # ------------------------------- + class TransformerEncoder: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None) -> None: - self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) for _ in range(num_layers)] + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, + seed: Optional[int] = None, + ) -> None: + self.layers = [ + TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) + for _ in range(num_layers) + ] - def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray: + def forward( + self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None + ) -> np.ndarray: """ Forward pass for encoder stack. @@ -269,13 +324,18 @@ def forward(self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None) # 🔹 Attention Pooling # ------------------------------- + class AttentionPooling: def __init__(self, d_model: int, seed: Optional[int] = None) -> None: self.rng = np.random.default_rng(seed) - self.w: np.ndarray = self.rng.standard_normal(d_model) * math.sqrt(2.0 / d_model) + self.w: np.ndarray = self.rng.standard_normal(d_model) * math.sqrt( + 2.0 / d_model + ) self.b: float = 0.0 - def forward(self, encoded_features: np.ndarray, mask: Optional[np.ndarray] = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, encoded_features: np.ndarray, mask: Optional[np.ndarray] = None + ) -> tuple[np.ndarray, np.ndarray]: """ Attention-based pooling. @@ -315,6 +375,7 @@ def forward(self, encoded_features: np.ndarray, mask: Optional[np.ndarray] = Non # 🔹 EEG Transformer # ------------------------------- + class EEGTransformer: def __init__( self, @@ -332,20 +393,28 @@ def __init__( self.d_model = d_model self.task_type = task_type - self.w_in: np.ndarray = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.w_in: np.ndarray = self.rng.standard_normal( + (feature_dim, d_model) + ) * math.sqrt(2.0 / (feature_dim + d_model)) self.b_in: np.ndarray = np.zeros((d_model,)) self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed=seed) + self.encoder = TransformerEncoder( + d_model, n_head, hidden_dim, num_layers, seed=seed + ) self.pooling = AttentionPooling(d_model, seed=seed) - self.w_out: np.ndarray = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.w_out: np.ndarray = self.rng.standard_normal( + (d_model, output_dim) + ) * math.sqrt(2.0 / (d_model + output_dim)) self.b_out: np.ndarray = np.zeros((output_dim,)) def _input_projection(self, input_tensor: np.ndarray) -> np.ndarray: return np.tensordot(input_tensor, self.w_in, axes=([2], [0])) + self.b_in - def forward(self, input_tensor: np.ndarray, mask: Optional[np.ndarray] = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, input_tensor: np.ndarray, mask: Optional[np.ndarray] = None + ) -> tuple[np.ndarray, np.ndarray]: """ Forward pass for EEG Transformer. @@ -383,7 +452,9 @@ def forward(self, input_tensor: np.ndarray, mask: Optional[np.ndarray] = None) - encoded_features = self.encoder.forward(projected_input, mask) pooled_output, attention_weights = self.pooling.forward(encoded_features, mask) - output_tensor = np.tensordot(pooled_output, self.w_out, axes=([1], [0])) + self.b_out + output_tensor = ( + np.tensordot(pooled_output, self.w_out, axes=([1], [0])) + self.b_out + ) if self.task_type == "classification": output_tensor = _softmax(output_tensor, axis=-1) From 86e48488f33c4c9822760586823ee79f0ab87138 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:03:13 +0530 Subject: [PATCH 19/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 429 +++++++----------- 1 file changed, 168 insertions(+), 261 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index cf75f88f2e90..6e42dd3801e1 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,123 +1,120 @@ +# ------------------------------- +# 🔹 Imports +# ------------------------------- from __future__ import annotations import math from typing import Optional import numpy as np -import pandas as pd -def _softmax(array: np.ndarray, axis: int = -1) -> np.ndarray: - max_val = np.max(array, axis=axis, keepdims=True) - exp = np.exp(array - max_val) - return exp / (np.sum(exp, axis=axis, keepdims=True) + 1e-12) +# ------------------------------- +# 🔹 Helper functions +# ------------------------------- +def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x_max = np.max(x, axis=axis, keepdims=True) + e = np.exp(x - x_max) + return e / (np.sum(e, axis=axis, keepdims=True) + 1e-12) -def _stable_div(numerator: np.ndarray, denominator: np.ndarray) -> np.ndarray: - return numerator / (denominator + 1e-12) +def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: + return x / (denom + 1e-12) # ------------------------------- # 🔹 Time2Vec # ------------------------------- - - class Time2Vec: + """Time2Vec positional encoding for real-valued time steps.""" + def __init__(self, d_model: int, seed: Optional[int] = None) -> None: + if seed is not None: + self.rng = np.random.default_rng(seed) + else: + self.rng = np.random.default_rng() + if d_model < 2: raise ValueError("d_model must be >= 2 for Time2Vec") - self.rng = np.random.default_rng(seed) - self.w0: np.ndarray = self.rng.standard_normal((1, 1)) - self.b0: np.ndarray = self.rng.standard_normal((1, 1)) - self.w: np.ndarray = self.rng.standard_normal((1, d_model - 1)) - self.b: np.ndarray = self.rng.standard_normal((1, d_model - 1)) + self.w0 = self.rng.standard_normal((1, 1)) + self.b0 = self.rng.standard_normal((1, 1)) + self.w = self.rng.standard_normal((1, d_model - 1)) + self.b = self.rng.standard_normal((1, d_model - 1)) - def forward(self, time_indices: np.ndarray) -> np.ndarray: + def forward(self, time_steps: np.ndarray) -> np.ndarray: """ Parameters ---------- - time_indices : np.ndarray - Shape (batch, seq_len) or (batch, seq_len, 1) + time_steps : np.ndarray + Shape (batch, seq_len, 1) or (batch, seq_len) Returns ------- np.ndarray Shape (batch, seq_len, d_model) - - Example - ------- - >>> t2v = Time2Vec(4, seed=0) - >>> ts = np.arange(3).reshape(1, 3, 1) - >>> out = t2v.forward(ts) - >>> out.shape - (1, 3, 4) """ - if time_indices.ndim == 2: - time_indices = time_indices[..., None] - - linear_term = (self.w0 * time_indices) + self.b0 - periodic_term = np.sin((time_indices * self.w) + self.b) - return np.concatenate([linear_term, periodic_term], axis=-1) + ts = time_steps if time_steps.ndim == 3 else time_steps[..., None] + linear = (self.w0 * ts) + self.b0 + periodic = np.sin((ts * self.w) + self.b) + return np.concatenate([linear, periodic], axis=-1) # ------------------------------- -# 🔹 Positionwise FeedForward +# 🔹 PositionwiseFeedForward # ------------------------------- - - class PositionwiseFeedForward: def __init__( - self, - d_model: int, - hidden_dim: int, - drop_prob: float = 0.0, - seed: Optional[int] = None, + self, d_model: int, hidden: int, drop_prob: float = 0.0, + seed: Optional[int] = None ) -> None: self.rng = np.random.default_rng(seed) - self.w1: np.ndarray = self.rng.standard_normal( - (d_model, hidden_dim) - ) * math.sqrt(2.0 / (d_model + hidden_dim)) - self.b1: np.ndarray = np.zeros((hidden_dim,)) - self.w2: np.ndarray = self.rng.standard_normal( - (hidden_dim, d_model) - ) * math.sqrt(2.0 / (hidden_dim + d_model)) - self.b2: np.ndarray = np.zeros((d_model,)) + self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt(2.0 / (d_model + hidden)) + self.b1 = np.zeros((hidden,)) + self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt(2.0 / (hidden + d_model)) + self.b2 = np.zeros((d_model,)) def forward(self, input_tensor: np.ndarray) -> np.ndarray: - hidden = np.tensordot(input_tensor, self.w1, axes=([2], [0])) + self.b1 - hidden = np.maximum(hidden, 0.0) # ReLU - output_tensor = np.tensordot(hidden, self.w2, axes=([2], [0])) + self.b2 - return output_tensor + """ + Parameters + ---------- + input_tensor : np.ndarray + Shape (batch, seq_len, d_model) + + Returns + ------- + np.ndarray + Shape (batch, seq_len, d_model) + """ + h = np.tensordot(input_tensor, self.w1, axes=([2], [0])) + self.b1 + h = np.maximum(h, 0.0) + out = np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 + return out # ------------------------------- -# 🔹 Scaled Dot-Product Attention +# 🔹 ScaledDotProductAttention # ------------------------------- - - class ScaledDotProductAttention: def forward( - self, - query: np.ndarray, - key: np.ndarray, - value: np.ndarray, - mask: Optional[np.ndarray] = None, + self, query: np.ndarray, key: np.ndarray, value: np.ndarray, + mask: np.ndarray | None = None ) -> tuple[np.ndarray, np.ndarray]: + """ + Compute scaled dot-product attention. + + Returns + ------- + context : np.ndarray + Shape (batch, n_head, seq_len, d_k) + attn_weights : np.ndarray + Shape (batch, n_head, seq_len, seq_len) + """ batch_size, n_head, seq_len, d_k = query.shape scores = np.matmul(query, key.transpose(0, 1, 3, 2)) / math.sqrt(d_k) if mask is not None: - if mask.ndim == 2: - mask_reshaped = mask[:, None, None, :] - elif mask.ndim == 3: - mask_reshaped = ( - mask[:, None, :, :] - if mask.shape[1] != seq_len - else mask[:, None, None, :] - ) - else: - mask_reshaped = mask - scores = np.where(mask_reshaped == 0, -1e9, scores) + mask2 = mask[:, None, None, :] if mask.ndim == 2 else mask + scores = np.where(mask2 == 0, -1e9, scores) attn_weights = _softmax(scores, axis=-1) context = np.matmul(attn_weights, value) @@ -125,257 +122,189 @@ def forward( # ------------------------------- -# 🔹 Multi-Head Attention +# 🔹 MultiHeadAttention # ------------------------------- - - class MultiHeadAttention: def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> None: if d_model % n_head != 0: raise ValueError("d_model must be divisible by n_head") - self.rng = np.random.default_rng(seed) + self.rng = np.random.default_rng(seed) self.d_model = d_model self.n_head = n_head self.d_k = d_model // n_head - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) self.b_q = np.zeros((d_model,)) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) self.b_k = np.zeros((d_model,)) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) self.b_v = np.zeros((d_model,)) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) self.b_out = np.zeros((d_model,)) self.attn = ScaledDotProductAttention() - def _linear( - self, input_tensor: np.ndarray, weight: np.ndarray, bias: np.ndarray - ) -> np.ndarray: - return np.tensordot(input_tensor, weight, axes=([2], [0])) + bias + def _linear(self, x: np.ndarray, weight: np.ndarray, bias: np.ndarray) -> np.ndarray: + return np.tensordot(x, weight, axes=([2], [0])) + bias - def _split_heads(self, input_tensor: np.ndarray) -> np.ndarray: - batch_size, seq_len, _ = input_tensor.shape - return input_tensor.reshape( - batch_size, seq_len, self.n_head, self.d_k - ).transpose(0, 2, 1, 3) + def _split_heads(self, x: np.ndarray) -> np.ndarray: + batch_size, seq_len, _ = x.shape + return x.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) - def _concat_heads(self, input_tensor: np.ndarray) -> np.ndarray: - batch_size, n_head, seq_len, d_k = input_tensor.shape - return input_tensor.transpose(0, 2, 1, 3).reshape( - batch_size, seq_len, n_head * d_k - ) + def _concat_heads(self, x: np.ndarray) -> np.ndarray: + batch_size, n_head, seq_len, d_k = x.shape + return x.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, n_head * d_k) def forward( - self, - query_tensor: np.ndarray, - key_tensor: np.ndarray, - value_tensor: np.ndarray, - mask: Optional[np.ndarray] = None, + self, query: np.ndarray, key: np.ndarray, value: np.ndarray, + mask: np.ndarray | None = None ) -> tuple[np.ndarray, np.ndarray]: """ - Forward pass of multi-head attention. + Parameters + ---------- + query/key/value : np.ndarray + Shape (batch, seq_len, d_model) + mask : np.ndarray | None + Optional mask Returns ------- - out: np.ndarray + out : np.ndarray Shape (batch, seq_len, d_model) - attn_weights: np.ndarray + attn_weights : np.ndarray Shape (batch, n_head, seq_len, seq_len) """ - qh = self._split_heads(self._linear(query_tensor, self.w_q, self.b_q)) - kh = self._split_heads(self._linear(key_tensor, self.w_k, self.b_k)) - vh = self._split_heads(self._linear(value_tensor, self.w_v, self.b_v)) + q = self._linear(query, self.w_q, self.b_q) + k = self._linear(key, self.w_k, self.b_k) + v = self._linear(value, self.w_v, self.b_v) + qh, kh, vh = self._split_heads(q), self._split_heads(k), self._split_heads(v) context, attn_weights = self.attn.forward(qh, kh, vh, mask) concat = self._concat_heads(context) - out_tensor = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out - return out_tensor, attn_weights - - + out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out + return out, attn_weights # ------------------------------- # 🔹 LayerNorm # ------------------------------- - - class LayerNorm: def __init__(self, d_model: int, eps: float = 1e-12) -> None: - self.gamma: np.ndarray = np.ones((d_model,)) - self.beta: np.ndarray = np.zeros((d_model,)) + self.gamma = np.ones((d_model,)) + self.beta = np.zeros((d_model,)) self.eps = eps def forward(self, input_tensor: np.ndarray) -> np.ndarray: + """ + Parameters + ---------- + input_tensor : np.ndarray + Shape (batch, seq_len, d_model) + + Returns + ------- + np.ndarray + Layer-normalized tensor of same shape + """ mean = np.mean(input_tensor, axis=-1, keepdims=True) var = np.mean((input_tensor - mean) ** 2, axis=-1, keepdims=True) - normalized_tensor = (input_tensor - mean) / np.sqrt(var + self.eps) - return self.gamma * normalized_tensor + self.beta + x_norm = (input_tensor - mean) / np.sqrt(var + self.eps) + return self.gamma * x_norm + self.beta # ------------------------------- -# 🔹 Transformer Encoder Layer +# 🔹 TransformerEncoderLayer # ------------------------------- - - class TransformerEncoderLayer: - def __init__( - self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None - ) -> None: - self.self_attn = MultiHeadAttention(d_model, n_head, seed=seed) + def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None) -> None: + self.self_attn = MultiHeadAttention(d_model, n_head, seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - def forward( - self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None - ) -> np.ndarray: + def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: """ - Forward pass for one encoder layer. - Parameters ---------- - encoded_input : np.ndarray + input_tensor : np.ndarray Shape (batch, seq_len, d_model) mask : np.ndarray | None - Optional mask (batch, seq_len) + Optional attention mask Returns ------- np.ndarray Shape (batch, seq_len, d_model) - - Example - ------- - >>> layer = TransformerEncoderLayer(d_model=4, n_head=2, hidden_dim=8, seed=0) - >>> x = np.random.randn(1, 3, 4) - >>> out = layer.forward(x) - >>> out.shape - (1, 3, 4) """ - attn_output, _ = self.self_attn.forward( - encoded_input, encoded_input, encoded_input, mask - ) - out1 = self.norm1.forward(encoded_input + attn_output) - ffn_output = self.ffn.forward(out1) - out2 = self.norm2.forward(out1 + ffn_output) - return out2 + attn_out, _ = self.self_attn.forward(input_tensor, input_tensor, input_tensor, mask) + x_norm1 = self.norm1.forward(input_tensor + attn_out) + ffn_out = self.ffn.forward(x_norm1) + x_norm2 = self.norm2.forward(x_norm1 + ffn_out) + return x_norm2 # ------------------------------- -# 🔹 Transformer Encoder Stack +# 🔹 TransformerEncoder (stack) # ------------------------------- - - class TransformerEncoder: - def __init__( - self, - d_model: int, - n_head: int, - hidden_dim: int, - num_layers: int, - seed: Optional[int] = None, - ) -> None: - self.layers = [ - TransformerEncoderLayer(d_model, n_head, hidden_dim, seed=seed) - for _ in range(num_layers) - ] + def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None) -> None: + self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed) for _ in range(num_layers)] - def forward( - self, encoded_input: np.ndarray, mask: Optional[np.ndarray] = None - ) -> np.ndarray: + def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: """ - Forward pass for encoder stack. - Parameters ---------- - encoded_input : np.ndarray + input_tensor : np.ndarray Shape (batch, seq_len, d_model) mask : np.ndarray | None - Optional mask + Optional attention mask Returns ------- np.ndarray Shape (batch, seq_len, d_model) - - Example - ------- - >>> encoder = TransformerEncoder(d_model=4, n_head=2, hidden_dim=8, num_layers=2, seed=0) - >>> x = np.random.randn(1, 3, 4) - >>> out = encoder.forward(x) - >>> out.shape - (1, 3, 4) """ - out = encoded_input + output = input_tensor for layer in self.layers: - out = layer.forward(out, mask) - return out + output = layer.forward(output, mask) + return output # ------------------------------- -# 🔹 Attention Pooling +# 🔹 AttentionPooling # ------------------------------- - - class AttentionPooling: def __init__(self, d_model: int, seed: Optional[int] = None) -> None: self.rng = np.random.default_rng(seed) - self.w: np.ndarray = self.rng.standard_normal(d_model) * math.sqrt( - 2.0 / d_model - ) - self.b: float = 0.0 + self.w = self.rng.standard_normal((d_model,)) * math.sqrt(2.0 / d_model) + self.b = 0.0 - def forward( - self, encoded_features: np.ndarray, mask: Optional[np.ndarray] = None - ) -> tuple[np.ndarray, np.ndarray]: + def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: """ - Attention-based pooling. - Parameters ---------- - encoded_features : np.ndarray + input_tensor : np.ndarray Shape (batch, seq_len, d_model) mask : np.ndarray | None - Optional mask (batch, seq_len), 1=valid, 0=pad + Shape (batch, seq_len) where 1=valid, 0=pad Returns ------- - pooled_output : np.ndarray + pooled : np.ndarray Shape (batch, d_model) - attention_weights : np.ndarray + attn_weights : np.ndarray Shape (batch, seq_len) - - Example - ------- - >>> pooling = AttentionPooling(d_model=4, seed=0) - >>> x = np.random.randn(1, 3, 4) - >>> pooled, weights = pooling.forward(x) - >>> pooled.shape - (1, 4) - >>> weights.shape - (1, 3) """ - scores = np.tensordot(encoded_features, self.w, axes=([2], [0])) + self.b + scores = np.tensordot(input_tensor, self.w, axes=([2], [0])) + self.b if mask is not None: scores = np.where(mask == 0, -1e9, scores) - weights = _softmax(scores, axis=-1) - pooled_output = np.matmul(weights[:, None, :], encoded_features).squeeze(1) - return pooled_output, weights + attn_weights = _softmax(scores, axis=-1) + pooled = np.matmul(attn_weights[:, None, :], input_tensor).squeeze(1) + return pooled, attn_weights # ------------------------------- -# 🔹 EEG Transformer +# 🔹 EEGTransformer # ------------------------------- - - class EEGTransformer: def __init__( self, @@ -386,76 +315,54 @@ def __init__( num_layers: int = 4, output_dim: int = 1, task_type: str = "regression", - seed: Optional[int] = None, + seed: Optional[int] = None ) -> None: self.rng = np.random.default_rng(seed) self.feature_dim = feature_dim self.d_model = d_model self.task_type = task_type - self.w_in: np.ndarray = self.rng.standard_normal( - (feature_dim, d_model) - ) * math.sqrt(2.0 / (feature_dim + d_model)) - self.b_in: np.ndarray = np.zeros((d_model,)) + self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.b_in = np.zeros((d_model,)) - self.time2vec = Time2Vec(d_model, seed=seed) - self.encoder = TransformerEncoder( - d_model, n_head, hidden_dim, num_layers, seed=seed - ) - self.pooling = AttentionPooling(d_model, seed=seed) + self.time2vec = Time2Vec(d_model, seed) + self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed) + self.pooling = AttentionPooling(d_model, seed) - self.w_out: np.ndarray = self.rng.standard_normal( - (d_model, output_dim) - ) * math.sqrt(2.0 / (d_model + output_dim)) - self.b_out: np.ndarray = np.zeros((output_dim,)) + self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.b_out = np.zeros((output_dim,)) - def _input_projection(self, input_tensor: np.ndarray) -> np.ndarray: - return np.tensordot(input_tensor, self.w_in, axes=([2], [0])) + self.b_in + def _input_proj(self, features: np.ndarray) -> np.ndarray: + return np.tensordot(features, self.w_in, axes=([2], [0])) + self.b_in - def forward( - self, input_tensor: np.ndarray, mask: Optional[np.ndarray] = None - ) -> tuple[np.ndarray, np.ndarray]: + def forward(self, features: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: """ - Forward pass for EEG Transformer. - Parameters ---------- - input_tensor : np.ndarray + features : np.ndarray Shape (batch, seq_len, feature_dim) mask : np.ndarray | None - Optional mask (batch, seq_len), 1=valid, 0=pad + Optional mask Returns ------- - output_tensor : np.ndarray + output : np.ndarray Shape (batch, output_dim) - attention_weights : np.ndarray + attn_weights : np.ndarray Shape (batch, seq_len) - - Example - ------- - >>> model = EEGTransformer(feature_dim=8, d_model=32, n_head=4, hidden_dim=64, num_layers=2, output_dim=1, seed=0) - >>> x = np.random.randn(2, 10, 8) - >>> out, attn = model.forward(x) - >>> out.shape - (2, 1) - >>> attn.shape - (2, 10) """ - batch_size, seq_len, _ = input_tensor.shape + batch_size, seq_len, _ = features.shape time_indices = np.arange(seq_len, dtype=float)[None, :, None] time_indices = np.tile(time_indices, (batch_size, 1, 1)) - time_embedding = self.time2vec.forward(time_indices) - projected_input = self._input_projection(input_tensor) + time_embedding + time_emb = self.time2vec.forward(time_indices) + x_proj = self._input_proj(features) + time_emb - encoded_features = self.encoder.forward(projected_input, mask) - pooled_output, attention_weights = self.pooling.forward(encoded_features, mask) + enc_out = self.encoder.forward(x_proj, mask) + pooled, attn_weights = self.pooling.forward(enc_out, mask) - output_tensor = ( - np.tensordot(pooled_output, self.w_out, axes=([1], [0])) + self.b_out - ) + output = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out if self.task_type == "classification": - output_tensor = _softmax(output_tensor, axis=-1) + output = _softmax(output, axis=-1) - return output_tensor, attention_weights + return output, attn_weights From f9aca1e7a94e40ea6674f7d8b25fe20322743cc4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:33:34 +0000 Subject: [PATCH 20/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 103 +++++++++++++----- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 6e42dd3801e1..f3b722675902 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -64,13 +64,20 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: # ------------------------------- class PositionwiseFeedForward: def __init__( - self, d_model: int, hidden: int, drop_prob: float = 0.0, - seed: Optional[int] = None + self, + d_model: int, + hidden: int, + drop_prob: float = 0.0, + seed: Optional[int] = None, ) -> None: self.rng = np.random.default_rng(seed) - self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt(2.0 / (d_model + hidden)) + self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt( + 2.0 / (d_model + hidden) + ) self.b1 = np.zeros((hidden,)) - self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt(2.0 / (hidden + d_model)) + self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt( + 2.0 / (hidden + d_model) + ) self.b2 = np.zeros((d_model,)) def forward(self, input_tensor: np.ndarray) -> np.ndarray: @@ -96,8 +103,11 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray: # ------------------------------- class ScaledDotProductAttention: def forward( - self, query: np.ndarray, key: np.ndarray, value: np.ndarray, - mask: np.ndarray | None = None + self, + query: np.ndarray, + key: np.ndarray, + value: np.ndarray, + mask: np.ndarray | None = None, ) -> tuple[np.ndarray, np.ndarray]: """ Compute scaled dot-product attention. @@ -134,31 +144,46 @@ def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> Non self.n_head = n_head self.d_k = d_model // n_head - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_q = np.zeros((d_model,)) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_k = np.zeros((d_model,)) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_v = np.zeros((d_model,)) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / (d_model + d_model)) + self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / (d_model + d_model) + ) self.b_out = np.zeros((d_model,)) self.attn = ScaledDotProductAttention() - def _linear(self, x: np.ndarray, weight: np.ndarray, bias: np.ndarray) -> np.ndarray: + def _linear( + self, x: np.ndarray, weight: np.ndarray, bias: np.ndarray + ) -> np.ndarray: return np.tensordot(x, weight, axes=([2], [0])) + bias def _split_heads(self, x: np.ndarray) -> np.ndarray: batch_size, seq_len, _ = x.shape - return x.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose(0, 2, 1, 3) + return x.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose( + 0, 2, 1, 3 + ) def _concat_heads(self, x: np.ndarray) -> np.ndarray: batch_size, n_head, seq_len, d_k = x.shape return x.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, n_head * d_k) def forward( - self, query: np.ndarray, key: np.ndarray, value: np.ndarray, - mask: np.ndarray | None = None + self, + query: np.ndarray, + key: np.ndarray, + value: np.ndarray, + mask: np.ndarray | None = None, ) -> tuple[np.ndarray, np.ndarray]: """ Parameters @@ -184,6 +209,8 @@ def forward( concat = self._concat_heads(context) out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out return out, attn_weights + + # ------------------------------- # 🔹 LayerNorm # ------------------------------- @@ -215,13 +242,17 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray: # 🔹 TransformerEncoderLayer # ------------------------------- class TransformerEncoderLayer: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None) -> None: + def __init__( + self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None + ) -> None: self.self_attn = MultiHeadAttention(d_model, n_head, seed) self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) - def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: + def forward( + self, input_tensor: np.ndarray, mask: np.ndarray | None = None + ) -> np.ndarray: """ Parameters ---------- @@ -235,7 +266,9 @@ def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> n np.ndarray Shape (batch, seq_len, d_model) """ - attn_out, _ = self.self_attn.forward(input_tensor, input_tensor, input_tensor, mask) + attn_out, _ = self.self_attn.forward( + input_tensor, input_tensor, input_tensor, mask + ) x_norm1 = self.norm1.forward(input_tensor + attn_out) ffn_out = self.ffn.forward(x_norm1) x_norm2 = self.norm2.forward(x_norm1 + ffn_out) @@ -246,10 +279,22 @@ def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> n # 🔹 TransformerEncoder (stack) # ------------------------------- class TransformerEncoder: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, num_layers: int, seed: Optional[int] = None) -> None: - self.layers = [TransformerEncoderLayer(d_model, n_head, hidden_dim, seed) for _ in range(num_layers)] + def __init__( + self, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, + seed: Optional[int] = None, + ) -> None: + self.layers = [ + TransformerEncoderLayer(d_model, n_head, hidden_dim, seed) + for _ in range(num_layers) + ] - def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: + def forward( + self, input_tensor: np.ndarray, mask: np.ndarray | None = None + ) -> np.ndarray: """ Parameters ---------- @@ -278,7 +323,9 @@ def __init__(self, d_model: int, seed: Optional[int] = None) -> None: self.w = self.rng.standard_normal((d_model,)) * math.sqrt(2.0 / d_model) self.b = 0.0 - def forward(self, input_tensor: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, input_tensor: np.ndarray, mask: np.ndarray | None = None + ) -> tuple[np.ndarray, np.ndarray]: """ Parameters ---------- @@ -315,27 +362,33 @@ def __init__( num_layers: int = 4, output_dim: int = 1, task_type: str = "regression", - seed: Optional[int] = None + seed: Optional[int] = None, ) -> None: self.rng = np.random.default_rng(seed) self.feature_dim = feature_dim self.d_model = d_model self.task_type = task_type - self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt(2.0 / (feature_dim + d_model)) + self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt( + 2.0 / (feature_dim + d_model) + ) self.b_in = np.zeros((d_model,)) self.time2vec = Time2Vec(d_model, seed) self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed) self.pooling = AttentionPooling(d_model, seed) - self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt(2.0 / (d_model + output_dim)) + self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt( + 2.0 / (d_model + output_dim) + ) self.b_out = np.zeros((output_dim,)) def _input_proj(self, features: np.ndarray) -> np.ndarray: return np.tensordot(features, self.w_in, axes=([2], [0])) + self.b_in - def forward(self, features: np.ndarray, mask: np.ndarray | None = None) -> tuple[np.ndarray, np.ndarray]: + def forward( + self, features: np.ndarray, mask: np.ndarray | None = None + ) -> tuple[np.ndarray, np.ndarray]: """ Parameters ---------- From e6e209233ae8bb008b0d4731347ea0a013b9ab84 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:08:11 +0530 Subject: [PATCH 21/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 433 ++++++------------ 1 file changed, 140 insertions(+), 293 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index f3b722675902..97032c357c92 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,41 +1,20 @@ -# ------------------------------- # 🔹 Imports # ------------------------------- from __future__ import annotations import math -from typing import Optional +from typing import Optional, Tuple import numpy as np # ------------------------------- -# 🔹 Helper functions -# ------------------------------- -def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: - x_max = np.max(x, axis=axis, keepdims=True) - e = np.exp(x - x_max) - return e / (np.sum(e, axis=axis, keepdims=True) + 1e-12) - - -def _stable_div(x: np.ndarray, denom: np.ndarray) -> np.ndarray: - return x / (denom + 1e-12) - - -# ------------------------------- -# 🔹 Time2Vec +# 🔹 Time2Vec Layer # ------------------------------- class Time2Vec: """Time2Vec positional encoding for real-valued time steps.""" - def __init__(self, d_model: int, seed: Optional[int] = None) -> None: - if seed is not None: - self.rng = np.random.default_rng(seed) - else: - self.rng = np.random.default_rng() - - if d_model < 2: - raise ValueError("d_model must be >= 2 for Time2Vec") - + def __init__(self, d_model: int, seed: int | None = None) -> None: + self.rng = np.random.default_rng(seed) self.w0 = self.rng.standard_normal((1, 1)) self.b0 = self.rng.standard_normal((1, 1)) self.w = self.rng.standard_normal((1, d_model - 1)) @@ -46,137 +25,91 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: Parameters ---------- time_steps : np.ndarray - Shape (batch, seq_len, 1) or (batch, seq_len) + Shape (batch, seq_len, 1) Returns ------- np.ndarray Shape (batch, seq_len, d_model) + + Doctest + ------- + >>> t2v = Time2Vec(4, seed=0) + >>> t = np.ones((1, 3, 1)) + >>> out = t2v.forward(t) + >>> out.shape + (1, 3, 4) """ - ts = time_steps if time_steps.ndim == 3 else time_steps[..., None] - linear = (self.w0 * ts) + self.b0 - periodic = np.sin((ts * self.w) + self.b) + linear = self.w0 * time_steps + self.b0 + periodic = np.sin(self.w * time_steps + self.b) return np.concatenate([linear, periodic], axis=-1) # ------------------------------- -# 🔹 PositionwiseFeedForward +# 🔹 LayerNorm # ------------------------------- -class PositionwiseFeedForward: - def __init__( - self, - d_model: int, - hidden: int, - drop_prob: float = 0.0, - seed: Optional[int] = None, - ) -> None: - self.rng = np.random.default_rng(seed) - self.w1 = self.rng.standard_normal((d_model, hidden)) * math.sqrt( - 2.0 / (d_model + hidden) - ) - self.b1 = np.zeros((hidden,)) - self.w2 = self.rng.standard_normal((hidden, d_model)) * math.sqrt( - 2.0 / (hidden + d_model) - ) - self.b2 = np.zeros((d_model,)) +class LayerNorm: + def __init__(self, d_model: int, eps: float = 1e-12) -> None: + self.gamma = np.ones((d_model,)) + self.beta = np.zeros((d_model,)) + self.eps = eps def forward(self, input_tensor: np.ndarray) -> np.ndarray: """ - Parameters - ---------- - input_tensor : np.ndarray - Shape (batch, seq_len, d_model) - - Returns - ------- - np.ndarray - Shape (batch, seq_len, d_model) + >>> ln = LayerNorm(4) + >>> x = np.ones((1, 3, 4)) + >>> out = ln.forward(x) + >>> out.shape + (1, 3, 4) """ - h = np.tensordot(input_tensor, self.w1, axes=([2], [0])) + self.b1 - h = np.maximum(h, 0.0) - out = np.tensordot(h, self.w2, axes=([2], [0])) + self.b2 - return out + mean = np.mean(input_tensor, axis=-1, keepdims=True) + var = np.mean((input_tensor - mean) ** 2, axis=-1, keepdims=True) + x_norm = (input_tensor - mean) / np.sqrt(var + self.eps) + return self.gamma * x_norm + self.beta # ------------------------------- -# 🔹 ScaledDotProductAttention +# 🔹 PositionwiseFeedForward # ------------------------------- -class ScaledDotProductAttention: - def forward( - self, - query: np.ndarray, - key: np.ndarray, - value: np.ndarray, - mask: np.ndarray | None = None, - ) -> tuple[np.ndarray, np.ndarray]: +class PositionwiseFeedForward: + def __init__(self, d_model: int, hidden: int, seed: int | None = None) -> None: + self.rng = np.random.default_rng(seed) + self.linear1_w = self.rng.standard_normal((d_model, hidden)) * \ + math.sqrt(2.0 / (d_model + hidden)) + self.linear1_b = np.zeros((hidden,)) + self.linear2_w = self.rng.standard_normal((hidden, d_model)) * \ + math.sqrt(2.0 / (hidden + d_model)) + self.linear2_b = np.zeros((d_model,)) + + def forward(self, x_tensor: np.ndarray) -> np.ndarray: """ - Compute scaled dot-product attention. - - Returns - ------- - context : np.ndarray - Shape (batch, n_head, seq_len, d_k) - attn_weights : np.ndarray - Shape (batch, n_head, seq_len, seq_len) + >>> ff = PositionwiseFeedForward(4, 8, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out = ff.forward(x) + >>> out.shape + (1, 3, 4) """ - batch_size, n_head, seq_len, d_k = query.shape - scores = np.matmul(query, key.transpose(0, 1, 3, 2)) / math.sqrt(d_k) - - if mask is not None: - mask2 = mask[:, None, None, :] if mask.ndim == 2 else mask - scores = np.where(mask2 == 0, -1e9, scores) - - attn_weights = _softmax(scores, axis=-1) - context = np.matmul(attn_weights, value) - return context, attn_weights + hidden = np.tensordot(x_tensor, self.linear1_w, axes=([2], [0])) + self.linear1_b + hidden = np.maximum(0, hidden) # ReLU + out = np.tensordot(hidden, self.linear2_w, axes=([2], [0])) + self.linear2_b + return out # ------------------------------- # 🔹 MultiHeadAttention # ------------------------------- class MultiHeadAttention: - def __init__(self, d_model: int, n_head: int, seed: Optional[int] = None) -> None: + def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: if d_model % n_head != 0: raise ValueError("d_model must be divisible by n_head") - - self.rng = np.random.default_rng(seed) self.d_model = d_model self.n_head = n_head self.d_k = d_model // n_head - - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_q = np.zeros((d_model,)) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_k = np.zeros((d_model,)) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_v = np.zeros((d_model,)) - self.w_out = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / (d_model + d_model) - ) - self.b_out = np.zeros((d_model,)) - - self.attn = ScaledDotProductAttention() - - def _linear( - self, x: np.ndarray, weight: np.ndarray, bias: np.ndarray - ) -> np.ndarray: - return np.tensordot(x, weight, axes=([2], [0])) + bias - - def _split_heads(self, x: np.ndarray) -> np.ndarray: - batch_size, seq_len, _ = x.shape - return x.reshape(batch_size, seq_len, self.n_head, self.d_k).transpose( - 0, 2, 1, 3 - ) - - def _concat_heads(self, x: np.ndarray) -> np.ndarray: - batch_size, n_head, seq_len, d_k = x.shape - return x.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, n_head * d_k) + self.rng = np.random.default_rng(seed) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) def forward( self, @@ -184,99 +117,60 @@ def forward( key: np.ndarray, value: np.ndarray, mask: np.ndarray | None = None, - ) -> tuple[np.ndarray, np.ndarray]: + ) -> Tuple[np.ndarray, np.ndarray]: """ - Parameters - ---------- - query/key/value : np.ndarray - Shape (batch, seq_len, d_model) - mask : np.ndarray | None - Optional mask - - Returns - ------- - out : np.ndarray - Shape (batch, seq_len, d_model) - attn_weights : np.ndarray - Shape (batch, n_head, seq_len, seq_len) + >>> attn = MultiHeadAttention(4, 2, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out, w = attn.forward(x, x, x) + >>> out.shape + (1, 3, 4) + >>> w.shape + (1, 2, 3, 3) """ - q = self._linear(query, self.w_q, self.b_q) - k = self._linear(key, self.w_k, self.b_k) - v = self._linear(value, self.w_v, self.b_v) - - qh, kh, vh = self._split_heads(q), self._split_heads(k), self._split_heads(v) - context, attn_weights = self.attn.forward(qh, kh, vh, mask) - concat = self._concat_heads(context) - out = np.tensordot(concat, self.w_out, axes=([2], [0])) + self.b_out + batch_size, _seq_len, _ = query.shape + Q = np.tensordot(query, self.w_q, axes=([2], [0])) + K = np.tensordot(key, self.w_k, axes=([2], [0])) + V = np.tensordot(value, self.w_v, axes=([2], [0])) + Q = Q.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + K = K.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + V = V.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / math.sqrt(self.d_k) + if mask is not None: + scores = np.where(mask[:, None, None, :] == 0, -1e9, scores) + attn_weights = _softmax(scores, axis=-1) + out = np.matmul(attn_weights, V) + out = out.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model) + out = np.tensordot(out, self.w_o, axes=([2], [0])) return out, attn_weights -# ------------------------------- -# 🔹 LayerNorm -# ------------------------------- -class LayerNorm: - def __init__(self, d_model: int, eps: float = 1e-12) -> None: - self.gamma = np.ones((d_model,)) - self.beta = np.zeros((d_model,)) - self.eps = eps - - def forward(self, input_tensor: np.ndarray) -> np.ndarray: - """ - Parameters - ---------- - input_tensor : np.ndarray - Shape (batch, seq_len, d_model) - - Returns - ------- - np.ndarray - Layer-normalized tensor of same shape - """ - mean = np.mean(input_tensor, axis=-1, keepdims=True) - var = np.mean((input_tensor - mean) ** 2, axis=-1, keepdims=True) - x_norm = (input_tensor - mean) / np.sqrt(var + self.eps) - return self.gamma * x_norm + self.beta - - # ------------------------------- # 🔹 TransformerEncoderLayer # ------------------------------- class TransformerEncoderLayer: - def __init__( - self, d_model: int, n_head: int, hidden_dim: int, seed: Optional[int] = None - ) -> None: + def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None) -> None: self.self_attn = MultiHeadAttention(d_model, n_head, seed) - self.ffn = PositionwiseFeedForward(d_model, hidden_dim, seed=seed) self.norm1 = LayerNorm(d_model) + self.ff = PositionwiseFeedForward(d_model, hidden_dim, seed) self.norm2 = LayerNorm(d_model) - def forward( - self, input_tensor: np.ndarray, mask: np.ndarray | None = None - ) -> np.ndarray: + def forward(self, x_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: """ - Parameters - ---------- - input_tensor : np.ndarray - Shape (batch, seq_len, d_model) - mask : np.ndarray | None - Optional attention mask - - Returns - ------- - np.ndarray - Shape (batch, seq_len, d_model) + >>> layer = TransformerEncoderLayer(4, 2, 8, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out = layer.forward(x) + >>> out.shape + (1, 3, 4) """ - attn_out, _ = self.self_attn.forward( - input_tensor, input_tensor, input_tensor, mask - ) - x_norm1 = self.norm1.forward(input_tensor + attn_out) - ffn_out = self.ffn.forward(x_norm1) - x_norm2 = self.norm2.forward(x_norm1 + ffn_out) - return x_norm2 + attn_out, _ = self.self_attn.forward(x_tensor, x_tensor, x_tensor, mask) + x_tensor = self.norm1.forward(x_tensor + attn_out) + ff_out = self.ff.forward(x_tensor) + x_tensor = self.norm2.forward(x_tensor + ff_out) + return x_tensor # ------------------------------- -# 🔹 TransformerEncoder (stack) +# 🔹 TransformerEncoder # ------------------------------- class TransformerEncoder: def __init__( @@ -285,68 +179,44 @@ def __init__( n_head: int, hidden_dim: int, num_layers: int, - seed: Optional[int] = None, + seed: int | None = None ) -> None: self.layers = [ TransformerEncoderLayer(d_model, n_head, hidden_dim, seed) for _ in range(num_layers) ] - def forward( - self, input_tensor: np.ndarray, mask: np.ndarray | None = None - ) -> np.ndarray: + def forward(self, x_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: """ - Parameters - ---------- - input_tensor : np.ndarray - Shape (batch, seq_len, d_model) - mask : np.ndarray | None - Optional attention mask - - Returns - ------- - np.ndarray - Shape (batch, seq_len, d_model) + >>> encoder = TransformerEncoder(4, 2, 8, 2, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out = encoder.forward(x) + >>> out.shape + (1, 3, 4) """ - output = input_tensor for layer in self.layers: - output = layer.forward(output, mask) - return output + x_tensor = layer.forward(x_tensor, mask) + return x_tensor # ------------------------------- # 🔹 AttentionPooling # ------------------------------- class AttentionPooling: - def __init__(self, d_model: int, seed: Optional[int] = None) -> None: + def __init__(self, d_model: int, seed: int | None = None) -> None: self.rng = np.random.default_rng(seed) self.w = self.rng.standard_normal((d_model,)) * math.sqrt(2.0 / d_model) - self.b = 0.0 - def forward( - self, input_tensor: np.ndarray, mask: np.ndarray | None = None - ) -> tuple[np.ndarray, np.ndarray]: + def forward(self, x_tensor: np.ndarray) -> np.ndarray: """ - Parameters - ---------- - input_tensor : np.ndarray - Shape (batch, seq_len, d_model) - mask : np.ndarray | None - Shape (batch, seq_len) where 1=valid, 0=pad - - Returns - ------- - pooled : np.ndarray - Shape (batch, d_model) - attn_weights : np.ndarray - Shape (batch, seq_len) + >>> pool = AttentionPooling(4, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out = pool.forward(x) + >>> out.shape + (1, 4) """ - scores = np.tensordot(input_tensor, self.w, axes=([2], [0])) + self.b - if mask is not None: - scores = np.where(mask == 0, -1e9, scores) - attn_weights = _softmax(scores, axis=-1) - pooled = np.matmul(attn_weights[:, None, :], input_tensor).squeeze(1) - return pooled, attn_weights + attn_weights = _softmax(np.tensordot(x_tensor, self.w, axes=([2], [0])), axis=1) + return np.sum(x_tensor * attn_weights[..., None], axis=1) # ------------------------------- @@ -355,67 +225,44 @@ def forward( class EEGTransformer: def __init__( self, - feature_dim: int, - d_model: int = 128, - n_head: int = 8, - hidden_dim: int = 512, - num_layers: int = 4, + d_model: int, + n_head: int, + hidden_dim: int, + num_layers: int, output_dim: int = 1, task_type: str = "regression", - seed: Optional[int] = None, + seed: int | None = None ) -> None: - self.rng = np.random.default_rng(seed) - self.feature_dim = feature_dim - self.d_model = d_model - self.task_type = task_type - - self.w_in = self.rng.standard_normal((feature_dim, d_model)) * math.sqrt( - 2.0 / (feature_dim + d_model) - ) - self.b_in = np.zeros((d_model,)) - self.time2vec = Time2Vec(d_model, seed) self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed) self.pooling = AttentionPooling(d_model, seed) - - self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt( - 2.0 / (d_model + output_dim) - ) + self.output_dim = output_dim + self.task_type = task_type + self.rng = np.random.default_rng(seed) + self.w_out = self.rng.standard_normal( + (d_model, output_dim) + ) * math.sqrt(2.0 / (d_model + output_dim)) self.b_out = np.zeros((output_dim,)) - def _input_proj(self, features: np.ndarray) -> np.ndarray: - return np.tensordot(features, self.w_in, axes=([2], [0])) + self.b_in - - def forward( - self, features: np.ndarray, mask: np.ndarray | None = None - ) -> tuple[np.ndarray, np.ndarray]: + def forward(self, eeg_data: np.ndarray) -> np.ndarray: """ - Parameters - ---------- - features : np.ndarray - Shape (batch, seq_len, feature_dim) - mask : np.ndarray | None - Optional mask - - Returns - ------- - output : np.ndarray - Shape (batch, output_dim) - attn_weights : np.ndarray - Shape (batch, seq_len) + >>> model = EEGTransformer(4, 2, 8, 2, seed=0) + >>> x = np.ones((1, 3, 4)) + >>> out = model.forward(x) + >>> out.shape + (1, 1) """ - batch_size, seq_len, _ = features.shape - time_indices = np.arange(seq_len, dtype=float)[None, :, None] - time_indices = np.tile(time_indices, (batch_size, 1, 1)) - - time_emb = self.time2vec.forward(time_indices) - x_proj = self._input_proj(features) + time_emb - - enc_out = self.encoder.forward(x_proj, mask) - pooled, attn_weights = self.pooling.forward(enc_out, mask) + x = self.time2vec.forward(eeg_data) + x = self.encoder.forward(x) + x = self.pooling.forward(x) + out = np.tensordot(x, self.w_out, axes=([1], [0])) + self.b_out + return out - output = np.tensordot(pooled, self.w_out, axes=([1], [0])) + self.b_out - if self.task_type == "classification": - output = _softmax(output, axis=-1) - return output, attn_weights +# ------------------------------- +# 🔹 Helper softmax +# ------------------------------- +def _softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x_max = np.max(x, axis=axis, keepdims=True) + e = np.exp(x - x_max) + return e / (np.sum(e, axis=axis, keepdims=True) + 1e-12) From 74714aa3b5486aa29ae6c73631ee796057a8e247 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:38:30 +0000 Subject: [PATCH 22/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../real_time_encoder_transformer.py | 52 +++++++++++++------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 97032c357c92..67939796faa8 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -74,11 +74,13 @@ def forward(self, input_tensor: np.ndarray) -> np.ndarray: class PositionwiseFeedForward: def __init__(self, d_model: int, hidden: int, seed: int | None = None) -> None: self.rng = np.random.default_rng(seed) - self.linear1_w = self.rng.standard_normal((d_model, hidden)) * \ - math.sqrt(2.0 / (d_model + hidden)) + self.linear1_w = self.rng.standard_normal((d_model, hidden)) * math.sqrt( + 2.0 / (d_model + hidden) + ) self.linear1_b = np.zeros((hidden,)) - self.linear2_w = self.rng.standard_normal((hidden, d_model)) * \ - math.sqrt(2.0 / (hidden + d_model)) + self.linear2_w = self.rng.standard_normal((hidden, d_model)) * math.sqrt( + 2.0 / (hidden + d_model) + ) self.linear2_b = np.zeros((d_model,)) def forward(self, x_tensor: np.ndarray) -> np.ndarray: @@ -89,7 +91,9 @@ def forward(self, x_tensor: np.ndarray) -> np.ndarray: >>> out.shape (1, 3, 4) """ - hidden = np.tensordot(x_tensor, self.linear1_w, axes=([2], [0])) + self.linear1_b + hidden = ( + np.tensordot(x_tensor, self.linear1_w, axes=([2], [0])) + self.linear1_b + ) hidden = np.maximum(0, hidden) # ReLU out = np.tensordot(hidden, self.linear2_w, axes=([2], [0])) + self.linear2_b return out @@ -106,10 +110,18 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: self.n_head = n_head self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) def forward( self, @@ -148,13 +160,17 @@ def forward( # 🔹 TransformerEncoderLayer # ------------------------------- class TransformerEncoderLayer: - def __init__(self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None) -> None: + def __init__( + self, d_model: int, n_head: int, hidden_dim: int, seed: int | None = None + ) -> None: self.self_attn = MultiHeadAttention(d_model, n_head, seed) self.norm1 = LayerNorm(d_model) self.ff = PositionwiseFeedForward(d_model, hidden_dim, seed) self.norm2 = LayerNorm(d_model) - def forward(self, x_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: + def forward( + self, x_tensor: np.ndarray, mask: np.ndarray | None = None + ) -> np.ndarray: """ >>> layer = TransformerEncoderLayer(4, 2, 8, seed=0) >>> x = np.ones((1, 3, 4)) @@ -179,14 +195,16 @@ def __init__( n_head: int, hidden_dim: int, num_layers: int, - seed: int | None = None + seed: int | None = None, ) -> None: self.layers = [ TransformerEncoderLayer(d_model, n_head, hidden_dim, seed) for _ in range(num_layers) ] - def forward(self, x_tensor: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray: + def forward( + self, x_tensor: np.ndarray, mask: np.ndarray | None = None + ) -> np.ndarray: """ >>> encoder = TransformerEncoder(4, 2, 8, 2, seed=0) >>> x = np.ones((1, 3, 4)) @@ -231,7 +249,7 @@ def __init__( num_layers: int, output_dim: int = 1, task_type: str = "regression", - seed: int | None = None + seed: int | None = None, ) -> None: self.time2vec = Time2Vec(d_model, seed) self.encoder = TransformerEncoder(d_model, n_head, hidden_dim, num_layers, seed) @@ -239,9 +257,9 @@ def __init__( self.output_dim = output_dim self.task_type = task_type self.rng = np.random.default_rng(seed) - self.w_out = self.rng.standard_normal( - (d_model, output_dim) - ) * math.sqrt(2.0 / (d_model + output_dim)) + self.w_out = self.rng.standard_normal((d_model, output_dim)) * math.sqrt( + 2.0 / (d_model + output_dim) + ) self.b_out = np.zeros((output_dim,)) def forward(self, eeg_data: np.ndarray) -> np.ndarray: From 18c156e02e36e4bbeb354668195fd820e68987e9 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:11:30 +0530 Subject: [PATCH 23/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 39 ++++++++----------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 67939796faa8..c9f91cc742aa 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -2,8 +2,6 @@ # ------------------------------- from __future__ import annotations import math -from typing import Optional, Tuple - import numpy as np @@ -110,18 +108,10 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: self.n_head = n_head self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) def forward( self, @@ -129,7 +119,7 @@ def forward( key: np.ndarray, value: np.ndarray, mask: np.ndarray | None = None, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: """ >>> attn = MultiHeadAttention(4, 2, seed=0) >>> x = np.ones((1, 3, 4)) @@ -140,17 +130,20 @@ def forward( (1, 2, 3, 3) """ batch_size, _seq_len, _ = query.shape - Q = np.tensordot(query, self.w_q, axes=([2], [0])) - K = np.tensordot(key, self.w_k, axes=([2], [0])) - V = np.tensordot(value, self.w_v, axes=([2], [0])) - Q = Q.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) - K = K.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) - V = V.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) - scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / math.sqrt(self.d_k) + q = np.tensordot(query, self.w_q, axes=([2], [0])) + k = np.tensordot(key, self.w_k, axes=([2], [0])) + v = np.tensordot(value, self.w_v, axes=([2], [0])) + + q = q.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + k = k.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + v = v.reshape(batch_size, -1, self.n_head, self.d_k).transpose(0, 2, 1, 3) + + scores = np.matmul(q, k.transpose(0, 1, 3, 2)) / math.sqrt(self.d_k) if mask is not None: scores = np.where(mask[:, None, None, :] == 0, -1e9, scores) + attn_weights = _softmax(scores, axis=-1) - out = np.matmul(attn_weights, V) + out = np.matmul(attn_weights, v) out = out.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model) out = np.tensordot(out, self.w_o, axes=([2], [0])) return out, attn_weights From e33202b2eb134893003a35d33d8f763b589ef0a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:41:52 +0000 Subject: [PATCH 24/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index c9f91cc742aa..dc6606e47e04 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -108,10 +108,18 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: self.n_head = n_head self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) def forward( self, From 33cf40ab6060a50e1d24995aa1cf16ee8647a7f9 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:13:37 +0530 Subject: [PATCH 25/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index dc6606e47e04..c9f91cc742aa 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -108,18 +108,10 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: self.n_head = n_head self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) - self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt( - 2.0 / d_model - ) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) def forward( self, From 96285392c9e3a13d0a56f33869fd12ab57214edb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:43:57 +0000 Subject: [PATCH 26/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index c9f91cc742aa..dc6606e47e04 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -108,10 +108,18 @@ def __init__(self, d_model: int, n_head: int, seed: int | None = None) -> None: self.n_head = n_head self.d_k = d_model // n_head self.rng = np.random.default_rng(seed) - self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) - self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt(2.0 / d_model) + self.w_q = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_k = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_v = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) + self.w_o = self.rng.standard_normal((d_model, d_model)) * math.sqrt( + 2.0 / d_model + ) def forward( self, From e33baeb852069b0627ab730bfe70bde168b0ab60 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:16:02 +0530 Subject: [PATCH 27/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index dc6606e47e04..a514ae21535f 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,6 +1,7 @@ # 🔹 Imports # ------------------------------- from __future__ import annotations + import math import numpy as np From 2665159fa518289f7227709f7dd45fbf25bd6524 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:20:48 +0530 Subject: [PATCH 28/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index a514ae21535f..70f70b83a89f 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,5 +1,4 @@ -# 🔹 Imports -# ------------------------------- + from __future__ import annotations import math From a21bd2bf39f88328d0a050bc1e1b0d45fe124b09 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:51:08 +0000 Subject: [PATCH 29/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 70f70b83a89f..c3cccf197b42 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,4 +1,3 @@ - from __future__ import annotations import math From 491e15d44eabc8d9af96430b98c8dd3b3ce7a2cb Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:22:27 +0530 Subject: [PATCH 30/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index c3cccf197b42..780bddc7f8be 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -3,7 +3,6 @@ import math import numpy as np - # ------------------------------- # 🔹 Time2Vec Layer # ------------------------------- From c57d1841c5a4519bcabbc606b2ca784a59105b84 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 04:52:48 +0000 Subject: [PATCH 31/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 780bddc7f8be..c3cccf197b42 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -3,6 +3,7 @@ import math import numpy as np + # ------------------------------- # 🔹 Time2Vec Layer # ------------------------------- From 80aff7a5f303fb92e55151f469959e8d36e0e9f3 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:24:41 +0530 Subject: [PATCH 32/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index c3cccf197b42..792157ca856f 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -1,6 +1,7 @@ from __future__ import annotations import math + import numpy as np From 007dcf13c9843685919c6dde75b09d1034627b72 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 10:50:39 +0530 Subject: [PATCH 33/36] Update real_time_encoder_transformer.py --- neural_network/real_time_encoder_transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 792157ca856f..72e4d1eff738 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -38,8 +38,10 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: >>> out.shape (1, 3, 4) """ + + linear = self.w0 * time_steps + self.b0 - periodic = np.sin(self.w * time_steps + self.b) + periodic = np.sin(time_steps * self.w[:, None, :] + self.b[:, None, :]) return np.concatenate([linear, periodic], axis=-1) From 21c18c23566f03fd61eda2d0216f9e5849399801 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 05:21:22 +0000 Subject: [PATCH 34/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 72e4d1eff738..de90e8b81d49 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -38,8 +38,7 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: >>> out.shape (1, 3, 4) """ - - + linear = self.w0 * time_steps + self.b0 periodic = np.sin(time_steps * self.w[:, None, :] + self.b[:, None, :]) return np.concatenate([linear, periodic], axis=-1) From 8b55a8f9487989fe6961d9e1edc3505c9f574a87 Mon Sep 17 00:00:00 2001 From: UTSAV OJHA Date: Wed, 22 Oct 2025 11:18:19 +0530 Subject: [PATCH 35/36] Update real_time_encoder_transformer.py --- .../real_time_encoder_transformer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index de90e8b81d49..67739ecacc48 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -39,11 +39,13 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: (1, 3, 4) """ + linear = self.w0 * time_steps + self.b0 - periodic = np.sin(time_steps * self.w[:, None, :] + self.b[:, None, :]) + periodic = np.sin(time_steps * self.w[None, None, :] + self.b[None, None, :]) return np.concatenate([linear, periodic], axis=-1) + # ------------------------------- # 🔹 LayerNorm # ------------------------------- @@ -267,15 +269,26 @@ def __init__( def forward(self, eeg_data: np.ndarray) -> np.ndarray: """ >>> model = EEGTransformer(4, 2, 8, 2, seed=0) - >>> x = np.ones((1, 3, 4)) + >>> x = np.ones((1, 3, 1)) >>> out = model.forward(x) >>> out.shape (1, 1) """ + # Ensure input shape is (batch, seq_len, 1) + if eeg_data.shape[-1] != 1: + eeg_data = eeg_data[..., :1] + + # Time2Vec positional encoding x = self.time2vec.forward(eeg_data) + + # Transformer encoder x = self.encoder.forward(x) + + # Attention pooling x = self.pooling.forward(x) - out = np.tensordot(x, self.w_out, axes=([1], [0])) + self.b_out + + # Final linear layer + out = np.dot(x, self.w_out) + self.b_out # shape (batch, output_dim) return out From 195b58ba06d31f6dbcf8ea54ee4c98ee14a85761 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 05:48:40 +0000 Subject: [PATCH 36/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/real_time_encoder_transformer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/neural_network/real_time_encoder_transformer.py b/neural_network/real_time_encoder_transformer.py index 67739ecacc48..f39db92609a8 100644 --- a/neural_network/real_time_encoder_transformer.py +++ b/neural_network/real_time_encoder_transformer.py @@ -39,13 +39,11 @@ def forward(self, time_steps: np.ndarray) -> np.ndarray: (1, 3, 4) """ - linear = self.w0 * time_steps + self.b0 periodic = np.sin(time_steps * self.w[None, None, :] + self.b[None, None, :]) return np.concatenate([linear, periodic], axis=-1) - # ------------------------------- # 🔹 LayerNorm # ------------------------------- @@ -278,16 +276,16 @@ def forward(self, eeg_data: np.ndarray) -> np.ndarray: if eeg_data.shape[-1] != 1: eeg_data = eeg_data[..., :1] - # Time2Vec positional encoding + # Time2Vec positional encoding x = self.time2vec.forward(eeg_data) - # Transformer encoder + # Transformer encoder x = self.encoder.forward(x) - # Attention pooling + # Attention pooling x = self.pooling.forward(x) - # Final linear layer + # Final linear layer out = np.dot(x, self.w_out) + self.b_out # shape (batch, output_dim) return out