Patch Mpt _top_ May 2026

class PatchedRotaryEmbedding(nn.Module): """Rotary embedding with cache reset on seqlen change.""" def (self, dim: int, max_seq_len: int = 2048, base: int = 10000): super(). init () self.dim = dim self.max_seq_len = max_seq_len self.base = base self._cached_cos = None self._cached_sin = None self._cached_seq_len = None

# Case: (batch, 1, key_len) elif attention_mask.dim() == 3 and attention_mask.size(1) == 1: mask = attention_mask[:, :, None, :] else: raise ValueError(f"Unexpected mask shape: attention_mask.shape") patch mpt

batch = attention_mask.size(0)

def forward(self, x: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]: self._update_cache(seq_len, x.device, x.dtype) return self._cached_cos[:seq_len], self._cached_sin[:seq_len] 2. Patch Attention Mask Expansion (for cross-attention) ---------------------------------------------------------------------- def patch_attention_mask( attention_mask: torch.Tensor, query_length: int, key_length: int, dtype: torch.dtype, ) -> torch.Tensor: """ Expand mask from (batch, 1, key_len) or (batch, seq_len) to (batch, 1, query_len, key_len) for MPT attention. """ if attention_mask is None: return None class PatchedRotaryEmbedding(nn