simplified flax attention and matched torch attention

2022-06-29 14:56:28 -04:00
parent 61cc99c13c
commit d99828a239
4 changed files with 72 additions and 87 deletions
@@ -23,22 +23,22 @@ class DecoderSelfAttentionTorch(AttentionTorch):
    def forward(
        self, 
        decoder_state: FloatTensor,
-        keys_values: FloatTensor,
+        attention_state: FloatTensor,
        attention_mask: BoolTensor,
        token_mask: BoolTensor
    ) -> Tuple[FloatTensor, FloatTensor]:
-        batch_count = decoder_state.shape[0]
        keys = self.k_proj.forward(decoder_state)
        values = self.v_proj.forward(decoder_state)
        queries = self.q_proj.forward(decoder_state)
-        keys_values = torch.where(
+        attention_state = torch.where(
            token_mask[None, :, None], 
            torch.cat([keys, values]), 
-            keys_values
+            attention_state
        )
-        keys, values = keys_values[:batch_count], keys_values[batch_count:]
+        batch_count = decoder_state.shape[0]
+        keys, values = attention_state[:batch_count], attention_state[batch_count:]
        decoder_state = super().forward(keys, values, queries, attention_mask)
-        return decoder_state, keys_values
+        return decoder_state, attention_state


 class DecoderLayerTorch(nn.Module):
@@ -67,7 +67,7 @@ class DecoderLayerTorch(nn.Module):
        self,
        decoder_state: FloatTensor,
        encoder_state: FloatTensor,
-        keys_values_state: FloatTensor,
+        attention_state: FloatTensor,
        attention_mask: BoolTensor,
        token_index: LongTensor
    ) -> Tuple[FloatTensor, FloatTensor]:
@@ -77,9 +77,9 @@ class DecoderLayerTorch(nn.Module):
        self_attn_mask = self.token_indices < token_index + 1
        token_mask = self.token_indices == token_index
        self_attn_mask = torch.stack([self_attn_mask] * decoder_state.shape[0])
-        decoder_state, keys_values_state = self.self_attn.forward(
+        decoder_state, attention_state = self.self_attn.forward(
            decoder_state,
-            keys_values_state,
+            attention_state,
            self_attn_mask,
            token_mask
        )
@@ -102,7 +102,7 @@ class DecoderLayerTorch(nn.Module):
        decoder_state = self.glu.forward(decoder_state)
        decoder_state = residual + decoder_state

-        return decoder_state, keys_values_state
+        return decoder_state, attention_state


 class DalleBartDecoderTorch(nn.Module):
@@ -139,8 +139,9 @@ class DalleBartDecoderTorch(nn.Module):
        self.layernorm_embedding = nn.LayerNorm(embed_count)
        self.final_ln = nn.LayerNorm(embed_count)
        self.lm_head = nn.Linear(embed_count, image_vocab_size + 1, bias=False)
-        self.keys_values_state_shape = (
-            layer_count * 2 * batch_count,
+        self.attention_state_shape = (
+            layer_count,
+            2 * batch_count,
            image_token_count,
            embed_count
        )
@@ -157,7 +158,7 @@ class DalleBartDecoderTorch(nn.Module):
        self,
        text_tokens: LongTensor,
        encoder_state: FloatTensor,
-        keys_values_state: FloatTensor,
+        attention_state: FloatTensor,
        prev_token_and_index: LongTensor
    ) -> Tuple[LongTensor, FloatTensor]:
        attention_mask = text_tokens.not_equal(1)
@@ -168,17 +169,16 @@ class DalleBartDecoderTorch(nn.Module):
        decoder_state += self.embed_positions.forward(token_index)
        decoder_state = self.layernorm_embedding.forward(decoder_state)
        decoder_state = decoder_state[:, None]
-        keys_values = []
-        for i, layer in enumerate(self.layers):
-            j1, j2 = i * 2 * batch_count, (i + 1) * 2 * batch_count
-            decoder_state, keys_values_layer = layer.forward(
+        attention_states_new = []
+        for i in range(self.layer_count):
+            decoder_state, attention_state_layer = self.layers[i].forward(
                decoder_state,
                encoder_state,
-                keys_values_state[j1:j2],
+                attention_state[i],
                attention_mask,
                token_index[:1]
            )
-            keys_values.append(keys_values_layer)
+            attention_states_new.append(attention_state_layer)
        decoder_state = self.final_ln(decoder_state)
        logits = self.lm_head(decoder_state)
        a = self.condition_factor
@@ -190,7 +190,7 @@ class DalleBartDecoderTorch(nn.Module):
            self.zero_prob,
            torch.exp(logits - top_logits[0])
        )
-        return probs, torch.cat(keys_values)
+        return probs, torch.stack(attention_states_new)


    def forward(
@@ -199,17 +199,17 @@ class DalleBartDecoderTorch(nn.Module):
        encoder_state: FloatTensor
    ) -> LongTensor:
        image_tokens: List[LongTensor] = []
-        keys_values_state = torch.zeros(self.keys_values_state_shape)
+        attention_state = torch.zeros(self.attention_state_shape)
        if torch.cuda.is_available(): 
-            keys_values_state = keys_values_state.cuda()
+            attention_state = attention_state.cuda()
        image_token = self.start_token

        for i in range(self.sample_token_count):
            token_index = self.token_indices[i:i+1]
-            probs, keys_values_state = self.decode_step(
+            probs, attention_state = self.decode_step(
                text_tokens = text_tokens,
                encoder_state = encoder_state,
-                keys_values_state = keys_values_state,
+                attention_state = attention_state,
                prev_token_and_index = torch.cat([image_token, token_index])
            )