I am developing a Bi-LSTM model and want to add a attention layer to it. But I am not getting how to add it.
My current code for the model is
model = Sequential()
model.add(Embedding(max_words, 1152, input_length=max_len, weights=[embeddings]))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()
And the model summary is
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 1152, 1152) 278396928
_________________________________________________________________
batch_normalization_1 (Batch (None, 1152, 1152) 4608
_________________________________________________________________
activation_1 (Activation) (None, 1152, 1152) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 1152, 1152) 0
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64) 303360
_________________________________________________________________
batch_normalization_2 (Batch (None, 64) 256
_________________________________________________________________
activation_2 (Activation) (None, 64) 0
_________________________________________________________________
dropout_2 (Dropout) (None, 64) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 65
=================================================================
Total params: 278,705,217
Trainable params: 278,702,785
Non-trainable params: 2,432
basic lstm gets confused between the words and sometimes can predict the wrong word. So whenever this type of situation occurs the encoder step needs to search for the most relevant information, this idea is called 'Attention'. A simple structure of the bidirectional LSTM model can be represented by the above image.
The pre-training model is the Attention-based CNN-LSTM model based on sequence-to-sequence framework. The model first uses convolution to extract the deep features of the original stock data, and then uses the Long Short-Term Memory networks to mine the long-term time series features.
Bidirectional long-short term memory(bi-lstm) is the process of making any neural network o have the sequence information in both directions backwards (future to past) or forward(past to future). In bidirectional, our input flows in two directions, making a bi-lstm different from the regular LSTM.
This can be a possible custom solution with a custom layer that computes attention on the positional/temporal dimension
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
class Attention(Layer):
def __init__(self, return_sequences=True):
self.return_sequences = return_sequences
super(Attention,self).__init__()
def build(self, input_shape):
self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
initializer="normal")
self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
initializer="zeros")
super(Attention,self).build(input_shape)
def call(self, x):
e = K.tanh(K.dot(x,self.W)+self.b)
a = K.softmax(e, axis=1)
output = x*a
if self.return_sequences:
return output
return K.sum(output, axis=1)
it's build to receive 3D tensors and output 3D tensors (return_sequences=True)
or 2D tensors (return_sequences=False)
. below a dummy example
# dummy data creation
max_len = 100
max_words = 333
emb_dim = 126
n_sample = 5
X = np.random.randint(0,max_words, (n_sample,max_len))
Y = np.random.randint(0,2, n_sample)
with return_sequences=True
model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=True)) # receive 3D and output 3D
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')
model.fit(X,Y, epochs=3)
with return_sequences=False
model = Sequential()
model.add(Embedding(max_words, emb_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention(return_sequences=False)) # receive 3D and output 2D
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')
model.fit(X,Y, epochs=3)
You can integrate it into your networks easily
here the running notebook
In case, someone is using only Tensorflow and not keras externally, this is the way to do it.
import tensorflow as tf
class Attention(tf.keras.layers.Layer):
def __init__(self, return_sequences=True, name=None, **kwargs):
super(Attention, self).__init__(name=name)
self.return_sequences = return_sequences
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
initializer="glorot_uniform", trainable=True)
self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
initializer="glorot_uniform", trainable=True)
super(Attention, self).build(input_shape)
def call(self, x):
e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
a = tf.keras.activations.softmax(e, axis=1)
output = x * a
if self.return_sequences:
return a, output
return a, tf.keras.backend.sum(output, axis=1)
def get_config(self):
config = super().get_config().copy()
config.update({
'return_sequences': self.return_sequences
})
return config
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With