So, I've been working on a project for a while, we have very little data, I know it would become much better if we were able to put together a much much larger dataset. That aside, my issue at the moment is when I have a sentence input, my outputs look like this right now:
contactid contactid contactid contactid
A single word is focused on and repeated over and over again. What can I do to overcome this hurdle?
Things I've tried:
For certain my perplexity is steadily decreasing.
Here is my dataset preperation code:
class ModelInputs(object):
"""Factory to construct various input hooks and functions depending on mode """
def __init__(
self, vocab_files, batch_size,
share_vocab=True, src_eos_id=1, tgt_eos_id=2
):
self.batch_size = batch_size
self.vocab_files = vocab_files
self.share_vocab = share_vocab
self.src_eos_id = src_eos_id
self.tgt_eos_id = tgt_eos_id
def get_inputs(self, file_path, num_infer=None, mode=tf.estimator.ModeKeys.TRAIN):
self.mode = mode
if self.mode == tf.estimator.ModeKeys.TRAIN:
return self._training_input_hook(file_path)
if self.mode == tf.estimator.ModeKeys.EVAL:
return self._validation_input_hook(file_path)
if self.mode == tf.estimator.ModeKeys.PREDICT:
if num_infer is None:
raise ValueError('If performing inference must supply number of predictions to be made.')
return self._infer_input_hook(file_path, num_infer)
def _prepare_data(self, dataset, out=False):
prep_set = dataset.map(lambda string: tf.string_split([string]).values)
prep_set = prep_set.map(lambda words: (words, tf.size(words)))
if out == True:
return prep_set.map(lambda words, size: (self.vocab_tables[1].lookup(words), size))
return prep_set.map(lambda words, size: (self.vocab_tables[0].lookup(words), size))
def _batch_data(self, dataset, src_eos_id, tgt_eos_id):
batched_set = dataset.padded_batch(
self.batch_size,
padded_shapes=((tf.TensorShape([None]), tf.TensorShape([])), (tf.TensorShape([None]), tf.TensorShape([]))),
padding_values=((src_eos_id, 0), (tgt_eos_id, 0))
)
return batched_set
def _batch_infer_data(self, dataset, src_eos_id):
batched_set = dataset.padded_batch(
self.batch_size,
padded_shapes=(tf.TensorShape([None]), tf.TensorShape([])),
padding_values=(src_eos_id, 0)
)
return batched_set
def _create_vocab_tables(self, vocab_files, share_vocab=False):
if vocab_files[1] is None and share_vocab == False:
raise ValueError('If share_vocab is set to false must provide target vocab. (src_vocab_file, \
target_vocab_file)')
src_vocab_table = lookup_ops.index_table_from_file(
vocab_files[0],
default_value=UNK_ID
)
if share_vocab:
tgt_vocab_table = src_vocab_table
else:
tgt_vocab_table = lookup_ops.index_table_from_file(
vocab_files[1],
default_value=UNK_ID
)
return src_vocab_table, tgt_vocab_table
def _prepare_iterator_hook(self, hook, scope_name, iterator, file_path, name_placeholder):
if self.mode == tf.estimator.ModeKeys.TRAIN or self.mode == tf.estimator.ModeKeys.EVAL:
feed_dict = {
name_placeholder[0]: file_path[0],
name_placeholder[1]: file_path[1]
}
else:
feed_dict = {name_placeholder: file_path}
with tf.name_scope(scope_name):
hook.iterator_initializer_func = \
lambda sess: sess.run(
iterator.initializer,
feed_dict=feed_dict,
)
def _set_up_train_or_eval(self, scope_name, file_path):
hook = IteratorInitializerHook()
def input_fn():
with tf.name_scope(scope_name):
with tf.name_scope('sentence_markers'):
src_eos_id = tf.constant(self.src_eos_id, dtype=tf.int64)
tgt_eos_id = tf.constant(self.tgt_eos_id, dtype=tf.int64)
self.vocab_tables = self._create_vocab_tables(self.vocab_files, self.share_vocab)
in_file = tf.placeholder(tf.string, shape=())
in_dataset = self._prepare_data(tf.contrib.data.TextLineDataset(in_file).repeat(None))
out_file = tf.placeholder(tf.string, shape=())
out_dataset = self._prepare_data(tf.contrib.data.TextLineDataset(out_file).repeat(None))
dataset = tf.contrib.data.Dataset.zip((in_dataset, out_dataset))
dataset = self._batch_data(dataset, src_eos_id, tgt_eos_id)
iterator = dataset.make_initializable_iterator()
next_example, next_label = iterator.get_next()
self._prepare_iterator_hook(hook, scope_name, iterator, file_path, (in_file, out_file))
return next_example, next_label
return (input_fn, hook)
def _training_input_hook(self, file_path):
input_fn, hook = self._set_up_train_or_eval('train_inputs', file_path)
return (input_fn, hook)
def _validation_input_hook(self, file_path):
input_fn, hook = self._set_up_train_or_eval('eval_inputs', file_path)
return (input_fn, hook)
def _infer_input_hook(self, file_path, num_infer):
hook = IteratorInitializerHook()
def input_fn():
with tf.name_scope('infer_inputs'):
with tf.name_scope('sentence_markers'):
src_eos_id = tf.constant(self.src_eos_id, dtype=tf.int64)
self.vocab_tables = self._create_vocab_tables(self.vocab_files, self.share_vocab)
infer_file = tf.placeholder(tf.string, shape=())
dataset = tf.contrib.data.TextLineDataset(infer_file)
dataset = self._prepare_data(dataset)
dataset = self._batch_infer_data(dataset, src_eos_id)
iterator = dataset.make_initializable_iterator()
next_example, seq_len = iterator.get_next()
self._prepare_iterator_hook(hook, 'infer_inputs', iterator, file_path, infer_file)
return ((next_example, seq_len), None)
return (input_fn, hook)
And here is my model:
class Seq2Seq():
def __init__(
self, batch_size, inputs,
outputs, inp_vocab_size, tgt_vocab_size,
embed_dim, mode, time_major=False,
enc_embedding=None, dec_embedding=None, average_across_batch=True,
average_across_timesteps=True, vocab_path=None, embedding_path='./data_files/wiki.simple.vec'
):
embed_np = self._get_embedding(embedding_path)
if not enc_embedding:
self.enc_embedding = tf.contrib.layers.embed_sequence(
inputs,
inp_vocab_size,
embed_dim,
trainable=True,
scope='embed',
initializer=tf.constant_initializer(value=embed_np, dtype=tf.float32)
)
else:
self.enc_embedding = enc_embedding
if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
if not dec_embedding:
embed_outputs = tf.contrib.layers.embed_sequence(
outputs,
tgt_vocab_size,
embed_dim,
trainable=True,
scope='embed',
reuse=True
)
with tf.variable_scope('embed', reuse=True):
dec_embedding = tf.get_variable('embeddings')
self.embed_outputs = embed_outputs
self.dec_embedding = dec_embedding
else:
self.dec_embedding = dec_embedding
else:
with tf.variable_scope('embed', reuse=True):
self.dec_embedding = tf.get_variable('embeddings')
if mode == tf.estimator.ModeKeys.PREDICT and vocab_path is None:
raise ValueError('If mode is predict, must supply vocab_path')
self.vocab_path = vocab_path
self.inp_vocab_size = inp_vocab_size
self.tgt_vocab_size = tgt_vocab_size
self.average_across_batch = average_across_batch
self.average_across_timesteps = average_across_timesteps
self.time_major = time_major
self.batch_size = batch_size
self.mode = mode
def _get_embedding(self, embedding_path):
model = KeyedVectors.load_word2vec_format(embedding_path)
vocab = model.vocab
vocab_len = len(vocab)
return np.array([model.word_vec(k) for k in vocab.keys()])
def _get_lstm(self, num_units):
return tf.nn.rnn_cell.BasicLSTMCell(num_units)
def encode(self, num_units, num_layers, seq_len, cell_fw=None, cell_bw=None):
if cell_fw and cell_bw:
fw_cell = cell_fw
bw_cell = cell_bw
else:
fw_cell = self._get_lstm(num_units)
bw_cell = self._get_lstm(num_units)
encoder_outputs, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
fw_cell,
bw_cell,
self.enc_embedding,
sequence_length=seq_len,
time_major=self.time_major,
dtype=tf.float32
)
c_state = tf.concat([bi_encoder_state[0].c, bi_encoder_state[1].c], axis=1)
h_state = tf.concat([bi_encoder_state[0].h, bi_encoder_state[1].h], axis=1)
encoder_state = tf.contrib.rnn.LSTMStateTuple(c=c_state, h=h_state)
return tf.concat(encoder_outputs, -1), encoder_state
def _train_decoder(self, decoder_cell, out_seq_len, encoder_state, helper):
if not helper:
helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
self.embed_outputs,
out_seq_len,
self.dec_embedding,
0.3,
)
# helper = tf.contrib.seq2seq.TrainingHelper(
# self.dec_embedding,
# out_seq_len,
# )
projection_layer = layers_core.Dense(self.tgt_vocab_size, use_bias=False)
decoder = tf.contrib.seq2seq.BasicDecoder(
decoder_cell,
helper,
encoder_state,
output_layer=projection_layer
)
return decoder
def _predict_decoder(self, cell, encoder_state, beam_width, length_penalty_weight):
tiled_encoder_state = tf.contrib.seq2seq.tile_batch(
encoder_state, multiplier=beam_width
)
with tf.name_scope('sentence_markers'):
sos_id = tf.constant(1, dtype=tf.int32)
eos_id = tf.constant(2, dtype=tf.int32)
start_tokens = tf.fill([self.batch_size], sos_id)
end_token = eos_id
projection_layer = layers_core.Dense(self.tgt_vocab_size, use_bias=False)
emb = tf.squeeze(self.dec_embedding)
decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell=cell,
embedding=self.dec_embedding,
start_tokens=start_tokens,
end_token=end_token,
initial_state=tiled_encoder_state,
beam_width=beam_width,
output_layer=projection_layer,
length_penalty_weight=length_penalty_weight
)
return decoder
def decode(
self, num_units, out_seq_len,
encoder_state, cell=None, helper=None,
beam_width=None, length_penalty_weight=None
):
with tf.name_scope('Decode'):
if cell:
decoder_cell = cell
else:
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(2*num_units)
if self.mode != estimator.ModeKeys.PREDICT:
decoder = self._train_decoder(decoder_cell, out_seq_len, encoder_state, helper)
else:
decoder = self._predict_decoder(decoder_cell, encoder_state, beam_width, length_penalty_weight)
outputs = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=20,
swap_memory=True,
)
outputs = outputs[0]
if self.mode != estimator.ModeKeys.PREDICT:
return outputs.rnn_output, outputs.sample_id
else:
return outputs.beam_search_decoder_output, outputs.predicted_ids
def prepare_predict(self, sample_id):
rev_table = lookup_ops.index_to_string_table_from_file(
self.vocab_path, default_value=UNK)
predictions = rev_table.lookup(tf.to_int64(sample_id))
return tf.estimator.EstimatorSpec(
predictions=predictions,
mode=tf.estimator.ModeKeys.PREDICT
)
def prepare_train_eval(
self, t_out,
out_seq_len, labels, lr,
train_op=None, loss=None
):
if not loss:
weights = tf.sequence_mask(
out_seq_len,
dtype=t_out.dtype
)
loss = tf.contrib.seq2seq.sequence_loss(
t_out,
labels,
weights,
average_across_batch=self.average_across_batch,
)
if not train_op:
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.train.get_global_step(),
optimizer='SGD',
learning_rate=lr,
summaries=['loss', 'learning_rate']
)
return tf.estimator.EstimatorSpec(
mode=self.mode,
loss=loss,
train_op=train_op,
)
Drawback: The output sequence relies heavily on the context defined by the hidden state in the final output of the encoder, making it challenging for the model to deal with long sentences. In the case of long sequences, there is a high probability that the initial context has been lost by the end of the sequence.
sequence-to-sequence prediction with example Python code. The Encoder-Decoder LSTM is a recurrent neural network designed to address sequence-to-sequence problems, sometimes called seq2seq.
Both encoder and the decoder are LSTM models (or sometimes GRU models) Encoder reads the input sequence and summarizes the information in something called the internal state vectors or context vector (in case of LSTM these are called the hidden state and cell state vectors).
Seq2seq is a family of machine learning approaches used for natural language processing. Applications include language translation, image captioning, conversational models and text summarization.
This type of repetition is called a "text degeneration".
There is a great paper from 2019 which analyse this phenomenon: The Curious Case of Neural Text Degeneration by Ari Holtzman et al. from the Allen Institute for Artificial Intelligence.
The repetition may come from the type of text search (text sampling) on the decoder site. Many people implement this just by the most probable next world proposed by the model (argmax on the softmax on the last layer) or by so called beam search. In fact the beam search is the industry standard for today.
This is the example of Beam search from the article:
Continuation (BeamSearch, b=10):
"The unicorns were able to communicate with each other, they said unicorns. a statement that the unicorns. Professor of the Department of Los Angeles, the most important place the world to be recognition of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the…
As you can see there is a great amount of repetition.
According to the paper this curious case may be explained by the fact that each repeated sequence of words have higher probability than the sequence without the next repetition:
The article propose some workarounds with words sampling by the decoder. It definitely requires more study, but this is the best explanation we have today.
The other is that your model need still more training. In many cases I faced a similar behaviour when I had big training set and model still couldn't generalise well over whole diversity of the data. To test this hypothesis - try to train on smaller dataset and see if it generalise (produce meaningful results).
But even if your model generalise well enough, that doesn't mean you won't ever face the repetition pattern. Unless you change the sampling patter of the decoder, it is a common scenario.
If you train on a small data then try to decrease the number of parameters, f. e. number of neurons in each layer.
For me, when the network outputs one word all the time, significant decrease of learning rate helps.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With