IndexLookup layer to map strings from a vocabulary to integer indices (#1864)

workingloong · web-flow · commit d757e7079c73 · 2020-03-25T22:21:01.000+08:00
* Lookup layer to map strings from a vocabulary to integer indices

* IndexLookup layer to map strings from a vocabulary to integer indices

* Fix the docstring

* Fix docstring

* Add note for TF version

* Fix by comments

* Add an unit test to create model with IndexLookup layers
diff --git a/elasticdl_preprocessing/layers/index_lookup.py b/elasticdl_preprocessing/layers/index_lookup.py
@@ -0,0 +1,115 @@
+from __future__ import absolute_import, division, print_function
+
+import collections
+
+import tensorflow as tf
+from tensorflow.python.ops import lookup_ops
+
+
+class IndexLookup(tf.keras.layers.Layer):
+    """Maps strings to integer indices by looking up a vocabulary.
+
+    This layer transforms categorical inputs to zero-based integer by
+    lookuping with a vocabulary list. TensorFlow 2.2 has developed
+    `tf.keras.layers.preprocessing.IndexLookup` but not released it yet.
+    So the layer is a simple temporary version. The codes in TensorFlow 2.2 is
+    `tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup`.
+
+    Note that the TensorFlow version with the layer must be greater than 2.0.0.
+
+    Example:
+    ```python
+    layer = IndexLookup(vocabulary=['A', 'B', 'C'])
+    inp = np.array([['A'], ['B'], ['C'], ['D'], ['E']])
+    layer(inputs)
+    ```
+    Then output will be `[[0], [1], [2], [3], [3]]`
+
+    Attributes:
+    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
+        1. If this value is more than 1,
+        `hash(inputs) % num_oov_tokens + len(vocabulary)` converts OOV inputs
+        to integer values.
+    vocabulary: A list of vocabulary terms, or a path to a text file
+        containing a vocabulary to load into this layer. The file should
+        contain one token per line.
+
+    Input: A string `tf.Tensor`,`tf.SparseTensor` or
+        `tf.RaggedTensor`.
+
+    Output: An int64 tensor with the same type as input.
+
+    """
+
+    def __init__(self, vocabulary=None, num_oov_tokens=1, **kwargs):
+        super(IndexLookup, self).__init__()
+        self.num_oov_tokens = num_oov_tokens
+
+        if vocabulary is not None and isinstance(vocabulary, str):
+            vocabulary = self._get_vocabulary_from_file(vocabulary)
+            vocabulary_set = set(vocabulary)
+            if len(vocabulary) != len(vocabulary_set):
+                repeated_items = [
+                    item
+                    for item, count in collections.Counter(vocabulary).items()
+                    if count > 1
+                ]
+                raise ValueError(
+                    "The passed vocabulary has at least one repeated "
+                    "term. Please uniquify your dataset before passing "
+                    "it to IndexLookup(). The repeated terms are %s"
+                    % repeated_items
+                )
+        self.vocabulary = vocabulary
+
+    def build(self, input_shape):
+        self.table = lookup_ops.index_table_from_tensor(
+            vocabulary_list=self.vocabulary,
+            num_oov_buckets=self.num_oov_tokens,
+        )
+
+    def call(self, inputs):
+        if isinstance(inputs, tf.SparseTensor):
+            lookup_id = self.table.lookup(inputs.values)
+            output = tf.SparseTensor(
+                indices=inputs.indices,
+                values=lookup_id,
+                dense_shape=inputs.dense_shape,
+            )
+        elif isinstance(inputs, tf.RaggedTensor):
+            return tf.ragged.map_flat_values(self.table.lookup, inputs,)
+        else:
+            output = self.table.lookup(inputs)
+        return tf.cast(output, tf.int64)
+
+    def _get_vocabulary_from_file(self, vocabulary_path):
+        vocab = []
+        with tf.io.gfile.GFile(vocabulary_path, "r") as reader:
+            while True:
+                # Get the next line, and break if it is None.
+                text = reader.readline()
+                if not text:
+                    break
+
+                # Convert the raw text into UTF8 and strip whitespace.
+                if isinstance(text, str):
+                    token = text
+                elif isinstance(text, bytes):
+                    token = text.decode("utf-8", "ignore")
+                token = token.strip()
+                vocab.append(token)
+        return vocab
+
+    def vocab_size(self):
+        return self._table.size().numpy()
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {
+            "num_oov_tokens": self.num_oov_tokens,
+            "vocabulary": None,
+        }
+        base_config = super(IndexLookup, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/elasticdl_preprocessing/tests/index_lookup_test.py b/elasticdl_preprocessing/tests/index_lookup_test.py
@@ -0,0 +1,61 @@
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import tensorflow as tf
+
+from elasticdl_preprocessing.layers.index_lookup import IndexLookup
+from elasticdl_preprocessing.tests.test_utils import (
+    ragged_tensor_equal,
+    sparse_tensor_equal,
+)
+
+
+class IndexLookupTest(unittest.TestCase):
+    def test_lookup_with_list(self):
+        lookup_layer = IndexLookup(vocabulary=["A", "B", "C"])
+        self._check_lookup(lookup_layer)
+
+    def test_lookup_with_file(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            vocab_file = os.path.join(temp_dir, "vocab_test.txt")
+            with open(vocab_file, "w") as f:
+                f.write("A\n")
+                f.write("B\n")
+                f.write("C\n")
+            lookup_layer = IndexLookup(vocabulary=vocab_file)
+            self._check_lookup(lookup_layer)
+
+    def test_model_with_lookup(self):
+        inputs = tf.keras.Input(shape=(1,), dtype=tf.string)
+        lookup_out = IndexLookup(vocabulary=["A", "B", "C"])(inputs)
+        model = tf.keras.Model(inputs=inputs, outputs=lookup_out)
+        out = model.call(tf.constant([["A"], ["C"], ["B"], ["D"], ["E"]]))
+        self.assertTrue(
+            np.array_equal(
+                np.array([[0], [2], [1], [3], [3]], dtype=int), out.numpy()
+            )
+        )
+
+    def _check_lookup(self, lookup_layer):
+        dense_input = tf.constant([["A"], ["B"], ["C"], ["D"], ["E"]])
+        output = lookup_layer(dense_input)
+        expected_out = np.array([[0], [1], [2], [3], [3]])
+        self.assertTrue(np.array_equal(output.numpy(), expected_out))
+
+        ragged_input = tf.ragged.constant([["A", "B", "C"], ["D", "E"]])
+        ragged_output = lookup_layer(ragged_input)
+        expected_ragged_out = tf.ragged.constant(
+            [[0, 1, 2], [3, 3]], dtype=tf.int64
+        )
+        self.assertTrue(
+            ragged_tensor_equal(ragged_output, expected_ragged_out)
+        )
+
+        sparse_input = ragged_input.to_sparse()
+        sparse_output = lookup_layer(sparse_input)
+        expected_sparse_out = expected_ragged_out.to_sparse()
+        self.assertTrue(
+            sparse_tensor_equal(sparse_output, expected_sparse_out)
+        )