Skip to content

Commit 59f4bee

Browse files
authored
Fix docstring for Tokenizers (#1739)
1 parent da509e1 commit 59f4bee

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

docs/source/transforms.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ CLIPTokenizer
3030

3131
.. automethod:: forward
3232

33+
BERTTokenizer
34+
----------------------
35+
36+
.. autoclass:: BERTTokenizer
37+
38+
.. automethod:: forward
39+
40+
3341
VocabTransform
3442
--------------
3543

torchtext/transforms.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,6 @@ def forward(self, input: Any) -> Any:
272272

273273

274274
class GPT2BPETokenizer(Module):
275-
__jit_unused_properties__ = ["is_jitable"]
276275
"""
277276
Transform for GPT-2 BPE Tokenizer.
278277
@@ -286,6 +285,8 @@ class GPT2BPETokenizer(Module):
286285
:param return_tokens: Indicate whether to return split tokens. If False, it will return encoded token IDs as strings (default: False)
287286
:type return_input: bool
288287
"""
288+
289+
__jit_unused_properties__ = ["is_jitable"]
289290
_seperator: torch.jit.Final[str]
290291

291292
def __init__(self, encoder_json_path: str, vocab_bpe_path: str, return_tokens: bool = False):
@@ -382,7 +383,6 @@ def __prepare_scriptable__(self):
382383

383384

384385
class CLIPTokenizer(Module):
385-
__jit_unused_properties__ = ["is_jitable"]
386386
"""
387387
Transform for CLIP Tokenizer. Based on Byte-Level BPE.
388388
@@ -414,6 +414,7 @@ class CLIPTokenizer(Module):
414414
:type return_input: bool
415415
"""
416416

417+
__jit_unused_properties__ = ["is_jitable"]
417418
_seperator: torch.jit.Final[str]
418419

419420
def __init__(
@@ -534,23 +535,25 @@ def __prepare_scriptable__(self):
534535

535536

536537
class BERTTokenizer(Module):
537-
__jit_unused_properties__ = ["is_jitable"]
538538
"""
539539
Transform for BERT Tokenizer.
540540
541541
Based on WordPiece algorithm introduced in paper:
542542
https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf
543543
544-
The backend kernel implementation is the modified form of https://github.com/LieluoboAi/radish.
545-
See https://github.com/pytorch/text/pull/1707 summary for more details.
544+
The backend kernel implementation is taken and modified from https://github.com/LieluoboAi/radish.
545+
546+
See PR https://github.com/pytorch/text/pull/1707 summary for more details.
546547
547548
The below code snippet shows how to use the BERT tokenizer using the pre-trained vocab files.
549+
548550
Example
549551
>>> from torchtext.transforms import BERTTokenizer
550552
>>> VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
551553
>>> tokenizer = BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=True)
552554
>>> tokenizer("Hello World, How are you!") # single sentence input
553555
>>> tokenizer(["Hello World","How are you!"]) # batch input
556+
554557
:param vocab_path: Path to pre-trained vocabulary file. The path can be either local or URL.
555558
:type vocab_path: str
556559
:param do_lower_case: Indicate whether to do lower case. (default: True)
@@ -561,6 +564,8 @@ class BERTTokenizer(Module):
561564
:type return_tokens: bool
562565
"""
563566

567+
__jit_unused_properties__ = ["is_jitable"]
568+
564569
def __init__(
565570
self, vocab_path: str, do_lower_case: bool = True, strip_accents: Optional[bool] = None, return_tokens=False
566571
) -> None:

0 commit comments

Comments
 (0)