@@ -272,7 +272,6 @@ def forward(self, input: Any) -> Any:
272
272
273
273
274
274
class GPT2BPETokenizer (Module ):
275
- __jit_unused_properties__ = ["is_jitable" ]
276
275
"""
277
276
Transform for GPT-2 BPE Tokenizer.
278
277
@@ -286,6 +285,8 @@ class GPT2BPETokenizer(Module):
286
285
:param return_tokens: Indicate whether to return split tokens. If False, it will return encoded token IDs as strings (default: False)
287
286
:type return_input: bool
288
287
"""
288
+
289
+ __jit_unused_properties__ = ["is_jitable" ]
289
290
_seperator : torch .jit .Final [str ]
290
291
291
292
def __init__ (self , encoder_json_path : str , vocab_bpe_path : str , return_tokens : bool = False ):
@@ -382,7 +383,6 @@ def __prepare_scriptable__(self):
382
383
383
384
384
385
class CLIPTokenizer (Module ):
385
- __jit_unused_properties__ = ["is_jitable" ]
386
386
"""
387
387
Transform for CLIP Tokenizer. Based on Byte-Level BPE.
388
388
@@ -414,6 +414,7 @@ class CLIPTokenizer(Module):
414
414
:type return_input: bool
415
415
"""
416
416
417
+ __jit_unused_properties__ = ["is_jitable" ]
417
418
_seperator : torch .jit .Final [str ]
418
419
419
420
def __init__ (
@@ -534,23 +535,25 @@ def __prepare_scriptable__(self):
534
535
535
536
536
537
class BERTTokenizer (Module ):
537
- __jit_unused_properties__ = ["is_jitable" ]
538
538
"""
539
539
Transform for BERT Tokenizer.
540
540
541
541
Based on WordPiece algorithm introduced in paper:
542
542
https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf
543
543
544
- The backend kernel implementation is the modified form of https://github.com/LieluoboAi/radish.
545
- See https://github.com/pytorch/text/pull/1707 summary for more details.
544
+ The backend kernel implementation is taken and modified from https://github.com/LieluoboAi/radish.
545
+
546
+ See PR https://github.com/pytorch/text/pull/1707 summary for more details.
546
547
547
548
The below code snippet shows how to use the BERT tokenizer using the pre-trained vocab files.
549
+
548
550
Example
549
551
>>> from torchtext.transforms import BERTTokenizer
550
552
>>> VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
551
553
>>> tokenizer = BERTTokenizer(vocab_path=VOCAB_FILE, do_lower_case=True, return_tokens=True)
552
554
>>> tokenizer("Hello World, How are you!") # single sentence input
553
555
>>> tokenizer(["Hello World","How are you!"]) # batch input
556
+
554
557
:param vocab_path: Path to pre-trained vocabulary file. The path can be either local or URL.
555
558
:type vocab_path: str
556
559
:param do_lower_case: Indicate whether to do lower case. (default: True)
@@ -561,6 +564,8 @@ class BERTTokenizer(Module):
561
564
:type return_tokens: bool
562
565
"""
563
566
567
+ __jit_unused_properties__ = ["is_jitable" ]
568
+
564
569
def __init__ (
565
570
self , vocab_path : str , do_lower_case : bool = True , strip_accents : Optional [bool ] = None , return_tokens = False
566
571
) -> None :
0 commit comments