Skip to content

Commit bf530db

Browse files
lucylqfacebook-github-bot
authored andcommitted
Tokenizer test (#21)
Summary: Pull Request resolved: #21 Test Plan: ## OSS Build ``` cmake . -DCMAKE_INSTALL_PREFIX=cmake-out -DTOKENIZERS_BUILD_TEST=ON -Bcmake-out cmake --build cmake-out -j9 --target install ``` Test ``` (executorch) [[email protected] /data/users/lfq/tokenizers/cmake-out (lfq.tokenizer-test)]$ ctest Test project /data/users/lfq/tokenizers/cmake-out Start 1: test_base64 1/5 Test #1: test_base64 ...................... Passed 0.00 sec Start 2: test_llama2c_tokenizer 2/5 Test #2: test_llama2c_tokenizer ........... Passed 0.00 sec Start 3: test_pre_tokenizer 3/5 Test #3: test_pre_tokenizer ............... Passed 0.73 sec Start 4: test_sentencepiece 4/5 Test #4: test_sentencepiece ............... Passed 0.04 sec Start 5: test_tiktoken 5/5 Test #5: test_tiktoken .................... Passed 3.32 sec 100% tests passed, 0 tests failed out of 5 Total Test time (real) = 4.10 sec ``` ## Internal ``` buck2 test fbsource//xplat/pytorch/tokenizers/test: buck2 test fbcode//pytorch/tokenizers/test: ``` Differential Revision: D69860352 Pulled By: lucylq
1 parent 0763945 commit bf530db

File tree

6 files changed

+100
-27
lines changed

6 files changed

+100
-27
lines changed

targets.bzl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def define_common_targets():
1515
]),
1616
visibility = [
1717
"@EXECUTORCH_CLIENTS",
18+
"//pytorch/tokenizers/...",
1819
],
1920
header_namespace = "",
2021
)
@@ -29,12 +30,14 @@ def define_common_targets():
2930
],
3031
visibility = [
3132
"@EXECUTORCH_CLIENTS",
33+
"//pytorch/tokenizers/...",
3234
],
3335
compiler_flags = [
3436
"-D_USE_INTERNAL_STRING_VIEW",
3537
],
3638
external_deps = [
3739
"sentencepiece",
40+
"abseil-cpp",
3841
],
3942
)
4043

@@ -49,6 +52,7 @@ def define_common_targets():
4952
],
5053
visibility = [
5154
"@EXECUTORCH_CLIENTS",
55+
"//pytorch/tokenizers/...",
5256
],
5357
compiler_flags = [
5458
"-D_USE_INTERNAL_STRING_VIEW",
@@ -84,6 +88,7 @@ def define_common_targets():
8488
],
8589
visibility = [
8690
"@EXECUTORCH_CLIENTS",
91+
"//pytorch/tokenizers/...",
8792
],
8893
compiler_flags = [
8994
"-D_USE_INTERNAL_STRING_VIEW",
@@ -104,5 +109,6 @@ def define_common_targets():
104109
],
105110
visibility = [
106111
"@EXECUTORCH_CLIENTS",
112+
"//pytorch/tokenizers/...",
107113
],
108114
)

test/TARGETS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets()

test/targets.bzl

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
load(
2+
"@fbsource//tools/build_defs:default_platform_defs.bzl",
3+
"ANDROID",
4+
"CXX",
5+
)
6+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
7+
8+
def define_common_targets():
9+
"""Defines targets that should be shared between fbcode and xplat.
10+
11+
The directory containing this targets.bzl file should also contain both
12+
TARGETS and BUCK files that call this function.
13+
"""
14+
runtime.cxx_test(
15+
name = "test_base64",
16+
srcs = [
17+
"test_base64.cpp",
18+
],
19+
deps = [
20+
"//pytorch/tokenizers:headers",
21+
],
22+
)
23+
24+
runtime.cxx_test(
25+
name = "test_llama2c_tokenizer",
26+
srcs = [
27+
"test_llama2c_tokenizer.cpp",
28+
],
29+
deps = [
30+
"//pytorch/tokenizers:llama2c_tokenizer",
31+
],
32+
env = {
33+
"RESOURCES_PATH": "$(location :resources)/resources",
34+
},
35+
platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform.
36+
)
37+
38+
runtime.cxx_test(
39+
name = "test_pre_tokenizer",
40+
srcs = [
41+
"test_pre_tokenizer.cpp",
42+
],
43+
deps = [
44+
"//pytorch/tokenizers:headers",
45+
"//pytorch/tokenizers:hf_tokenizer",
46+
],
47+
)
48+
49+
runtime.cxx_test(
50+
name = "test_sentencepiece",
51+
srcs = [
52+
"test_sentencepiece.cpp",
53+
],
54+
deps = ["//pytorch/tokenizers:sentencepiece"],
55+
external_deps = [
56+
"sentencepiece",
57+
"abseil-cpp",
58+
],
59+
env = {
60+
"RESOURCES_PATH": "$(location :resources)/resources",
61+
},
62+
)
63+
64+
runtime.cxx_test(
65+
name = "test_tiktoken",
66+
srcs = [
67+
"test_tiktoken.cpp",
68+
],
69+
deps = [
70+
"//pytorch/tokenizers:tiktoken",
71+
],
72+
env = {
73+
"RESOURCES_PATH": "$(location :resources)/resources",
74+
},
75+
platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform.
76+
external_deps = [
77+
"re2",
78+
],
79+
)
80+
81+
runtime.filegroup(
82+
name = "resources",
83+
srcs = native.glob([
84+
"resources/**",
85+
]),
86+
)

test/test_llama2c_tokenizer.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef TOKENIZERS_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
129
#include <gtest/gtest.h>
1310
#include <pytorch/tokenizers/llama2c_tokenizer.h>
1411

@@ -17,16 +14,9 @@ using namespace ::testing;
1714
namespace tokenizers {
1815

1916
namespace {
20-
// Test case based on llama2.c tokenizer
2117
static inline std::string _get_resource_path(const std::string& name) {
22-
#ifdef TOKENIZERS_FB_BUCK
23-
return facebook::xplat::testing::getPathForTestResource(
24-
"test/resources/" + name);
25-
#else
2618
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
27-
#endif
2819
}
29-
3020
} // namespace
3121

3222
class Llama2cTokenizerTest : public Test {

test/test_sentencepiece.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,15 @@
77
*/
88
// @lint-ignore-every LICENSELINT
99

10-
#ifdef TOKENIZERS_FB_BUCK
11-
#include <TestResourceUtils/TestResourceUtils.h>
12-
#endif
1310
#include <gtest/gtest.h>
1411
#include <pytorch/tokenizers/sentencepiece.h>
1512

1613
namespace tokenizers {
1714

1815
namespace {
1916
static inline std::string _get_resource_path(const std::string& name) {
20-
#ifdef TOKENIZERS_FB_BUCK
21-
return facebook::xplat::testing::getPathForTestResource(
22-
"test/resources/" + name);
23-
#else
2417
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
25-
#endif
2618
}
27-
2819
} // namespace
2920

3021
TEST(SPTokenizerTest, TestEncodeWithoutLoad) {

test/test_tiktoken.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77
*/
88
// @lint-ignore-every LICENSELINT
99

10-
#ifdef TOKENIZERS_FB_BUCK
11-
#include <TestResourceUtils/TestResourceUtils.h>
12-
#endif
1310
#include <gtest/gtest.h>
1411
#include <pytorch/tokenizers/tiktoken.h>
1512

@@ -45,12 +42,7 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
4542
}
4643

4744
static inline std::string _get_resource_path(const std::string& name) {
48-
#ifdef TOKENIZERS_FB_BUCK
49-
return facebook::xplat::testing::getPathForTestResource(
50-
"test/resources/" + name);
51-
#else
5245
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
53-
#endif
5446
}
5547

5648
} // namespace

0 commit comments

Comments
 (0)