Skip to content

Commit ca4b79d

Browse files
authored
Add override to destructor (#29)
* Add override to destructor Summary: As titled. Test Plan: CI Reviewers: Subscribers: Tasks: Tags: * Add linter job Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Fix workflow Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Fix workflow Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Fix workflow Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Copy pytorch/vision setup-env.sh Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Chmod Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Change lint.yml Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Test something Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Try again Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: * Do not add lint.yml just yet Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 8328fa0 commit ca4b79d

31 files changed

+581
-586
lines changed

.flake8

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
[flake8]
2+
select = B,C,E,F,P,W,B9,TOR0,TOR1,TOR2
3+
max-line-length = 80
4+
ignore =
5+
# Black conflicts and overlaps.
6+
B950,
7+
E111,
8+
E115,
9+
E117,
10+
E121,
11+
E122,
12+
E123,
13+
E124,
14+
E125,
15+
E126,
16+
E127,
17+
E128,
18+
E129,
19+
E131,
20+
E201,
21+
E202,
22+
E203,
23+
E221,
24+
E222,
25+
E225,
26+
E226,
27+
E227,
28+
E231,
29+
E241,
30+
E251,
31+
E252,
32+
E261,
33+
E262,
34+
E265,
35+
E271,
36+
E272,
37+
E301,
38+
E302,
39+
E303,
40+
E305,
41+
E306,
42+
E501,
43+
E502,
44+
E701,
45+
E702,
46+
E703,
47+
E704,
48+
W291,
49+
W292,
50+
W293,
51+
W391,
52+
W504,
53+
54+
# Too opinionated.
55+
E265,
56+
E266,
57+
E402,
58+
E722,
59+
B001,
60+
P207,
61+
B003,
62+
P208,
63+
C403,
64+
W503,
65+
66+
# Bugbear has opinions: https://github.com/PyCQA/flake8-bugbear#opinionated-warnings
67+
B904,
68+
B905,
69+
B906,
70+
B907,
71+
exclude =
72+
./.git,
73+
./backends/xnnpack/third-party,
74+
./build,
75+
./configurations,
76+
./docs,
77+
./third_party,
78+
*.pyi
79+
80+
max-complexity = 12

.lintrunner.toml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ init_command = [
2929
[[linter]]
3030
code = 'UFMT'
3131
include_patterns = [
32-
'**/*.py',
33-
'**/*.pyi',
32+
'*.py',
33+
'*.pyi',
3434
]
3535
exclude_patterns = [
3636
'third-party/**',
@@ -135,3 +135,33 @@ command = [
135135
'@{{PATHSFILE}}',
136136
]
137137
is_formatter = true
138+
139+
[[linter]]
140+
code = 'MYPY'
141+
include_patterns = [
142+
'*.py',
143+
]
144+
exclude_patterns = [
145+
'third-party/**',
146+
]
147+
command = [
148+
'python',
149+
'-m',
150+
'lintrunner_adapters',
151+
'run',
152+
'mypy_linter',
153+
'--config=.mypy.ini',
154+
'--show-disable',
155+
'--',
156+
'--explicit-package-bases',
157+
'@{{PATHSFILE}}'
158+
]
159+
init_command = [
160+
'python',
161+
'-m',
162+
'lintrunner_adapters',
163+
'run',
164+
'pip_init',
165+
'--dry-run={{DRYRUN}}',
166+
'--requirement=requirements-lintrunner.txt',
167+
]

.mypy.ini

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
[mypy]
2+
allow_redefinition = True
3+
warn_unused_configs = True
4+
warn_redundant_casts = True
5+
show_error_codes = True
6+
show_column_numbers = True
7+
disallow_untyped_decorators = True
8+
follow_imports = normal
9+
local_partial_types = True
10+
enable_error_code = possibly-undefined
11+
warn_unused_ignores = False
12+
13+
mypy_path = pytorch_tokenizers
14+
15+
[mypy-buck_util]
16+
ignore_missing_imports = True
17+
18+
[mypy-docutils.*]
19+
ignore_missing_imports = True
20+
21+
[mypy-pandas]
22+
ignore_missing_imports = True
23+
24+
[mypy-ruamel]
25+
ignore_missing_imports = True
26+
27+
[mypy-tomllib]
28+
ignore_missing_imports = True
29+
30+
[mypy-yaml]
31+
ignore_missing_imports = True
32+
33+
[mypy-zstd]
34+
ignore_missing_imports = True

examples/tokenize_tool/main.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
using namespace tokenizers;
2727

28-
std::string help(char* argv[]) {
28+
std::string help(char *argv[]) {
2929
std::stringstream ss;
3030
ss << "Usage: " << argv[0] << " <type> <model> <input to tokenize...>"
3131
<< std::endl
@@ -37,7 +37,7 @@ std::string help(char* argv[]) {
3737
return ss.str();
3838
}
3939

40-
int main(int argc, char* argv[]) {
40+
int main(int argc, char *argv[]) {
4141
// Check for the right number of CLI args
4242
if (argc < 4) {
4343
std::cerr << help(argv) << std::endl;
@@ -95,7 +95,7 @@ int main(int argc, char* argv[]) {
9595
// Decode
9696
std::cout << "Decoding..." << std::endl;
9797
uint64_t prev = tok_ptr->bos_tok();
98-
for (const auto& current : encoded) {
98+
for (const auto &current : encoded) {
9999
const auto decoded_result = tok_ptr->decode(prev, current);
100100
std::cout << decoded_result.get();
101101
prev = current;

include/pytorch/tokenizers/base64.h

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace base64 {
3636
using tokenizers::Error;
3737
using tokenizers::Result;
3838

39-
Result<std::string> decode(const std::string_view& input);
39+
Result<std::string> decode(const std::string_view &input);
4040

4141
namespace detail {
4242

@@ -68,12 +68,9 @@ inline Error validate(uint32_t v) {
6868
return Error::Ok;
6969
}
7070

71-
inline Error decode(const std::string_view& input, std::string& output) {
72-
TK_CHECK_OR_RETURN_ERROR(
73-
input.size() == 4,
74-
Base64DecodeFailure,
75-
"input length must be 4, got %zu",
76-
input.size());
71+
inline Error decode(const std::string_view &input, std::string &output) {
72+
TK_CHECK_OR_RETURN_ERROR(input.size() == 4, Base64DecodeFailure,
73+
"input length must be 4, got %zu", input.size());
7774

7875
uint32_t val = 0;
7976

@@ -103,14 +100,10 @@ inline Error decode(const std::string_view& input, std::string& output) {
103100
return Error::Ok;
104101
}
105102

106-
inline Error decode_1_padding(
107-
const std::string_view& input,
108-
std::string& output) {
109-
TK_CHECK_OR_RETURN_ERROR(
110-
input.size() == 3,
111-
Base64DecodeFailure,
112-
"input length must be 3, got %zu",
113-
input.size());
103+
inline Error decode_1_padding(const std::string_view &input,
104+
std::string &output) {
105+
TK_CHECK_OR_RETURN_ERROR(input.size() == 3, Base64DecodeFailure,
106+
"input length must be 3, got %zu", input.size());
114107

115108
uint32_t val = 0;
116109

@@ -134,14 +127,10 @@ inline Error decode_1_padding(
134127
return Error::Ok;
135128
}
136129

137-
inline Error decode_2_padding(
138-
const std::string_view& input,
139-
std::string& output) {
140-
TK_CHECK_OR_RETURN_ERROR(
141-
input.size() == 2,
142-
Base64DecodeFailure,
143-
"input length must be 2, got %zu",
144-
input.size());
130+
inline Error decode_2_padding(const std::string_view &input,
131+
std::string &output) {
132+
TK_CHECK_OR_RETURN_ERROR(input.size() == 2, Base64DecodeFailure,
133+
"input length must be 2, got %zu", input.size());
145134

146135
uint32_t val = 0;
147136

@@ -161,13 +150,12 @@ inline Error decode_2_padding(
161150

162151
} // namespace detail
163152

164-
inline tokenizers::Result<std::string> decode(const std::string_view& input) {
153+
inline tokenizers::Result<std::string> decode(const std::string_view &input) {
165154
TK_CHECK_OR_RETURN_ERROR(!input.empty(), Base64DecodeFailure, "empty input");
166155

167156
// Faster than `input.size() % 4`.
168157
TK_CHECK_OR_RETURN_ERROR(
169-
(input.size() & 3) == 0 && input.size() >= 4,
170-
Base64DecodeFailure,
158+
(input.size() & 3) == 0 && input.size() >= 4, Base64DecodeFailure,
171159
"input length must be larger than 4 and is multiple of 4, got %zu",
172160
input.size());
173161

include/pytorch/tokenizers/bpe_tokenizer_base.h

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,27 @@ using Decoder = std::unordered_map<uint64_t, std::string>;
3232
using Re2UPtr = std::unique_ptr<re2::RE2>;
3333

3434
class BPETokenizerBase : public Tokenizer {
35-
public:
36-
Result<std::vector<uint64_t>>
37-
encode(const std::string& input, int8_t bos, int8_t eos) const override;
35+
public:
36+
Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
37+
int8_t eos) const override;
3838

39-
Result<std::string> decode(uint64_t prev_token, uint64_t token)
40-
const override;
39+
Result<std::string> decode(uint64_t prev_token,
40+
uint64_t token) const override;
4141

42-
protected:
42+
protected:
4343
explicit BPETokenizerBase() {}
44-
virtual ~BPETokenizerBase() {}
44+
virtual ~BPETokenizerBase() override {}
4545

4646
std::pair<std::optional<std::string>, re2::StringPiece>
47-
split_with_allowed_special_token_(
48-
re2::StringPiece& input,
49-
const Encoder& allowed_special) const;
47+
split_with_allowed_special_token_(re2::StringPiece &input,
48+
const Encoder &allowed_special) const;
5049

51-
Result<std::pair<std::vector<uint64_t>, uint64_t>> encode_with_special_token_(
52-
const std::string& text,
53-
const Encoder& allowed_special) const;
50+
Result<std::pair<std::vector<uint64_t>, uint64_t>>
51+
encode_with_special_token_(const std::string &text,
52+
const Encoder &allowed_special) const;
5453

55-
Result<std::vector<uint64_t>> byte_pair_encode_(
56-
const std::string& piece,
57-
const Encoder& encoder) const;
54+
Result<std::vector<uint64_t>> byte_pair_encode_(const std::string &piece,
55+
const Encoder &encoder) const;
5856

5957
// Protected members that can be overloaded by other BPE tokenizers
6058
Re2UPtr special_token_regex_;
@@ -63,13 +61,11 @@ class BPETokenizerBase : public Tokenizer {
6361
Decoder decoder_;
6462
Decoder special_token_decoder_;
6563

66-
private:
67-
virtual Error _encode(
68-
re2::StringPiece& input,
69-
std::vector<uint64_t>& ret,
70-
uint64_t& last_piece_token_len) const = 0;
64+
private:
65+
virtual Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
66+
uint64_t &last_piece_token_len) const = 0;
7167

72-
virtual void _decode(re2::StringPiece input, std::string& ret) const = 0;
68+
virtual void _decode(re2::StringPiece input, std::string &ret) const = 0;
7369
};
7470

7571
} // namespace detail

0 commit comments

Comments
 (0)