-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtokenizer.py
More file actions
50 lines (41 loc) · 1.67 KB
/
tokenizer.py
File metadata and controls
50 lines (41 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
Tokenize text using HuggingFace tokenizers and dump token IDs.
Usage:
python python/tokenizer.py "The capital of France is" --model meta-llama/Llama-3.2-1B
python python/tokenizer.py --decode --ids 791 6864 315 9822 374 --model meta-llama/Llama-3.2-1B
The C++ engine reads/writes plain text files of token IDs.
"""
import argparse
import json
from transformers import AutoTokenizer
def main():
parser = argparse.ArgumentParser()
parser.add_argument("text", nargs="?", help="Text to tokenize")
parser.add_argument("--model", default="meta-llama/Llama-3.2-1B")
parser.add_argument("--decode", action="store_true", help="Decode mode: IDs → text")
parser.add_argument("--ids", nargs="+", type=int, help="Token IDs to decode")
parser.add_argument("--output", "-o", help="Output file (default: stdout)")
args = parser.parse_args()
tokenizer = AutoTokenizer.from_pretrained(args.model)
if args.decode:
if not args.ids:
parser.error("--decode requires --ids")
text = tokenizer.decode(args.ids)
print(text)
else:
if not args.text:
parser.error("Provide text to tokenize")
ids = tokenizer.encode(args.text)
if args.output:
with open(args.output, "w") as f:
json.dump(ids, f)
print(f"Wrote {len(ids)} tokens to {args.output}")
else:
space_delimited_tokens = ""
for index, id in enumerate(ids):
space_delimited_tokens += f"{id}"
if index != len(ids) - 1:
space_delimited_tokens += " "
print(space_delimited_tokens)
if __name__ == "__main__":
main()