Modern-Text-Tokenizer/Modern-Text-Tokenizer.cpp at main · Mecanik/Modern-Text-Tokenizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#include "Modern-Text-Tokenizer.hpp"
#include <chrono>

using namespace std;
using namespace MecanikDev;

void print_separator(const std::string& title) {
	std::cout << "\n" << std::string(50, '=') << std::endl;
	std::cout << "  " << title << std::endl;
	std::cout << std::string(50, '=') << std::endl;
}

void test_basic_tokenization() {
	print_separator("BASIC TOKENIZATION TEST");

	TextTokenizer tokenizer;

	std::vector<std::string> test_texts = {
		"Hello, world!",
		"This is a test sentence.",
		"Natural language processing with C++",
		"The quick brown fox jumps over the lazy dog."
	};

	for (const auto& text : test_texts) {
		auto tokens = tokenizer.tokenize(text);
		std::cout << "Text: \"" << text << "\"" << std::endl;
		std::cout << "Tokens: ";
		for (size_t i = 0; i < tokens.size(); ++i) {
			std::cout << "'" << tokens[i] << "'";
			if (i < tokens.size() - 1) std::cout << ", ";
		}
		std::cout << " (" << tokens.size() << " tokens)" << std::endl << std::endl;
	}
}

void test_distilbert_vocab_loading() {
	print_separator("DISTILBERT VOCABULARY LOADING");

	TextTokenizer tokenizer;

	std::cout << "Loading DistilBERT vocabulary from 'vocab.txt'..." << std::endl;

	if (tokenizer.load_vocab("vocab.txt")) {
		std::cout << "Successfully loaded vocabulary." << std::endl;
		std::cout << "Vocabulary size: " << tokenizer.vocab_size() << " tokens" << std::endl;

		// Show special token IDs
		std::cout << "\nSpecial Token IDs:" << std::endl;
		std::cout << "  [PAD]: " << tokenizer.get_pad_id() << std::endl;
		std::cout << "  [UNK]: " << tokenizer.get_unk_id() << std::endl;
		std::cout << "  [CLS]: " << tokenizer.get_cls_id() << std::endl;
		std::cout << "  [SEP]: " << tokenizer.get_sep_id() << std::endl;
	}
	else {
		std::cout << "Failed to load vocabulary!" << std::endl;
		std::cout << "Make sure 'vocab.txt' exists in the current directory." << std::endl;
		std::cout << "You can download it from:" << std::endl;
		std::cout << "https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad/raw/main/vocab.txt" << std::endl;
		return;
	}
}

void test_encoding_decoding() {
	print_separator("ENCODING & DECODING TEST");

	TextTokenizer tokenizer;

	if (!tokenizer.load_vocab("vocab.txt")) {
		std::cout << "Cannot test encoding/decoding without vocabulary!" << std::endl;
		return;
	}

	// Configure tokenizer
	tokenizer
		// DistilBERT typically uses lowercase
		.set_lowercase(true)
		.set_split_on_punctuation(true)
		.set_keep_punctuation(true);

	std::vector<std::string> test_texts = {
		"Hello world!",
		"This is a test.",
		"Machine learning is awesome.",
		"Natural language processing with transformers."
	};

	for (const auto& text : test_texts) {
		std::cout << "\nOriginal: \"" << text << "\"" << std::endl;

		// Tokenize to strings
		auto tokens = tokenizer.tokenize(text);
		std::cout << "Tokens: ";
		for (size_t i = 0; i < tokens.size(); ++i) {
			std::cout << "'" << tokens[i] << "'";
			if (i < tokens.size() - 1) std::cout << ", ";
		}
		std::cout << std::endl;

		// Encode to token IDs
		auto token_ids = tokenizer.encode(text);
		std::cout << "Token IDs: ";
		for (size_t i = 0; i < token_ids.size(); ++i) {
			std::cout << token_ids[i];
			if (i < token_ids.size() - 1) std::cout << ", ";
		}
		std::cout << std::endl;

		// Decode back to text
		auto decoded = tokenizer.decode(token_ids);
		std::cout << "Decoded: \"" << decoded << "\"" << std::endl;

		// Check if round-trip is successful
		if (decoded.find(text.substr(0, text.find_first_of(".,!?"))) != std::string::npos) {
			std::cout << "Round-trip successful!" << std::endl;
		}
		else {
			std::cout << "Round-trip differences detected" << std::endl;
		}
	}
}

void test_sequence_encoding() {
	print_separator("SEQUENCE ENCODING FOR ML");

	TextTokenizer tokenizer;

	if (!tokenizer.load_vocab("vocab.txt")) {
		std::cout << "Cannot test sequence encoding without vocabulary!" << std::endl;
		return;
	}

	tokenizer
		.set_lowercase(true)
		.set_split_on_punctuation(true)
		.set_keep_punctuation(true);

	std::vector<std::string> test_sentences = {
		"What is machine learning?",
		"Transformers are powerful neural networks.",
		"BERT revolutionized natural language processing."
	};

	std::cout << "Encoding sequences for ML models (max_length=20):" << std::endl;

	for (const auto& sentence : test_sentences) {
		std::cout << "\nSentence: \"" << sentence << "\"" << std::endl;

		// Encode with special tokens for BERT-style models
		auto sequence_ids = tokenizer.encode_sequence(sentence, 20, true);

		std::cout << "Sequence IDs: [";
		for (size_t i = 0; i < sequence_ids.size(); ++i) {
			std::cout << sequence_ids[i];
			if (i < sequence_ids.size() - 1) std::cout << ", ";
		}
		std::cout << "]" << std::endl;
		std::cout << "Length: " << sequence_ids.size() << " tokens" << std::endl;

		// Show what each ID represents
		std::cout << "Token breakdown: ";
		for (size_t i = 0; i < sequence_ids.size(); ++i) {
			int id = sequence_ids[i];
			if (id == tokenizer.get_cls_id()) {
				std::cout << "[CLS]";
			}
			else if (id == tokenizer.get_sep_id()) {
				std::cout << "[SEP]";
			}
			else if (id == tokenizer.get_pad_id()) {
				std::cout << "[PAD]";
			}
			else if (id == tokenizer.get_unk_id()) {
				std::cout << "[UNK]";
			}
			else {
				std::cout << tokenizer.get_token_by_id(id);
			}
			if (i < sequence_ids.size() - 1) std::cout << " ";
		}
		std::cout << std::endl;
	}
}

void test_performance() {
	print_separator("PERFORMANCE TEST");

	TextTokenizer tokenizer;

	if (!tokenizer.load_vocab("vocab.txt")) {
		std::cout << "Cannot test performance without vocabulary!" << std::endl;
		return;
	}

	tokenizer
		.set_lowercase(true)
		.set_split_on_punctuation(true);

	// Create a large test text
	std::string base_text = "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.";
	std::string large_text;

	// Repeat the text 1000 times
	for (int i = 0; i < 1000; ++i) {
		large_text += base_text + " ";
	}

	std::cout << "Performance test with " << large_text.size() << " characters" << std::endl;

	// Test tokenization performance
	auto start_time = std::chrono::high_resolution_clock::now();
	auto tokens = tokenizer.tokenize(large_text);
	auto tokenize_time = std::chrono::high_resolution_clock::now();

	// Test encoding performance
	auto token_ids = tokenizer.encode(large_text);
	auto encode_time = std::chrono::high_resolution_clock::now();

	// Test decoding performance
	auto decoded = tokenizer.decode(token_ids);
	auto decode_time = std::chrono::high_resolution_clock::now();

	// Calculate durations
	auto tokenize_duration = std::chrono::duration_cast<std::chrono::microseconds>(tokenize_time - start_time);
	auto encode_duration = std::chrono::duration_cast<std::chrono::microseconds>(encode_time - tokenize_time);
	auto decode_duration = std::chrono::duration_cast<std::chrono::microseconds>(decode_time - encode_time);

	std::cout << "\nResults:" << std::endl;
	std::cout << "  Tokenization: " << tokenize_duration.count() << " μs (" << tokens.size() << " tokens)" << std::endl;
	std::cout << "  Encoding:     " << encode_duration.count() << " μs" << std::endl;
	std::cout << "  Decoding:     " << decode_duration.count() << " μs" << std::endl;

	// Calculate throughput
	double total_time_ms = (tokenize_duration.count() + encode_duration.count() + decode_duration.count()) / 1000.0;
	double throughput_mb_s = (large_text.size() / 1024.0 / 1024.0) / (total_time_ms / 1000.0);

	std::cout << "  Total time:   " << std::fixed << std::setprecision(2) << total_time_ms << " ms" << std::endl;
	std::cout << "  Throughput:   " << std::fixed << std::setprecision(2) << throughput_mb_s << " MB/s" << std::endl;
}

void test_edge_cases() {
	print_separator("EDGE CASES TEST");

	TextTokenizer tokenizer;

	if (tokenizer.load_vocab("vocab.txt")) {
		tokenizer
			.set_lowercase(true)
			.set_split_on_punctuation(true)
			.set_keep_punctuation(true);
	}

	std::vector<std::string> edge_cases = {
		"",									// Empty string
		"   ",								// Only whitespace
		"Hello",							// Single word
		"!!!",								// Only punctuation
		"Hello123World",					// Mixed alphanumeric
		"café naïve résumé",				// Accented characters
		"你好世界",							// Chinese characters
		"🚀🌟💡",							// Emojis
		"C++ vs Python vs Rust",			// Programming languages
		"user@example.com",					// Email
		"https://www.example.com",			// URL
		"It's a beautiful day, isn't it?"	// Contractions
	};

	for (const auto& text : edge_cases) {
		auto tokens = tokenizer.tokenize(text);
		std::cout << "Input: \"" << text << "\"" << std::endl;
		std::cout << "Tokens (" << tokens.size() << "): ";

		for (size_t i = 0; i < tokens.size(); ++i) {
			std::cout << "'" << tokens[i] << "'";
			if (i < tokens.size() - 1) std::cout << ", ";
		}
		std::cout << std::endl;

		if (tokenizer.has_vocab()) {
			auto token_ids = tokenizer.encode(text);
			std::cout << "IDs: ";
			for (size_t i = 0; i < token_ids.size(); ++i) {
				std::cout << token_ids[i];
				if (i < token_ids.size() - 1) std::cout << ", ";
			}
			std::cout << std::endl;
		}
		std::cout << std::endl;
	}
}

int main()
{
	std::cout << "Vocabulary Tokenizer Demo" << std::endl;
	std::cout << "=======================================" << std::endl;

	test_basic_tokenization();
	test_distilbert_vocab_loading();
	test_encoding_decoding();
	test_sequence_encoding();
	test_performance();
	test_edge_cases();

	std::cout << "Demo completed!" << std::endl;
	std::cout << "=======================================" << std::endl;

	return 0;
}