In our previous tutorials, we explored the fundamentals of tokenization and built a complete Byte Pair Encoding (BPE) tokenizer from scratch.
If you haven’t read them yet, you can start with Part 1: Understanding Tokenization Basics and then move on to Part 2: Building a BPE Tokenizer from Scratch to understand how raw text is converted into token sequences.
In this part, we take the next step by introducing regex-based tokenization. By applying carefully designed regular expressions, we can significantly improve how tokens are segmented—capturing meaningful words, numbers, and symbols instead of fragmented character sequences. This approach helps produce cleaner, more semantically useful tokens that are better suited for modern language models.
The Problem with Basic BPE
Again lets see the sample of our original BPE:

We can see that the tokenizer sample gibberish like [, ] [t h] etc, here for actual use case we need a tokenizer that can extract meaningful self contained tokens because if that is not so then when tokens are fed into language model the transformer attention mechanism will attempt to reconstruct words or learning how words are formed rather than capturing meaning to mitigate this and speed forward a training procedure we must recognize common patterns that occur when words/subwords or meaningful chunks occur in a text stream then use our tokenizer, for this we can use the native regex library of python, here we will use a regex pattern supplied by GPT-2 the pattern is as follows:
(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+Understanding the GPT-2 Pattern
Now let’s see what pattern it looks for:
'(?:[sdmt]|ll|ve|re) This matches English contractions starting with ‘. So matches found will be: ‘s ‘d ‘re etc.. **\p{L}+** This matches optional leading space with trailing one or more characters like: ‘ hello’ ‘hi’, this allows for matching words.
The pattern will similarly match for isolated numbers special characters like emojis and stuff which are isolated in sentence stream as such make up a indivisible component that is what our tokenizer will process. Ok now let’s implement this:
Implementing RegexTokenizer
import regex as re
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
class RegexTokenizer:
def __init__(self):
self.merges = {}
self.pattern = GPT2_SPLIT_PATTERN
self.compiled_pattern = re.compile(self.pattern)
self.special_tokens = {} # eg <|endoftext|>
self.vocab = self._build_vocab()
def _build_vocab(self):
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in self.merges.items():
vocab[idx] = vocab[p0] + vocab[p1]
for special, idx in self.special_tokens.items():
vocab[idx] = special.encode("utf-8")
return vocab
def train(self,text,vocab_size,verbose=False):
assert vocab_size >= 256
num_merges = vocab_size - 256
text_chunks = re.findall(self.compiled_pattern,text) #create chunks with included patterns
ids = [list(ch.encode("utf-8")) for ch in text_chunks]#iterate through all chunks and encode each individually
merges = {}
vocab = {idx: bytes([idx]) for idx in range(256)}
for i in range(num_merges):
stats = {}
for chunk_ids in ids:
get_stats(chunk_ids,stats)
pair = max(stats,key=stats.get)
idx = 256 + i
ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
merges[pair] = idx
vocab[idx] = vocab[pair[0]]+ vocab[pair[1]]
if verbose:
print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
self.merges = merges #used in encode
self.vocab = vocab #used in decode
def register_special_tokens(self, special_tokens):
self.special_tokens = special_tokens
self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
def decode(self,ids):
part_bytes = []
for idx in ids:
if idx in self.vocab:
part_bytes.append(self.vocab[idx])
elif idx in self.inverse_special_tokens:
part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
else:
raise ValueError(f"invalid token id: {idx}")
text_bytes = b"".join(part_bytes)
text = text_bytes.decode("utf-8", errors="replace")
return text
def _encode_chunk(self, text_bytes):
ids = list(text_bytes)
while len(ids) >= 2:
stats = get_stats(ids)
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
if pair not in self.merges:
break
idx = self.merges[pair]
ids = merge(ids, pair, idx)
return ids
def encode_ordinary(self, text):
"""Encoding that ignores any special tokens."""
text_chunks = re.findall(self.compiled_pattern, text)
ids = []
for chunk in text_chunks:
chunk_bytes = chunk.encode("utf-8") # raw bytes
chunk_ids = self._encode_chunk(chunk_bytes)
ids.extend(chunk_ids)
return ids
def encode(self, text, allowed_special="none_raise"):
special = None
if allowed_special == "all":
special = self.special_tokens
elif allowed_special == "none":
special = {}
elif allowed_special == "none_raise":
special = {}
assert all(token not in text for token in self.special_tokens)
elif isinstance(allowed_special, set):
special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
else:
raise ValueError(f"allowed_special={allowed_special} not understood")
if not special:
# shortcut: if no special tokens, just use the ordinary encoding
return self.encode_ordinary(text)
special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
special_chunks = re.split(special_pattern, text)
# now all the special characters are separated from the rest of the text
# all chunks of text are encoded separately, then results are joined
ids = []
for part in special_chunks:
if part in special:
ids.append(special[part])
else:
ids.extend(self.encode_ordinary(part))
return ids
def save(self,file_prefix):
'''
Two files are save file_prefix.vocab and file_prefix.model
model file is the actually important file
vocab is for inspection
'''
model_file = file_prefix +'.model'
with open(model_file,'w') as f:
f.write('bpe v1\n')
f.write(f"{self.pattern}\n")
#write special tokens
f.write(f"{len(self.special_tokens)}\n")
for special,idx in self.special_tokens.items():
f.write(f"{special} {idx}\n")
for idx1, idx2 in self.merges:
f.write(f"{idx1} {idx2}\n")
#writing vocab file(inspection)
vocab_file = file_prefix + ".vocab"
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
with open(vocab_file, "w", encoding="utf-8") as f:
for idx, token in self.vocab.items():
s = render_token(token)
if idx in inverted_merges:
idx0, idx1 = inverted_merges[idx]
s0 = render_token(self.vocab[idx0])
s1 = render_token(self.vocab[idx1])
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
else:
f.write(f"[{s}] {idx}\n")
def load(self, model_file):
assert model_file.endswith(".model")
# read the model file
merges = {}
special_tokens = {}
idx = 256
with open(model_file, 'r', encoding="utf-8") as f:
# read the version
version = f.readline().strip()
assert version == "bpe v1"
self.pattern = f.readline().strip()
# read the special tokens
num_special = int(f.readline().strip())
for _ in range(num_special):
special, special_idx = f.readline().strip().split()
special_tokens[special] = int(special_idx)
# read the merges
for line in f:
idx1, idx2 = map(int, line.split())
merges[(idx1, idx2)] = idx
idx += 1
self.merges = merges
self.special_tokens = special_tokens
self.vocab = self._build_vocab()Training the RegexTokenizer
text = open(“data.txt”,”r”,encoding=”utf-8″).read()
tokenizer = RegexTokenizer()
tokenizer.save(“model2”)
Now lets see how vocabulary are formed in our new model2 it should look similar to this:

We can see that the tokens now seem much more meaningful.
Conclusion
And there we have it we have implemented a basic tokenizer which should now extract meaningful chunks from a text streams so how was your experience will free to refer to the notebook or scripts if you get stuck here. We have learned and implemented basic building block of modern LM’s.
Explore More Resources
Main Website: biterdevs.com
Tools:
Tokenizer Tutorial Series:
- Part 1: Tokenizer: Understanding and building one
- Part 2: Tokenizer Class: Building a Complete BPE Implementation
- Part 3: Applying RegexBase to achieve usable tokens (You are here)
Related Blog Posts:
- Agentic AI: Stanford ACE Framework Guide
- AI Memory: The Genius Concept 2025
- Time Zone Planner Guide: Zoneing
- AI Cognitive Debt: Memory Loss
Blog Home: blog.biterdevs.com


