GPT-4 Tokenizer
🌍
References & Disclaimer
This content is adapted from A deep understanding of AI language model mechanisms. It has been curated and organized for educational purposes on this portfolio. No copyright infringement is intended.
import numpy as np
import matplotlib.pyplot as plt
# matplotlib defaults
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')# need to install the tiktoken library to get OpenAI's tokenizer
# note: it's tik-token, not tiktok-en :P
!pip install tiktoken
import tiktokenExecution Output
Requirement already satisfied: tiktoken in /Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages (0.9.0)
Requirement already satisfied: regex>=2022.1.18 in /Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages (from tiktoken) (2024.11.6)
Requirement already satisfied: requests>=2.26.0 in /Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages (from tiktoken) (2.32.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (3.10)# GPT-4's tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')
dir(tokenizer)Execution Output
['__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',# get help
tokenizer??Execution Output
Type: Encoding
String form: <Encoding 'cl100k_base'>
File: ~/.pyenv/versions/3.12.6/lib/python3.12/site-packages/tiktoken/core.py
Source:
class Encoding:# vocab size
tokenizer.n_vocabExecution Output
100277tokenizer.decode([tokenizer.eot_token])Execution Output
'<|endoftext|>'# but not all tokens are valid, e.g.,
print(tokenizer.n_vocab)
tokenizer.decode([100261])Execution Output
100277Execution Output
KeyError: 'Invalid token for decoding: 100277'
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[9], line 3
1 # but not all tokens are valid, e.g.,# list of all tokens:
# https://github.com/vnglst/gpt4-tokens/blob/main/decode-tokens.ipynbExplore some tokens
for i in range(1000,1050):
print(f'{i} = {tokenizer.decode([i])}')Execution Output
1000 = indow
1001 = lement
1002 = pect
1003 = ash
1004 = [iTokenization!
text = "My name is Mike and I like toothpaste-flavored chocolate."
tokens = tokenizer.encode(text)
print(tokens)Execution Output
[5159, 836, 374, 11519, 323, 358, 1093, 26588, 57968, 12556, 76486, 18414, 13]text.split()Execution Output
['My',
'name',
'is',
'Mike',
'and',for word in text.split():
print(f'"{word}" comprises token(s) {tokenizer.encode(word)}')Execution Output
"My" comprises token(s) [5159]
"name" comprises token(s) [609]
"is" comprises token(s) [285]
"Mike" comprises token(s) [35541]
"and" comprises token(s) [438]for t in tokens:
print(f'Token {t:>6} is "{tokenizer.decode([t])}"')Execution Output
Token 5159 is "My"
Token 836 is " name"
Token 374 is " is"
Token 11519 is " Mike"
Token 323 is " and"# with special (non-ASCII) characters
tokenizer.encode('â')Execution Output
[9011]How long are the tokens?
# initialize lengths vector
token_lengths = np.zeros(tokenizer.n_vocab)
# get the number of characters in each token
for idx in range(tokenizer.n_vocab):
try:
token_lengths[idx] = len(tokenizer.decode([idx]))
except:
token_lengths[idx] = np.nan
# count unique lengths
uniqueLengths,tokenCount = np.unique(token_lengths,return_counts=True)
# visualize
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(token_lengths,'k.',markersize=3,alpha=.4)
axs[0].set(xlim=[0,tokenizer.n_vocab],xlabel='Token index',ylabel='Token length (characters)',
title='GPT4 token lengths')
axs[1].bar(uniqueLengths,tokenCount,color='k',edgecolor='gray')
axs[1].set(xlim=[0,max(uniqueLengths)],xlabel='Token length (chars)',ylabel='Token count (log scale)',
title='Distribution of token lengths')
plt.tight_layout()
plt.show()Many word-tokens start with spaces
# single-token words with vs. without spaces
print( tokenizer.encode(' Michael') )
print( tokenizer.encode('Michael') )Execution Output
[8096]
[26597]# multi-token words without a space
print( tokenizer.encode(' Peach') )
print( tokenizer.encode('Peach') )Execution Output
[64695]
[47, 9739]peach = tokenizer.encode('Peach')
[tokenizer.decode([p]) for p in peach]Execution Output
['P', 'each']The Time Machine book encoded
import requests
import re
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text
# split by punctuation
words = re.split(r'([,.:;—?_!"“()\']|--|\s)',text)
words = [item.strip() for item in words if item.strip()]
print(f'There are {len(words)} words.')
words[10000:10050]Execution Output
There are 37786 words.Execution Output
['I',
'was',
'not',
'loath',
'to',# tokens of a random word in the text
someRandomWord = np.random.choice(words)
print(f'"{someRandomWord}" has token {tokenizer.encode(someRandomWord)}')Execution Output
"has" has token [4752]for t in words[:20]:
print(f'"{t}" has {len(tokenizer.encode(t))} tokens')Execution Output
"***" has 1 tokens
"START" has 1 tokens
"OF" has 1 tokens
"THE" has 1 tokens
"PROJECT" has 1 tokensfor spelling in ['book','Book','bOok']:
print(f'"{spelling}" has tokens {tokenizer.encode(spelling)}')Execution Output
"book" has tokens [2239]
"Book" has tokens [7280]
"bOok" has tokens [65, 46, 564]But do we need to separate the text into words?
# what happens if we just tokenize the raw (unprocessed) text?
tmTokens = tokenizer.encode(text)
print(f'The text has {len(tmTokens):,} tokens and {len(words):,} words.')Execution Output
The text has 43,053 tokens and 37,786 words.# check out some tokens
for t in tmTokens[9990:10020]:
print(f'Token {t:>6}: "{tokenizer.decode([t])}"')Execution Output
Token 264: " a"
Token 3094: " step"
Token 4741: " forward"
Token 11: ","
Token 20365: " hes"print(tokenizer.decode(tmTokens[9990:10020]))Execution Output
a step forward, hesitated, and then touched my
hand. Then I felt other soft little tentacles upon my back and
shoulders.