BERT Tokenizer
🌍
References & Disclaimer
This content is adapted from A deep understanding of AI language model mechanisms. It has been curated and organized for educational purposes on this portfolio. No copyright infringement is intended.
# !pip install transformersImport BERT tokenizer
from transformers import BertTokenizer
# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')Execution Output
/Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdmInspect the tokenizer properties
# inspect the tokenizer info
dir(tokenizer)Execution Output
['SPECIAL_TOKENS_ATTRIBUTES',
'__annotations__',
'__call__',
'__class__',
'__delattr__',Check out some tokens
all_tokens = list(tokenizer.get_vocab().keys())
all_tokens[20000:20100]Execution Output
['chunk',
'rigorous',
'blaine',
'198',
'peabody',print(tokenizer.vocab_size)
tokenizer.get_vocab()['science']Execution Output
30522Execution Output
2671Tokenizing a word
word = 'science'
res1 = tokenizer.convert_tokens_to_ids(word)
res2 = tokenizer.get_vocab()[word]
print(res1)
print(res2)Execution Output
2671
2671Encoding a text
text = 'science is great'
res1 = tokenizer.convert_tokens_to_ids(text)
res2 = tokenizer.get_vocab()[text]
print(res1)
print(res2)Execution Output
KeyError: 'science is great'
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[8], line 4
1 text = 'science is great'# better method:
res3 = tokenizer.encode(text)
for i in res3:
print(f'Token {i} is "{tokenizer.decode(i)}"')
# [CLS] = classification
# [SEP] = sentence separation
print('')
print(tokenizer.decode(res3,skip_special_tokens=True))
print(tokenizer.decode(res3,skip_special_tokens=False))Execution Output
Token 101 is "[CLS]"
Token 2671 is "science"
Token 2003 is "is"
Token 2307 is "great"
Token 102 is "[SEP]"# BERT adds [CLS]...[SEP] with each encode
tokenizer.decode(tokenizer.encode(tokenizer.decode(tokenizer.encode( text ))))Execution Output
'[CLS] [CLS] science is great [SEP] [SEP]'Calling the class directly
tokenizer(text)Execution Output
{'input_ids': [101, 2671, 2003, 2307, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}More on tokenizing
sentence = 'AI is both exciting and terrifying.'
print('Original sentence:')
print(f' {sentence}\n')
# segment the text into tokens
tokenized = tokenizer.tokenize(sentence)
print('Tokenized (segmented) sentence:')
print(f' {tokenized}')
# encode the tokenized sentence
ids_from_tokens = tokenizer.convert_tokens_to_ids(tokenized)
print(f' {ids_from_tokens}\n')
# and finally, encode from the original sentence
encodedText = tokenizer.encode(sentence)
print('Encoded from the original text:')
print(f' {encodedText}\n\n')
# now for decoding
print('Decoded from token-wise encoding:')
print(f' {tokenizer.decode(ids_from_tokens)}\n')
print('Decoded from text encoding:')
print(f' {tokenizer.decode(encodedText)}')Execution Output
Original sentence:
AI is both exciting and terrifying.
Tokenized (segmented) sentence:
['ai', 'is', 'both', 'exciting', 'and', 'terrifying', '.']