Copied!







import torch
import torch.nn as nn

# Parameters of the embedding layer
vocab_size = 100  # Number of unique tokens
embedding_dim = 2  # You can choose the dimension of the embeddings
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Documents are now represented as sequences of tokens.
# Each token is an integer representing the index of the token in the vocabulary.
tokens = torch.tensor([[0, 1, 2, 3],
                        [4, 5, 4, 3],
                        [5, 4, 3, 2]])

# Get the embeddings for the tokens
embeddings = embedding_layer(tokens)

print(embeddings)
print(embeddings.shape)

import torch
import torch.nn as nn

# Parameters of the embedding layer
vocab_size = 100  # Number of unique tokens
embedding_dim = 2  # You can choose the dimension of the embeddings
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Documents are now represented as sequences of tokens.
# Each token is an integer representing the index of the token in the vocabulary.
tokens = torch.tensor([[0, 1, 2, 3],
                        [4, 5, 4, 3],
                        [5, 4, 3, 2]])

# Get the embeddings for the tokens
embeddings = embedding_layer(tokens)

print(embeddings)
print(embeddings.shape)

tensor([[[-1.2297,  1.2948],
         [ 1.0586,  0.2113],
         [-0.1370,  1.5003],
         [-1.4372, -1.1496]],

        [[ 2.0969,  1.7514],
         [-1.7039,  0.0144],
         [ 2.0969,  1.7514],
         [-1.4372, -1.1496]],

        [[-1.7039,  0.0144],
         [ 2.0969,  1.7514],
         [-1.4372, -1.1496],
         [-0.1370,  1.5003]]], grad_fn=<EmbeddingBackward0>)
torch.Size([3, 4, 2])





Copied!







import torch
import torch.nn as nn

# Parameters of the embedding layer
vocab_size = 100  # Number of unique tokens
embedding_dim = 2  # You can choose the dimension of the embeddings
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Documents are now represented as sequences of tokens.
# Each token is an integer representing the index of the token in the vocabulary.
tokens = torch.tensor([[0, 1, 2, 3],
                        [4, 5, 4, 3],
                        [5, 4, 3, 2]])

# Get the embeddings for the tokens
embeddings = embedding_layer(tokens)

print(embeddings)
print(embeddings.shape)

import torch
import torch.nn as nn

# Parameters of the embedding layer
vocab_size = 100  # Number of unique tokens
embedding_dim = 2  # You can choose the dimension of the embeddings
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Documents are now represented as sequences of tokens.
# Each token is an integer representing the index of the token in the vocabulary.
tokens = torch.tensor([[0, 1, 2, 3],
                        [4, 5, 4, 3],
                        [5, 4, 3, 2]])

# Get the embeddings for the tokens
embeddings = embedding_layer(tokens)

print(embeddings)
print(embeddings.shape)

tensor([[[-1.2297,  1.2948],
         [ 1.0586,  0.2113],
         [-0.1370,  1.5003],
         [-1.4372, -1.1496]],

        [[ 2.0969,  1.7514],
         [-1.7039,  0.0144],
         [ 2.0969,  1.7514],
         [-1.4372, -1.1496]],

        [[-1.7039,  0.0144],
         [ 2.0969,  1.7514],
         [-1.4372, -1.1496],
         [-0.1370,  1.5003]]], grad_fn=<EmbeddingBackward0>)
torch.Size([3, 4, 2])





Copied!







import sentencepiece as spm
from io import StringIO

# Your input data as a string
input_data = """Was ever feather so lightly blown to and fro as this multitude? The
name of Henry the Fifth hales them to an hundred mischiefs and makes
them leave me desolate. I see them lay their heads together to surprise
me. My sword make way for me, for here is no staying.—In despite of the
devils and hell, have through the very middest of you! And heavens and
honour be witness that no want of resolution in me, but only my
followers’ base and ignominious treasons, makes me betake me to my
heels.
"""

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100
)

import sentencepiece as spm
from io import StringIO

# Your input data as a string
input_data = """Was ever feather so lightly blown to and fro as this multitude? The
name of Henry the Fifth hales them to an hundred mischiefs and makes
them leave me desolate. I see them lay their heads together to surprise
me. My sword make way for me, for here is no staying.—In despite of the
devils and hell, have through the very middest of you! And heavens and
honour be witness that no want of resolution in me, but only my
followers’ base and ignominious treasons, makes me betake me to my
heels.
"""

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: my_tokenizer
  model_type: UNIGRAM
  vocab_size: 100
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(411) LOG(INFO) Loaded all 8 sentences
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(432) LOG(INFO) Normalizing sentences...
trainer_interface.cc(541) LOG(INFO) all chars count=490
trainer_interface.cc(562) LOG(INFO) Alphabet size=36
trainer_interface.cc(563) LOG(INFO) Final character coverage=1
trainer_interface.cc(594) LOG(INFO) Done! preprocessed 8 sentences.
unigram_model_trainer.cc(265) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(269) LOG(INFO) Extracting frequent sub strings... node_num=210
unigram_model_trainer.cc(312) LOG(INFO) Initialized 134 seed sentencepieces
trainer_interface.cc(600) LOG(INFO) Tokenizing input sentences with whitespace: 8
trainer_interface.cc(611) LOG(INFO) Done! 71
unigram_model_trainer.cc(602) LOG(INFO) Using 71 sentences for EM training
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=0 size=109 obj=13.1974 num_tokens=231 num_tokens/piece=2.11927
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=1 size=99 obj=12.7698 num_tokens=232 num_tokens/piece=2.34343
trainer_interface.cc(689) LOG(INFO) Saving model: my_tokenizer.model
trainer_interface.cc(701) LOG(INFO) Saving vocabs: my_tokenizer.vocab





Copied!







import sentencepiece as spm
from io import StringIO

# Your input data as a string
input_data = """Was ever feather so lightly blown to and fro as this multitude? The
name of Henry the Fifth hales them to an hundred mischiefs and makes
them leave me desolate. I see them lay their heads together to surprise
me. My sword make way for me, for here is no staying.—In despite of the
devils and hell, have through the very middest of you! And heavens and
honour be witness that no want of resolution in me, but only my
followers’ base and ignominious treasons, makes me betake me to my
heels.
"""

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100
)

import sentencepiece as spm
from io import StringIO

# Your input data as a string
input_data = """Was ever feather so lightly blown to and fro as this multitude? The
name of Henry the Fifth hales them to an hundred mischiefs and makes
them leave me desolate. I see them lay their heads together to surprise
me. My sword make way for me, for here is no staying.—In despite of the
devils and hell, have through the very middest of you! And heavens and
honour be witness that no want of resolution in me, but only my
followers’ base and ignominious treasons, makes me betake me to my
heels.
"""

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: my_tokenizer
  model_type: UNIGRAM
  vocab_size: 100
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(411) LOG(INFO) Loaded all 8 sentences
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(432) LOG(INFO) Normalizing sentences...
trainer_interface.cc(541) LOG(INFO) all chars count=490
trainer_interface.cc(562) LOG(INFO) Alphabet size=36
trainer_interface.cc(563) LOG(INFO) Final character coverage=1
trainer_interface.cc(594) LOG(INFO) Done! preprocessed 8 sentences.
unigram_model_trainer.cc(265) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(269) LOG(INFO) Extracting frequent sub strings... node_num=210
unigram_model_trainer.cc(312) LOG(INFO) Initialized 134 seed sentencepieces
trainer_interface.cc(600) LOG(INFO) Tokenizing input sentences with whitespace: 8
trainer_interface.cc(611) LOG(INFO) Done! 71
unigram_model_trainer.cc(602) LOG(INFO) Using 71 sentences for EM training
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=0 size=109 obj=13.1974 num_tokens=231 num_tokens/piece=2.11927
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=1 size=99 obj=12.7698 num_tokens=232 num_tokens/piece=2.34343
trainer_interface.cc(689) LOG(INFO) Saving model: my_tokenizer.model
trainer_interface.cc(701) LOG(INFO) Saving vocabs: my_tokenizer.vocab





Copied!







# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

# Test the tokenizer
test_sentence = "This is a test sentence."
encoded_pieces = sp.encode_as_pieces(test_sentence)
encoded_ids = sp.encode_as_ids(test_sentence)
recovered_sentence = sp.decode_ids(encoded_ids)

print("Encoded pieces:", encoded_pieces)
print("Encoded ids:", encoded_ids)
print("Recovered sentence:", recovered_sentence)

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

# Test the tokenizer
test_sentence = "This is a test sentence."
encoded_pieces = sp.encode_as_pieces(test_sentence)
encoded_ids = sp.encode_as_ids(test_sentence)
recovered_sentence = sp.decode_ids(encoded_ids)

print("Encoded pieces:", encoded_pieces)
print("Encoded ids:", encoded_ids)
print("Recovered sentence:", recovered_sentence)

Encoded pieces: ['▁', 'T', 'hi', 's', '▁', 'is', '▁', 'a', '▁', 't', 'es', 't', '▁s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']
Encoded ids: [3, 82, 90, 7, 3, 28, 3, 59, 3, 9, 54, 9, 30, 4, 14, 9, 4, 14, 83, 4, 20]
Recovered sentence: This is a test sentence.





Copied!







# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

# Test the tokenizer
test_sentence = "This is a test sentence."
encoded_pieces = sp.encode_as_pieces(test_sentence)
encoded_ids = sp.encode_as_ids(test_sentence)
recovered_sentence = sp.decode_ids(encoded_ids)

print("Encoded pieces:", encoded_pieces)
print("Encoded ids:", encoded_ids)
print("Recovered sentence:", recovered_sentence)

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

# Test the tokenizer
test_sentence = "This is a test sentence."
encoded_pieces = sp.encode_as_pieces(test_sentence)
encoded_ids = sp.encode_as_ids(test_sentence)
recovered_sentence = sp.decode_ids(encoded_ids)

print("Encoded pieces:", encoded_pieces)
print("Encoded ids:", encoded_ids)
print("Recovered sentence:", recovered_sentence)

Encoded pieces: ['▁', 'T', 'hi', 's', '▁', 'is', '▁', 'a', '▁', 't', 'es', 't', '▁s', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']
Encoded ids: [3, 82, 90, 7, 3, 28, 3, 59, 3, 9, 54, 9, 30, 4, 14, 9, 4, 14, 83, 4, 20]
Recovered sentence: This is a test sentence.





Copied!







# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100,
    user_defined_symbols=['<PAD>']
)

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100,
    user_defined_symbols=['']
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: my_tokenizer
  model_type: UNIGRAM
  vocab_size: 100
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <PAD>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(411) LOG(INFO) Loaded all 8 sentences
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <PAD>
trainer_interface.cc(432) LOG(INFO) Normalizing sentences...
trainer_interface.cc(541) LOG(INFO) all chars count=490
trainer_interface.cc(562) LOG(INFO) Alphabet size=36
trainer_interface.cc(563) LOG(INFO) Final character coverage=1
trainer_interface.cc(594) LOG(INFO) Done! preprocessed 8 sentences.
unigram_model_trainer.cc(265) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(269) LOG(INFO) Extracting frequent sub strings... node_num=210
unigram_model_trainer.cc(312) LOG(INFO) Initialized 134 seed sentencepieces
trainer_interface.cc(600) LOG(INFO) Tokenizing input sentences with whitespace: 8
trainer_interface.cc(611) LOG(INFO) Done! 71
unigram_model_trainer.cc(602) LOG(INFO) Using 71 sentences for EM training
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=0 size=109 obj=13.1974 num_tokens=231 num_tokens/piece=2.11927
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=1 size=99 obj=12.7698 num_tokens=232 num_tokens/piece=2.34343
trainer_interface.cc(689) LOG(INFO) Saving model: my_tokenizer.model
trainer_interface.cc(701) LOG(INFO) Saving vocabs: my_tokenizer.vocab





Copied!







# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100,
    user_defined_symbols=['<PAD>']
)

# Use StringIO to create a file-like object
input_fp = StringIO(input_data)

# Train the SentencePiece model using the file pointer
spm.SentencePieceTrainer.train(
    sentence_iterator=input_fp, 
    model_prefix='my_tokenizer', 
    vocab_size=100,
    user_defined_symbols=['']
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: my_tokenizer
  model_type: UNIGRAM
  vocab_size: 100
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <PAD>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(411) LOG(INFO) Loaded all 8 sentences
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(427) LOG(INFO) Adding meta_piece: <PAD>
trainer_interface.cc(432) LOG(INFO) Normalizing sentences...
trainer_interface.cc(541) LOG(INFO) all chars count=490
trainer_interface.cc(562) LOG(INFO) Alphabet size=36
trainer_interface.cc(563) LOG(INFO) Final character coverage=1
trainer_interface.cc(594) LOG(INFO) Done! preprocessed 8 sentences.
unigram_model_trainer.cc(265) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(269) LOG(INFO) Extracting frequent sub strings... node_num=210
unigram_model_trainer.cc(312) LOG(INFO) Initialized 134 seed sentencepieces
trainer_interface.cc(600) LOG(INFO) Tokenizing input sentences with whitespace: 8
trainer_interface.cc(611) LOG(INFO) Done! 71
unigram_model_trainer.cc(602) LOG(INFO) Using 71 sentences for EM training
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=0 size=109 obj=13.1974 num_tokens=231 num_tokens/piece=2.11927
unigram_model_trainer.cc(618) LOG(INFO) EM sub_iter=1 size=99 obj=12.7698 num_tokens=232 num_tokens/piece=2.34343
trainer_interface.cc(689) LOG(INFO) Saving model: my_tokenizer.model
trainer_interface.cc(701) LOG(INFO) Saving vocabs: my_tokenizer.vocab





Copied!







sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')
print(sp.piece_to_id('<PAD>'))

sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')
print(sp.piece_to_id(''))

3





Copied!







sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')
print(sp.piece_to_id('<PAD>'))

sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')
print(sp.piece_to_id(''))

3





Copied!







# Create an embedding layer with padding index set to 3
def pad_to_len(sequences, pad_idx, max_len):
    padded = []
    for s in sequences:
        if len(s) >= max_len:
            padded.append(s[:max_len])
        else:
            padded.append(s + [pad_idx] * (max_len - len(s)))
    return padded

vocab_size = len(sp)
embedding_dim = 2
padding_idx = sp.piece_to_id('<PAD>')
embedding_layer_with_padding = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=padding_idx,   
)

# Get the embeddings for the tokens using the new embedding layer
input_data = ["This is my test sentence", "This is another test sentence", "this is a really long sequence and I will probably have to crop it!"]
tokens = sp.tokenize(input_data)
tokens = pad_to_len(tokens, padding_idx, 25)
print(tokens)
embeddings_with_padding = embedding_layer_with_padding(torch.tensor(tokens))
#print(embeddings_with_padding)
print(embeddings_with_padding.shape)

# Create an embedding layer with padding index set to 3
def pad_to_len(sequences, pad_idx, max_len):
    padded = []
    for s in sequences:
        if len(s) >= max_len:
            padded.append(s[:max_len])
        else:
            padded.append(s + [pad_idx] * (max_len - len(s)))
    return padded

vocab_size = len(sp)
embedding_dim = 2
padding_idx = sp.piece_to_id('')
embedding_layer_with_padding = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=padding_idx,   
)

# Get the embeddings for the tokens using the new embedding layer
input_data = ["This is my test sentence", "This is another test sentence", "this is a really long sequence and I will probably have to crop it!"]
tokens = sp.tokenize(input_data)
tokens = pad_to_len(tokens, padding_idx, 25)
print(tokens)
embeddings_with_padding = embedding_layer_with_padding(torch.tensor(tokens))
#print(embeddings_with_padding)
print(embeddings_with_padding.shape)

[[4, 83, 91, 8, 4, 29, 26, 6, 4, 10, 55, 10, 31, 5, 15, 10, 5, 15, 84, 5, 3, 3, 3, 3, 3], [4, 83, 91, 8, 4, 29, 78, 56, 10, 58, 4, 10, 55, 10, 31, 5, 15, 10, 5, 15, 84, 5, 3, 3, 3], [34, 29, 4, 29, 4, 60, 4, 35, 60, 96, 6, 40, 28, 42, 31, 5, 0, 27, 5, 15, 84, 5, 16, 4, 45]]
torch.Size([3, 25, 2])





Copied!







# Create an embedding layer with padding index set to 3
def pad_to_len(sequences, pad_idx, max_len):
    padded = []
    for s in sequences:
        if len(s) >= max_len:
            padded.append(s[:max_len])
        else:
            padded.append(s + [pad_idx] * (max_len - len(s)))
    return padded

vocab_size = len(sp)
embedding_dim = 2
padding_idx = sp.piece_to_id('<PAD>')
embedding_layer_with_padding = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=padding_idx,   
)

# Get the embeddings for the tokens using the new embedding layer
input_data = ["This is my test sentence", "This is another test sentence", "this is a really long sequence and I will probably have to crop it!"]
tokens = sp.tokenize(input_data)
tokens = pad_to_len(tokens, padding_idx, 25)
print(tokens)
embeddings_with_padding = embedding_layer_with_padding(torch.tensor(tokens))
#print(embeddings_with_padding)
print(embeddings_with_padding.shape)

# Create an embedding layer with padding index set to 3
def pad_to_len(sequences, pad_idx, max_len):
    padded = []
    for s in sequences:
        if len(s) >= max_len:
            padded.append(s[:max_len])
        else:
            padded.append(s + [pad_idx] * (max_len - len(s)))
    return padded

vocab_size = len(sp)
embedding_dim = 2
padding_idx = sp.piece_to_id('')
embedding_layer_with_padding = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=padding_idx,   
)

# Get the embeddings for the tokens using the new embedding layer
input_data = ["This is my test sentence", "This is another test sentence", "this is a really long sequence and I will probably have to crop it!"]
tokens = sp.tokenize(input_data)
tokens = pad_to_len(tokens, padding_idx, 25)
print(tokens)
embeddings_with_padding = embedding_layer_with_padding(torch.tensor(tokens))
#print(embeddings_with_padding)
print(embeddings_with_padding.shape)

[[4, 83, 91, 8, 4, 29, 26, 6, 4, 10, 55, 10, 31, 5, 15, 10, 5, 15, 84, 5, 3, 3, 3, 3, 3], [4, 83, 91, 8, 4, 29, 78, 56, 10, 58, 4, 10, 55, 10, 31, 5, 15, 10, 5, 15, 84, 5, 3, 3, 3], [34, 29, 4, 29, 4, 60, 4, 35, 60, 96, 6, 40, 28, 42, 31, 5, 0, 27, 5, 15, 84, 5, 16, 4, 45]]
torch.Size([3, 25, 2])





Copied!







import torch.nn.functional as F

class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

model = SimpleClassifier(vocab_size, embedding_dim)
print(model)

import torch.nn.functional as F

class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

model = SimpleClassifier(vocab_size, embedding_dim)
print(model)

SimpleClassifier(
  (embedding): Embedding(100, 2, padding_idx=3)
  (clf): Linear(in_features=2, out_features=1, bias=True)
)





Copied!







import torch.nn.functional as F

class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

model = SimpleClassifier(vocab_size, embedding_dim)
print(model)

import torch.nn.functional as F

class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

model = SimpleClassifier(vocab_size, embedding_dim)
print(model)

SimpleClassifier(
  (embedding): Embedding(100, 2, padding_idx=3)
  (clf): Linear(in_features=2, out_features=1, bias=True)
)





Copied!







import pandas as pd 
from sklearn.model_selection import train_test_split

import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet").sample(100, random_state=42)
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet").sample(100, random_state=42)
X_train = list(train_df['text'])
y_train = torch.tensor(list(train_df['label']))
X_test = list(test_df['text'])
y_test = torch.tensor(list(test_df['label']))

import pandas as pd 
from sklearn.model_selection import train_test_split

import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet").sample(100, random_state=42)
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet").sample(100, random_state=42)
X_train = list(train_df['text'])
y_train = torch.tensor(list(train_df['label']))
X_test = list(test_df['text'])
y_test = torch.tensor(list(test_df['label']))





Copied!







import pandas as pd 
from sklearn.model_selection import train_test_split

import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet").sample(100, random_state=42)
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet").sample(100, random_state=42)
X_train = list(train_df['text'])
y_train = torch.tensor(list(train_df['label']))
X_test = list(test_df['text'])
y_test = torch.tensor(list(test_df['label']))

import pandas as pd 
from sklearn.model_selection import train_test_split

import pandas as pd

train_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet").sample(100, random_state=42)
test_df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet").sample(100, random_state=42)
X_train = list(train_df['text'])
y_train = torch.tensor(list(train_df['label']))
X_test = list(test_df['text'])
y_test = torch.tensor(list(test_df['label']))





Copied!







from tqdm import tqdm 

# We will also define an optimizer:
optimizer = torch.optim.SGD(model.parameters(), lr=1e0) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    loss = torch.mean( (output_probs-y_train)**2 )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

from tqdm import tqdm 

# We will also define an optimizer:
optimizer = torch.optim.SGD(model.parameters(), lr=1e0) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    loss = torch.mean( (output_probs-y_train)**2 )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

Entering loop

100%|██████████| 200/200 [00:05<00:00, 37.34it/s]





Copied!







from tqdm import tqdm 

# We will also define an optimizer:
optimizer = torch.optim.SGD(model.parameters(), lr=1e0) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    loss = torch.mean( (output_probs-y_train)**2 )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

from tqdm import tqdm 

# We will also define an optimizer:
optimizer = torch.optim.SGD(model.parameters(), lr=1e0) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    loss = torch.mean( (output_probs-y_train)**2 )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

Entering loop

100%|██████████| 200/200 [00:05<00:00, 37.34it/s]





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()





Copied!







from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

Accuracy: 0.54
F1 Score: 0.3695175438596491
              precision    recall  f1-score   support

           0       1.00      0.02      0.04        47
           1       0.54      1.00      0.70        53

    accuracy                           0.54       100
   macro avg       0.77      0.51      0.37       100
weighted avg       0.75      0.54      0.39       100





Copied!







from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

Accuracy: 0.54
F1 Score: 0.3695175438596491
              precision    recall  f1-score   support

           0       1.00      0.02      0.04        47
           1       0.54      1.00      0.70        53

    accuracy                           0.54       100
   macro avg       0.77      0.51      0.37       100
weighted avg       0.75      0.54      0.39       100





Copied!







from tqdm import tqdm

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
model.train()
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

from tqdm import tqdm

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
model.train()
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

Entering loop

100%|██████████| 200/200 [00:02<00:00, 67.54it/s]





Copied!







from tqdm import tqdm

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
model.train()
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

from tqdm import tqdm

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
model.train()
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

Entering loop

100%|██████████| 200/200 [00:02<00:00, 67.54it/s]





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.show()





Copied!







from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

Accuracy: 0.55
F1 Score: 0.54226426609704
              precision    recall  f1-score   support

           0       0.53      0.45      0.48        47
           1       0.57      0.64      0.60        53

    accuracy                           0.55       100
   macro avg       0.55      0.54      0.54       100
weighted avg       0.55      0.55      0.55       100





Copied!







from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get the predictions for the test set
with torch.no_grad():
    model.eval()
    tokens = sp.encode_as_ids(X_test)
    tokens = pad_to_len(tokens, padding_idx, 25)
    tokens = torch.tensor(tokens)
    output = model(tokens)
    output_probs = torch.sigmoid(output)
    predictions = (output_probs > 0.5).int().numpy()

# Calculate the accuracy and F1 score
accuracy = accuracy_score(
    y_test,
    predictions,
)
f1 = f1_score(
    y_test,
    predictions,
    zero_division=0,
    average='macro',
)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(
    y_test,
    predictions,
    zero_division=0,
))

Accuracy: 0.55
F1 Score: 0.54226426609704
              precision    recall  f1-score   support

           0       0.53      0.45      0.48        47
           1       0.57      0.64      0.60        53

    accuracy                           0.55       100
   macro avg       0.55      0.54      0.54       100
weighted avg       0.55      0.55      0.55       100





Copied!







import torch.nn.functional as F

class SimpleClassifierExtended(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x, return_embeddings=False):
        x = self.embedding(x)
        document_embeddings = torch.mean(x, dim=1)
        x = self.clf(document_embeddings)
        if return_embeddings:
            return x, document_embeddings
        return x

import torch.nn.functional as F

class SimpleClassifierExtended(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x, return_embeddings=False):
        x = self.embedding(x)
        document_embeddings = torch.mean(x, dim=1)
        x = self.clf(document_embeddings)
        if return_embeddings:
            return x, document_embeddings
        return x





Copied!







import torch.nn.functional as F

class SimpleClassifierExtended(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x, return_embeddings=False):
        x = self.embedding(x)
        document_embeddings = torch.mean(x, dim=1)
        x = self.clf(document_embeddings)
        if return_embeddings:
            return x, document_embeddings
        return x

import torch.nn.functional as F

class SimpleClassifierExtended(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=padding_idx
        )
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x, return_embeddings=False):
        x = self.embedding(x)
        document_embeddings = torch.mean(x, dim=1)
        x = self.clf(document_embeddings)
        if return_embeddings:
            return x, document_embeddings
        return x





Copied!







from tqdm import tqdm

model = SimpleClassifierExtended(vocab_size, embedding_dim)

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
train_losses = []
train_embeddings = []
eval_losses = []
eval_embeddings = []

for epoch in tqdm(range(200)):
    # Training step
    model.train()
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output, document_embeddings = model(tokens, return_embeddings=True)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())
    train_embeddings.append(document_embeddings.cpu().detach().numpy())
    
    # Evaluation step
    model.eval()
    with torch.no_grad():
        tokens = sp.encode_as_ids(X_test)
        tokens = pad_to_len(tokens, padding_idx, 100)
        tokens = torch.tensor(tokens)
        output, document_embeddings = model(tokens, return_embeddings=True)
        loss = torch.mean(
            torch.binary_cross_entropy_with_logits(
                output.flatten().float(),
                y_test.flatten().float(),
            ), )
        eval_losses.append(loss.item())
        eval_embeddings.append(document_embeddings.cpu().detach().numpy())

from tqdm import tqdm

model = SimpleClassifierExtended(vocab_size, embedding_dim)

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
train_losses = []
train_embeddings = []
eval_losses = []
eval_embeddings = []

for epoch in tqdm(range(200)):
    # Training step
    model.train()
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output, document_embeddings = model(tokens, return_embeddings=True)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())
    train_embeddings.append(document_embeddings.cpu().detach().numpy())
    
    # Evaluation step
    model.eval()
    with torch.no_grad():
        tokens = sp.encode_as_ids(X_test)
        tokens = pad_to_len(tokens, padding_idx, 100)
        tokens = torch.tensor(tokens)
        output, document_embeddings = model(tokens, return_embeddings=True)
        loss = torch.mean(
            torch.binary_cross_entropy_with_logits(
                output.flatten().float(),
                y_test.flatten().float(),
            ), )
        eval_losses.append(loss.item())
        eval_embeddings.append(document_embeddings.cpu().detach().numpy())

Entering loop

100%|██████████| 200/200 [00:05<00:00, 36.24it/s]





Copied!







from tqdm import tqdm

model = SimpleClassifierExtended(vocab_size, embedding_dim)

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
train_losses = []
train_embeddings = []
eval_losses = []
eval_embeddings = []

for epoch in tqdm(range(200)):
    # Training step
    model.train()
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output, document_embeddings = model(tokens, return_embeddings=True)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())
    train_embeddings.append(document_embeddings.cpu().detach().numpy())
    
    # Evaluation step
    model.eval()
    with torch.no_grad():
        tokens = sp.encode_as_ids(X_test)
        tokens = pad_to_len(tokens, padding_idx, 100)
        tokens = torch.tensor(tokens)
        output, document_embeddings = model(tokens, return_embeddings=True)
        loss = torch.mean(
            torch.binary_cross_entropy_with_logits(
                output.flatten().float(),
                y_test.flatten().float(),
            ), )
        eval_losses.append(loss.item())
        eval_embeddings.append(document_embeddings.cpu().detach().numpy())

from tqdm import tqdm

model = SimpleClassifierExtended(vocab_size, embedding_dim)

# We will also define an optimizer:
optimizer = torch.optim.Adam(
    model.parameters(), lr=1e0)  # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
train_losses = []
train_embeddings = []
eval_losses = []
eval_embeddings = []

for epoch in tqdm(range(200)):
    # Training step
    model.train()
    optimizer.zero_grad()
    tokens = sp.encode_as_ids(X_train)
    tokens = pad_to_len(tokens, padding_idx, 100)
    tokens = torch.tensor(tokens)
    output, document_embeddings = model(tokens, return_embeddings=True)
    loss = torch.mean(
        torch.binary_cross_entropy_with_logits(
            output.flatten().float(),
            y_train.flatten().float(),
        ), )
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())
    train_embeddings.append(document_embeddings.cpu().detach().numpy())
    
    # Evaluation step
    model.eval()
    with torch.no_grad():
        tokens = sp.encode_as_ids(X_test)
        tokens = pad_to_len(tokens, padding_idx, 100)
        tokens = torch.tensor(tokens)
        output, document_embeddings = model(tokens, return_embeddings=True)
        loss = torch.mean(
            torch.binary_cross_entropy_with_logits(
                output.flatten().float(),
                y_test.flatten().float(),
            ), )
        eval_losses.append(loss.item())
        eval_embeddings.append(document_embeddings.cpu().detach().numpy())

Entering loop

100%|██████████| 200/200 [00:05<00:00, 36.24it/s]





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(train_losses, label='Train Loss')
plt.plot(eval_losses, label='Eval Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss over Epochs')
plt.legend()
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(train_losses, label='Train Loss')
plt.plot(eval_losses, label='Eval Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss over Epochs')
plt.legend()
plt.show()





Copied!







import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(train_losses, label='Train Loss')
plt.plot(eval_losses, label='Eval Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss over Epochs')
plt.legend()
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(5,3))
plt.plot(train_losses, label='Train Loss')
plt.plot(eval_losses, label='Eval Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss over Epochs')
plt.legend()
plt.show()





Copied!







import matplotlib.pyplot as plt

emb = train_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_train.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

emb = eval_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_test.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

import matplotlib.pyplot as plt

emb = train_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_train.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

emb = eval_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_test.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()





Copied!







import matplotlib.pyplot as plt

emb = train_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_train.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

emb = eval_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_test.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

import matplotlib.pyplot as plt

emb = train_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_train.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

emb = eval_embeddings[-1]
plt.figure(figsize=(4, 3))
scatter = plt.scatter(emb[:, 0], emb[:, 1], c=y_test.numpy(), cmap='coolwarm', label=['Class 0', 'Class 1'], alpha=0.8)
plt.xlabel('Embedding Dimension 1')
plt.ylabel('Embedding Dimension 2')
plt.title('Train Embeddings (last epoch)')
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search

End-to-end Neural Networks for NLP¶

Word Embeddings¶

Some underlying math of the bag-of-words model¶

Problems with the sparse representation¶

Dense representations¶

Coding with embeddings¶

Strategies for tokenization¶

Example code for training a Sentence Piece tokenizer:¶

Example code to test the trained tokenizer:¶

Zero-padding and truncation: keeping sentences the same length¶

How far have we gone?¶

Making a pipeline with PyTorch¶

Evaluating the model¶

Some steps on optimization¶

Cross-entropy loss¶

Visualizing model¶

Practice¶

Part 1 of 2: Recall concepts¶

Part 2 of 2: Improve performance of our classifier¶

Extension: sequence models¶