Copied!







import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
torch.manual_seed(20)

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
torch.manual_seed(20)

<torch._C.Generator at 0x7aedd8bea1f0>





Copied!







import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
torch.manual_seed(20)

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
torch.manual_seed(20)

<torch._C.Generator at 0x7aedd8bea1f0>





Copied!







class ResidualBlock(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.nonlinearity = nn.ReLU()

        # Initialize weights to identity and biases to zero
        nn.init.eye_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.eye_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        residual = x
        x = self.nonlinearity(self.fc1(x))
        x = self.fc2(x)
        x += residual  # Add the residual connection
        return x

class MeanClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ResidualBlock(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.nonlinearity = nn.ReLU()

        # Initialize weights to identity and biases to zero
        nn.init.eye_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.eye_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        residual = x
        x = self.nonlinearity(self.fc1(x))
        x = self.fc2(x)
        x += residual  # Add the residual connection
        return x

class MeanClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







class ResidualBlock(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.nonlinearity = nn.ReLU()

        # Initialize weights to identity and biases to zero
        nn.init.eye_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.eye_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        residual = x
        x = self.nonlinearity(self.fc1(x))
        x = self.fc2(x)
        x += residual  # Add the residual connection
        return x

class MeanClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ResidualBlock(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.nonlinearity = nn.ReLU()

        # Initialize weights to identity and biases to zero
        nn.init.eye_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.eye_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        residual = x
        x = self.nonlinearity(self.fc1(x))
        x = self.fc2(x)
        x += residual  # Add the residual connection
        return x

class MeanClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







def train_one_epoch(model, optimizer, x, y):
    model.train()
    optimizer.zero_grad()
    output = model(x)
    loss = F.binary_cross_entropy_with_logits(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()

def train_one_epoch(model, optimizer, x, y):
    model.train()
    optimizer.zero_grad()
    output = model(x)
    loss = F.binary_cross_entropy_with_logits(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()





Copied!







def train_one_epoch(model, optimizer, x, y):
    model.train()
    optimizer.zero_grad()
    output = model(x)
    loss = F.binary_cross_entropy_with_logits(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()

def train_one_epoch(model, optimizer, x, y):
    model.train()
    optimizer.zero_grad()
    output = model(x)
    loss = F.binary_cross_entropy_with_logits(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()





Copied!







model = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer = optim.SGD(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(1000)):
    loss = train_one_epoch(model, optimizer, X, y)
    losses.append(loss)

model = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer = optim.SGD(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(1000)):
    loss = train_one_epoch(model, optimizer, X, y)
    losses.append(loss)

Entering loop

100%|██████████| 1000/1000 [00:01<00:00, 825.46it/s]





Copied!







model = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer = optim.SGD(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(1000)):
    loss = train_one_epoch(model, optimizer, X, y)
    losses.append(loss)

model = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer = optim.SGD(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses = []
for epoch in tqdm(range(1000)):
    loss = train_one_epoch(model, optimizer, X, y)
    losses.append(loss)

Entering loop

100%|██████████| 1000/1000 [00:01<00:00, 825.46it/s]





Copied!







import matplotlib.pyplot as plt

# Plot losses over epochs
plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses) + 1), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: simple classifier with residual block\nfor data where reverse order matters')
plt.grid(True)
plt.show()

import matplotlib.pyplot as plt

# Plot losses over epochs
plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses) + 1), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: simple classifier with residual block\nfor data where reverse order matters')
plt.grid(True)
plt.show()





Copied!







import matplotlib.pyplot as plt

# Plot losses over epochs
plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses) + 1), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: simple classifier with residual block\nfor data where reverse order matters')
plt.grid(True)
plt.show()

import matplotlib.pyplot as plt

# Plot losses over epochs
plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses) + 1), losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: simple classifier with residual block\nfor data where reverse order matters')
plt.grid(True)
plt.show()





Copied!







class ClassifierWithPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ClassifierWithPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







class ClassifierWithPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ClassifierWithPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







model1 = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer1 = optim.Adam(model1.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model2 = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer2 = optim.Adam(model2.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses1 = []
losses2 = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model1, optimizer1, X, y)
    losses1.append(loss1)
    loss2 = train_one_epoch(model2, optimizer2, X, y)
    losses2.append(loss2)

model1 = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer1 = optim.Adam(model1.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model2 = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer2 = optim.Adam(model2.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses1 = []
losses2 = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model1, optimizer1, X, y)
    losses1.append(loss1)
    loss2 = train_one_epoch(model2, optimizer2, X, y)
    losses2.append(loss2)

Entering loop

100%|██████████| 1000/1000 [00:03<00:00, 282.66it/s]





Copied!







model1 = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer1 = optim.Adam(model1.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model2 = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer2 = optim.Adam(model2.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses1 = []
losses2 = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model1, optimizer1, X, y)
    losses1.append(loss1)
    loss2 = train_one_epoch(model2, optimizer2, X, y)
    losses2.append(loss2)

model1 = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer1 = optim.Adam(model1.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model2 = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer2 = optim.Adam(model2.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses1 = []
losses2 = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model1, optimizer1, X, y)
    losses1.append(loss1)
    loss2 = train_one_epoch(model2, optimizer2, X, y)
    losses2.append(loss2)

Entering loop

100%|██████████| 1000/1000 [00:03<00:00, 282.66it/s]





Copied!







plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses2) + 1), losses2, 'b', label='Simple')
plt.plot(range(1, len(losses1) + 1), losses1, 'r', label='Position')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: classifier with position encoding\nfor data where reverse order matters')
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses2) + 1), losses2, 'b', label='Simple')
plt.plot(range(1, len(losses1) + 1), losses1, 'r', label='Position')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: classifier with position encoding\nfor data where reverse order matters')
plt.grid(True)
plt.legend()
plt.show()





Copied!







plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses2) + 1), losses2, 'b', label='Simple')
plt.plot(range(1, len(losses1) + 1), losses1, 'r', label='Position')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: classifier with position encoding\nfor data where reverse order matters')
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses2) + 1), losses2, 'b', label='Simple')
plt.plot(range(1, len(losses1) + 1), losses1, 'r', label='Position')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: classifier with position encoding\nfor data where reverse order matters')
plt.grid(True)
plt.legend()
plt.show()





Copied!







X = torch.tensor([
    [0.7, 0.6],
    [0.6, 0.7],
    [-1.0, 0.0],
    [-0.9, -0.1],
], )

S = X @ X.T
print(S)

X = torch.tensor([
    [0.7, 0.6],
    [0.6, 0.7],
    [-1.0, 0.0],
    [-0.9, -0.1],
], )

S = X @ X.T
print(S)

tensor([[ 0.8500,  0.8400, -0.7000, -0.6900],
        [ 0.8400,  0.8500, -0.6000, -0.6100],
        [-0.7000, -0.6000,  1.0000,  0.9000],
        [-0.6900, -0.6100,  0.9000,  0.8200]])





Copied!







X = torch.tensor([
    [0.7, 0.6],
    [0.6, 0.7],
    [-1.0, 0.0],
    [-0.9, -0.1],
], )

S = X @ X.T
print(S)

X = torch.tensor([
    [0.7, 0.6],
    [0.6, 0.7],
    [-1.0, 0.0],
    [-0.9, -0.1],
], )

S = X @ X.T
print(S)

tensor([[ 0.8500,  0.8400, -0.7000, -0.6900],
        [ 0.8400,  0.8500, -0.6000, -0.6100],
        [-0.7000, -0.6000,  1.0000,  0.9000],
        [-0.6900, -0.6100,  0.9000,  0.8200]])





Copied!







W = torch.softmax(S/torch.sqrt(torch.tensor([3])), dim=1)
print(W)

W = torch.softmax(S/torch.sqrt(torch.tensor([3])), dim=1)
print(W)

tensor([[0.3554, 0.3533, 0.1452, 0.1461],
        [0.3479, 0.3499, 0.1515, 0.1506],
        [0.1380, 0.1462, 0.3682, 0.3476],
        [0.1440, 0.1508, 0.3607, 0.3444]])





Copied!







W = torch.softmax(S/torch.sqrt(torch.tensor([3])), dim=1)
print(W)

W = torch.softmax(S/torch.sqrt(torch.tensor([3])), dim=1)
print(W)

tensor([[0.3554, 0.3533, 0.1452, 0.1461],
        [0.3479, 0.3499, 0.1515, 0.1506],
        [0.1380, 0.1462, 0.3682, 0.3476],
        [0.1440, 0.1508, 0.3607, 0.3444]])





Copied!







Y = W @ X
print(Y)

Y = W @ X
print(Y)

tensor([[ 0.1841,  0.4460],
        [ 0.1664,  0.4387],
        [-0.4967,  0.1504],
        [-0.4793,  0.1576]])





Copied!







Y = W @ X
print(Y)

Y = W @ X
print(Y)

tensor([[ 0.1841,  0.4460],
        [ 0.1664,  0.4387],
        [-0.4967,  0.1504],
        [-0.4793,  0.1576]])





Copied!







plt.figure(figsize=(3,3))
plt.scatter(X[:, 0], X[:, 1], color='blue', s=10, label='X')
plt.scatter(Y[:, 0], Y[:, 1], color='red', s=10, label='Y')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Scatterplot of X and Y')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(3,3))
plt.scatter(X[:, 0], X[:, 1], color='blue', s=10, label='X')
plt.scatter(Y[:, 0], Y[:, 1], color='red', s=10, label='Y')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Scatterplot of X and Y')
plt.legend()
plt.grid(True)
plt.show()





Copied!







plt.figure(figsize=(3,3))
plt.scatter(X[:, 0], X[:, 1], color='blue', s=10, label='X')
plt.scatter(Y[:, 0], Y[:, 1], color='red', s=10, label='Y')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Scatterplot of X and Y')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(3,3))
plt.scatter(X[:, 0], X[:, 1], color='blue', s=10, label='X')
plt.scatter(Y[:, 0], Y[:, 1], color='red', s=10, label='Y')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Scatterplot of X and Y')
plt.legend()
plt.grid(True)
plt.show()





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







class ClassifierWithMHAPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.mhe = nn.MultiheadAttention(
            embed_dim=embedding_dim,
            num_heads=1,
            batch_first=True
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x_ = self.mhe(x, x, x)[0]
        x = x_ + x
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ClassifierWithMHAPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.mhe = nn.MultiheadAttention(
            embed_dim=embedding_dim,
            num_heads=1,
            batch_first=True
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x_ = self.mhe(x, x, x)[0]
        x = x_ + x
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







class ClassifierWithMHAPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.mhe = nn.MultiheadAttention(
            embed_dim=embedding_dim,
            num_heads=1,
            batch_first=True
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x_ = self.mhe(x, x, x)[0]
        x = x_ + x
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x

class ClassifierWithMHAPositionEncoding(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.position_embedding = nn.Embedding(
            num_embeddings=seq_len,  # Assuming max sequence length of 100
            embedding_dim=embedding_dim
        )
        self.mhe = nn.MultiheadAttention(
            embed_dim=embedding_dim,
            num_heads=1,
            batch_first=True
        )
        self.r1 = ResidualBlock(embedding_dim)
        self.clf = nn.Linear(embedding_dim, 1)
        
    def forward(self, x):
        pos_enc = self.position_embedding(torch.arange(x.size(1)))
        x_embeddings = self.embedding(x)
        x = x_embeddings + pos_enc
        x_ = self.mhe(x, x, x)[0]
        x = x_ + x
        x = self.r1(x)
        x = torch.mean(x, dim=1)
        x = self.clf(x)
        return x





Copied!







model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: data where relative order matters\nPosition encoding vs. Self-Attention')
plt.grid(True)
plt.legend()
plt.show()

model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: data where relative order matters\nPosition encoding vs. Self-Attention')
plt.grid(True)
plt.legend()
plt.show()

Entering loop

100%|██████████| 1000/1000 [00:04<00:00, 202.39it/s]





Copied!







model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: data where relative order matters\nPosition encoding vs. Self-Attention')
plt.grid(True)
plt.legend()
plt.show()

model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
for epoch in tqdm(range(1000)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss: data where relative order matters\nPosition encoding vs. Self-Attention')
plt.grid(True)
plt.legend()
plt.show()

Entering loop

100%|██████████| 1000/1000 [00:04<00:00, 202.39it/s]





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
])

# Dataset
X = torch.tensor([
    [0, 1, 2],
    [0, 3, 2],
    [4, 3, 2],
    [4, 1, 2],
    [0, 1, 2],
    [2, 1, 0],
    [1, 3, 4],
    [4, 3, 1],
])

y = torch.tensor([
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
    [1.],
    [0.],
])





Copied!







model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_simple = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer_simple = optim.Adam(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
losses_simple = []
for epoch in tqdm(range(500)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    loss3 = train_one_epoch(model_simple, optimizer_simple, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)
    losses_simple.append(loss3)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_simple) + 1), losses_simple, 'b', label='Simple')
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training losses for different language models\nData where relative order and word reversal matters')
plt.grid(True)
plt.legend()
plt.show()

model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_simple = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer_simple = optim.Adam(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
losses_simple = []
for epoch in tqdm(range(500)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    loss3 = train_one_epoch(model_simple, optimizer_simple, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)
    losses_simple.append(loss3)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_simple) + 1), losses_simple, 'b', label='Simple')
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training losses for different language models\nData where relative order and word reversal matters')
plt.grid(True)
plt.legend()
plt.show()

Entering loop

100%|██████████| 500/500 [00:03<00:00, 147.91it/s]





Copied!







model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_simple = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer_simple = optim.Adam(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
losses_simple = []
for epoch in tqdm(range(500)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    loss3 = train_one_epoch(model_simple, optimizer_simple, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)
    losses_simple.append(loss3)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_simple) + 1), losses_simple, 'b', label='Simple')
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training losses for different language models\nData where relative order and word reversal matters')
plt.grid(True)
plt.legend()
plt.show()

model_baseline = ClassifierWithPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_baseline = optim.Adam(model_baseline.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_mha = ClassifierWithMHAPositionEncoding(vocab_size=5, seq_len=3, embedding_dim=2)
optimizer_mha = optim.Adam(model_mha.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha
model_simple = MeanClassifier(vocab_size=5, embedding_dim=2)
optimizer_simple = optim.Adam(model.parameters(), lr=1e-2) # lr is the learning rate - this is our alpha

print("Entering loop")
# And now, this is the training loop:
losses_bl = []
losses_mha = []
losses_simple = []
for epoch in tqdm(range(500)):
    loss1 = train_one_epoch(model_baseline, optimizer_baseline, X, y)
    loss2 = train_one_epoch(model_mha, optimizer_mha, X, y)
    loss3 = train_one_epoch(model_simple, optimizer_simple, X, y)
    losses_bl.append(loss1)
    losses_mha.append(loss2)
    losses_simple.append(loss3)

plt.figure(figsize=(5, 3))
plt.plot(range(1, len(losses_simple) + 1), losses_simple, 'b', label='Simple')
plt.plot(range(1, len(losses_bl) + 1), losses_bl, 'r', label='Position')
plt.plot(range(1, len(losses_mha) + 1), losses_mha, 'g', label='Self-Attention')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training losses for different language models\nData where relative order and word reversal matters')
plt.grid(True)
plt.legend()
plt.show()

Entering loop

100%|██████████| 500/500 [00:03<00:00, 147.91it/s]

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search

The Self-Attention Mechanism¶

Recalling previous lessons¶

Analysis: words with no order¶

Positional encoding¶

Example: positional encoder¶

Self-Attention¶

The base idea of attention¶

Example¶

Query, key, value¶

Example: when relative words matter¶

Exercise: ablation study¶

Case study¶

Practice: Multi-head attention¶