# 인공지능 특화교육 - 자연어처리 실습 2

## 1. Transformer 구현
## 2. 구현한 Transformer 기반 Text Classification (Intent Classification)
## 3. 구현한 Transformer 기반 Text Classification + Contrastive learning



##### Author: 이성민(cap1232@jbnu.ac.kr)
###### github: Transformer(https://github.com/ZIZUN/pytorch_lightning_transformer),  CPFT(https://github.com/ZIZUN/CPFT)
###### references: https://arxiv.org/abs/1706.03762, https://arxiv.org/pdf/2109.06349.pdf

In [None]:
!pip3 install torch
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [112]:
import torch.nn as nn
import torch
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.query_linear = nn.Linear(config.transformer_hidden_size, config.qkv_hidden_size * config.multi_head_num)
        self.key_linear = nn.Linear(config.transformer_hidden_size, config.qkv_hidden_size * config.multi_head_num)
        self.value_linear = nn.Linear(config.transformer_hidden_size, config.qkv_hidden_size * config.multi_head_num)
        
        self.softmax = nn.Softmax(dim=-1)
        self.mha_linear = nn.Linear(config.transformer_hidden_size, config.transformer_hidden_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input, attention_mask=None, encoder_output=None):
        q = self.query_linear(input)
        if encoder_output == None:
            k = self.key_linear(input)
            v = self.value_linear(input)
        else:
            k = self.key_linear(encoder_output)
            v = self.value_linear(encoder_output)
               
        bsz= q.size(0)
        seq_len = k.size(1)
        
        q = q.view(bsz, seq_len, self.config.multi_head_num, -1).transpose(1,2)
        k = k.view(bsz, seq_len, self.config.multi_head_num, -1).transpose(1,2)
        v = v.view(bsz, seq_len, self.config.multi_head_num, -1).transpose(1,2)
        
        qk_mul = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(q.size(-1))
        
        if attention_mask != None: # encoder att or decoder cross att
            mask = attention_mask.unsqueeze(1).expand(bsz, seq_len, seq_len).unsqueeze(1)
        elif encoder_output == None and attention_mask == None: # decoder masked att
            mask = torch.ones(bsz,seq_len,seq_len)
            mask = mask.triu(diagonal=1)
            mask = (mask==0).unsqueeze(1).to(qk_mul.device)   
        masked_qk_mul = qk_mul.masked_fill(mask == 0, -float('inf'))
        
        qk_score = self.softmax(masked_qk_mul) # divide by scaling factor
        attn_output = torch.matmul(qk_score, v)
        mha_output = self.mha_linear(attn_output.view(bsz, seq_len, -1))
        
        return self.dropout(mha_output)




class PosEncoding(nn.Module):
    def __init__(self, config):
        super().__init__()
        max_len = config.position_encoding_maxlen
        pos_encoding  = torch.ones(max_len, config.transformer_hidden_size)
        pos_encoding.requires_grad = False
        
        pos = torch.arange(0, max_len).unsqueeze(1)
        
        base_term = 10000 * torch.ones(int(config.transformer_hidden_size / 2))
        divide_term = torch.pow(base_term, torch.arange(0,config.transformer_hidden_size, 2) / config.transformer_hidden_size)
        
        pos_encoding[:,0::2] = torch.sin(pos / divide_term)
        pos_encoding[:,1::2] = torch.cos(pos / divide_term)
        
        pos_encoding = pos_encoding.unsqueeze(0)
        
        self.register_buffer('pos_encoding', pos_encoding)
        
    def forward(self, seq_len):       
        
        return self.pos_encoding[:,:seq_len,:]
    


        
class TransformerEncoder(nn.Module):
    def __init__(self, config, shared_word_embedding):
        super().__init__()
        self.config = config                
                
        self.word_embedding = shared_word_embedding
        self.pos_embedding = PosEncoding(config)
        self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(config) for i in range(config.encoder_layer_num)])

    def forward(self, input_ids, attention_mask):
        
        input_repre = self.word_embedding(input_ids)
        input_repre += self.pos_embedding(input_repre.size(1))

        for layer in self.encoder_layers:
            input_repre = layer(input=input_repre, attention_mask=attention_mask)
            
        output = input_repre
        return output
    
    
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.multi_head_attention = MultiHeadAttention(config)
        self.layernorm = nn.LayerNorm(config.transformer_hidden_size)
        
        self.linear_1 = nn.Linear(config.transformer_hidden_size, config.transformer_hidden_size * 4)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(config.transformer_hidden_size * 4, config.transformer_hidden_size)

        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input, attention_mask):
        mha_output = self.layernorm(input + self.multi_head_attention(input=input, attention_mask=attention_mask))
        layer_output = self.layernorm(mha_output + self.linear_2(self.relu(self.linear_1(mha_output))))
        
        return self.dropout(layer_output)
        
class TransformerDecoder(nn.Module):
    def __init__(self, config, shared_word_embedding):
        super().__init__()
        self.config = config                
                
        self.word_embedding = shared_word_embedding
        self.pos_embedding = PosEncoding(config)
        self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(config) for i in range(config.encoder_layer_num)])
        


    def forward(self, input_ids, enc_output, enc_attention_mask):
        
        input_repre = self.word_embedding(input_ids)
        input_repre += self.pos_embedding(input_repre.size(1))
        

        for layer in self.decoder_layers:
            input_repre = layer(input=input_repre, enc_output=enc_output, enc_attention_mask=enc_attention_mask)
            
        output = input_repre    
        return output


class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # input, attention_mask=None, encoder_output=None
        self.masked_attention = MultiHeadAttention(config)
        self.enc_dec_cross_attention = MultiHeadAttention(config)
        self.layernorm = nn.LayerNorm(config.transformer_hidden_size)
        
        self.linear_1 = nn.Linear(config.transformer_hidden_size, config.transformer_hidden_size * 4)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(config.transformer_hidden_size * 4, config.transformer_hidden_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input, enc_output, enc_attention_mask):
        
        masked_mha_output = self.layernorm(input + self.masked_attention(input=input, 
                                                                             attention_mask=None, 
                                                                             encoder_output=None))
        
        cross_mha_output = self.layernorm(masked_mha_output + self.enc_dec_cross_attention(input=masked_mha_output,
                                                                                        attention_mask=enc_attention_mask,
                                                                                        encoder_output=enc_output))
        layer_output = self.layernorm(cross_mha_output + self.linear_2(self.relu(self.linear_1(cross_mha_output))))
        
        return self.dropout(layer_output)




In [113]:

class TransformerConfig:
    def __init__(self):
        self.vocab_size = 50265
        self.transformer_hidden_size = 64
        self.multi_head_num = 4
        self.position_encoding_maxlen = 64
        
        self.qkv_hidden_size = 16
                
        self.encoder_layer_num = 6
        self.decoder_layer_num = 6

class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config                

        self.shared_word_embedding = nn.Embedding(config.vocab_size, config.transformer_hidden_size)                
        self.encoder = TransformerEncoder(config, shared_word_embedding=self.shared_word_embedding)
        self.decoder = TransformerDecoder(config, shared_word_embedding=self.shared_word_embedding)

    def forward(self, enc_input_ids, enc_attention_mask, dec_input_ids):
        
        enc_output = self.encoder(input_ids=enc_input_ids, attention_mask=enc_attention_mask)
        dec_output = self.decoder(input_ids=dec_input_ids, enc_output=enc_output, enc_attention_mask=enc_attention_mask)
        
        return dec_output



In [72]:
import torch
  
# model sanity check


model_config = TransformerConfig()
model = Transformer(config=model_config)



enc_input_ids_rand = torch.randint(0, 10, (5, 30))
enc_attention_mask = torch.randint(0, 2, (5, 30))

dec_input_ids_rand = torch.randint(0, 10, (5, 30))


output = model(enc_input_ids=enc_input_ids_rand, 
               enc_attention_mask=enc_attention_mask,
               dec_input_ids=dec_input_ids_rand)


print(output.shape)

torch.Size([5, 30, 512])


In [73]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm

class IntentExample:
    def __init__(self, text, label, do_lower_case):
        self.original_text = text
        self.text = text
        self.label = label

        if do_lower_case:
            self.text = self.text.lower()



def load_intent_examples(file_path, do_lower_case=True):
    examples = []
    
    labels_li = []
    with open('{}/seq.in'.format(file_path), 'r', encoding="utf-8") as f_text, open('{}/label'.format(file_path), 'r', encoding="utf-8") as f_label:
        for text, label in zip(f_text, f_label):
            text = text.strip()
            label = label.strip()
            
            if label not in labels_li:
                labels_li.append(label)
            
            e = IntentExample(text, label, do_lower_case)
            examples.append(e)
    return examples, labels_li
    

class LoadDataset(Dataset):
    def __init__(self, model_name, corpus_path, labels_li, seq_len):
        import os
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        self.seq_len = seq_len
        self.corpus_path = corpus_path
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.start = self.tokenizer.bos_token_id
        self.sep = self.tokenizer.eos_token_id
        self.padding = self.tokenizer.pad_token_id

        self.dataset, _ = load_intent_examples(file_path=corpus_path)
        self.labels_li = labels_li
        self.dataset_len = len(self.dataset)
        
        self.processed_dataset = []

        for data in tqdm(self.dataset):
            text = data.text
            label = data.label
            label = self.labels_li.index(label)

            text = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))

            if len(text) <= self.seq_len - 2:
                text = [self.start] + text + [self.sep]
                pad_length = self.seq_len - len(text)

                attention_mask = (len(text) * [1]) + (pad_length * [0])
                text = text + (pad_length * [self.padding])
            else:
                text = text[:self.seq_len - 2]
                text = [self.start] + text + [self.sep]
                attention_mask = len(text) * [1]

            model_input = text
            model_label = int(label)
            
            self.processed_dataset.append({"input_ids": model_input, 'attention_mask': attention_mask, "labels": model_label})

    def __len__(self):
        return len(self.processed_dataset)

    def __getitem__(self, item):
        output = self.processed_dataset[item]
        return {key: torch.tensor(value) for key, value in output.items()}

In [74]:
!git clone https://github.com/ZIZUN/pytorch_lightning_transformer

fatal: destination path 'pytorch_lightning_transformer' already exists and is not an empty directory.


In [88]:
class IntentCLSModule(nn.Module):
    def __init__(self, _config, num_labels=2):
        super().__init__()
        
        model_config = TransformerConfig()


        
        self.model = Transformer(config=model_config)
        
        self.classifier = nn.Linear(model_config.transformer_hidden_size, num_labels)
            
        # self.metric = Accuracy()

    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(enc_input_ids=input_ids, enc_attention_mask=attention_mask, dec_input_ids=input_ids)
        
        logits = self.classifier(outputs[:,0,:].squeeze(1))
        
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        
        return loss, logits

In [89]:
import torch.optim as optim


n_epoch = 30


train_dataset_path='/content/pytorch_lightning_transformer/data/HWU64/train_5'
model_name='roberta-base'
input_seq_len=50

_, train_labels_li = load_intent_examples(train_dataset_path)
# _, self.val_labels_li = load_intent_examples(self.val_dataset_path)
# _, self.test_labels_li = load_intent_examples(self.test_dataset_path)

train_dataset = LoadDataset(model_name, train_dataset_path, train_labels_li, seq_len=input_seq_len)        
# self.val_dataset = LoadDataset(self.model_name, self.val_dataset_path, self.val_labels_li, seq_len=self.input_seq_len)
# self.test_dataset = LoadDataset(self.model_name, self.test_dataset_path, self.test_labels_li, seq_len=self.input_seq_len)
        


loader = DataLoader(
    train_dataset,
    batch_size=4,
    num_workers=1,
)
# print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


losses = []
model = IntentCLSModule(model_config, len(train_labels_li))
# model.to(device)
# print(model)
optimizer = optim.SGD(model.parameters(), lr=0.002)

for epoch in range(n_epoch):
    total_loss = .0
    for batch_input in loader:
        batch_input = {key: value.to('cpu') for key, value in batch_input.items()}
        
        model.zero_grad()
        loss, logits = model(**batch_input)

        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
        break
    losses.append(total_loss)
    
    average_loss = (total_loss/len(loader)).item()
    print('epcoh:', epoch, '\t>>\t average loss: ', average_loss)
        


100%|██████████| 320/320 [00:00<00:00, 10346.65it/s]


cuda
epcoh: 0 	>>	 average loss:  0.05352320522069931
epcoh: 1 	>>	 average loss:  0.04676540568470955
epcoh: 2 	>>	 average loss:  0.040562279522418976
epcoh: 3 	>>	 average loss:  0.03502581641077995
epcoh: 4 	>>	 average loss:  0.030163656920194626
epcoh: 5 	>>	 average loss:  0.02593984641134739
epcoh: 6 	>>	 average loss:  0.02236049249768257
epcoh: 7 	>>	 average loss:  0.01935606077313423
epcoh: 8 	>>	 average loss:  0.016865909099578857
epcoh: 9 	>>	 average loss:  0.014806771650910378
epcoh: 10 	>>	 average loss:  0.013101955875754356
epcoh: 11 	>>	 average loss:  0.011679882183670998
epcoh: 12 	>>	 average loss:  0.01049315370619297
epcoh: 13 	>>	 average loss:  0.009495769627392292
epcoh: 14 	>>	 average loss:  0.008649270981550217
epcoh: 15 	>>	 average loss:  0.007927282713353634
epcoh: 16 	>>	 average loss:  0.007306565530598164
epcoh: 17 	>>	 average loss:  0.006769813597202301
epcoh: 18 	>>	 average loss:  0.006301529705524445
epcoh: 19 	>>	 average loss:  0.00589104974

In [135]:
import torch.nn.functional as F

class IntentCLSContrastiveModule(nn.Module):
    def __init__(self, _config, num_labels=2):
        super().__init__()
        
        model_config = TransformerConfig()


        
        self.model = Transformer(config=model_config)
        
        self.classifier = nn.Linear(model_config.transformer_hidden_size, num_labels)
            
        self.sup_con_loss_fct = SupConLoss(temperature=0.1)
        self.ce_loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(enc_input_ids=input_ids, enc_attention_mask=attention_mask, dec_input_ids=input_ids)
        
        features = outputs[:,0,:].unsqueeze(1)
        features_for_classification = outputs[:,0,:]
        
        for i in [0, 0.06, 0.12, 0.18,0.24, ]:
            set_dropout_mf(self.model, i)
            
            pos_feature = self.model(enc_input_ids=input_ids, enc_attention_mask=attention_mask, dec_input_ids=input_ids)[:,0,:].unsqueeze(1)

            # print(features.shape, pos_feature.shape)

            features = torch.cat([features, pos_feature], dim=1)
        sup_con_loss = self.sup_con_loss_fct(F.normalize(features, p=2, dim=2), labels)
        
        
        logits = self.classifier(features_for_classification)
        # print(logits.shape, labels.shape)
        cls_loss = self.ce_loss_fct(logits, labels)
        

        loss = sup_con_loss  +  cls_loss 

        
        return loss, logits





def set_dropout_mf(
    model:nn, 
    w
    ):
    """Alters the dropouts in the embeddings.
    """
    # ------ set hidden dropout -------#
    if hasattr(model, 'module'):
        for i in model.encoder.encoder_layers:
            i.multi_head_attention.dropout.p = w
            i.multi_head_attention.dropout.p = w
            i.dropout.p = w
        for i in model.decoder.decoder_layers:
            i.masked_attention.dropout.p = w
            i.enc_dec_cross_attention.dropout.p = w
            i.dropout.p = w 
    else:
        for i in model.encoder.encoder_layers:
            i.multi_head_attention.dropout.p = w
            i.multi_head_attention.dropout.p = w
            i.dropout.p = w
        for i in model.decoder.decoder_layers:
            i.masked_attention.dropout.p = w
            i.enc_dec_cross_attention.dropout.p = w
            i.dropout.p = w 
        
    return model

class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""
    def __init__(self, temperature=0.07, contrast_mode='all',
                 base_temperature=0.07):
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.contrast_mode = contrast_mode
        self.base_temperature = base_temperature

    def forward(self, features, labels=None, mask=None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """
        device = (torch.device('cuda') 
                  if features.is_cuda
                  else torch.device('cpu'))

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0) # (bsz*view, hidden)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count # view
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            self.temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach() # (bsz*view, bsz*view)

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count) # (bsz*view, bsz*view)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        
       
        # logits_mask -> tensor([
        #     [0., 1., 1., 1.],
        #     [1., 0., 1., 1.],
        #     [1., 1., 0., 1.],
        #     [1., 1., 1., 0.]])
        mask = mask * logits_mask
        


        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss        
    

In [136]:
import torch.optim as optim


n_epoch = 30


train_dataset_path='/content/pytorch_lightning_transformer/data/HWU64/train_5'
model_name='roberta-base'
input_seq_len=50

_, train_labels_li = load_intent_examples(train_dataset_path)
# _, self.val_labels_li = load_intent_examples(self.val_dataset_path)
# _, self.test_labels_li = load_intent_examples(self.test_dataset_path)

train_dataset = LoadDataset(model_name, train_dataset_path, train_labels_li, seq_len=input_seq_len)        
# self.val_dataset = LoadDataset(self.model_name, self.val_dataset_path, self.val_labels_li, seq_len=self.input_seq_len)
# self.test_dataset = LoadDataset(self.model_name, self.test_dataset_path, self.test_labels_li, seq_len=self.input_seq_len)
        


loader = DataLoader(
    train_dataset,
    batch_size=4,
    num_workers=1,
)
# print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


losses = []
model = IntentCLSContrastiveModule(model_config, len(train_labels_li))
# model.to(device)
# print(model)
optimizer = optim.SGD(model.parameters(), lr=0.002)

for epoch in range(n_epoch):
    total_loss = .0
    for batch_input in loader:
        batch_input = {key: value.to('cpu') for key, value in batch_input.items()}
        
        model.zero_grad()
        loss, logits = model(**batch_input)

        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
        break
    losses.append(total_loss)
    
    average_loss = (total_loss/len(loader)).item()
    print('epcoh:', epoch, '\t>>\t average loss: ', average_loss)
        


100%|██████████| 320/320 [00:00<00:00, 10828.38it/s]

cuda





epcoh: 0 	>>	 average loss:  0.1244221180677414
epcoh: 1 	>>	 average loss:  0.12693920731544495
epcoh: 2 	>>	 average loss:  0.12797527015209198
epcoh: 3 	>>	 average loss:  0.12507325410842896
epcoh: 4 	>>	 average loss:  0.11502696573734283
epcoh: 5 	>>	 average loss:  0.11924104392528534
epcoh: 6 	>>	 average loss:  0.11322568356990814
epcoh: 7 	>>	 average loss:  0.1136661022901535
epcoh: 8 	>>	 average loss:  0.10878562927246094
epcoh: 9 	>>	 average loss:  0.10821595042943954
epcoh: 10 	>>	 average loss:  0.10828202962875366
epcoh: 11 	>>	 average loss:  0.10166139900684357
epcoh: 12 	>>	 average loss:  0.10243082046508789
epcoh: 13 	>>	 average loss:  0.10519327968358994
epcoh: 14 	>>	 average loss:  0.09115318208932877
epcoh: 15 	>>	 average loss:  0.08803428709506989
epcoh: 16 	>>	 average loss:  0.09332285076379776
epcoh: 17 	>>	 average loss:  0.0884881541132927
epcoh: 18 	>>	 average loss:  0.08255601674318314
epcoh: 19 	>>	 average loss:  0.09259887784719467
epcoh: 20 	>>