Add medium baseline method (also pass strong baseline)

- select all features expect participant state
birdmandayum0131 · Sep 25, 2022 · 265a542 · 265a542
1 parent 45d60cd
commit 265a542
Show file tree

Hide file tree

Showing 2 changed files with 313 additions and 0 deletions.
diff --git a/HW01/HW01_medium_baseline.py b/HW01/HW01_medium_baseline.py
@@ -0,0 +1,313 @@
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Import packages'''
+# Numerical Operations
+
+# Reading/Writing Data
+
+# For Progress Bar
+
+# Pytorch
+
+# For plotting learning curve
+#from torch.utils.tensorboard import SummaryWriter
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Some Utility Functions'''
+'''You do not need to modify this part'''
+
+
+
+
+import matplotlib.pyplot as plt
+from torch.utils.data import Dataset, DataLoader, random_split
+import torch.nn as nn
+import math
+import numpy as np
+import pandas as pd
+import os
+import csv
+from tqdm import tqdm
+import torch
+def same_seed(seed):
+    '''Fixes random number generator seeds for reproducibility.'''
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def train_valid_split(data_set, valid_ratio, seed):
+    '''Split provided training data into training set and validation set'''
+    valid_set_size = int(valid_ratio * len(data_set))
+    train_set_size = len(data_set) - valid_set_size
+    train_set, valid_set = random_split(data_set, [
+                                        train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
+    return np.array(train_set), np.array(valid_set)
+
+
+def predict(test_loader, model, device):
+    model.eval()  # Set your model to evaluation mode.
+    preds = []
+    for x in tqdm(test_loader):
+        x = x.to(device)
+        with torch.no_grad():
+            pred = model(x)
+            preds.append(pred.detach().cpu())
+    preds = torch.cat(preds, dim=0).numpy()
+    return preds
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Dataset'''
+
+
+class COVID19Dataset(Dataset):
+    '''
+    x: Features.
+    y: Targets, if none, do prediction.
+    '''
+
+    def __init__(self, x, y=None):
+        if y is None:
+            self.y = y
+        else:
+            self.y = torch.FloatTensor(y)
+        self.x = torch.FloatTensor(x)
+
+    def __getitem__(self, idx):
+        if self.y is None:
+            return self.x[idx]
+        else:
+            return self.x[idx], self.y[idx]
+
+    def __len__(self):
+        return len(self.x)
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Neural Network Model'''
+
+
+class My_Model(nn.Module):
+    def __init__(self, input_dim):
+        super(My_Model, self).__init__()
+        # TODO: modify model's structure, be aware of dimensions.
+        self.layers = nn.Sequential(
+            nn.Linear(input_dim, 16),
+            nn.ReLU(),
+            nn.Linear(16, 8),
+            nn.ReLU(),
+            nn.Linear(8, 1)
+        )
+
+    def forward(self, x):
+        x = self.layers(x)
+        x = x.squeeze(1)  # (B, 1) -> (B)
+        return x
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Feature Selection'''
+
+
+def select_feat(train_data, valid_data, test_data, select_all=True):
+    '''Selects useful features to perform regression'''
+    y_train, y_valid = train_data[:, -1], valid_data[:, -1]
+    raw_x_train, raw_x_valid, raw_x_test = train_data[:,
+                                                      :-1], valid_data[:, :-1], test_data
+
+    if select_all:
+        feat_idx = list(range(raw_x_train.shape[1]))
+    else:
+        feat_idx = list(range(raw_x_train.shape[1]))[37:]  # TODO: Select suitable feature columns.
+
+    return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Training Loop'''
+train_loss_list, val_loss_list = [], []
+train_best_loss, val_best_loss = math.inf, math.inf
+
+
+def trainer(train_loader, valid_loader, model, config, device):
+    global train_loss_list, val_loss_list
+    global train_best_loss, val_best_loss
+    # Define your loss function, do not modify this.
+    criterion = nn.MSELoss(reduction='mean')
+
+    # Define your optimization algorithm.
+    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
+    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
+    optimizer = torch.optim.SGD(
+        model.parameters(), lr=config['learning_rate'], momentum=0.9)
+
+    # writer = SummaryWriter() # Writer of tensoboard.
+
+    if not os.path.isdir('./models'):
+        os.mkdir('./models')  # Create directory of saving models.
+
+    n_epochs,  step, early_stop_count = config['n_epochs'],  0, 0
+
+    for epoch in range(n_epochs):
+        model.train()  # Set your model to train mode.
+        loss_record = []
+
+        # tqdm is a package to visualize your training progress.
+        train_pbar = tqdm(train_loader, position=0, leave=True)
+
+        for x, y in train_pbar:
+            optimizer.zero_grad()               # Set gradient to zero.
+            x, y = x.to(device), y.to(device)   # Move your data to device.
+            pred = model(x)
+            loss = criterion(pred, y)
+            # Compute gradient(backpropagation).
+            loss.backward()
+            optimizer.step()                    # Update parameters.
+            step += 1
+            loss_record.append(loss.detach().item())
+
+            # Display current epoch number and loss on tqdm progress bar.
+            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
+            train_pbar.set_postfix({'loss': loss.detach().item()})
+
+        mean_train_loss = sum(loss_record)/len(loss_record)
+        #writer.add_scalar('Loss/train', mean_train_loss, step)
+        train_loss_list.append(mean_train_loss)
+
+        model.eval()  # Set your model to evaluation mode.
+        loss_record = []
+        for x, y in valid_loader:
+            x, y = x.to(device), y.to(device)
+            with torch.no_grad():
+                pred = model(x)
+                loss = criterion(pred, y)
+
+            loss_record.append(loss.item())
+
+        mean_valid_loss = sum(loss_record)/len(loss_record)
+        print(
+            f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
+        #writer.add_scalar('Loss/valid', mean_valid_loss, step)
+        val_loss_list.append(mean_valid_loss)
+
+        if mean_train_loss < train_best_loss:
+            train_best_loss = mean_train_loss
+
+        if mean_valid_loss < val_best_loss:
+            val_best_loss = mean_valid_loss
+            # Save your best model
+            torch.save(model.state_dict(), config['save_path'])
+            print('Saving model with loss {:.3f}...'.format(val_best_loss))
+            early_stop_count = 0
+        else:
+            early_stop_count += 1
+
+        if early_stop_count >= config['early_stop']:
+            print('\nModel is not improving, so we halt the training session.')
+            return
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Configurations'''
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+config = {
+    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
+    'select_all': False,   # Whether to use all features.
+    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
+    'n_epochs': 3000,     # Number of epochs.
+    'batch_size': 256,
+    'learning_rate': 1e-5,
+    # If model has not improved for this many consecutive epochs, stop training.
+    'early_stop': 400,
+    'save_path': './models/model.ckpt'  # Your model will be saved here.
+}
+
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Dataloader'''
+'''You do not need to modify this part'''
+# Set seed for reproducibility
+same_seed(config['seed'])
+
+
+# train_data size: 2699 x 118 (id + 37 states + 16 features x 5 days)
+# test_data size: 1078 x 117 (without last day's positive rate)
+train_data, test_data = pd.read_csv(
+    './covid.train.csv').values, pd.read_csv('./covid.test.csv').values
+train_data, valid_data = train_valid_split(
+    train_data, config['valid_ratio'], config['seed'])
+
+# Print out the data size.
+print(f"""train_data size: {train_data.shape} 
+valid_data size: {valid_data.shape} 
+test_data size: {test_data.shape}""")
+
+# Select features
+x_train, x_valid, x_test, y_train, y_valid = select_feat(
+    train_data, valid_data, test_data, config['select_all'])
+
+# Print out the number of features.
+print(f'number of features: {x_train.shape[1]}')
+
+train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \
+    COVID19Dataset(x_valid, y_valid), \
+    COVID19Dataset(x_test)
+
+# Pytorch data loader loads pytorch dataset into batches.
+train_loader = DataLoader(
+    train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
+valid_loader = DataLoader(
+    valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
+test_loader = DataLoader(
+    test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Start training'''
+model = My_Model(input_dim=x_train.shape[1]).to(
+    device)  # put your model and data on the same computation device.
+trainer(train_loader, valid_loader, model, config, device)
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Plot learning curves with matplotlib'''
+fig, (trainAxe, valAxe) = plt.subplots(1, 2, figsize=(22, 6))
+fig.supxlabel('Epoch')
+fig.supylabel('Loss')
+trainAxe.set_title('Loss/train #min: %f'%(train_best_loss))
+valAxe.set_title('Loss/validate #min: %f'%(val_best_loss))
+trainAxe.plot(train_loss_list)
+valAxe.plot(val_loss_list)
+plt.show()
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
+
+'''Testing'''
+
+
+def save_pred(preds, file):
+    ''' Save predictions to specified file '''
+    with open(file, 'w') as fp:
+        writer = csv.writer(fp)
+        writer.writerow(['id', 'tested_positive'])
+        for i, p in enumerate(preds):
+            writer.writerow([i, p])
+
+
+model = My_Model(input_dim=x_train.shape[1]).to(device)
+model.load_state_dict(torch.load(config['save_path']))
+preds = predict(test_loader, model, device)
+save_pred(preds, 'pred.csv')
+
+'''@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'''
diff --git a/HW01/medium_baseline.png b/HW01/medium_baseline.png