From b62011c0293b5882d5f9e5be81fd3fc5d7ad9172 Mon Sep 17 00:00:00 2001 From: Eduardo Cueto-Mendoza Date: Wed, 15 Jan 2025 10:26:48 +0000 Subject: [PATCH] Add new Experiments, with noise addition and better save --- LICENSE | 0 amd_sample_draw.py | 9 +- arguments.py | 56 +++++--- cpu_watt.sh | 2 +- gpu_power_func.py | 94 +++++++------ main_bayesian.py | 161 ++++++++++++---------- main_frequentist.py | 93 +++++++------ mem_free.sh | 3 +- models/BayesianModels/Bayesian3Conv3FC.py | 29 ++-- radeontop.sh | 2 +- stopping_crit.py | 68 +++++++-- 11 files changed, 309 insertions(+), 208 deletions(-) mode change 100644 => 100755 LICENSE diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/amd_sample_draw.py b/amd_sample_draw.py index 5e6e533..2366c6a 100755 --- a/amd_sample_draw.py +++ b/amd_sample_draw.py @@ -1,8 +1,9 @@ import pickle from warnings import warn + from gpu_power_func import get_sample_of_gpu -with (open("configuration.pkl", "rb")) as file: +with open("configuration.pkl", "rb") as file: while True: try: cfg = pickle.load(file) @@ -16,14 +17,14 @@ with (open("configuration.pkl", "rb")) as file: # print(cfg) -if __name__ == '__main__': +if __name__ == "__main__": dataDump = [] while True: try: dataDump.append(get_sample_of_gpu()) - with open(cfg["pickle_path"], 'wb') as f: + with open(cfg["pickle_path"], "wb") as f: pickle.dump(dataDump, f) except EOFError: - warn('Pickle ran out of space') + warn("Pickle ran out of space") finally: f.close() diff --git a/arguments.py b/arguments.py index ae46999..d55de32 100755 --- a/arguments.py +++ b/arguments.py @@ -6,22 +6,42 @@ all_args = argparse.ArgumentParser() def makeArguments(arguments: ArgumentParser) -> dict: - all_args.add_argument("-b", "--Bayesian", action="store", dest="b", - type=int, choices=range(1, 8), - help="Bayesian model of size x") - all_args.add_argument("-f", "--Frequentist", action="store", dest="f", - type=int, choices=range(1, 8), - help="Frequentist model of size x") - all_args.add_argument("-E", "--EarlyStopping", action="store_true", - help="Early Stopping criteria") - all_args.add_argument("-e", "--EnergyBound", action="store_true", - help="Energy Bound criteria") - all_args.add_argument("-a", "--AccuracyBound", action="store_true", - help="Accuracy Bound criteria") - all_args.add_argument("-s", "--Save", action="store_true", - help="Save model") - all_args.add_argument('--net_type', default='lenet', type=str, - help='model = [lenet/AlexNet/3Conv3FC]') - all_args.add_argument('--dataset', default='CIFAR10', type=str, - help='dataset = [MNIST/CIFAR10/CIFAR100]') + """Training arguments to be passed to the model""" + all_args.add_argument( + "-b", + "--Bayesian", + action="store", + dest="b", + type=int, + choices=range(1, 8), + help="Bayesian model of size x", + ) + all_args.add_argument( + "-f", + "--Frequentist", + action="store", + dest="f", + type=int, + choices=range(1, 8), + help="Frequentist model of size x", + ) + all_args.add_argument( + "-E", "--EarlyStopping", action="store_true", help="Early Stopping criteria" + ) + all_args.add_argument( + "-e", "--EnergyBound", action="store_true", help="Energy Bound criteria" + ) + all_args.add_argument( + "-a", "--AccuracyBound", action="store_true", help="Accuracy Bound criteria" + ) + all_args.add_argument("-s", "--Save", action="store_true", help="Save model") + all_args.add_argument( + "--net_type", default="lenet", type=str, help="model = [lenet/AlexNet/3Conv3FC]" + ) + all_args.add_argument( + "--dataset", + default="CIFAR10", + type=str, + help="dataset = [MNIST/CIFAR10/CIFAR100]", + ) return vars(all_args.parse_args()) diff --git a/cpu_watt.sh b/cpu_watt.sh index e35390f..09e1638 100755 --- a/cpu_watt.sh +++ b/cpu_watt.sh @@ -1,4 +1,4 @@ -#!/bin/env bash +#!/usr/bin/env bash powerstat -D -z 0.5 10000000 > $1 #powerstat -z 0.5 1000000 > $1 diff --git a/gpu_power_func.py b/gpu_power_func.py index 4f86933..ef7fc55 100755 --- a/gpu_power_func.py +++ b/gpu_power_func.py @@ -1,54 +1,58 @@ import os -import re import pickle +import re +import subprocess +from re import findall, sub +from subprocess import run + import numpy as np def get_sample_of_gpu(): - from re import sub, findall - import subprocess - from subprocess import run + no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running." + no_version = "Failed to initialize NVML: Driver/library version mismatch" + smi_string = run( + ["rocm-smi", "-P", "--showvoltage", "--showmemuse"], stdout=subprocess.PIPE + ) + smi_string = smi_string.stdout.decode("utf-8") + smi_string = smi_string.split("\n") + smi_string = list(filter(lambda x: x, smi_string)) + if smi_string[0] == no_graph: + raise Exception("It seems that no AMD GPU is installed") + elif smi_string[0] == no_version: + raise Exception("rocm-smi version mismatch") + else: + results = [] + gpuW0 = findall(r"[0-9]*\.[0-9]*", smi_string[2]) + gpuW1 = findall(r"[0-9]*\.[0-9]*", smi_string[3]) + gpuM0 = findall(r"[0-9]+", smi_string[6]) + gpuM1 = findall(r"[0-9]+", smi_string[10]) + gpuV0 = findall(r"[0-9]+", smi_string[16]) + gpuV1 = findall(r"[0-9]+", smi_string[17]) + results.append(float(gpuW0[0]) + float(gpuW1[0])) + if len(gpuM0) == 2 and len(gpuM1) == 2: + results.append(int(gpuM0[1]) + int(gpuM1[1])) + elif len(gpuM0) == 2: + results.append(gpuM0[1]) + elif len(gpuM1) == 2: + results.append(gpuM1[1]) + results.append(int(gpuV0[1]) + int(gpuV1[1])) + return results + # for l in smi_string: + # temp = findall("[0-9]*MiB | [0-9]*W",l) + # if temp: + # return temp - no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running." - no_version = "Failed to initialize NVML: Driver/library version mismatch" - smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE) - smi_string = smi_string.stdout.decode('utf-8') - smi_string = smi_string.split("\n") - smi_string = list(filter(lambda x: x, smi_string)) - if smi_string[0] == no_graph: - raise Exception("It seems that no AMD GPU is installed") - elif smi_string[0] == no_version: - raise Exception("rocm-smi version mismatch") - else: - results= [] - gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) - gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4]) - gpuM0 = findall("[0-9]+",smi_string[7]) - gpuM1 = findall("[0-9]+",smi_string[9]) - gpuV0 = findall("[0-9]+",smi_string[13]) - gpuV1 = findall("[0-9]+",smi_string[14]) - results.append(float(gpuW0[0]) + float(gpuW1[0])) - if len(gpuM0) == 2 and len(gpuM1) == 2: - results.append(int(gpuM0[1]) + int(gpuM1[1])) - elif len(gpuM0) == 2: - results.append(gpuM0[1]) - elif len(gpuM1) == 2: - results.append(gpuM1[1]) - results.append(int(gpuV0[1]) + int(gpuV1[1])) - return results - #for l in smi_string: - #temp = findall("[0-9]*MiB | [0-9]*W",l) - #if temp: - #return temp def total_watt_consumed(pickle_name): - with (open(pickle_name, "rb")) as file: - while True: - try: - x = pickle.load(file) - except EOFError: - break - x = np.array(x) - x = x[:,0] - y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x] - return sum(y) \ No newline at end of file + with open(pickle_name, "rb") as file: + while True: + try: + x = pickle.load(file) + except EOFError: + break + x = np.array(x) + x = x[:, 0] + y = [float(re.findall("\d+.\d+", xi)[0]) for xi in x] + return sum(y) + diff --git a/main_bayesian.py b/main_bayesian.py index d2075f6..4b86484 100755 --- a/main_bayesian.py +++ b/main_bayesian.py @@ -1,21 +1,23 @@ from __future__ import print_function import os -import data -import utils -import torch import pickle -import metrics -import numpy as np from datetime import datetime + +import numpy as np +import torch from torch.nn import functional as F from torch.optim import Adam, lr_scheduler -from models.BayesianModels.BayesianLeNet import BBBLeNet -from models.BayesianModels.BayesianAlexNet import BBBAlexNet -from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC -from stopping_crit import earlyStopping, energyBound, accuracyBound -with (open("configuration.pkl", "rb")) as file: +import data +import metrics +import utils +from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC +from models.BayesianModels.BayesianAlexNet import BBBAlexNet +from models.BayesianModels.BayesianLeNet import BBBLeNet +from stopping_crit import accuracy_bound, e_stop, energy_bound + +with open("configuration.pkl", "rb") as file: while True: try: cfg = pickle.load(file) @@ -28,21 +30,37 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def getModel(net_type, inputs, outputs, priors, layer_type, activation_type): - if (net_type == 'lenet'): - return BBBLeNet(outputs, inputs, priors, layer_type, activation_type, - wide=cfg["model"]["size"]) - elif (net_type == 'alexnet'): + print(net_type) + if net_type == "lenet": + return BBBLeNet( + outputs, + inputs, + priors, + layer_type, + activation_type, + wide=cfg["model"]["size"], + ) + elif net_type == "alexnet": return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type) - elif (net_type == '3conv3fc'): - return BBB3Conv3FC(outputs, inputs, priors, layer_type, - activation_type) + elif net_type == "3conv3fc": + return BBB3Conv3FC(outputs, inputs, priors, layer_type, activation_type) else: - raise ValueError('Network should be either [LeNet / AlexNet\ - / 3Conv3FC') + raise ValueError( + "Network should be either [LeNet / AlexNet\ + / 3Conv3FC" + ) -def train_model(net, optimizer, criterion, trainloader, num_ens=1, - beta_type=0.1, epoch=None, num_epochs=None): +def train_model( + net, + optimizer, + criterion, + trainloader, + num_ens=1, + beta_type=0.1, + epoch=None, + num_epochs=None, +): net.train() training_loss = 0.0 accs = [] @@ -52,8 +70,7 @@ def train_model(net, optimizer, criterion, trainloader, num_ens=1, optimizer.zero_grad() inputs, labels = inputs.to(device), labels.to(device) - outputs = torch.zeros(inputs.shape[0], net.num_classes, - num_ens).to(device) + outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device) kl = 0.0 for j in range(num_ens): @@ -65,19 +82,19 @@ def train_model(net, optimizer, criterion, trainloader, num_ens=1, kl_list.append(kl.item()) log_outputs = utils.logmeanexp(outputs, dim=2) - beta = metrics.get_beta(i-1, len(trainloader), beta_type, - epoch, num_epochs) + beta = metrics.get_beta(i - 1, len(trainloader), beta_type, epoch, num_epochs) loss = criterion(log_outputs, labels, kl, beta) loss.backward() optimizer.step() accs.append(metrics.acc(log_outputs.data, labels)) training_loss += loss.cpu().data.numpy() - return training_loss/len(trainloader), np.mean(accs), np.mean(kl_list) + return training_loss / len(trainloader), np.mean(accs), np.mean(kl_list) -def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, - epoch=None, num_epochs=None): +def validate_model( + net, criterion, validloader, num_ens=1, beta_type=0.1, epoch=None, num_epochs=None +): """Calculate ensemble accuracy and NLL Loss""" net.train() valid_loss = 0.0 @@ -85,8 +102,7 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, for i, (inputs, labels) in enumerate(validloader): inputs, labels = inputs.to(device), labels.to(device) - outputs = torch.zeros(inputs.shape[0], net.num_classes, - num_ens).to(device) + outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device) kl = 0.0 for j in range(num_ens): net_out, _kl = net(inputs) @@ -95,12 +111,11 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, log_outputs = utils.logmeanexp(outputs, dim=2) - beta = metrics.get_beta(i-1, len(validloader), beta_type, - epoch, num_epochs) + beta = metrics.get_beta(i - 1, len(validloader), beta_type, epoch, num_epochs) valid_loss += criterion(log_outputs, labels, kl, beta).item() accs.append(metrics.acc(log_outputs, labels)) - return valid_loss/len(validloader), np.mean(accs) + return valid_loss / len(validloader), np.mean(accs) def run(dataset, net_type): @@ -121,11 +136,13 @@ def run(dataset, net_type): trainset, testset, inputs, outputs = data.getDataset(dataset) train_loader, valid_loader, test_loader = data.getDataloader( - trainset, testset, valid_size, batch_size, num_workers) - net = getModel(net_type, inputs, outputs, priors, layer_type, - activation_type).to(device) + trainset, testset, valid_size, batch_size, num_workers + ) + net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to( + device + ) - ckpt_dir = f'checkpoints/{dataset}/bayesian' + ckpt_dir = f"checkpoints/{dataset}/bayesian" ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}\ _{activation_type}_{cfg["model"]["size"]}.pt' @@ -137,66 +154,72 @@ def run(dataset, net_type): criterion = metrics.ELBO(len(trainset)).to(device) optimizer = Adam(net.parameters(), lr=lr_start) - lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, - verbose=True) + lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True) # valid_loss_max = np.Inf # if stp == 2: early_stop = [] train_data = [] for epoch in range(n_epochs): # loop over the dataset multiple times - train_loss, train_acc, train_kl = train_model(net, optimizer, - criterion, - train_loader, - num_ens=train_ens, - beta_type=beta_type, - epoch=epoch, - num_epochs=n_epochs) - valid_loss, valid_acc = validate_model(net, criterion, valid_loader, - num_ens=valid_ens, - beta_type=beta_type, - epoch=epoch, - num_epochs=n_epochs) + train_loss, train_acc, train_kl = train_model( + net, + optimizer, + criterion, + train_loader, + num_ens=train_ens, + beta_type=beta_type, + epoch=epoch, + num_epochs=n_epochs, + ) + valid_loss, valid_acc = validate_model( + net, + criterion, + valid_loader, + num_ens=valid_ens, + beta_type=beta_type, + epoch=epoch, + num_epochs=n_epochs, + ) lr_sched.step(valid_loss) - train_data.append([epoch, train_loss, train_acc, valid_loss, - valid_acc]) - print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy:\ + train_data.append([epoch, train_loss, train_acc, valid_loss, valid_acc]) + print( + "Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy:\ {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy:\ - {:.4f} \ttrain_kl_div: {:.4f}'.format(epoch, train_loss, - train_acc, valid_loss, - valid_acc, train_kl)) + {:.4f} \ttrain_kl_div: {:.4f}".format( + epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl + ) + ) if stp == 2: - print('Using early stopping') - if earlyStopping(early_stop, valid_acc, epoch, - cfg["model"]["sens"]) == 1: + # print("Using early stopping") + if e_stop(early_stop, valid_acc, epoch + 1, 2, cfg["model"]["sens"]) == 1: break elif stp == 3: - print('Using energy bound') - if energyBound(cfg["model"]["energy_thrs"]) == 1: + # print("Using energy bound") + if energy_bound(cfg["model"]["energy_thrs"]) == 1: break elif stp == 4: - print('Using accuracy bound') - if accuracyBound(train_acc, cfg.acc_thrs) == 1: + # print("Using accuracy bound") + if accuracy_bound(train_acc, cfg.acc_thrs) == 1: break else: - print('Training for {} epochs'.format(cfg["model"]["n_epochs"])) + print("Training for {} epochs".format(cfg["model"]["n_epochs"])) if sav == 1: # save model when finished - if epoch == cfg.n_epochs-1: + if epoch == cfg.n_epochs - 1: torch.save(net.state_dict(), ckpt_name) - with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f: + with open("bayes_exp_data_" + str(cfg["model"]["size"]) + ".pkl", "wb") as f: pickle.dump(train_data, f) -if __name__ == '__main__': +if __name__ == "__main__": now = datetime.now() current_time = now.strftime("%H:%M:%S") print("Initial Time =", current_time) - print("Using bayesian model of size: {}".format(cfg["model"]["size"])) + print(f"Using bayesian model of size: {cfg["model"]["size"]}") run(cfg["data"], cfg["model"]["net_type"]) now = datetime.now() current_time = now.strftime("%H:%M:%S") diff --git a/main_frequentist.py b/main_frequentist.py index e70d631..e489f03 100755 --- a/main_frequentist.py +++ b/main_frequentist.py @@ -1,19 +1,22 @@ from __future__ import print_function -import os -import data -import torch -import pickle -import metrics -import numpy as np -import torch.nn as nn -from datetime import datetime -from torch.optim import Adam, lr_scheduler -from models.NonBayesianModels.LeNet import LeNet -from models.NonBayesianModels.AlexNet import AlexNet -from stopping_crit import earlyStopping, energyBound, accuracyBound -from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC -with (open("configuration.pkl", "rb")) as file: +import os +import pickle +from datetime import datetime + +import numpy as np +import torch +import torch.nn as nn +from torch.optim import Adam, lr_scheduler + +import data +import metrics +from models.NonBayesianModels.AlexNet import AlexNet +from models.NonBayesianModels.LeNet import LeNet +from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC +from stopping_crit import accuracy_bound, e_stop, energy_bound + +with open("configuration.pkl", "rb") as file: while True: try: cfg = pickle.load(file) @@ -25,15 +28,17 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def getModel(net_type, inputs, outputs, wide=cfg["model"]["size"]): - if (net_type == 'lenet'): + if net_type == "lenet": return LeNet(outputs, inputs, wide) - elif (net_type == 'alexnet'): + elif net_type == "alexnet": return AlexNet(outputs, inputs) - elif (net_type == '3conv3fc'): + elif net_type == "3conv3fc": return ThreeConvThreeFC(outputs, inputs) else: - raise ValueError('Network should be either [LeNet / AlexNet / \ - 3Conv3FC') + raise ValueError( + "Network should be either [LeNet / AlexNet / \ + 3Conv3FC" + ) def train_model(net, optimizer, criterion, train_loader): @@ -47,7 +52,7 @@ def train_model(net, optimizer, criterion, train_loader): loss = criterion(output, target) loss.backward() optimizer.step() - train_loss += loss.item()*data.size(0) + train_loss += loss.item() * data.size(0) accs.append(metrics.acc(output.detach(), target)) return train_loss, np.mean(accs) @@ -60,7 +65,7 @@ def validate_model(net, criterion, valid_loader): data, target = datas.to(device), target.to(device) output = net(data) loss = criterion(output, target) - valid_loss += loss.item()*data.size(0) + valid_loss += loss.item() * data.size(0) accs.append(metrics.acc(output.detach(), target)) return valid_loss, np.mean(accs) @@ -76,10 +81,11 @@ def run(dataset, net_type): trainset, testset, inputs, outputs = data.getDataset(dataset) train_loader, valid_loader, test_loader = data.getDataloader( - trainset, testset, valid_size, batch_size, num_workers) + trainset, testset, valid_size, batch_size, num_workers + ) net = getModel(net_type, inputs, outputs).to(device) - ckpt_dir = f'checkpoints/{dataset}/frequentist' + ckpt_dir = f"checkpoints/{dataset}/frequentist" ckpt_name = f'checkpoints/{dataset}/frequentist/model\ _{net_type}_{cfg["model"]["size"]}.pt' @@ -91,55 +97,54 @@ def run(dataset, net_type): criterion = nn.CrossEntropyLoss() optimizer = Adam(net.parameters(), lr=lr) - lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, - verbose=True) + lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True) # valid_loss_min = np.Inf # if stp == 2: early_stop = [] train_data = [] - for epoch in range(1, n_epochs+1): + for epoch in range(1, n_epochs + 1): - train_loss, train_acc = train_model(net, optimizer, criterion, - train_loader) + train_loss, train_acc = train_model(net, optimizer, criterion, train_loader) valid_loss, valid_acc = validate_model(net, criterion, valid_loader) lr_sched.step(valid_loss) - train_loss = train_loss/len(train_loader.dataset) - valid_loss = valid_loss/len(valid_loader.dataset) + train_loss = train_loss / len(train_loader.dataset) + valid_loss = valid_loss / len(valid_loader.dataset) - train_data.append([epoch, train_loss, train_acc, valid_loss, - valid_acc]) - print('Epoch: {} \tTraining Loss: {: .4f} \tTraining Accuracy: {: .4f}\ + train_data.append([epoch, train_loss, train_acc, valid_loss, valid_acc]) + print( + "Epoch: {} \tTraining Loss: {: .4f} \tTraining Accuracy: {: .4f}\ \tValidation Loss: {: .4f} \tValidation Accuracy: {: .4f}\ - '.format(epoch, train_loss, train_acc, valid_loss, valid_acc)) + ".format( + epoch, train_loss, train_acc, valid_loss, valid_acc + ) + ) if stp == 2: - # print('Using early stopping') - if earlyStopping(early_stop, valid_acc, epoch, - cfg["model"]["sens"]) == 1: + # print("Using early stopping") + if e_stop(early_stop, valid_acc, epoch, 2, cfg["model"]["sens"]) == 1: break elif stp == 3: # print('Using energy bound') - if energyBound(cfg["model"]["energy_thrs"]) == 1: + if energy_bound(cfg["model"]["energy_thrs"]) == 1: break elif stp == 4: # print('Using accuracy bound') - if accuracyBound(train_acc, - cfg["model"]["acc_thrs"]) == 1: + if accuracy_bound(train_acc, cfg["model"]["acc_thrs"]) == 1: break else: - print('Training for {} epochs'.format(cfg["model"]["n_epochs"])) + print("Training for {} epochs".format(cfg["model"]["n_epochs"])) if sav == 1: # save model when finished - if epoch == n_epochs: + if epoch <= n_epochs: torch.save(net.state_dict(), ckpt_name) - with open("freq_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f: + with open("freq_exp_data_" + str(cfg["model"]["size"]) + ".pkl", "wb") as f: pickle.dump(train_data, f) -if __name__ == '__main__': +if __name__ == "__main__": now = datetime.now() current_time = now.strftime("%H:%M:%S") print("Initial Time =", current_time) diff --git a/mem_free.sh b/mem_free.sh index 2be6c5c..d6643e0 100755 --- a/mem_free.sh +++ b/mem_free.sh @@ -1,5 +1,4 @@ -#!/bin/env bash - +#!/usr/bin/env bash while true do diff --git a/models/BayesianModels/Bayesian3Conv3FC.py b/models/BayesianModels/Bayesian3Conv3FC.py index f5a19ae..808a455 100755 --- a/models/BayesianModels/Bayesian3Conv3FC.py +++ b/models/BayesianModels/Bayesian3Conv3FC.py @@ -1,8 +1,16 @@ import math + import torch.nn as nn -from layers import BBB_Linear, BBB_Conv2d -from layers import BBB_LRT_Linear, BBB_LRT_Conv2d -from layers import FlattenLayer, ModuleWrapper + +from layers import ( + BBB_Conv2d, + BBB_Linear, + BBB_LRT_Conv2d, + BBB_LRT_Linear, + FlattenLayer, + ModuleWrapper, +) + class BBB3Conv3FC(ModuleWrapper): """ @@ -10,25 +18,28 @@ class BBB3Conv3FC(ModuleWrapper): Simple Neural Network having 3 Convolution and 3 FC layers with Bayesian layers. """ - def __init__(self, outputs, inputs, priors, layer_type='lrt', activation_type='softplus'): + + def __init__( + self, outputs, inputs, priors, layer_type="lrt", activation_type="softplus" + ): super(BBB3Conv3FC, self).__init__() self.num_classes = outputs self.layer_type = layer_type self.priors = priors - if layer_type=='lrt': + if layer_type == "lrt": BBBLinear = BBB_LRT_Linear BBBConv2d = BBB_LRT_Conv2d - elif layer_type=='bbb': + elif layer_type == "bbb": BBBLinear = BBB_Linear BBBConv2d = BBB_Conv2d else: raise ValueError("Undefined layer_type") - - if activation_type=='softplus': + + if activation_type == "softplus": self.act = nn.Softplus - elif activation_type=='relu': + elif activation_type == "relu": self.act = nn.ReLU else: raise ValueError("Only softplus or relu supported") diff --git a/radeontop.sh b/radeontop.sh index afc084d..c89c33f 100755 --- a/radeontop.sh +++ b/radeontop.sh @@ -1,3 +1,3 @@ -#!/bin/env bash +#!/usr/bin/env bash radeontop -b 08 -d - > $1 diff --git a/stopping_crit.py b/stopping_crit.py index b1ddc64..77779b5 100755 --- a/stopping_crit.py +++ b/stopping_crit.py @@ -1,31 +1,69 @@ import pickle from time import sleep + from gpu_power_func import total_watt_consumed -with (open("configuration.pkl", "rb")) as file: +with open("configuration.pkl", "rb") as file: while True: try: cfg = pickle.load(file) except EOFError: break -def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9): + +def non_decreasing(L): + return all(x <= y for x, y in zip(L, L[1:])) + + +def non_increasing(L): + return all(x >= y for x, y in zip(L, L[1:])) + + +def monotonic(L): + return non_decreasing(L) or non_increasing(L) + + +def strictly_increasing(L): + return all(x < y for x, y in zip(L, L[1:])) + + +def strictly_decreasing(L): + return all(x > y for x, y in zip(L, L[1:])) + + +def strictly_monotonic(L): + return strictly_increasing(L) or strictly_decreasing(L) + + +def e_stop( + early_stopping: list, + train_acc: float, + epoch: int, + patience: int = 4, + sensitivity: float = 1e-9, +): early_stopping.append(train_acc) - if epoch % 4 == 0 and epoch > 0: - print("Value 1: {} > Value 2: {} > \ - Value 3: {}".format(early_stopping[0], \ - abs(early_stopping[1]-sensitivity), \ - abs(early_stopping[2]-sensitivity))) - if train_acc > 0.5: - if early_stopping[0] > abs(early_stopping[1]-sensitivity) and \ - early_stopping[1] > abs(early_stopping[2]-sensitivity): - print("Stopping Early") - return 1 + if patience in (0, 1): + print("Stopping Early") + return 1 + if epoch % patience == 0 and epoch > 0: + early_stopping = early_stopping[-patience : len(early_stopping)] + ini = early_stopping.pop(0) + early_stopping = list(map(lambda x: x - sensitivity, early_stopping)) + early_stopping.insert(0, ini) + values = "" + for i, v in enumerate(early_stopping): + values += f"Value {i+1}: {v} > " + print(values) + if (train_acc > 0.5) and not strictly_increasing(early_stopping): + print("Stopping Early") + return 1 del early_stopping[:] return 0 - -def energyBound(threshold: float=100000.0): + +def energy_bound(threshold: float = 100000.0): + """Stops training when a specified amount of energy is consumed""" try: energy = total_watt_consumed(cfg["pickle_path"]) except Exception as e: @@ -38,7 +76,7 @@ def energyBound(threshold: float=100000.0): return 0 -def accuracyBound(train_acc: float, threshold: float=0.99): +def accuracy_bound(train_acc: float, threshold: float = 0.99): if train_acc >= threshold: print("Accuracy bound achieved") return 1