Add new Experiments, with noise addition and better save

This commit is contained in:
Eduardo Cueto-Mendoza 2025-01-15 10:26:48 +00:00
parent c900129ec9
commit b62011c029
Signed by: TastyPancakes
GPG Key ID: 941DF56C7242C3F1
11 changed files with 309 additions and 208 deletions

0
LICENSE Normal file → Executable file
View File

View File

@ -1,8 +1,9 @@
import pickle import pickle
from warnings import warn from warnings import warn
from gpu_power_func import get_sample_of_gpu from gpu_power_func import get_sample_of_gpu
with (open("configuration.pkl", "rb")) as file: with open("configuration.pkl", "rb") as file:
while True: while True:
try: try:
cfg = pickle.load(file) cfg = pickle.load(file)
@ -16,14 +17,14 @@ with (open("configuration.pkl", "rb")) as file:
# print(cfg) # print(cfg)
if __name__ == '__main__': if __name__ == "__main__":
dataDump = [] dataDump = []
while True: while True:
try: try:
dataDump.append(get_sample_of_gpu()) dataDump.append(get_sample_of_gpu())
with open(cfg["pickle_path"], 'wb') as f: with open(cfg["pickle_path"], "wb") as f:
pickle.dump(dataDump, f) pickle.dump(dataDump, f)
except EOFError: except EOFError:
warn('Pickle ran out of space') warn("Pickle ran out of space")
finally: finally:
f.close() f.close()

View File

@ -6,22 +6,42 @@ all_args = argparse.ArgumentParser()
def makeArguments(arguments: ArgumentParser) -> dict: def makeArguments(arguments: ArgumentParser) -> dict:
all_args.add_argument("-b", "--Bayesian", action="store", dest="b", """Training arguments to be passed to the model"""
type=int, choices=range(1, 8), all_args.add_argument(
help="Bayesian model of size x") "-b",
all_args.add_argument("-f", "--Frequentist", action="store", dest="f", "--Bayesian",
type=int, choices=range(1, 8), action="store",
help="Frequentist model of size x") dest="b",
all_args.add_argument("-E", "--EarlyStopping", action="store_true", type=int,
help="Early Stopping criteria") choices=range(1, 8),
all_args.add_argument("-e", "--EnergyBound", action="store_true", help="Bayesian model of size x",
help="Energy Bound criteria") )
all_args.add_argument("-a", "--AccuracyBound", action="store_true", all_args.add_argument(
help="Accuracy Bound criteria") "-f",
all_args.add_argument("-s", "--Save", action="store_true", "--Frequentist",
help="Save model") action="store",
all_args.add_argument('--net_type', default='lenet', type=str, dest="f",
help='model = [lenet/AlexNet/3Conv3FC]') type=int,
all_args.add_argument('--dataset', default='CIFAR10', type=str, choices=range(1, 8),
help='dataset = [MNIST/CIFAR10/CIFAR100]') help="Frequentist model of size x",
)
all_args.add_argument(
"-E", "--EarlyStopping", action="store_true", help="Early Stopping criteria"
)
all_args.add_argument(
"-e", "--EnergyBound", action="store_true", help="Energy Bound criteria"
)
all_args.add_argument(
"-a", "--AccuracyBound", action="store_true", help="Accuracy Bound criteria"
)
all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
all_args.add_argument(
"--net_type", default="lenet", type=str, help="model = [lenet/AlexNet/3Conv3FC]"
)
all_args.add_argument(
"--dataset",
default="CIFAR10",
type=str,
help="dataset = [MNIST/CIFAR10/CIFAR100]",
)
return vars(all_args.parse_args()) return vars(all_args.parse_args())

View File

@ -1,4 +1,4 @@
#!/bin/env bash #!/usr/bin/env bash
powerstat -D -z 0.5 10000000 > $1 powerstat -D -z 0.5 10000000 > $1
#powerstat -z 0.5 1000000 > $1 #powerstat -z 0.5 1000000 > $1

View File

@ -1,18 +1,20 @@
import os import os
import re
import pickle import pickle
import re
import subprocess
from re import findall, sub
from subprocess import run
import numpy as np import numpy as np
def get_sample_of_gpu(): def get_sample_of_gpu():
from re import sub, findall
import subprocess
from subprocess import run
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running." no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
no_version = "Failed to initialize NVML: Driver/library version mismatch" no_version = "Failed to initialize NVML: Driver/library version mismatch"
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE) smi_string = run(
smi_string = smi_string.stdout.decode('utf-8') ["rocm-smi", "-P", "--showvoltage", "--showmemuse"], stdout=subprocess.PIPE
)
smi_string = smi_string.stdout.decode("utf-8")
smi_string = smi_string.split("\n") smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string)) smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph: if smi_string[0] == no_graph:
@ -20,13 +22,13 @@ def get_sample_of_gpu():
elif smi_string[0] == no_version: elif smi_string[0] == no_version:
raise Exception("rocm-smi version mismatch") raise Exception("rocm-smi version mismatch")
else: else:
results= [] results = []
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) gpuW0 = findall(r"[0-9]*\.[0-9]*", smi_string[2])
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4]) gpuW1 = findall(r"[0-9]*\.[0-9]*", smi_string[3])
gpuM0 = findall("[0-9]+",smi_string[7]) gpuM0 = findall(r"[0-9]+", smi_string[6])
gpuM1 = findall("[0-9]+",smi_string[9]) gpuM1 = findall(r"[0-9]+", smi_string[10])
gpuV0 = findall("[0-9]+",smi_string[13]) gpuV0 = findall(r"[0-9]+", smi_string[16])
gpuV1 = findall("[0-9]+",smi_string[14]) gpuV1 = findall(r"[0-9]+", smi_string[17])
results.append(float(gpuW0[0]) + float(gpuW1[0])) results.append(float(gpuW0[0]) + float(gpuW1[0]))
if len(gpuM0) == 2 and len(gpuM1) == 2: if len(gpuM0) == 2 and len(gpuM1) == 2:
results.append(int(gpuM0[1]) + int(gpuM1[1])) results.append(int(gpuM0[1]) + int(gpuM1[1]))
@ -36,19 +38,21 @@ def get_sample_of_gpu():
results.append(gpuM1[1]) results.append(gpuM1[1])
results.append(int(gpuV0[1]) + int(gpuV1[1])) results.append(int(gpuV0[1]) + int(gpuV1[1]))
return results return results
#for l in smi_string: # for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l) # temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp: # if temp:
#return temp # return temp
def total_watt_consumed(pickle_name): def total_watt_consumed(pickle_name):
with (open(pickle_name, "rb")) as file: with open(pickle_name, "rb") as file:
while True: while True:
try: try:
x = pickle.load(file) x = pickle.load(file)
except EOFError: except EOFError:
break break
x = np.array(x) x = np.array(x)
x = x[:,0] x = x[:, 0]
y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x] y = [float(re.findall("\d+.\d+", xi)[0]) for xi in x]
return sum(y) return sum(y)

View File

@ -1,21 +1,23 @@
from __future__ import print_function from __future__ import print_function
import os import os
import data
import utils
import torch
import pickle import pickle
import metrics
import numpy as np
from datetime import datetime from datetime import datetime
import numpy as np
import torch
from torch.nn import functional as F from torch.nn import functional as F
from torch.optim import Adam, lr_scheduler from torch.optim import Adam, lr_scheduler
from models.BayesianModels.BayesianLeNet import BBBLeNet
from models.BayesianModels.BayesianAlexNet import BBBAlexNet
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
from stopping_crit import earlyStopping, energyBound, accuracyBound
with (open("configuration.pkl", "rb")) as file: import data
import metrics
import utils
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
from models.BayesianModels.BayesianAlexNet import BBBAlexNet
from models.BayesianModels.BayesianLeNet import BBBLeNet
from stopping_crit import accuracy_bound, e_stop, energy_bound
with open("configuration.pkl", "rb") as file:
while True: while True:
try: try:
cfg = pickle.load(file) cfg = pickle.load(file)
@ -28,21 +30,37 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs, priors, layer_type, activation_type): def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
if (net_type == 'lenet'): print(net_type)
return BBBLeNet(outputs, inputs, priors, layer_type, activation_type, if net_type == "lenet":
wide=cfg["model"]["size"]) return BBBLeNet(
elif (net_type == 'alexnet'): outputs,
inputs,
priors,
layer_type,
activation_type,
wide=cfg["model"]["size"],
)
elif net_type == "alexnet":
return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type) return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type)
elif (net_type == '3conv3fc'): elif net_type == "3conv3fc":
return BBB3Conv3FC(outputs, inputs, priors, layer_type, return BBB3Conv3FC(outputs, inputs, priors, layer_type, activation_type)
activation_type)
else: else:
raise ValueError('Network should be either [LeNet / AlexNet\ raise ValueError(
/ 3Conv3FC') "Network should be either [LeNet / AlexNet\
/ 3Conv3FC"
)
def train_model(net, optimizer, criterion, trainloader, num_ens=1, def train_model(
beta_type=0.1, epoch=None, num_epochs=None): net,
optimizer,
criterion,
trainloader,
num_ens=1,
beta_type=0.1,
epoch=None,
num_epochs=None,
):
net.train() net.train()
training_loss = 0.0 training_loss = 0.0
accs = [] accs = []
@ -52,8 +70,7 @@ def train_model(net, optimizer, criterion, trainloader, num_ens=1,
optimizer.zero_grad() optimizer.zero_grad()
inputs, labels = inputs.to(device), labels.to(device) inputs, labels = inputs.to(device), labels.to(device)
outputs = torch.zeros(inputs.shape[0], net.num_classes, outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device)
num_ens).to(device)
kl = 0.0 kl = 0.0
for j in range(num_ens): for j in range(num_ens):
@ -65,19 +82,19 @@ def train_model(net, optimizer, criterion, trainloader, num_ens=1,
kl_list.append(kl.item()) kl_list.append(kl.item())
log_outputs = utils.logmeanexp(outputs, dim=2) log_outputs = utils.logmeanexp(outputs, dim=2)
beta = metrics.get_beta(i-1, len(trainloader), beta_type, beta = metrics.get_beta(i - 1, len(trainloader), beta_type, epoch, num_epochs)
epoch, num_epochs)
loss = criterion(log_outputs, labels, kl, beta) loss = criterion(log_outputs, labels, kl, beta)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
accs.append(metrics.acc(log_outputs.data, labels)) accs.append(metrics.acc(log_outputs.data, labels))
training_loss += loss.cpu().data.numpy() training_loss += loss.cpu().data.numpy()
return training_loss/len(trainloader), np.mean(accs), np.mean(kl_list) return training_loss / len(trainloader), np.mean(accs), np.mean(kl_list)
def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, def validate_model(
epoch=None, num_epochs=None): net, criterion, validloader, num_ens=1, beta_type=0.1, epoch=None, num_epochs=None
):
"""Calculate ensemble accuracy and NLL Loss""" """Calculate ensemble accuracy and NLL Loss"""
net.train() net.train()
valid_loss = 0.0 valid_loss = 0.0
@ -85,8 +102,7 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1,
for i, (inputs, labels) in enumerate(validloader): for i, (inputs, labels) in enumerate(validloader):
inputs, labels = inputs.to(device), labels.to(device) inputs, labels = inputs.to(device), labels.to(device)
outputs = torch.zeros(inputs.shape[0], net.num_classes, outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device)
num_ens).to(device)
kl = 0.0 kl = 0.0
for j in range(num_ens): for j in range(num_ens):
net_out, _kl = net(inputs) net_out, _kl = net(inputs)
@ -95,12 +111,11 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1,
log_outputs = utils.logmeanexp(outputs, dim=2) log_outputs = utils.logmeanexp(outputs, dim=2)
beta = metrics.get_beta(i-1, len(validloader), beta_type, beta = metrics.get_beta(i - 1, len(validloader), beta_type, epoch, num_epochs)
epoch, num_epochs)
valid_loss += criterion(log_outputs, labels, kl, beta).item() valid_loss += criterion(log_outputs, labels, kl, beta).item()
accs.append(metrics.acc(log_outputs, labels)) accs.append(metrics.acc(log_outputs, labels))
return valid_loss/len(validloader), np.mean(accs) return valid_loss / len(validloader), np.mean(accs)
def run(dataset, net_type): def run(dataset, net_type):
@ -121,11 +136,13 @@ def run(dataset, net_type):
trainset, testset, inputs, outputs = data.getDataset(dataset) trainset, testset, inputs, outputs = data.getDataset(dataset)
train_loader, valid_loader, test_loader = data.getDataloader( train_loader, valid_loader, test_loader = data.getDataloader(
trainset, testset, valid_size, batch_size, num_workers) trainset, testset, valid_size, batch_size, num_workers
net = getModel(net_type, inputs, outputs, priors, layer_type, )
activation_type).to(device) net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to(
device
)
ckpt_dir = f'checkpoints/{dataset}/bayesian' ckpt_dir = f"checkpoints/{dataset}/bayesian"
ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}\ ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}\
_{activation_type}_{cfg["model"]["size"]}.pt' _{activation_type}_{cfg["model"]["size"]}.pt'
@ -137,66 +154,72 @@ def run(dataset, net_type):
criterion = metrics.ELBO(len(trainset)).to(device) criterion = metrics.ELBO(len(trainset)).to(device)
optimizer = Adam(net.parameters(), lr=lr_start) optimizer = Adam(net.parameters(), lr=lr_start)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
verbose=True)
# valid_loss_max = np.Inf # valid_loss_max = np.Inf
# if stp == 2: # if stp == 2:
early_stop = [] early_stop = []
train_data = [] train_data = []
for epoch in range(n_epochs): # loop over the dataset multiple times for epoch in range(n_epochs): # loop over the dataset multiple times
train_loss, train_acc, train_kl = train_model(net, optimizer, train_loss, train_acc, train_kl = train_model(
net,
optimizer,
criterion, criterion,
train_loader, train_loader,
num_ens=train_ens, num_ens=train_ens,
beta_type=beta_type, beta_type=beta_type,
epoch=epoch, epoch=epoch,
num_epochs=n_epochs) num_epochs=n_epochs,
valid_loss, valid_acc = validate_model(net, criterion, valid_loader, )
valid_loss, valid_acc = validate_model(
net,
criterion,
valid_loader,
num_ens=valid_ens, num_ens=valid_ens,
beta_type=beta_type, beta_type=beta_type,
epoch=epoch, epoch=epoch,
num_epochs=n_epochs) num_epochs=n_epochs,
)
lr_sched.step(valid_loss) lr_sched.step(valid_loss)
train_data.append([epoch, train_loss, train_acc, valid_loss, train_data.append([epoch, train_loss, train_acc, valid_loss, valid_acc])
valid_acc]) print(
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy:\ "Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy:\
{:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy:\ {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy:\
{:.4f} \ttrain_kl_div: {:.4f}'.format(epoch, train_loss, {:.4f} \ttrain_kl_div: {:.4f}".format(
train_acc, valid_loss, epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl
valid_acc, train_kl)) )
)
if stp == 2: if stp == 2:
print('Using early stopping') # print("Using early stopping")
if earlyStopping(early_stop, valid_acc, epoch, if e_stop(early_stop, valid_acc, epoch + 1, 2, cfg["model"]["sens"]) == 1:
cfg["model"]["sens"]) == 1:
break break
elif stp == 3: elif stp == 3:
print('Using energy bound') # print("Using energy bound")
if energyBound(cfg["model"]["energy_thrs"]) == 1: if energy_bound(cfg["model"]["energy_thrs"]) == 1:
break break
elif stp == 4: elif stp == 4:
print('Using accuracy bound') # print("Using accuracy bound")
if accuracyBound(train_acc, cfg.acc_thrs) == 1: if accuracy_bound(train_acc, cfg.acc_thrs) == 1:
break break
else: else:
print('Training for {} epochs'.format(cfg["model"]["n_epochs"])) print("Training for {} epochs".format(cfg["model"]["n_epochs"]))
if sav == 1: if sav == 1:
# save model when finished # save model when finished
if epoch == cfg.n_epochs-1: if epoch == cfg.n_epochs - 1:
torch.save(net.state_dict(), ckpt_name) torch.save(net.state_dict(), ckpt_name)
with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f: with open("bayes_exp_data_" + str(cfg["model"]["size"]) + ".pkl", "wb") as f:
pickle.dump(train_data, f) pickle.dump(train_data, f)
if __name__ == '__main__': if __name__ == "__main__":
now = datetime.now() now = datetime.now()
current_time = now.strftime("%H:%M:%S") current_time = now.strftime("%H:%M:%S")
print("Initial Time =", current_time) print("Initial Time =", current_time)
print("Using bayesian model of size: {}".format(cfg["model"]["size"])) print(f"Using bayesian model of size: {cfg["model"]["size"]}")
run(cfg["data"], cfg["model"]["net_type"]) run(cfg["data"], cfg["model"]["net_type"])
now = datetime.now() now = datetime.now()
current_time = now.strftime("%H:%M:%S") current_time = now.strftime("%H:%M:%S")

View File

@ -1,19 +1,22 @@
from __future__ import print_function from __future__ import print_function
import os
import data
import torch
import pickle
import metrics
import numpy as np
import torch.nn as nn
from datetime import datetime
from torch.optim import Adam, lr_scheduler
from models.NonBayesianModels.LeNet import LeNet
from models.NonBayesianModels.AlexNet import AlexNet
from stopping_crit import earlyStopping, energyBound, accuracyBound
from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
with (open("configuration.pkl", "rb")) as file: import os
import pickle
from datetime import datetime
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam, lr_scheduler
import data
import metrics
from models.NonBayesianModels.AlexNet import AlexNet
from models.NonBayesianModels.LeNet import LeNet
from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
from stopping_crit import accuracy_bound, e_stop, energy_bound
with open("configuration.pkl", "rb") as file:
while True: while True:
try: try:
cfg = pickle.load(file) cfg = pickle.load(file)
@ -25,15 +28,17 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs, wide=cfg["model"]["size"]): def getModel(net_type, inputs, outputs, wide=cfg["model"]["size"]):
if (net_type == 'lenet'): if net_type == "lenet":
return LeNet(outputs, inputs, wide) return LeNet(outputs, inputs, wide)
elif (net_type == 'alexnet'): elif net_type == "alexnet":
return AlexNet(outputs, inputs) return AlexNet(outputs, inputs)
elif (net_type == '3conv3fc'): elif net_type == "3conv3fc":
return ThreeConvThreeFC(outputs, inputs) return ThreeConvThreeFC(outputs, inputs)
else: else:
raise ValueError('Network should be either [LeNet / AlexNet / \ raise ValueError(
3Conv3FC') "Network should be either [LeNet / AlexNet / \
3Conv3FC"
)
def train_model(net, optimizer, criterion, train_loader): def train_model(net, optimizer, criterion, train_loader):
@ -47,7 +52,7 @@ def train_model(net, optimizer, criterion, train_loader):
loss = criterion(output, target) loss = criterion(output, target)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
train_loss += loss.item()*data.size(0) train_loss += loss.item() * data.size(0)
accs.append(metrics.acc(output.detach(), target)) accs.append(metrics.acc(output.detach(), target))
return train_loss, np.mean(accs) return train_loss, np.mean(accs)
@ -60,7 +65,7 @@ def validate_model(net, criterion, valid_loader):
data, target = datas.to(device), target.to(device) data, target = datas.to(device), target.to(device)
output = net(data) output = net(data)
loss = criterion(output, target) loss = criterion(output, target)
valid_loss += loss.item()*data.size(0) valid_loss += loss.item() * data.size(0)
accs.append(metrics.acc(output.detach(), target)) accs.append(metrics.acc(output.detach(), target))
return valid_loss, np.mean(accs) return valid_loss, np.mean(accs)
@ -76,10 +81,11 @@ def run(dataset, net_type):
trainset, testset, inputs, outputs = data.getDataset(dataset) trainset, testset, inputs, outputs = data.getDataset(dataset)
train_loader, valid_loader, test_loader = data.getDataloader( train_loader, valid_loader, test_loader = data.getDataloader(
trainset, testset, valid_size, batch_size, num_workers) trainset, testset, valid_size, batch_size, num_workers
)
net = getModel(net_type, inputs, outputs).to(device) net = getModel(net_type, inputs, outputs).to(device)
ckpt_dir = f'checkpoints/{dataset}/frequentist' ckpt_dir = f"checkpoints/{dataset}/frequentist"
ckpt_name = f'checkpoints/{dataset}/frequentist/model\ ckpt_name = f'checkpoints/{dataset}/frequentist/model\
_{net_type}_{cfg["model"]["size"]}.pt' _{net_type}_{cfg["model"]["size"]}.pt'
@ -91,55 +97,54 @@ def run(dataset, net_type):
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = Adam(net.parameters(), lr=lr) optimizer = Adam(net.parameters(), lr=lr)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
verbose=True)
# valid_loss_min = np.Inf # valid_loss_min = np.Inf
# if stp == 2: # if stp == 2:
early_stop = [] early_stop = []
train_data = [] train_data = []
for epoch in range(1, n_epochs+1): for epoch in range(1, n_epochs + 1):
train_loss, train_acc = train_model(net, optimizer, criterion, train_loss, train_acc = train_model(net, optimizer, criterion, train_loader)
train_loader)
valid_loss, valid_acc = validate_model(net, criterion, valid_loader) valid_loss, valid_acc = validate_model(net, criterion, valid_loader)
lr_sched.step(valid_loss) lr_sched.step(valid_loss)
train_loss = train_loss/len(train_loader.dataset) train_loss = train_loss / len(train_loader.dataset)
valid_loss = valid_loss/len(valid_loader.dataset) valid_loss = valid_loss / len(valid_loader.dataset)
train_data.append([epoch, train_loss, train_acc, valid_loss, train_data.append([epoch, train_loss, train_acc, valid_loss, valid_acc])
valid_acc]) print(
print('Epoch: {} \tTraining Loss: {: .4f} \tTraining Accuracy: {: .4f}\ "Epoch: {} \tTraining Loss: {: .4f} \tTraining Accuracy: {: .4f}\
\tValidation Loss: {: .4f} \tValidation Accuracy: {: .4f}\ \tValidation Loss: {: .4f} \tValidation Accuracy: {: .4f}\
'.format(epoch, train_loss, train_acc, valid_loss, valid_acc)) ".format(
epoch, train_loss, train_acc, valid_loss, valid_acc
)
)
if stp == 2: if stp == 2:
# print('Using early stopping') # print("Using early stopping")
if earlyStopping(early_stop, valid_acc, epoch, if e_stop(early_stop, valid_acc, epoch, 2, cfg["model"]["sens"]) == 1:
cfg["model"]["sens"]) == 1:
break break
elif stp == 3: elif stp == 3:
# print('Using energy bound') # print('Using energy bound')
if energyBound(cfg["model"]["energy_thrs"]) == 1: if energy_bound(cfg["model"]["energy_thrs"]) == 1:
break break
elif stp == 4: elif stp == 4:
# print('Using accuracy bound') # print('Using accuracy bound')
if accuracyBound(train_acc, if accuracy_bound(train_acc, cfg["model"]["acc_thrs"]) == 1:
cfg["model"]["acc_thrs"]) == 1:
break break
else: else:
print('Training for {} epochs'.format(cfg["model"]["n_epochs"])) print("Training for {} epochs".format(cfg["model"]["n_epochs"]))
if sav == 1: if sav == 1:
# save model when finished # save model when finished
if epoch == n_epochs: if epoch <= n_epochs:
torch.save(net.state_dict(), ckpt_name) torch.save(net.state_dict(), ckpt_name)
with open("freq_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f: with open("freq_exp_data_" + str(cfg["model"]["size"]) + ".pkl", "wb") as f:
pickle.dump(train_data, f) pickle.dump(train_data, f)
if __name__ == '__main__': if __name__ == "__main__":
now = datetime.now() now = datetime.now()
current_time = now.strftime("%H:%M:%S") current_time = now.strftime("%H:%M:%S")
print("Initial Time =", current_time) print("Initial Time =", current_time)

View File

@ -1,5 +1,4 @@
#!/bin/env bash #!/usr/bin/env bash
while true while true
do do

View File

@ -1,8 +1,16 @@
import math import math
import torch.nn as nn import torch.nn as nn
from layers import BBB_Linear, BBB_Conv2d
from layers import BBB_LRT_Linear, BBB_LRT_Conv2d from layers import (
from layers import FlattenLayer, ModuleWrapper BBB_Conv2d,
BBB_Linear,
BBB_LRT_Conv2d,
BBB_LRT_Linear,
FlattenLayer,
ModuleWrapper,
)
class BBB3Conv3FC(ModuleWrapper): class BBB3Conv3FC(ModuleWrapper):
""" """
@ -10,25 +18,28 @@ class BBB3Conv3FC(ModuleWrapper):
Simple Neural Network having 3 Convolution Simple Neural Network having 3 Convolution
and 3 FC layers with Bayesian layers. and 3 FC layers with Bayesian layers.
""" """
def __init__(self, outputs, inputs, priors, layer_type='lrt', activation_type='softplus'):
def __init__(
self, outputs, inputs, priors, layer_type="lrt", activation_type="softplus"
):
super(BBB3Conv3FC, self).__init__() super(BBB3Conv3FC, self).__init__()
self.num_classes = outputs self.num_classes = outputs
self.layer_type = layer_type self.layer_type = layer_type
self.priors = priors self.priors = priors
if layer_type=='lrt': if layer_type == "lrt":
BBBLinear = BBB_LRT_Linear BBBLinear = BBB_LRT_Linear
BBBConv2d = BBB_LRT_Conv2d BBBConv2d = BBB_LRT_Conv2d
elif layer_type=='bbb': elif layer_type == "bbb":
BBBLinear = BBB_Linear BBBLinear = BBB_Linear
BBBConv2d = BBB_Conv2d BBBConv2d = BBB_Conv2d
else: else:
raise ValueError("Undefined layer_type") raise ValueError("Undefined layer_type")
if activation_type=='softplus': if activation_type == "softplus":
self.act = nn.Softplus self.act = nn.Softplus
elif activation_type=='relu': elif activation_type == "relu":
self.act = nn.ReLU self.act = nn.ReLU
else: else:
raise ValueError("Only softplus or relu supported") raise ValueError("Only softplus or relu supported")

View File

@ -1,3 +1,3 @@
#!/bin/env bash #!/usr/bin/env bash
radeontop -b 08 -d - > $1 radeontop -b 08 -d - > $1

View File

@ -1,31 +1,69 @@
import pickle import pickle
from time import sleep from time import sleep
from gpu_power_func import total_watt_consumed from gpu_power_func import total_watt_consumed
with (open("configuration.pkl", "rb")) as file: with open("configuration.pkl", "rb") as file:
while True: while True:
try: try:
cfg = pickle.load(file) cfg = pickle.load(file)
except EOFError: except EOFError:
break break
def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9):
def non_decreasing(L):
return all(x <= y for x, y in zip(L, L[1:]))
def non_increasing(L):
return all(x >= y for x, y in zip(L, L[1:]))
def monotonic(L):
return non_decreasing(L) or non_increasing(L)
def strictly_increasing(L):
return all(x < y for x, y in zip(L, L[1:]))
def strictly_decreasing(L):
return all(x > y for x, y in zip(L, L[1:]))
def strictly_monotonic(L):
return strictly_increasing(L) or strictly_decreasing(L)
def e_stop(
early_stopping: list,
train_acc: float,
epoch: int,
patience: int = 4,
sensitivity: float = 1e-9,
):
early_stopping.append(train_acc) early_stopping.append(train_acc)
if epoch % 4 == 0 and epoch > 0: if patience in (0, 1):
print("Value 1: {} > Value 2: {} > \ print("Stopping Early")
Value 3: {}".format(early_stopping[0], \ return 1
abs(early_stopping[1]-sensitivity), \ if epoch % patience == 0 and epoch > 0:
abs(early_stopping[2]-sensitivity))) early_stopping = early_stopping[-patience : len(early_stopping)]
if train_acc > 0.5: ini = early_stopping.pop(0)
if early_stopping[0] > abs(early_stopping[1]-sensitivity) and \ early_stopping = list(map(lambda x: x - sensitivity, early_stopping))
early_stopping[1] > abs(early_stopping[2]-sensitivity): early_stopping.insert(0, ini)
values = ""
for i, v in enumerate(early_stopping):
values += f"Value {i+1}: {v} > "
print(values)
if (train_acc > 0.5) and not strictly_increasing(early_stopping):
print("Stopping Early") print("Stopping Early")
return 1 return 1
del early_stopping[:] del early_stopping[:]
return 0 return 0
def energyBound(threshold: float=100000.0): def energy_bound(threshold: float = 100000.0):
"""Stops training when a specified amount of energy is consumed"""
try: try:
energy = total_watt_consumed(cfg["pickle_path"]) energy = total_watt_consumed(cfg["pickle_path"])
except Exception as e: except Exception as e:
@ -38,7 +76,7 @@ def energyBound(threshold: float=100000.0):
return 0 return 0
def accuracyBound(train_acc: float, threshold: float=0.99): def accuracy_bound(train_acc: float, threshold: float = 0.99):
if train_acc >= threshold: if train_acc >= threshold:
print("Accuracy bound achieved") print("Accuracy bound achieved")
return 1 return 1