Solved killing pipes, renewed how training sampling works

This commit is contained in:
Eddie Cueto 2023-06-28 17:02:56 +01:00
parent 44272d52a7
commit 7fa9a14303
13 changed files with 132 additions and 203 deletions

1
.gitignore vendored
View File

@ -13,6 +13,7 @@ bayes_*
times_* times_*
freq_* freq_*
*.pkl *.pkl
*.txt
bay bay
frq frq
sav sav

88
amd_sample_draw.py Normal file → Executable file
View File

@ -1,86 +1,44 @@
import os
import re
import pickle import pickle
import numpy as np
from warnings import warn from warnings import warn
from gpu_power_func import get_sample_of_gpu
with open("frq", "r") as file: with (open("configuration.pkl", "rb")) as file:
frq = int(file.read()) while True:
try:
cfg = pickle.load(file)
except EOFError:
break
with open("bay", "r") as file: #with open("frq", "r") as file:
bay = int(file.read()) # frq = int(file.read())
if frq == 1: #with open("bay", "r") as file:
model_t = "freq" # bay = int(file.read())
with open("tmp", "r") as file:
size = float(file.read())
if bay == 1: #if frq == 1:
model_t = "bayes" # model_t = "freq"
with open("tmp", "r") as file: # with open("tmp", "r") as file:
size = int(file.read()) # size = float(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size) #if bay == 1:
print("GPU energy file config: {}".format(pickle_name)) # model_t = "bayes"
# with open("tmp", "r") as file:
# size = int(file.read())
def get_sample_of_gpu(): #pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
from re import sub, findall #print("GPU energy file config: {}".format(pickle_name))
import subprocess
from subprocess import run
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running." #print(cfg)
no_version = "Failed to initialize NVML: Driver/library version mismatch"
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph:
raise Exception("It seems that no AMD GPU is installed")
elif smi_string[0] == no_version:
raise Exception("rocm-smi version mismatch")
else:
results= []
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2])
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
gpuM0 = findall("[0-9]+",smi_string[7])
gpuM1 = findall("[0-9]+",smi_string[9])
gpuV0 = findall("[0-9]+",smi_string[13])
gpuV1 = findall("[0-9]+",smi_string[14])
results.append(float(gpuW0[0]) + float(gpuW1[0]))
if len(gpuM0) == 2 and len(gpuM1) == 2:
results.append(int(gpuM0[1]) + int(gpuM1[1]))
elif len(gpuM0) == 2:
results.append(gpuM0[1])
elif len(gpuM1) == 2:
results.append(gpuM1[1])
results.append(int(gpuV0[1]) + int(gpuV1[1]))
return results
#for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp:
#return temp
def total_watt_consumed():
with (open(pickle_name, "rb")) as file:
while True:
try:
x = pickle.load(file)
except EOFError:
break
x = np.array(x)
x = x[:,0]
y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
return sum(y)
if __name__ == '__main__': if __name__ == '__main__':
dataDump = [] dataDump = []
#var = True #var = True
#pickling_on = open("wattdata.pickle","wb") #pickling_on = open("wattdata.pickle","wb")
while True: while True:
#from run_service import retcode
try: try:
dataDump.append(get_sample_of_gpu()) dataDump.append(get_sample_of_gpu())
with open(pickle_name, 'wb') as f: with open(cfg["pickle_path"], 'wb') as f:
pickle.dump(dataDump, f) pickle.dump(dataDump, f)
except EOFError: except EOFError:
warn('Pickle ran out of space') warn('Pickle ran out of space')

2
arguments.py Normal file → Executable file
View File

@ -17,4 +17,6 @@ def makeArguments(arguments: ArgumentParser) -> dict:
all_args.add_argument("-a", "--AccuracyBound", action="store_true", all_args.add_argument("-a", "--AccuracyBound", action="store_true",
help="Accuracy Bound criteria") help="Accuracy Bound criteria")
all_args.add_argument("-s", "--Save", action="store_true", help="Save model") all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
all_args.add_argument('--net_type', default='lenet', type=str, help='model = [lenet/AlexNet/3Conv3FC]')
all_args.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
return vars(all_args.parse_args()) return vars(all_args.parse_args())

View File

@ -1,45 +0,0 @@
############### Configuration file for Bayesian ###############
import os
layer_type = 'lrt' # 'bbb' or 'lrt'
activation_type = 'softplus' # 'softplus' or 'relu'
priors={
'prior_mu': 0,
'prior_sigma': 0.1,
'posterior_mu_initial': (0, 0.1), # (mean, std) normal_
'posterior_rho_initial': (-5, 0.1), # (mean, std) normal_
}
n_epochs = 100
sens = 1e-9
energy_thrs = 100000
acc_thrs = 0.99
lr_start = 0.001
num_workers = 4
valid_size = 0.2
batch_size = 256
train_ens = 1
valid_ens = 1
beta_type = 0.1 # 'Blundell', 'Standard', etc. Use float for const value
with open("bay", "r") as file:
bay = int(file.read())
if bay == 1:
with open("tmp", "r") as file:
wide = int(file.read())
#if os.path.exists("tmp"):
# os.remove("tmp")
#else:
# raise Exception("Tmp file not found")
print("Bayesian configured to run with width: {}".format(wide))
#if os.path.exists("bay"):
# os.remove("bay")
#else:
# raise Exception("Bay file not found")

View File

@ -1,32 +0,0 @@
############### Configuration file for Frequentist ###############
import os
n_epochs = 100
sens = 1e-9
energy_thrs = 10000
acc_thrs = 0.99
lr = 0.001
num_workers = 4
valid_size = 0.2
batch_size = 256
with open("frq", "r") as file:
frq = int(file.read())
if frq == 1:
with open("tmp", "r") as file:
wide = int(file.read())
if os.path.exists("tmp"):
os.remove("tmp")
else:
raise Exception("Tmp file not found")
print("Frequentist configured to run with width: {}".format(wide))
#if os.path.exists("frq"):
# os.remove("frq")
#else:
# raise Exception("Frq file not found")

54
gpu_power_func.py Normal file
View File

@ -0,0 +1,54 @@
import os
import re
import pickle
import numpy as np
def get_sample_of_gpu():
from re import sub, findall
import subprocess
from subprocess import run
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
no_version = "Failed to initialize NVML: Driver/library version mismatch"
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph:
raise Exception("It seems that no AMD GPU is installed")
elif smi_string[0] == no_version:
raise Exception("rocm-smi version mismatch")
else:
results= []
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2])
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
gpuM0 = findall("[0-9]+",smi_string[7])
gpuM1 = findall("[0-9]+",smi_string[9])
gpuV0 = findall("[0-9]+",smi_string[13])
gpuV1 = findall("[0-9]+",smi_string[14])
results.append(float(gpuW0[0]) + float(gpuW1[0]))
if len(gpuM0) == 2 and len(gpuM1) == 2:
results.append(int(gpuM0[1]) + int(gpuM1[1]))
elif len(gpuM0) == 2:
results.append(gpuM0[1])
elif len(gpuM1) == 2:
results.append(gpuM1[1])
results.append(int(gpuV0[1]) + int(gpuV1[1]))
return results
#for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp:
#return temp
def total_watt_consumed(pickle_name):
with (open(pickle_name, "rb")) as file:
while True:
try:
x = pickle.load(file)
except EOFError:
break
x = np.array(x)
x = x[:,0]
y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
return sum(y)

View File

@ -4,24 +4,6 @@ import pickle
import numpy as np import numpy as np
from warnings import warn from warnings import warn
with open("frq", "r") as file:
frq = int(file.read())
with open("bay", "r") as file:
bay = int(file.read())
if frq == 1:
model_t = "freq"
with open("tmp", "r") as file:
size = float(file.read())
if bay == 1:
model_t = "bayes"
with open("tmp", "r") as file:
size = int(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
#print("GPU energy file config: {}".format(pickle_name))
def get_sample_of_gpu(): def get_sample_of_gpu():
from re import sub, findall from re import sub, findall
@ -45,6 +27,7 @@ def get_sample_of_gpu():
#if temp: #if temp:
#return temp #return temp
def total_watt_consumed(): def total_watt_consumed():
with open(pickle_name, 'rb') as f: with open(pickle_name, 'rb') as f:
x = pickle.load(f) x = pickle.load(f)
@ -53,12 +36,12 @@ def total_watt_consumed():
y = [int(re.findall("\d+",xi)[0]) for xi in x] y = [int(re.findall("\d+",xi)[0]) for xi in x]
return sum(y) return sum(y)
if __name__ == '__main__': if __name__ == '__main__':
dataDump = [] dataDump = []
#var = True #var = True
#pickling_on = open("wattdata.pickle","wb") #pickling_on = open("wattdata.pickle","wb")
while True: while True:
#from run_service import retcode
try: try:
dataDump.append(get_sample_of_gpu()) dataDump.append(get_sample_of_gpu())
with open(pickle_name, 'wb') as f: with open(pickle_name, 'wb') as f:

View File

@ -6,24 +6,30 @@ import utils
import torch import torch
import pickle import pickle
import metrics import metrics
import argparse
import numpy as np import numpy as np
import amd_sample_draw
import config_bayesian as cfg
from datetime import datetime from datetime import datetime
from torch.nn import functional as F from torch.nn import functional as F
from torch.optim import Adam, lr_scheduler from torch.optim import Adam, lr_scheduler
from gpu_power_func import total_watt_consumed
from models.BayesianModels.BayesianLeNet import BBBLeNet from models.BayesianModels.BayesianLeNet import BBBLeNet
from models.BayesianModels.BayesianAlexNet import BBBAlexNet from models.BayesianModels.BayesianAlexNet import BBBAlexNet
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
from stopping_crit import earlyStopping, energyBound, accuracyBound from stopping_crit import earlyStopping, energyBound, accuracyBound
with (open("configuration.pkl", "rb")) as file:
while True:
try:
cfg = pickle.load(file)
except EOFError:
break
# CUDA settings # CUDA settings
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs, priors, layer_type, activation_type): def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
if (net_type == 'lenet'): if (net_type == 'lenet'):
return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg.wide) return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg["model"]["size"])
elif (net_type == 'alexnet'): elif (net_type == 'alexnet'):
return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type) return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type)
elif (net_type == '3conv3fc'): elif (net_type == '3conv3fc'):
@ -91,18 +97,18 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, epoch=
def run(dataset, net_type): def run(dataset, net_type):
# Hyper Parameter settings # Hyper Parameter settings
layer_type = cfg.layer_type layer_type = cfg["model"]["layer_type"]
activation_type = cfg.activation_type activation_type = cfg["model"]["activation_type"]
priors = cfg.priors priors = cfg["model"]["priors"]
train_ens = cfg.train_ens train_ens = cfg["model"]["train_ens"]
valid_ens = cfg.valid_ens valid_ens = cfg["model"]["valid_ens"]
n_epochs = cfg.n_epochs n_epochs = cfg["model"]["n_epochs"]
lr_start = cfg.lr_start lr_start = cfg["model"]["lr"]
num_workers = cfg.num_workers num_workers = cfg["model"]["num_workers"]
valid_size = cfg.valid_size valid_size = cfg["model"]["valid_size"]
batch_size = cfg.batch_size batch_size = cfg["model"]["batch_size"]
beta_type = cfg.beta_type beta_type = cfg["model"]["beta_type"]
trainset, testset, inputs, outputs = data.getDataset(dataset) trainset, testset, inputs, outputs = data.getDataset(dataset)
train_loader, valid_loader, test_loader = data.getDataloader( train_loader, valid_loader, test_loader = data.getDataloader(
@ -110,15 +116,13 @@ def run(dataset, net_type):
net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to(device) net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to(device)
ckpt_dir = f'checkpoints/{dataset}/bayesian' ckpt_dir = f'checkpoints/{dataset}/bayesian'
ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg.wide}.pt' ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg["model"]["size"]}.pt'
if not os.path.exists(ckpt_dir): if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir, exist_ok=True) os.makedirs(ckpt_dir, exist_ok=True)
with open("stp", "r") as file: stp = cfg["stopping_crit"]
stp = int(file.read()) sav = cfg["save"]
with open("sav", "r") as file:
sav = int(file.read())
criterion = metrics.ELBO(len(trainset)).to(device) criterion = metrics.ELBO(len(trainset)).to(device)
optimizer = Adam(net.parameters(), lr=lr_start) optimizer = Adam(net.parameters(), lr=lr_start)
@ -139,19 +143,19 @@ def run(dataset, net_type):
epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl)) epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
if stp == 2: if stp == 2:
#print('Using early stopping') print('Using early stopping')
if earlyStopping(early_stop,train_acc,epoch,cfg.sens) == 1: if earlyStopping(early_stop,valid_acc,epoch,cfg["model"]["sens"]) == 1:
break break
elif stp == 3: elif stp == 3:
#print('Using energy bound') print('Using energy bound')
if energyBound(cfg.energy_thrs) == 1: if energyBound(cfg["model"]["energy_thrs"]) == 1:
break break
elif stp == 4: elif stp == 4:
#print('Using accuracy bound') print('Using accuracy bound')
if accuracyBound(cfg.acc_thrs) == 1: if accuracyBound(train_acc,cfg.acc_thrs) == 1:
break break
else: else:
print('Training for {} epochs'.format(cfg.n_epochs)) print('Training for {} epochs'.format(cfg["model"]["n_epochs"]))
if sav == 1: if sav == 1:
# save model when finished # save model when finished
@ -159,18 +163,14 @@ def run(dataset, net_type):
torch.save(net.state_dict(), ckpt_name) torch.save(net.state_dict(), ckpt_name)
with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f: with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f:
pickle.dump(train_data, f) pickle.dump(train_data, f)
if __name__ == '__main__': if __name__ == '__main__':
now = datetime.now() now = datetime.now()
current_time = now.strftime("%H:%M:%S") current_time = now.strftime("%H:%M:%S")
print("Initial Time =", current_time) print("Initial Time =", current_time)
parser = argparse.ArgumentParser(description = "PyTorch Bayesian Model Training") run(cfg["data"], cfg["model"]["net_type"])
parser.add_argument('--net_type', default='lenet', type=str, help='model')
parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
args = parser.parse_args()
run(args.dataset, args.net_type)
now = datetime.now() now = datetime.now()
current_time = now.strftime("%H:%M:%S") current_time = now.strftime("%H:%M:%S")
print("Final Time =", current_time) print("Final Time =", current_time)

View File

@ -116,7 +116,7 @@ def run(dataset, net_type):
break break
elif stp == 4: elif stp == 4:
#print('Using accuracy bound') #print('Using accuracy bound')
if accuracyBound(train_acc,0.70) == 1: if accuracyBound(train_acc,cfg.acc_thrs) == 1:
break break
else: else:
print('Training for {} epochs'.format(cfg.n_epochs)) print('Training for {} epochs'.format(cfg.n_epochs))
@ -136,7 +136,7 @@ if __name__ == '__main__':
print("Initial Time =", current_time) print("Initial Time =", current_time)
parser = argparse.ArgumentParser(description = "PyTorch Frequentist Model Training") parser = argparse.ArgumentParser(description = "PyTorch Frequentist Model Training")
parser.add_argument('--net_type', default='lenet', type=str, help='model') parser.add_argument('--net_type', default='lenet', type=str, help='model')
parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]') parser.add_argument('--dataset', default='MNIST', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
args = parser.parse_args() args = parser.parse_args()
run(args.dataset, args.net_type) run(args.dataset, args.net_type)
now = datetime.now() now = datetime.now()

3
radeontop.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/env bash
radeontop -b 08 -d - > $1

2
read_pickle.py Normal file → Executable file
View File

@ -1,7 +1,7 @@
import pickle import pickle
gpu_data = [] gpu_data = []
with (open("freq_wattdata_1.0.pkl", "rb")) as openfile: with (open("bayesian_wattdata_3.pkl", "rb")) as openfile:
while True: while True:
try: try:
gpu_data = pickle.load(openfile) gpu_data = pickle.load(openfile)

14
stopping_crit.py Normal file → Executable file
View File

@ -1,6 +1,13 @@
import amd_sample_draw import pickle
from time import sleep from time import sleep
from gpu_power_func import total_watt_consumed
with (open("configuration.pkl", "rb")) as file:
while True:
try:
cfg = pickle.load(file)
except EOFError:
break
def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9): def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9):
early_stopping.append(train_acc) early_stopping.append(train_acc)
@ -20,16 +27,17 @@ def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivit
def energyBound(threshold: float=100000.0): def energyBound(threshold: float=100000.0):
try: try:
energy = amd_sample_draw.total_watt_consumed() energy = total_watt_consumed(cfg["pickle_path"])
except Exception as e: except Exception as e:
sleep(3) sleep(3)
energy = amd_sample_draw.total_watt_consumed() energy = total_watt_consumed(cfg["pickle_path"])
print("Energy used: {}".format(energy)) print("Energy used: {}".format(energy))
if energy > threshold: if energy > threshold:
print("Energy bound achieved") print("Energy bound achieved")
return 1 return 1
return 0 return 0
def accuracyBound(train_acc: float, threshold: float=0.99): def accuracyBound(train_acc: float, threshold: float=0.99):
if train_acc >= threshold: if train_acc >= threshold:
print("Accuracy bound achieved") print("Accuracy bound achieved")

View File

@ -3,9 +3,6 @@ import torch
import numpy as np import numpy as np
from torch.nn import functional as F from torch.nn import functional as F
import config_bayesian as cfg
# cifar10 classes # cifar10 classes
cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck'] 'dog', 'frog', 'horse', 'ship', 'truck']