Ignore save files

This commit is contained in:
Eddie Cueto 2023-06-01 09:20:51 +01:00
parent 403f5b52b1
commit 88c3f1c088
13 changed files with 275 additions and 77 deletions

9
.gitignore vendored Normal file → Executable file
View File

@ -7,3 +7,12 @@ experiment-power-draw/
**/__init__.py **/__init__.py
**/**/__pycache__/ **/**/__pycache__/
**/**/__init__.py **/**/__init__.py
stp
sav
bayes_*
freq_*
*.pkl
bay
frq
sav
tmp

96
amd_sample_draw.py Normal file
View File

@ -0,0 +1,96 @@
import os
import re
import pickle
import numpy as np
from warnings import warn
with open("frq", "r") as file:
frq = int(file.read())
with open("bay", "r") as file:
bay = int(file.read())
if frq == 1:
model_t = "freq"
with open("tmp", "r") as file:
size = float(file.read())
if bay == 1:
model_t = "bayes"
with open("tmp", "r") as file:
size = int(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
print("GPU energy file config: {}".format(pickle_name))
def get_sample_of_gpu():
from re import sub, findall
import subprocess
from subprocess import run
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
no_version = "Failed to initialize NVML: Driver/library version mismatch"
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph:
raise Exception("It seems that no AMD GPU is installed")
elif smi_string[0] == no_version:
raise Exception("rocm-smi version mismatch")
else:
results= []
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2])
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
gpuM0 = findall("[0-9]+",smi_string[7])
gpuM1 = findall("[0-9]+",smi_string[9])
gpuV0 = findall("[0-9]+",smi_string[13])
gpuV1 = findall("[0-9]+",smi_string[14])
results.append(float(gpuW0[0]) + float(gpuW1[0]))
if len(gpuM0) == 2 and len(gpuM1) == 2:
results.append(int(gpuM0[1]) + int(gpuM1[1]))
elif len(gpuM0) == 2:
results.append(gpuM0[1])
elif len(gpuM1) == 2:
results.append(gpuM1[1])
results.append(int(gpuV0[1]) + int(gpuV1[1]))
return results
#for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp:
#return temp
def total_watt_consumed():
with open(pickle_name, 'rb') as f:
x = pickle.load(f)
x = np.array(x)
x = x[:,0]
y = [int(re.findall("\d+",xi)[0]) for xi in x]
return sum(y)
if __name__ == '__main__':
dataDump = []
#var = True
#pickling_on = open("wattdata.pickle","wb")
while True:
#from run_service import retcode
try:
dataDump.append(get_sample_of_gpu())
with open(pickle_name, 'wb') as f:
pickle.dump(dataDump, f)
except EOFError:
warn('Pickle ran out of space')
size += 0.01
finally:
f.close()
#if retcode == 0:
#break
#pickle.dump(dataDump, pickling_on)
#pickling_on.close()

20
arguments.py Normal file
View File

@ -0,0 +1,20 @@
import argparse
from argparse import ArgumentParser
# Construct an argument parser
all_args = argparse.ArgumentParser()
def makeArguments(arguments: ArgumentParser) -> dict:
all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
type=int, choices=range(1,7), help="Bayesian model of size x")
all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
type=int, choices=range(1,7), help="Frequentist model of size x")
all_args.add_argument("-E", "--EarlyStopping", action="store_true",
help="Early Stopping criteria")
all_args.add_argument("-e", "--EnergyBound", action="store_true",
help="Energy Bound criteria")
all_args.add_argument("-a", "--AccuracyBound", action="store_true",
help="Accuracy Bound criteria")
all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
return vars(all_args.parse_args())

View File

@ -10,7 +10,10 @@ priors={
'posterior_rho_initial': (-5, 0.1), # (mean, std) normal_ 'posterior_rho_initial': (-5, 0.1), # (mean, std) normal_
} }
n_epochs = 200 n_epochs = 100
sens = 1e-9
energy_thrs = 100000
acc_thrs = 0.99
lr_start = 0.001 lr_start = 0.001
num_workers = 4 num_workers = 4
valid_size = 0.2 valid_size = 0.2
@ -27,16 +30,16 @@ if bay == 1:
with open("tmp", "r") as file: with open("tmp", "r") as file:
wide = int(file.read()) wide = int(file.read())
if os.path.exists("tmp"): #if os.path.exists("tmp"):
os.remove("tmp") # os.remove("tmp")
else: #else:
raise Exception("Tmp file not found") # raise Exception("Tmp file not found")
print("Bayesian configured to run with width: {}".format(wide)) print("Bayesian configured to run with width: {}".format(wide))
if os.path.exists("bay"): #if os.path.exists("bay"):
os.remove("bay") # os.remove("bay")
else: #else:
raise Exception("Bay file not found") # raise Exception("Bay file not found")

View File

@ -2,6 +2,9 @@
import os import os
n_epochs = 100 n_epochs = 100
sens = 1e-9
energy_thrs = 100000
acc_thrs = 0.99
lr = 0.001 lr = 0.001
num_workers = 4 num_workers = 4
valid_size = 0.2 valid_size = 0.2
@ -23,8 +26,7 @@ if frq == 1:
if os.path.exists("frq"): #if os.path.exists("frq"):
os.remove("frq") # os.remove("frq")
else: #else:
raise Exception("Frq file not found") # raise Exception("Frq file not found")

View File

@ -1,4 +1,5 @@
#!/bin/bash #!/bin/env bash
powerstat -z 0.5 1000000 > $1 #powerstat -z 0.5 1000000 > $1
powerstat -D > $1

View File

@ -18,7 +18,7 @@ if frq == 1:
if bay == 1: if bay == 1:
model_t = "bayes" model_t = "bayes"
with open("tmp", "r") as file: with open("tmp", "r") as file:
wide = int(file.read()) size = int(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size) pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
#print("GPU energy file config: {}".format(pickle_name)) #print("GPU energy file config: {}".format(pickle_name))
@ -33,12 +33,13 @@ def get_sample_of_gpu():
smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE) smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8') smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n") smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph: if smi_string[0] == no_graph:
raise Exception("It seems that no NVIDIA GPU is installed") raise Exception("It seems that no NVIDIA GPU is installed")
elif smi_string[0] == no_version: elif smi_string[0] == no_version:
raise Exception("nvidia-smi version mismatch") raise Exception("nvidia-smi version mismatch")
else: else:
return findall("[0-9]*MiB | [0-9]*W",smi_string[9]) return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
#for l in smi_string: #for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l) #temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp: #if temp:

View File

@ -12,13 +12,14 @@ import config_bayesian as cfg
from datetime import datetime from datetime import datetime
from torch.nn import functional as F from torch.nn import functional as F
from torch.optim import Adam, lr_scheduler from torch.optim import Adam, lr_scheduler
import gpu_sample_draw import amd_sample_draw
from models.BayesianModels.BayesianLeNet import BBBLeNet from models.BayesianModels.BayesianLeNet import BBBLeNet
from models.BayesianModels.BayesianAlexNet import BBBAlexNet from models.BayesianModels.BayesianAlexNet import BBBAlexNet
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
from stopping_crit import earlyStopping, energyBound, accuracyBound
# CUDA settings # CUDA settings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs, priors, layer_type, activation_type): def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
if (net_type == 'lenet'): if (net_type == 'lenet'):
@ -114,12 +115,17 @@ def run(dataset, net_type):
if not os.path.exists(ckpt_dir): if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir, exist_ok=True) os.makedirs(ckpt_dir, exist_ok=True)
with open("stp", "r") as file:
stp = int(file.read())
with open("sav", "r") as file:
sav = int(file.read())
criterion = metrics.ELBO(len(trainset)).to(device) criterion = metrics.ELBO(len(trainset)).to(device)
optimizer = Adam(net.parameters(), lr=lr_start) optimizer = Adam(net.parameters(), lr=lr_start)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True) lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
#valid_loss_max = np.Inf #valid_loss_max = np.Inf
#early_stop = [] if stp == 2:
#thrs=1e-9 early_stop = []
train_data = [] train_data = []
for epoch in range(n_epochs): # loop over the dataset multiple times for epoch in range(n_epochs): # loop over the dataset multiple times
@ -132,22 +138,25 @@ def run(dataset, net_type):
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format( print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl)) epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
#early_stop.append(valid_acc) if stp == 2:
#if epoch % 4 == 0 and epoch > 0: print('Using early stopping')
#print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs)) if earlyStopping(early_stop,train_acc,cfg.sens) == None:
#if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs: break
#break elif stp == 3:
#early_stop = [] print('Using energy bound')
if energyBound(cfg.energy_thrs) == None:
break
elif stp == 4:
print('Using accuracy bound')
if accuracyBound(cfg.acc_thrs) == None:
break
else:
print('Training for {} epochs'.format(cfg.n_epochs))
if train_acc >= 0.50: if sav == 1:
break # save model when finished
if epoch == n_epochs:
#if gpu_sample_draw.total_watt_consumed() > 100000: torch.save(net.state_dict(), ckpt_name)
#break
# save model on last epoch
#if epoch == (n_epochs-1):
#torch.save(net.state_dict(), ckpt_name)
with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f: with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
pickle.dump(train_data, f) pickle.dump(train_data, f)

View File

@ -8,17 +8,17 @@ import metrics
import argparse import argparse
import numpy as np import numpy as np
import torch.nn as nn import torch.nn as nn
import gpu_sample_draw import amd_sample_draw
from datetime import datetime from datetime import datetime
import config_frequentist as cfg import config_frequentist as cfg
from torch.optim import Adam, lr_scheduler from torch.optim import Adam, lr_scheduler
from models.NonBayesianModels.LeNet import LeNet from models.NonBayesianModels.LeNet import LeNet
from models.NonBayesianModels.AlexNet import AlexNet from models.NonBayesianModels.AlexNet import AlexNet
from stopping_crit import earlyStopping, energyBound, accuracyBound
from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
# CUDA settings # CUDA settings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs,wide=cfg.wide): def getModel(net_type, inputs, outputs,wide=cfg.wide):
@ -81,12 +81,17 @@ def run(dataset, net_type):
if not os.path.exists(ckpt_dir): if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir, exist_ok=True) os.makedirs(ckpt_dir, exist_ok=True)
with open("stp", "r") as file:
stp = int(file.read())
with open("sav", "r") as file:
sav = int(file.read())
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = Adam(net.parameters(), lr=lr) optimizer = Adam(net.parameters(), lr=lr)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True) lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
#valid_loss_min = np.Inf #valid_loss_min = np.Inf
#early_stop = [] if stp == 2:
#thrs=1e-9 early_stop = []
train_data = [] train_data = []
for epoch in range(1, n_epochs+1): for epoch in range(1, n_epochs+1):
@ -101,22 +106,22 @@ def run(dataset, net_type):
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format( print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
epoch, train_loss, train_acc, valid_loss, valid_acc)) epoch, train_loss, train_acc, valid_loss, valid_acc))
#early_stop.append(valid_acc) if stp == 2:
#if epoch % 4 == 0 and epoch > 0: print('Using early stopping')
# print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs)) earlyStopping(early_stop,train_acc,cfg.sens)
# if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs: elif stp == 3:
# break print('Using energy bound')
# early_stop = [] energyBound(cfg.energy_thrs)
elif stp == 4:
print('Using accuracy bound')
accuracyBound(cfg.acc_thrs)
else:
print('Training for {} epochs'.format(cfg.n_epochs))
#if train_acc >= 0.99: if sav == 1:
# break # save model when finished
if epoch == n_epochs:
#if gpu_sample_draw.total_watt_consumed() > 100000: torch.save(net.state_dict(), ckpt_name)
# break
# save model when finished
#if epoch == n_epochs:
#torch.save(net.state_dict(), ckpt_name)
with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f: with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
pickle.dump(train_data, f) pickle.dump(train_data, f)

View File

@ -1,4 +1,4 @@
#!/bin/bash #!/bin/env bash
while true while true

17
read_pickle.py Normal file
View File

@ -0,0 +1,17 @@
import pickle
gpu_data = []
with (open("bayes_wattdata_1.pkl", "rb")) as openfile:
while True:
try:
gpu_data.append(pickle.load(openfile))
except EOFError:
break
exp_data = []
with (open("bayes_exp_data_1.pkl", "rb")) as openfile:
while True:
try:
exp_data.append(pickle.load(openfile))
except EOFError:
break

View File

@ -1,16 +1,9 @@
import argparse import arguments
from time import sleep from time import sleep
import subprocess as sub import subprocess as sub
from arguments import makeArguments
# Construct an argument parser args = makeArguments(arguments.all_args)
all_args = argparse.ArgumentParser()
all_args.add_argument("-b", "--Value1", action="store", dest="b",
type=int, choices=range(1,6), help="Bayesian model of size x")
all_args.add_argument("-f", "--Value2", action="store", dest="f",
type=int, choices=range(1,6), help="Frequentist model of size x")
args = vars(all_args.parse_args())
check = list(args.values()) check = list(args.values())
if all(v is None for v in check): if all(v is None for v in check):
@ -29,6 +22,26 @@ wide = args["f"] or args["b"]
with open("tmp", "w") as file: with open("tmp", "w") as file:
file.write(str(wide)) file.write(str(wide))
if args['EarlyStopping']:
with open("stp", "w") as file:
file.write('2')
elif args['EnergyBound']:
with open("stp", "w") as file:
file.write('3')
elif args['AccuracyBound']:
with open("stp", "w") as file:
file.write('4')
else:
with open("stp", "w") as file:
file.write('1')
if args['Save']:
with open("sav", "w") as file:
file.write('1')
else:
with open("sav", "w") as file:
file.write('0')
sleep(3) sleep(3)
@ -44,7 +57,7 @@ elif cmd[1] == "main_bayesian.py":
cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)] cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)]
with open("bay", "w") as file: with open("bay", "w") as file:
file.write(str(1)) file.write(str(1))
with open("frw", "w") as file: with open("frq", "w") as file:
file.write(str(0)) file.write(str(0))
@ -52,7 +65,7 @@ path = sub.check_output(['pwd'])
path = path.decode() path = path.decode()
path = path.replace('\n', '') path = path.replace('\n', '')
#startWattCounter = 'python ' + path + '/gpu_sample_draw.py' startWattCounter = 'python ' + path + '/amd_sample_draw.py'
#test = startNODE.split() #test = startNODE.split()
#test.append(pythonEnd) #test.append(pythonEnd)
@ -60,11 +73,11 @@ path = path.replace('\n', '')
#startNODE = test #startNODE = test
##print(startNODE) #print(startNODE)
##print(startWattCounter) #print(startWattCounter)
p1 = sub.Popen(cmd) p1 = sub.Popen(cmd)
#p2 = sub.Popen(startWattCounter.split()) p2 = sub.Popen(startWattCounter.split())
p3 = sub.Popen(cmd2) p3 = sub.Popen(cmd2)
p4 = sub.Popen(cmd3) p4 = sub.Popen(cmd3)
@ -72,6 +85,6 @@ retcode = p1.wait()
print("Return code: {}".format(retcode)) print("Return code: {}".format(retcode))
p1.kill() p1.kill()
#p2.kill() p2.kill()
p3.kill() p3.kill()
p4.kill() p4.kill()

22
stopping_crit.py Normal file
View File

@ -0,0 +1,22 @@
def earlyStopping(early_stopping: list, train_acc: float, sensitivity: float=1e-9):
early_stopping.append(train_acc)
if epoch % 4 == 0 and epoch > 0:
print("Value 1: {} >= {}, Value 2: {} >= {}, \
Value 2: {} >= {}".format(early_stopping[0], \
train_acc-sensitivity,early_stopping[1], \
train_acc-sensitivity, early_stopping[2], train_acc-sensitivity))
if abs(early_stopping[0]) >= train_acc-sensitivity and \
abs(early_stopping[1]) >= train_acc-sensitivity and \
abs(early_stopping[2]) >= train_acc-sensitivity:
return None
early_stopping = []
def energyBound(threshold: float=100000.0):
if gpu_sample_draw.total_watt_consumed() > threshold:
return None
def accuracyBound(train_acc: float, threshold: float=0.99):
if train_acc >= threshold:
return None