Ignore save files

This commit is contained in:
Eddie Cueto 2023-06-01 09:20:51 +01:00
parent 403f5b52b1
commit 88c3f1c088
13 changed files with 275 additions and 77 deletions

11
.gitignore vendored Normal file → Executable file
View File

@ -6,4 +6,13 @@ experiment-power-draw/
**/__pycache__/
**/__init__.py
**/**/__pycache__/
**/**/__init__.py
**/**/__init__.py
stp
sav
bayes_*
freq_*
*.pkl
bay
frq
sav
tmp

96
amd_sample_draw.py Normal file
View File

@ -0,0 +1,96 @@
import os
import re
import pickle
import numpy as np
from warnings import warn
with open("frq", "r") as file:
frq = int(file.read())
with open("bay", "r") as file:
bay = int(file.read())
if frq == 1:
model_t = "freq"
with open("tmp", "r") as file:
size = float(file.read())
if bay == 1:
model_t = "bayes"
with open("tmp", "r") as file:
size = int(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
print("GPU energy file config: {}".format(pickle_name))
def get_sample_of_gpu():
from re import sub, findall
import subprocess
from subprocess import run
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
no_version = "Failed to initialize NVML: Driver/library version mismatch"
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph:
raise Exception("It seems that no AMD GPU is installed")
elif smi_string[0] == no_version:
raise Exception("rocm-smi version mismatch")
else:
results= []
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2])
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
gpuM0 = findall("[0-9]+",smi_string[7])
gpuM1 = findall("[0-9]+",smi_string[9])
gpuV0 = findall("[0-9]+",smi_string[13])
gpuV1 = findall("[0-9]+",smi_string[14])
results.append(float(gpuW0[0]) + float(gpuW1[0]))
if len(gpuM0) == 2 and len(gpuM1) == 2:
results.append(int(gpuM0[1]) + int(gpuM1[1]))
elif len(gpuM0) == 2:
results.append(gpuM0[1])
elif len(gpuM1) == 2:
results.append(gpuM1[1])
results.append(int(gpuV0[1]) + int(gpuV1[1]))
return results
#for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp:
#return temp
def total_watt_consumed():
with open(pickle_name, 'rb') as f:
x = pickle.load(f)
x = np.array(x)
x = x[:,0]
y = [int(re.findall("\d+",xi)[0]) for xi in x]
return sum(y)
if __name__ == '__main__':
dataDump = []
#var = True
#pickling_on = open("wattdata.pickle","wb")
while True:
#from run_service import retcode
try:
dataDump.append(get_sample_of_gpu())
with open(pickle_name, 'wb') as f:
pickle.dump(dataDump, f)
except EOFError:
warn('Pickle ran out of space')
size += 0.01
finally:
f.close()
#if retcode == 0:
#break
#pickle.dump(dataDump, pickling_on)
#pickling_on.close()

20
arguments.py Normal file
View File

@ -0,0 +1,20 @@
import argparse
from argparse import ArgumentParser
# Construct an argument parser
all_args = argparse.ArgumentParser()
def makeArguments(arguments: ArgumentParser) -> dict:
all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
type=int, choices=range(1,7), help="Bayesian model of size x")
all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
type=int, choices=range(1,7), help="Frequentist model of size x")
all_args.add_argument("-E", "--EarlyStopping", action="store_true",
help="Early Stopping criteria")
all_args.add_argument("-e", "--EnergyBound", action="store_true",
help="Energy Bound criteria")
all_args.add_argument("-a", "--AccuracyBound", action="store_true",
help="Accuracy Bound criteria")
all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
return vars(all_args.parse_args())

View File

@ -10,7 +10,10 @@ priors={
'posterior_rho_initial': (-5, 0.1), # (mean, std) normal_
}
n_epochs = 200
n_epochs = 100
sens = 1e-9
energy_thrs = 100000
acc_thrs = 0.99
lr_start = 0.001
num_workers = 4
valid_size = 0.2
@ -27,16 +30,16 @@ if bay == 1:
with open("tmp", "r") as file:
wide = int(file.read())
if os.path.exists("tmp"):
os.remove("tmp")
else:
raise Exception("Tmp file not found")
#if os.path.exists("tmp"):
# os.remove("tmp")
#else:
# raise Exception("Tmp file not found")
print("Bayesian configured to run with width: {}".format(wide))
if os.path.exists("bay"):
os.remove("bay")
else:
raise Exception("Bay file not found")
#if os.path.exists("bay"):
# os.remove("bay")
#else:
# raise Exception("Bay file not found")

View File

@ -2,6 +2,9 @@
import os
n_epochs = 100
sens = 1e-9
energy_thrs = 100000
acc_thrs = 0.99
lr = 0.001
num_workers = 4
valid_size = 0.2
@ -23,8 +26,7 @@ if frq == 1:
if os.path.exists("frq"):
os.remove("frq")
else:
raise Exception("Frq file not found")
#if os.path.exists("frq"):
# os.remove("frq")
#else:
# raise Exception("Frq file not found")

View File

@ -1,4 +1,5 @@
#!/bin/bash
#!/bin/env bash
powerstat -z 0.5 1000000 > $1
#powerstat -z 0.5 1000000 > $1
powerstat -D > $1

View File

@ -18,7 +18,7 @@ if frq == 1:
if bay == 1:
model_t = "bayes"
with open("tmp", "r") as file:
wide = int(file.read())
size = int(file.read())
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
#print("GPU energy file config: {}".format(pickle_name))
@ -33,12 +33,13 @@ def get_sample_of_gpu():
smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
smi_string = smi_string.stdout.decode('utf-8')
smi_string = smi_string.split("\n")
smi_string = list(filter(lambda x: x, smi_string))
if smi_string[0] == no_graph:
raise Exception("It seems that no NVIDIA GPU is installed")
elif smi_string[0] == no_version:
raise Exception("nvidia-smi version mismatch")
else:
return findall("[0-9]*MiB | [0-9]*W",smi_string[9])
return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
#for l in smi_string:
#temp = findall("[0-9]*MiB | [0-9]*W",l)
#if temp:

View File

@ -12,13 +12,14 @@ import config_bayesian as cfg
from datetime import datetime
from torch.nn import functional as F
from torch.optim import Adam, lr_scheduler
import gpu_sample_draw
import amd_sample_draw
from models.BayesianModels.BayesianLeNet import BBBLeNet
from models.BayesianModels.BayesianAlexNet import BBBAlexNet
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
from stopping_crit import earlyStopping, energyBound, accuracyBound
# CUDA settings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
if (net_type == 'lenet'):
@ -114,12 +115,17 @@ def run(dataset, net_type):
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir, exist_ok=True)
with open("stp", "r") as file:
stp = int(file.read())
with open("sav", "r") as file:
sav = int(file.read())
criterion = metrics.ELBO(len(trainset)).to(device)
optimizer = Adam(net.parameters(), lr=lr_start)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
#valid_loss_max = np.Inf
#early_stop = []
#thrs=1e-9
if stp == 2:
early_stop = []
train_data = []
for epoch in range(n_epochs): # loop over the dataset multiple times
@ -132,22 +138,25 @@ def run(dataset, net_type):
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
#early_stop.append(valid_acc)
#if epoch % 4 == 0 and epoch > 0:
#print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
#if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
#break
#early_stop = []
if stp == 2:
print('Using early stopping')
if earlyStopping(early_stop,train_acc,cfg.sens) == None:
break
elif stp == 3:
print('Using energy bound')
if energyBound(cfg.energy_thrs) == None:
break
elif stp == 4:
print('Using accuracy bound')
if accuracyBound(cfg.acc_thrs) == None:
break
else:
print('Training for {} epochs'.format(cfg.n_epochs))
if train_acc >= 0.50:
break
#if gpu_sample_draw.total_watt_consumed() > 100000:
#break
# save model on last epoch
#if epoch == (n_epochs-1):
#torch.save(net.state_dict(), ckpt_name)
if sav == 1:
# save model when finished
if epoch == n_epochs:
torch.save(net.state_dict(), ckpt_name)
with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
pickle.dump(train_data, f)

View File

@ -8,17 +8,17 @@ import metrics
import argparse
import numpy as np
import torch.nn as nn
import gpu_sample_draw
import amd_sample_draw
from datetime import datetime
import config_frequentist as cfg
from torch.optim import Adam, lr_scheduler
from models.NonBayesianModels.LeNet import LeNet
from models.NonBayesianModels.AlexNet import AlexNet
from stopping_crit import earlyStopping, energyBound, accuracyBound
from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
# CUDA settings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
def getModel(net_type, inputs, outputs,wide=cfg.wide):
@ -81,12 +81,17 @@ def run(dataset, net_type):
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir, exist_ok=True)
with open("stp", "r") as file:
stp = int(file.read())
with open("sav", "r") as file:
sav = int(file.read())
criterion = nn.CrossEntropyLoss()
optimizer = Adam(net.parameters(), lr=lr)
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
#valid_loss_min = np.Inf
#early_stop = []
#thrs=1e-9
if stp == 2:
early_stop = []
train_data = []
for epoch in range(1, n_epochs+1):
@ -100,23 +105,23 @@ def run(dataset, net_type):
train_data.append([epoch,train_loss,train_acc,valid_loss,valid_acc])
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
epoch, train_loss, train_acc, valid_loss, valid_acc))
if stp == 2:
print('Using early stopping')
earlyStopping(early_stop,train_acc,cfg.sens)
elif stp == 3:
print('Using energy bound')
energyBound(cfg.energy_thrs)
elif stp == 4:
print('Using accuracy bound')
accuracyBound(cfg.acc_thrs)
else:
print('Training for {} epochs'.format(cfg.n_epochs))
#early_stop.append(valid_acc)
#if epoch % 4 == 0 and epoch > 0:
# print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
# if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
# break
# early_stop = []
#if train_acc >= 0.99:
# break
#if gpu_sample_draw.total_watt_consumed() > 100000:
# break
# save model when finished
#if epoch == n_epochs:
#torch.save(net.state_dict(), ckpt_name)
if sav == 1:
# save model when finished
if epoch == n_epochs:
torch.save(net.state_dict(), ckpt_name)
with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
pickle.dump(train_data, f)

View File

@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/env bash
while true

17
read_pickle.py Normal file
View File

@ -0,0 +1,17 @@
import pickle
gpu_data = []
with (open("bayes_wattdata_1.pkl", "rb")) as openfile:
while True:
try:
gpu_data.append(pickle.load(openfile))
except EOFError:
break
exp_data = []
with (open("bayes_exp_data_1.pkl", "rb")) as openfile:
while True:
try:
exp_data.append(pickle.load(openfile))
except EOFError:
break

View File

@ -1,16 +1,9 @@
import argparse
import arguments
from time import sleep
import subprocess as sub
from arguments import makeArguments
# Construct an argument parser
all_args = argparse.ArgumentParser()
all_args.add_argument("-b", "--Value1", action="store", dest="b",
type=int, choices=range(1,6), help="Bayesian model of size x")
all_args.add_argument("-f", "--Value2", action="store", dest="f",
type=int, choices=range(1,6), help="Frequentist model of size x")
args = vars(all_args.parse_args())
args = makeArguments(arguments.all_args)
check = list(args.values())
if all(v is None for v in check):
@ -29,6 +22,26 @@ wide = args["f"] or args["b"]
with open("tmp", "w") as file:
file.write(str(wide))
if args['EarlyStopping']:
with open("stp", "w") as file:
file.write('2')
elif args['EnergyBound']:
with open("stp", "w") as file:
file.write('3')
elif args['AccuracyBound']:
with open("stp", "w") as file:
file.write('4')
else:
with open("stp", "w") as file:
file.write('1')
if args['Save']:
with open("sav", "w") as file:
file.write('1')
else:
with open("sav", "w") as file:
file.write('0')
sleep(3)
@ -44,7 +57,7 @@ elif cmd[1] == "main_bayesian.py":
cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)]
with open("bay", "w") as file:
file.write(str(1))
with open("frw", "w") as file:
with open("frq", "w") as file:
file.write(str(0))
@ -52,7 +65,7 @@ path = sub.check_output(['pwd'])
path = path.decode()
path = path.replace('\n', '')
#startWattCounter = 'python ' + path + '/gpu_sample_draw.py'
startWattCounter = 'python ' + path + '/amd_sample_draw.py'
#test = startNODE.split()
#test.append(pythonEnd)
@ -60,11 +73,11 @@ path = path.replace('\n', '')
#startNODE = test
##print(startNODE)
##print(startWattCounter)
#print(startNODE)
#print(startWattCounter)
p1 = sub.Popen(cmd)
#p2 = sub.Popen(startWattCounter.split())
p2 = sub.Popen(startWattCounter.split())
p3 = sub.Popen(cmd2)
p4 = sub.Popen(cmd3)
@ -72,6 +85,6 @@ retcode = p1.wait()
print("Return code: {}".format(retcode))
p1.kill()
#p2.kill()
p2.kill()
p3.kill()
p4.kill()

22
stopping_crit.py Normal file
View File

@ -0,0 +1,22 @@
def earlyStopping(early_stopping: list, train_acc: float, sensitivity: float=1e-9):
early_stopping.append(train_acc)
if epoch % 4 == 0 and epoch > 0:
print("Value 1: {} >= {}, Value 2: {} >= {}, \
Value 2: {} >= {}".format(early_stopping[0], \
train_acc-sensitivity,early_stopping[1], \
train_acc-sensitivity, early_stopping[2], train_acc-sensitivity))
if abs(early_stopping[0]) >= train_acc-sensitivity and \
abs(early_stopping[1]) >= train_acc-sensitivity and \
abs(early_stopping[2]) >= train_acc-sensitivity:
return None
early_stopping = []
def energyBound(threshold: float=100000.0):
if gpu_sample_draw.total_watt_consumed() > threshold:
return None
def accuracyBound(train_acc: float, threshold: float=0.99):
if train_acc >= threshold:
return None