Ignore save files
This commit is contained in:
parent
403f5b52b1
commit
88c3f1c088
|
@ -7,3 +7,12 @@ experiment-power-draw/
|
|||
**/__init__.py
|
||||
**/**/__pycache__/
|
||||
**/**/__init__.py
|
||||
stp
|
||||
sav
|
||||
bayes_*
|
||||
freq_*
|
||||
*.pkl
|
||||
bay
|
||||
frq
|
||||
sav
|
||||
tmp
|
|
@ -0,0 +1,96 @@
|
|||
import os
|
||||
import re
|
||||
import pickle
|
||||
import numpy as np
|
||||
from warnings import warn
|
||||
|
||||
with open("frq", "r") as file:
|
||||
frq = int(file.read())
|
||||
|
||||
with open("bay", "r") as file:
|
||||
bay = int(file.read())
|
||||
|
||||
if frq == 1:
|
||||
model_t = "freq"
|
||||
with open("tmp", "r") as file:
|
||||
size = float(file.read())
|
||||
|
||||
if bay == 1:
|
||||
model_t = "bayes"
|
||||
with open("tmp", "r") as file:
|
||||
size = int(file.read())
|
||||
|
||||
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
|
||||
print("GPU energy file config: {}".format(pickle_name))
|
||||
|
||||
def get_sample_of_gpu():
|
||||
from re import sub, findall
|
||||
import subprocess
|
||||
from subprocess import run
|
||||
|
||||
no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
|
||||
no_version = "Failed to initialize NVML: Driver/library version mismatch"
|
||||
smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
|
||||
smi_string = smi_string.stdout.decode('utf-8')
|
||||
smi_string = smi_string.split("\n")
|
||||
smi_string = list(filter(lambda x: x, smi_string))
|
||||
if smi_string[0] == no_graph:
|
||||
raise Exception("It seems that no AMD GPU is installed")
|
||||
elif smi_string[0] == no_version:
|
||||
raise Exception("rocm-smi version mismatch")
|
||||
else:
|
||||
results= []
|
||||
gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2])
|
||||
gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
|
||||
gpuM0 = findall("[0-9]+",smi_string[7])
|
||||
gpuM1 = findall("[0-9]+",smi_string[9])
|
||||
gpuV0 = findall("[0-9]+",smi_string[13])
|
||||
gpuV1 = findall("[0-9]+",smi_string[14])
|
||||
results.append(float(gpuW0[0]) + float(gpuW1[0]))
|
||||
if len(gpuM0) == 2 and len(gpuM1) == 2:
|
||||
results.append(int(gpuM0[1]) + int(gpuM1[1]))
|
||||
elif len(gpuM0) == 2:
|
||||
results.append(gpuM0[1])
|
||||
elif len(gpuM1) == 2:
|
||||
results.append(gpuM1[1])
|
||||
results.append(int(gpuV0[1]) + int(gpuV1[1]))
|
||||
return results
|
||||
#for l in smi_string:
|
||||
#temp = findall("[0-9]*MiB | [0-9]*W",l)
|
||||
#if temp:
|
||||
#return temp
|
||||
|
||||
def total_watt_consumed():
|
||||
with open(pickle_name, 'rb') as f:
|
||||
x = pickle.load(f)
|
||||
x = np.array(x)
|
||||
x = x[:,0]
|
||||
y = [int(re.findall("\d+",xi)[0]) for xi in x]
|
||||
return sum(y)
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataDump = []
|
||||
#var = True
|
||||
#pickling_on = open("wattdata.pickle","wb")
|
||||
while True:
|
||||
#from run_service import retcode
|
||||
try:
|
||||
dataDump.append(get_sample_of_gpu())
|
||||
with open(pickle_name, 'wb') as f:
|
||||
pickle.dump(dataDump, f)
|
||||
except EOFError:
|
||||
warn('Pickle ran out of space')
|
||||
size += 0.01
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
#if retcode == 0:
|
||||
#break
|
||||
|
||||
#pickle.dump(dataDump, pickling_on)
|
||||
#pickling_on.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
import argparse
|
||||
from argparse import ArgumentParser
|
||||
|
||||
# Construct an argument parser
|
||||
all_args = argparse.ArgumentParser()
|
||||
|
||||
|
||||
def makeArguments(arguments: ArgumentParser) -> dict:
|
||||
all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
|
||||
type=int, choices=range(1,7), help="Bayesian model of size x")
|
||||
all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
|
||||
type=int, choices=range(1,7), help="Frequentist model of size x")
|
||||
all_args.add_argument("-E", "--EarlyStopping", action="store_true",
|
||||
help="Early Stopping criteria")
|
||||
all_args.add_argument("-e", "--EnergyBound", action="store_true",
|
||||
help="Energy Bound criteria")
|
||||
all_args.add_argument("-a", "--AccuracyBound", action="store_true",
|
||||
help="Accuracy Bound criteria")
|
||||
all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
|
||||
return vars(all_args.parse_args())
|
|
@ -10,7 +10,10 @@ priors={
|
|||
'posterior_rho_initial': (-5, 0.1), # (mean, std) normal_
|
||||
}
|
||||
|
||||
n_epochs = 200
|
||||
n_epochs = 100
|
||||
sens = 1e-9
|
||||
energy_thrs = 100000
|
||||
acc_thrs = 0.99
|
||||
lr_start = 0.001
|
||||
num_workers = 4
|
||||
valid_size = 0.2
|
||||
|
@ -27,16 +30,16 @@ if bay == 1:
|
|||
with open("tmp", "r") as file:
|
||||
wide = int(file.read())
|
||||
|
||||
if os.path.exists("tmp"):
|
||||
os.remove("tmp")
|
||||
else:
|
||||
raise Exception("Tmp file not found")
|
||||
#if os.path.exists("tmp"):
|
||||
# os.remove("tmp")
|
||||
#else:
|
||||
# raise Exception("Tmp file not found")
|
||||
|
||||
print("Bayesian configured to run with width: {}".format(wide))
|
||||
|
||||
|
||||
if os.path.exists("bay"):
|
||||
os.remove("bay")
|
||||
else:
|
||||
raise Exception("Bay file not found")
|
||||
#if os.path.exists("bay"):
|
||||
# os.remove("bay")
|
||||
#else:
|
||||
# raise Exception("Bay file not found")
|
||||
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
import os
|
||||
n_epochs = 100
|
||||
sens = 1e-9
|
||||
energy_thrs = 100000
|
||||
acc_thrs = 0.99
|
||||
lr = 0.001
|
||||
num_workers = 4
|
||||
valid_size = 0.2
|
||||
|
@ -23,8 +26,7 @@ if frq == 1:
|
|||
|
||||
|
||||
|
||||
if os.path.exists("frq"):
|
||||
os.remove("frq")
|
||||
else:
|
||||
raise Exception("Frq file not found")
|
||||
|
||||
#if os.path.exists("frq"):
|
||||
# os.remove("frq")
|
||||
#else:
|
||||
# raise Exception("Frq file not found")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#!/bin/bash
|
||||
#!/bin/env bash
|
||||
|
||||
powerstat -z 0.5 1000000 > $1
|
||||
#powerstat -z 0.5 1000000 > $1
|
||||
powerstat -D > $1
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ if frq == 1:
|
|||
if bay == 1:
|
||||
model_t = "bayes"
|
||||
with open("tmp", "r") as file:
|
||||
wide = int(file.read())
|
||||
size = int(file.read())
|
||||
|
||||
pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
|
||||
#print("GPU energy file config: {}".format(pickle_name))
|
||||
|
@ -33,12 +33,13 @@ def get_sample_of_gpu():
|
|||
smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
|
||||
smi_string = smi_string.stdout.decode('utf-8')
|
||||
smi_string = smi_string.split("\n")
|
||||
smi_string = list(filter(lambda x: x, smi_string))
|
||||
if smi_string[0] == no_graph:
|
||||
raise Exception("It seems that no NVIDIA GPU is installed")
|
||||
elif smi_string[0] == no_version:
|
||||
raise Exception("nvidia-smi version mismatch")
|
||||
else:
|
||||
return findall("[0-9]*MiB | [0-9]*W",smi_string[9])
|
||||
return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
|
||||
#for l in smi_string:
|
||||
#temp = findall("[0-9]*MiB | [0-9]*W",l)
|
||||
#if temp:
|
||||
|
|
|
@ -12,13 +12,14 @@ import config_bayesian as cfg
|
|||
from datetime import datetime
|
||||
from torch.nn import functional as F
|
||||
from torch.optim import Adam, lr_scheduler
|
||||
import gpu_sample_draw
|
||||
import amd_sample_draw
|
||||
from models.BayesianModels.BayesianLeNet import BBBLeNet
|
||||
from models.BayesianModels.BayesianAlexNet import BBBAlexNet
|
||||
from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
|
||||
from stopping_crit import earlyStopping, energyBound, accuracyBound
|
||||
|
||||
# CUDA settings
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
|
||||
if (net_type == 'lenet'):
|
||||
|
@ -114,12 +115,17 @@ def run(dataset, net_type):
|
|||
if not os.path.exists(ckpt_dir):
|
||||
os.makedirs(ckpt_dir, exist_ok=True)
|
||||
|
||||
with open("stp", "r") as file:
|
||||
stp = int(file.read())
|
||||
with open("sav", "r") as file:
|
||||
sav = int(file.read())
|
||||
|
||||
criterion = metrics.ELBO(len(trainset)).to(device)
|
||||
optimizer = Adam(net.parameters(), lr=lr_start)
|
||||
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
|
||||
#valid_loss_max = np.Inf
|
||||
#early_stop = []
|
||||
#thrs=1e-9
|
||||
if stp == 2:
|
||||
early_stop = []
|
||||
train_data = []
|
||||
for epoch in range(n_epochs): # loop over the dataset multiple times
|
||||
|
||||
|
@ -132,22 +138,25 @@ def run(dataset, net_type):
|
|||
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
|
||||
epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
|
||||
|
||||
#early_stop.append(valid_acc)
|
||||
#if epoch % 4 == 0 and epoch > 0:
|
||||
#print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
|
||||
#if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
|
||||
#break
|
||||
#early_stop = []
|
||||
if stp == 2:
|
||||
print('Using early stopping')
|
||||
if earlyStopping(early_stop,train_acc,cfg.sens) == None:
|
||||
break
|
||||
elif stp == 3:
|
||||
print('Using energy bound')
|
||||
if energyBound(cfg.energy_thrs) == None:
|
||||
break
|
||||
elif stp == 4:
|
||||
print('Using accuracy bound')
|
||||
if accuracyBound(cfg.acc_thrs) == None:
|
||||
break
|
||||
else:
|
||||
print('Training for {} epochs'.format(cfg.n_epochs))
|
||||
|
||||
if train_acc >= 0.50:
|
||||
break
|
||||
|
||||
#if gpu_sample_draw.total_watt_consumed() > 100000:
|
||||
#break
|
||||
|
||||
# save model on last epoch
|
||||
#if epoch == (n_epochs-1):
|
||||
#torch.save(net.state_dict(), ckpt_name)
|
||||
if sav == 1:
|
||||
# save model when finished
|
||||
if epoch == n_epochs:
|
||||
torch.save(net.state_dict(), ckpt_name)
|
||||
|
||||
with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
|
||||
pickle.dump(train_data, f)
|
||||
|
|
|
@ -8,17 +8,17 @@ import metrics
|
|||
import argparse
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
import gpu_sample_draw
|
||||
import amd_sample_draw
|
||||
from datetime import datetime
|
||||
import config_frequentist as cfg
|
||||
from torch.optim import Adam, lr_scheduler
|
||||
from models.NonBayesianModels.LeNet import LeNet
|
||||
from models.NonBayesianModels.AlexNet import AlexNet
|
||||
from stopping_crit import earlyStopping, energyBound, accuracyBound
|
||||
from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
|
||||
|
||||
|
||||
# CUDA settings
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def getModel(net_type, inputs, outputs,wide=cfg.wide):
|
||||
|
@ -81,12 +81,17 @@ def run(dataset, net_type):
|
|||
if not os.path.exists(ckpt_dir):
|
||||
os.makedirs(ckpt_dir, exist_ok=True)
|
||||
|
||||
with open("stp", "r") as file:
|
||||
stp = int(file.read())
|
||||
with open("sav", "r") as file:
|
||||
sav = int(file.read())
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = Adam(net.parameters(), lr=lr)
|
||||
lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
|
||||
#valid_loss_min = np.Inf
|
||||
#early_stop = []
|
||||
#thrs=1e-9
|
||||
if stp == 2:
|
||||
early_stop = []
|
||||
train_data = []
|
||||
for epoch in range(1, n_epochs+1):
|
||||
|
||||
|
@ -101,22 +106,22 @@ def run(dataset, net_type):
|
|||
print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
|
||||
epoch, train_loss, train_acc, valid_loss, valid_acc))
|
||||
|
||||
#early_stop.append(valid_acc)
|
||||
#if epoch % 4 == 0 and epoch > 0:
|
||||
# print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
|
||||
# if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
|
||||
# break
|
||||
# early_stop = []
|
||||
if stp == 2:
|
||||
print('Using early stopping')
|
||||
earlyStopping(early_stop,train_acc,cfg.sens)
|
||||
elif stp == 3:
|
||||
print('Using energy bound')
|
||||
energyBound(cfg.energy_thrs)
|
||||
elif stp == 4:
|
||||
print('Using accuracy bound')
|
||||
accuracyBound(cfg.acc_thrs)
|
||||
else:
|
||||
print('Training for {} epochs'.format(cfg.n_epochs))
|
||||
|
||||
#if train_acc >= 0.99:
|
||||
# break
|
||||
|
||||
#if gpu_sample_draw.total_watt_consumed() > 100000:
|
||||
# break
|
||||
|
||||
# save model when finished
|
||||
#if epoch == n_epochs:
|
||||
#torch.save(net.state_dict(), ckpt_name)
|
||||
if sav == 1:
|
||||
# save model when finished
|
||||
if epoch == n_epochs:
|
||||
torch.save(net.state_dict(), ckpt_name)
|
||||
|
||||
with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
|
||||
pickle.dump(train_data, f)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/bin/bash
|
||||
#!/bin/env bash
|
||||
|
||||
|
||||
while true
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
import pickle
|
||||
|
||||
gpu_data = []
|
||||
with (open("bayes_wattdata_1.pkl", "rb")) as openfile:
|
||||
while True:
|
||||
try:
|
||||
gpu_data.append(pickle.load(openfile))
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
exp_data = []
|
||||
with (open("bayes_exp_data_1.pkl", "rb")) as openfile:
|
||||
while True:
|
||||
try:
|
||||
exp_data.append(pickle.load(openfile))
|
||||
except EOFError:
|
||||
break
|
|
@ -1,16 +1,9 @@
|
|||
import argparse
|
||||
import arguments
|
||||
from time import sleep
|
||||
import subprocess as sub
|
||||
from arguments import makeArguments
|
||||
|
||||
# Construct an argument parser
|
||||
all_args = argparse.ArgumentParser()
|
||||
|
||||
all_args.add_argument("-b", "--Value1", action="store", dest="b",
|
||||
type=int, choices=range(1,6), help="Bayesian model of size x")
|
||||
all_args.add_argument("-f", "--Value2", action="store", dest="f",
|
||||
type=int, choices=range(1,6), help="Frequentist model of size x")
|
||||
args = vars(all_args.parse_args())
|
||||
|
||||
args = makeArguments(arguments.all_args)
|
||||
|
||||
check = list(args.values())
|
||||
if all(v is None for v in check):
|
||||
|
@ -29,6 +22,26 @@ wide = args["f"] or args["b"]
|
|||
with open("tmp", "w") as file:
|
||||
file.write(str(wide))
|
||||
|
||||
if args['EarlyStopping']:
|
||||
with open("stp", "w") as file:
|
||||
file.write('2')
|
||||
elif args['EnergyBound']:
|
||||
with open("stp", "w") as file:
|
||||
file.write('3')
|
||||
elif args['AccuracyBound']:
|
||||
with open("stp", "w") as file:
|
||||
file.write('4')
|
||||
else:
|
||||
with open("stp", "w") as file:
|
||||
file.write('1')
|
||||
|
||||
if args['Save']:
|
||||
with open("sav", "w") as file:
|
||||
file.write('1')
|
||||
else:
|
||||
with open("sav", "w") as file:
|
||||
file.write('0')
|
||||
|
||||
sleep(3)
|
||||
|
||||
|
||||
|
@ -44,7 +57,7 @@ elif cmd[1] == "main_bayesian.py":
|
|||
cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)]
|
||||
with open("bay", "w") as file:
|
||||
file.write(str(1))
|
||||
with open("frw", "w") as file:
|
||||
with open("frq", "w") as file:
|
||||
file.write(str(0))
|
||||
|
||||
|
||||
|
@ -52,7 +65,7 @@ path = sub.check_output(['pwd'])
|
|||
path = path.decode()
|
||||
path = path.replace('\n', '')
|
||||
|
||||
#startWattCounter = 'python ' + path + '/gpu_sample_draw.py'
|
||||
startWattCounter = 'python ' + path + '/amd_sample_draw.py'
|
||||
|
||||
#test = startNODE.split()
|
||||
#test.append(pythonEnd)
|
||||
|
@ -60,11 +73,11 @@ path = path.replace('\n', '')
|
|||
|
||||
#startNODE = test
|
||||
|
||||
##print(startNODE)
|
||||
##print(startWattCounter)
|
||||
#print(startNODE)
|
||||
#print(startWattCounter)
|
||||
|
||||
p1 = sub.Popen(cmd)
|
||||
#p2 = sub.Popen(startWattCounter.split())
|
||||
p2 = sub.Popen(startWattCounter.split())
|
||||
p3 = sub.Popen(cmd2)
|
||||
p4 = sub.Popen(cmd3)
|
||||
|
||||
|
@ -72,6 +85,6 @@ retcode = p1.wait()
|
|||
print("Return code: {}".format(retcode))
|
||||
|
||||
p1.kill()
|
||||
#p2.kill()
|
||||
p2.kill()
|
||||
p3.kill()
|
||||
p4.kill()
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
def earlyStopping(early_stopping: list, train_acc: float, sensitivity: float=1e-9):
|
||||
early_stopping.append(train_acc)
|
||||
if epoch % 4 == 0 and epoch > 0:
|
||||
print("Value 1: {} >= {}, Value 2: {} >= {}, \
|
||||
Value 2: {} >= {}".format(early_stopping[0], \
|
||||
train_acc-sensitivity,early_stopping[1], \
|
||||
train_acc-sensitivity, early_stopping[2], train_acc-sensitivity))
|
||||
if abs(early_stopping[0]) >= train_acc-sensitivity and \
|
||||
abs(early_stopping[1]) >= train_acc-sensitivity and \
|
||||
abs(early_stopping[2]) >= train_acc-sensitivity:
|
||||
return None
|
||||
early_stopping = []
|
||||
|
||||
|
||||
def energyBound(threshold: float=100000.0):
|
||||
if gpu_sample_draw.total_watt_consumed() > threshold:
|
||||
return None
|
||||
|
||||
|
||||
def accuracyBound(train_acc: float, threshold: float=0.99):
|
||||
if train_acc >= threshold:
|
||||
return None
|
Loading…
Reference in New Issue