Ignore save files

2023-06-01 09:20:51 +01:00 · 2023-06-01 09:20:51 +01:00 · 88c3f1c088
parent 403f5b52b1
commit 88c3f1c088
13 changed files with 275 additions and 77 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,3 +7,12 @@ experiment-power-draw/
 **/__init__.py
 **/**/__pycache__/
 **/**/__init__.py
 stp
 sav
 bayes_*
 freq_*
 *.pkl
 bay
 frq
 sav
 tmp
--- a/amd_sample_draw.py
+++ b/amd_sample_draw.py
@ -0,0 +1,96 @@
 import os
 import re
 import pickle
 import numpy as np
 from warnings import warn
 with open("frq", "r") as file:
    frq = int(file.read())
 with open("bay", "r") as file:
    bay = int(file.read())
 if frq == 1:
  model_t = "freq"
  with open("tmp", "r") as file:
    size = float(file.read())
 if bay == 1:
  model_t = "bayes"
  with open("tmp", "r") as file:
    size = int(file.read())
 pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 print("GPU energy file config: {}".format(pickle_name))
 def get_sample_of_gpu():
  from re import sub, findall
  import subprocess
  from subprocess import run
  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
  no_version = "Failed to initialize NVML: Driver/library version mismatch"
  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
  smi_string = smi_string.stdout.decode('utf-8')
  smi_string = smi_string.split("\n")
  smi_string = list(filter(lambda x: x, smi_string))
  if smi_string[0] ==  no_graph:
    raise Exception("It seems that no AMD GPU is installed")
  elif smi_string[0] ==  no_version:
    raise Exception("rocm-smi version mismatch")
  else:
    results= []
    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
    gpuM0 = findall("[0-9]+",smi_string[7]) 
    gpuM1 = findall("[0-9]+",smi_string[9])
    gpuV0 = findall("[0-9]+",smi_string[13]) 
    gpuV1 = findall("[0-9]+",smi_string[14])
    results.append(float(gpuW0[0]) + float(gpuW1[0]))
    if len(gpuM0) == 2 and len(gpuM1) == 2:
      results.append(int(gpuM0[1]) + int(gpuM1[1]))
    elif len(gpuM0) == 2:
      results.append(gpuM0[1])
    elif len(gpuM1) == 2:
      results.append(gpuM1[1])
    results.append(int(gpuV0[1]) + int(gpuV1[1]))
    return results
    #for l in smi_string:
        #temp = findall("[0-9]*MiB | [0-9]*W",l)
        #if temp:
           #return temp
 def total_watt_consumed():
    with open(pickle_name, 'rb') as f:
        x = pickle.load(f)
    x = np.array(x)
    x = x[:,0]
    y = [int(re.findall("\d+",xi)[0]) for xi in x]
    return sum(y)
 if __name__ == '__main__':
  dataDump = []
  #var = True
  #pickling_on = open("wattdata.pickle","wb")
  while True:
    #from run_service import retcode
    try:
      dataDump.append(get_sample_of_gpu())
      with open(pickle_name, 'wb') as f:
        pickle.dump(dataDump, f)
    except EOFError:
      warn('Pickle ran out of space')
      size += 0.01
    finally:
      f.close()
    #if retcode == 0:
      #break
  #pickle.dump(dataDump, pickling_on)
  #pickling_on.close()
--- a/arguments.py
+++ b/arguments.py
@ -0,0 +1,20 @@
 import argparse
 from argparse import ArgumentParser
 # Construct an argument parser
 all_args = argparse.ArgumentParser()
 def makeArguments(arguments: ArgumentParser) -> dict:
    all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
       type=int, choices=range(1,7), help="Bayesian model of size x")
    all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
       type=int, choices=range(1,7), help="Frequentist model of size x")
    all_args.add_argument("-E", "--EarlyStopping", action="store_true",
    help="Early Stopping criteria")
    all_args.add_argument("-e", "--EnergyBound", action="store_true",
    help="Energy Bound criteria")
    all_args.add_argument("-a", "--AccuracyBound", action="store_true",
    help="Accuracy Bound criteria")
    all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
    return vars(all_args.parse_args())
--- a/config_bayesian.py
+++ b/config_bayesian.py
@ -10,7 +10,10 @@ priors={
    'posterior_rho_initial': (-5, 0.1),  # (mean, std) normal_
 }
-n_epochs = 200
+n_epochs = 100
 sens = 1e-9
 energy_thrs = 100000
 acc_thrs = 0.99
 lr_start = 0.001
 num_workers = 4
 valid_size = 0.2
@ -27,16 +30,16 @@ if bay == 1:
    with open("tmp", "r") as file:
        wide = int(file.read())
-    if os.path.exists("tmp"):
+    #if os.path.exists("tmp"):
-        os.remove("tmp")
+    #    os.remove("tmp")
-    else:
+    #else:
-        raise Exception("Tmp file not found")
+    #    raise Exception("Tmp file not found")
    print("Bayesian configured to run with width: {}".format(wide))
-if os.path.exists("bay"):
+#if os.path.exists("bay"): 
-    os.remove("bay")
+#    os.remove("bay")
-else:
+#else:
-    raise Exception("Bay file not found")
+#    raise Exception("Bay file not found")
--- a/config_frequentist.py
+++ b/config_frequentist.py
@ -2,6 +2,9 @@
 import os
 n_epochs = 100
 sens = 1e-9
 energy_thrs = 100000
 acc_thrs = 0.99
 lr = 0.001
 num_workers = 4
 valid_size = 0.2
@ -23,8 +26,7 @@ if frq == 1:
-if os.path.exists("frq"):
+#if os.path.exists("frq"):
-    os.remove("frq")
+#    os.remove("frq")
-else:
+#else:
-    raise Exception("Frq file not found")
+#    raise Exception("Frq file not found")
--- a/cpu_watt.sh
+++ b/cpu_watt.sh
@ -1,4 +1,5 @@
-#!/bin/bash
+#!/bin/env bash
-powerstat -z 0.5 1000000 > $1
+#powerstat -z 0.5 1000000 > $1
 powerstat -D > $1
--- a/gpu_sample_draw.py
+++ b/gpu_sample_draw.py
@ -18,7 +18,7 @@ if frq == 1:
 if bay == 1:
  model_t = "bayes"
  with open("tmp", "r") as file:
-    wide = int(file.read())
+    size = int(file.read())
 pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 #print("GPU energy file config: {}".format(pickle_name))
@ -33,12 +33,13 @@ def get_sample_of_gpu():
  smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
  smi_string = smi_string.stdout.decode('utf-8')
  smi_string = smi_string.split("\n")
  smi_string = list(filter(lambda x: x, smi_string))
  if smi_string[0] ==  no_graph:
    raise Exception("It seems that no NVIDIA GPU is installed")
  elif smi_string[0] ==  no_version:
    raise Exception("nvidia-smi version mismatch")
  else:
-    return findall("[0-9]*MiB | [0-9]*W",smi_string[9])
+    return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
    #for l in smi_string:
        #temp = findall("[0-9]*MiB | [0-9]*W",l)
        #if temp:
--- a/main_bayesian.py
+++ b/main_bayesian.py
@ -12,13 +12,14 @@ import config_bayesian as cfg
 from datetime import datetime
 from torch.nn import functional as F
 from torch.optim import Adam, lr_scheduler
-import gpu_sample_draw
+import amd_sample_draw
 from models.BayesianModels.BayesianLeNet import BBBLeNet
 from models.BayesianModels.BayesianAlexNet import BBBAlexNet
 from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
 from stopping_crit import earlyStopping, energyBound, accuracyBound
 # CUDA settings
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
 def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
    if (net_type == 'lenet'):
@ -114,12 +115,17 @@ def run(dataset, net_type):
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)
    with open("stp", "r") as file:
        stp = int(file.read())
    with open("sav", "r") as file:
        sav = int(file.read())
    criterion = metrics.ELBO(len(trainset)).to(device)
    optimizer = Adam(net.parameters(), lr=lr_start)
    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
    #valid_loss_max = np.Inf
-    #early_stop = []
+    if stp == 2:
-    #thrs=1e-9
+        early_stop = []
    train_data = []
    for epoch in range(n_epochs):  # loop over the dataset multiple times
@ -132,22 +138,25 @@ def run(dataset, net_type):
        print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
            epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
-        #early_stop.append(valid_acc)
+        if stp == 2:
-        #if epoch % 4 == 0 and epoch > 0:
+            print('Using early stopping')
-            #print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
+            if earlyStopping(early_stop,train_acc,cfg.sens) == None:
-            #if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
+                break
-                #break
+        elif stp == 3: 
-            #early_stop = []
+            print('Using energy bound')
            if energyBound(cfg.energy_thrs) == None:
                break
        elif stp == 4:
            print('Using accuracy bound')
            if accuracyBound(cfg.acc_thrs) == None:
                break
        else:
            print('Training for {} epochs'.format(cfg.n_epochs))
-        if train_acc >= 0.50:
+        if sav == 1:
-            break
+            # save model when finished
-
+            if epoch == n_epochs:
-        #if gpu_sample_draw.total_watt_consumed() > 100000:
+                torch.save(net.state_dict(), ckpt_name)
            #break
        # save model on last epoch
        #if epoch == (n_epochs-1):
            #torch.save(net.state_dict(), ckpt_name)
    with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
      pickle.dump(train_data, f)
--- a/main_frequentist.py
+++ b/main_frequentist.py
@ -8,17 +8,17 @@ import metrics
 import argparse
 import numpy as np
 import torch.nn as nn
-import gpu_sample_draw
+import amd_sample_draw
 from datetime import datetime
 import config_frequentist as cfg
 from torch.optim import Adam, lr_scheduler
 from models.NonBayesianModels.LeNet import LeNet
 from models.NonBayesianModels.AlexNet import AlexNet
 from stopping_crit import earlyStopping, energyBound, accuracyBound
 from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
 # CUDA settings
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
 def getModel(net_type, inputs, outputs,wide=cfg.wide):
@ -81,12 +81,17 @@ def run(dataset, net_type):
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)
    with open("stp", "r") as file:
        stp = int(file.read())
    with open("sav", "r") as file:
        sav = int(file.read())
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(net.parameters(), lr=lr)
    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
    #valid_loss_min = np.Inf
-    #early_stop = []
+    if stp == 2:
-    #thrs=1e-9
+        early_stop = []
    train_data = []
    for epoch in range(1, n_epochs+1):
@ -101,22 +106,22 @@ def run(dataset, net_type):
        print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
            epoch, train_loss, train_acc, valid_loss, valid_acc))
-        #early_stop.append(valid_acc)
+        if stp == 2:
-        #if epoch % 4 == 0 and epoch > 0:
+            print('Using early stopping')
-        #    print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
+            earlyStopping(early_stop,train_acc,cfg.sens)
-        #    if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
+        elif stp == 3: 
-        #        break
+            print('Using energy bound')
-        #    early_stop = []
+            energyBound(cfg.energy_thrs)
        elif stp == 4:
            print('Using accuracy bound')
            accuracyBound(cfg.acc_thrs)
        else:
            print('Training for {} epochs'.format(cfg.n_epochs))
-        #if train_acc >= 0.99:
+        if sav == 1:
-        #    break
+            # save model when finished
-
+            if epoch == n_epochs:
-        #if gpu_sample_draw.total_watt_consumed() > 100000:
+                torch.save(net.state_dict(), ckpt_name)
        #    break
        # save model when finished
        #if epoch == n_epochs:
            #torch.save(net.state_dict(), ckpt_name)
    with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
      pickle.dump(train_data, f)
--- a/mem_free.sh
+++ b/mem_free.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/env bash
 while true
--- a/read_pickle.py
+++ b/read_pickle.py
@ -0,0 +1,17 @@
 import pickle
 gpu_data = []
 with (open("bayes_wattdata_1.pkl", "rb")) as openfile:
    while True:
        try:
            gpu_data.append(pickle.load(openfile))
        except EOFError:
            break
 exp_data = []
 with (open("bayes_exp_data_1.pkl", "rb")) as openfile:
    while True:
        try:
            exp_data.append(pickle.load(openfile))
        except EOFError:
            break
--- a/run_service.py
+++ b/run_service.py
@ -1,16 +1,9 @@
-import argparse
+import arguments
 from time import sleep
 import subprocess as sub
 from arguments import makeArguments
-# Construct an argument parser
+args = makeArguments(arguments.all_args)
 all_args = argparse.ArgumentParser()
 all_args.add_argument("-b", "--Value1", action="store", dest="b",
   type=int, choices=range(1,6), help="Bayesian model of size x")
 all_args.add_argument("-f", "--Value2", action="store", dest="f",
   type=int, choices=range(1,6), help="Frequentist model of size x")
 args = vars(all_args.parse_args())
 check = list(args.values())
 if all(v is None for v in check):
@ -29,6 +22,26 @@ wide = args["f"] or args["b"]
 with open("tmp", "w") as file:
    file.write(str(wide))
 if args['EarlyStopping']:
    with open("stp", "w") as file:
        file.write('2')
 elif args['EnergyBound']:
    with open("stp", "w") as file:
        file.write('3')
 elif args['AccuracyBound']:
    with open("stp", "w") as file:
        file.write('4')
 else:
    with open("stp", "w") as file:
        file.write('1')
 if args['Save']:
    with open("sav", "w") as file:
        file.write('1')
 else:
    with open("sav", "w") as file:
        file.write('0')
 sleep(3)
@ -44,7 +57,7 @@ elif cmd[1] == "main_bayesian.py":
    cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)]
    with open("bay", "w") as file:
        file.write(str(1))
-    with open("frw", "w") as file:
+    with open("frq", "w") as file:
        file.write(str(0))
@ -52,7 +65,7 @@ path = sub.check_output(['pwd'])
 path = path.decode()
 path = path.replace('\n', '')
-#startWattCounter = 'python ' + path + '/gpu_sample_draw.py'
+startWattCounter = 'python ' + path + '/amd_sample_draw.py'
 #test = startNODE.split()
 #test.append(pythonEnd)
@ -60,11 +73,11 @@ path = path.replace('\n', '')
 #startNODE = test
-##print(startNODE)
+#print(startNODE)
-##print(startWattCounter)
+#print(startWattCounter)
 p1 = sub.Popen(cmd)
-#p2 = sub.Popen(startWattCounter.split())
+p2 = sub.Popen(startWattCounter.split())
 p3 = sub.Popen(cmd2)
 p4 = sub.Popen(cmd3)
@ -72,6 +85,6 @@ retcode = p1.wait()
 print("Return code: {}".format(retcode))
 p1.kill()
-#p2.kill()
+p2.kill()
 p3.kill()
 p4.kill()
--- a/stopping_crit.py
+++ b/stopping_crit.py
@ -0,0 +1,22 @@
 def earlyStopping(early_stopping: list, train_acc: float, sensitivity: float=1e-9):
    early_stopping.append(train_acc)
    if epoch % 4 == 0 and epoch > 0:
        print("Value 1: {} >= {}, Value 2: {} >= {}, \
            Value 2: {} >= {}".format(early_stopping[0], \
            train_acc-sensitivity,early_stopping[1], \
            train_acc-sensitivity, early_stopping[2], train_acc-sensitivity))
        if abs(early_stopping[0]) >= train_acc-sensitivity and \
            abs(early_stopping[1]) >= train_acc-sensitivity and \
            abs(early_stopping[2]) >= train_acc-sensitivity:
            return None
        early_stopping = []
 def energyBound(threshold: float=100000.0):
    if gpu_sample_draw.total_watt_consumed() > threshold:
        return None
 def accuracyBound(train_acc: float, threshold: float=0.99):
    if train_acc >= threshold:
        return None
`@ -1,4 +1,4 @@`
	`#!/bin/bash`	`#!/bin/env bash`


	`while true`	`while true`