Ignore save files

2023-06-01 09:20:51 +01:00 · 2023-06-01 09:20:51 +01:00 · 88c3f1c088
commit 88c3f1c088
parent 403f5b52b1
13 changed files with 275 additions and 77 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,4 +6,13 @@ experiment-power-draw/
 **/__pycache__/
 **/__init__.py
 **/**/__pycache__/
-**/**/__init__.py
+**/**/__init__.py
+stp
+sav
+bayes_*
+freq_*
+*.pkl
+bay
+frq
+sav
+tmp
--- a/amd_sample_draw.py
+++ b/amd_sample_draw.py
@ -0,0 +1,96 @@
+import os
+import re
+import pickle
+import numpy as np
+from warnings import warn
+
+with open("frq", "r") as file:
+    frq = int(file.read())
+
+with open("bay", "r") as file:
+    bay = int(file.read())
+
+if frq == 1:
+  model_t = "freq"
+  with open("tmp", "r") as file:
+    size = float(file.read())
+
+if bay == 1:
+  model_t = "bayes"
+  with open("tmp", "r") as file:
+    size = int(file.read())
+
+pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
+print("GPU energy file config: {}".format(pickle_name))
+
+def get_sample_of_gpu():
+  from re import sub, findall
+  import subprocess
+  from subprocess import run
+
+  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
+  no_version = "Failed to initialize NVML: Driver/library version mismatch"
+  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
+  smi_string = smi_string.stdout.decode('utf-8')
+  smi_string = smi_string.split("\n")
+  smi_string = list(filter(lambda x: x, smi_string))
+  if smi_string[0] ==  no_graph:
+    raise Exception("It seems that no AMD GPU is installed")
+  elif smi_string[0] ==  no_version:
+    raise Exception("rocm-smi version mismatch")
+  else:
+    results= []
+    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
+    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
+    gpuM0 = findall("[0-9]+",smi_string[7]) 
+    gpuM1 = findall("[0-9]+",smi_string[9])
+    gpuV0 = findall("[0-9]+",smi_string[13]) 
+    gpuV1 = findall("[0-9]+",smi_string[14])
+    results.append(float(gpuW0[0]) + float(gpuW1[0]))
+    if len(gpuM0) == 2 and len(gpuM1) == 2:
+      results.append(int(gpuM0[1]) + int(gpuM1[1]))
+    elif len(gpuM0) == 2:
+      results.append(gpuM0[1])
+    elif len(gpuM1) == 2:
+      results.append(gpuM1[1])
+    results.append(int(gpuV0[1]) + int(gpuV1[1]))
+    return results
+    #for l in smi_string:
+        #temp = findall("[0-9]*MiB | [0-9]*W",l)
+        #if temp:
+           #return temp
+
+def total_watt_consumed():
+    with open(pickle_name, 'rb') as f:
+        x = pickle.load(f)
+    x = np.array(x)
+    x = x[:,0]
+    y = [int(re.findall("\d+",xi)[0]) for xi in x]
+    return sum(y)
+
+if __name__ == '__main__':
+  dataDump = []
+  #var = True
+  #pickling_on = open("wattdata.pickle","wb")
+  while True:
+    #from run_service import retcode
+    try:
+      dataDump.append(get_sample_of_gpu())
+      with open(pickle_name, 'wb') as f:
+        pickle.dump(dataDump, f)
+    except EOFError:
+      warn('Pickle ran out of space')
+      size += 0.01
+    finally:
+      f.close()
+
+    #if retcode == 0:
+      #break
+
+  #pickle.dump(dataDump, pickling_on)
+  #pickling_on.close()
+
+
+
+
+
--- a/arguments.py
+++ b/arguments.py
@ -0,0 +1,20 @@
+import argparse
+from argparse import ArgumentParser
+
+# Construct an argument parser
+all_args = argparse.ArgumentParser()
+
+
+def makeArguments(arguments: ArgumentParser) -> dict:
+    all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
+       type=int, choices=range(1,7), help="Bayesian model of size x")
+    all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
+       type=int, choices=range(1,7), help="Frequentist model of size x")
+    all_args.add_argument("-E", "--EarlyStopping", action="store_true",
+    help="Early Stopping criteria")
+    all_args.add_argument("-e", "--EnergyBound", action="store_true",
+    help="Energy Bound criteria")
+    all_args.add_argument("-a", "--AccuracyBound", action="store_true",
+    help="Accuracy Bound criteria")
+    all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
+    return vars(all_args.parse_args())
--- a/config_bayesian.py
+++ b/config_bayesian.py
@ -10,7 +10,10 @@ priors={
    'posterior_rho_initial': (-5, 0.1),  # (mean, std) normal_
 }

-n_epochs = 200
+n_epochs = 100
+sens = 1e-9
+energy_thrs = 100000
+acc_thrs = 0.99
 lr_start = 0.001
 num_workers = 4
 valid_size = 0.2
@ -27,16 +30,16 @@ if bay == 1:
    with open("tmp", "r") as file:
        wide = int(file.read())

-    if os.path.exists("tmp"):
-        os.remove("tmp")
-    else:
-        raise Exception("Tmp file not found")
+    #if os.path.exists("tmp"):
+    #    os.remove("tmp")
+    #else:
+    #    raise Exception("Tmp file not found")

    print("Bayesian configured to run with width: {}".format(wide))


-if os.path.exists("bay"):
-    os.remove("bay")
-else:
-    raise Exception("Bay file not found")
-    
+#if os.path.exists("bay"): 
+#    os.remove("bay")
+#else:
+#    raise Exception("Bay file not found")
+    
--- a/config_frequentist.py
+++ b/config_frequentist.py
@ -2,6 +2,9 @@

 import os
 n_epochs = 100
+sens = 1e-9
+energy_thrs = 100000
+acc_thrs = 0.99
 lr = 0.001
 num_workers = 4
 valid_size = 0.2
@ -23,8 +26,7 @@ if frq == 1:



-if os.path.exists("frq"):
-    os.remove("frq")
-else:
-    raise Exception("Frq file not found")
-
+#if os.path.exists("frq"):
+#    os.remove("frq")
+#else:
+#    raise Exception("Frq file not found")
--- a/cpu_watt.sh
+++ b/cpu_watt.sh
@ -1,4 +1,5 @@
-#!/bin/bash
+#!/bin/env bash

-powerstat -z 0.5 1000000 > $1
+#powerstat -z 0.5 1000000 > $1
+powerstat -D > $1

--- a/gpu_sample_draw.py
+++ b/gpu_sample_draw.py
@ -18,7 +18,7 @@ if frq == 1:
 if bay == 1:
  model_t = "bayes"
  with open("tmp", "r") as file:
-    wide = int(file.read())
+    size = int(file.read())

 pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 #print("GPU energy file config: {}".format(pickle_name))
@ -33,12 +33,13 @@ def get_sample_of_gpu():
  smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
  smi_string = smi_string.stdout.decode('utf-8')
  smi_string = smi_string.split("\n")
+  smi_string = list(filter(lambda x: x, smi_string))
  if smi_string[0] ==  no_graph:
    raise Exception("It seems that no NVIDIA GPU is installed")
  elif smi_string[0] ==  no_version:
    raise Exception("nvidia-smi version mismatch")
  else:
-    return findall("[0-9]*MiB | [0-9]*W",smi_string[9])
+    return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
    #for l in smi_string:
        #temp = findall("[0-9]*MiB | [0-9]*W",l)
        #if temp:
--- a/main_bayesian.py
+++ b/main_bayesian.py
@ -12,13 +12,14 @@ import config_bayesian as cfg
 from datetime import datetime
 from torch.nn import functional as F
 from torch.optim import Adam, lr_scheduler
-import gpu_sample_draw
+import amd_sample_draw
 from models.BayesianModels.BayesianLeNet import BBBLeNet
 from models.BayesianModels.BayesianAlexNet import BBBAlexNet
 from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
+from stopping_crit import earlyStopping, energyBound, accuracyBound

 # CUDA settings
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

 def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
    if (net_type == 'lenet'):
@ -114,12 +115,17 @@ def run(dataset, net_type):
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)

+    with open("stp", "r") as file:
+        stp = int(file.read())
+    with open("sav", "r") as file:
+        sav = int(file.read())
+
    criterion = metrics.ELBO(len(trainset)).to(device)
    optimizer = Adam(net.parameters(), lr=lr_start)
    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
    #valid_loss_max = np.Inf
-    #early_stop = []
-    #thrs=1e-9
+    if stp == 2:
+        early_stop = []
    train_data = []
    for epoch in range(n_epochs):  # loop over the dataset multiple times

@ -132,22 +138,25 @@ def run(dataset, net_type):
        print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
            epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))

-        #early_stop.append(valid_acc)
-        #if epoch % 4 == 0 and epoch > 0:
-            #print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
-            #if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
-                #break
-            #early_stop = []
+        if stp == 2:
+            print('Using early stopping')
+            if earlyStopping(early_stop,train_acc,cfg.sens) == None:
+                break
+        elif stp == 3: 
+            print('Using energy bound')
+            if energyBound(cfg.energy_thrs) == None:
+                break
+        elif stp == 4:
+            print('Using accuracy bound')
+            if accuracyBound(cfg.acc_thrs) == None:
+                break
+        else:
+            print('Training for {} epochs'.format(cfg.n_epochs))

-        if train_acc >= 0.50:
-            break
-
-        #if gpu_sample_draw.total_watt_consumed() > 100000:
-            #break
-
-        # save model on last epoch
-        #if epoch == (n_epochs-1):
-            #torch.save(net.state_dict(), ckpt_name)
+        if sav == 1:
+            # save model when finished
+            if epoch == n_epochs:
+                torch.save(net.state_dict(), ckpt_name)

    with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
      pickle.dump(train_data, f)
--- a/main_frequentist.py
+++ b/main_frequentist.py
@ -8,17 +8,17 @@ import metrics
 import argparse
 import numpy as np
 import torch.nn as nn
-import gpu_sample_draw
+import amd_sample_draw
 from datetime import datetime
 import config_frequentist as cfg
 from torch.optim import Adam, lr_scheduler
 from models.NonBayesianModels.LeNet import LeNet
 from models.NonBayesianModels.AlexNet import AlexNet
+from stopping_crit import earlyStopping, energyBound, accuracyBound
 from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC

-
 # CUDA settings
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")


 def getModel(net_type, inputs, outputs,wide=cfg.wide):
@ -81,12 +81,17 @@ def run(dataset, net_type):
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)

+    with open("stp", "r") as file:
+        stp = int(file.read())
+    with open("sav", "r") as file:
+        sav = int(file.read())
+
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(net.parameters(), lr=lr)
    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, verbose=True)
    #valid_loss_min = np.Inf
-    #early_stop = []
-    #thrs=1e-9
+    if stp == 2:
+        early_stop = []
    train_data = []
    for epoch in range(1, n_epochs+1):

@ -100,23 +105,23 @@ def run(dataset, net_type):
        train_data.append([epoch,train_loss,train_acc,valid_loss,valid_acc])
        print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
            epoch, train_loss, train_acc, valid_loss, valid_acc))
+        
+        if stp == 2:
+            print('Using early stopping')
+            earlyStopping(early_stop,train_acc,cfg.sens)
+        elif stp == 3: 
+            print('Using energy bound')
+            energyBound(cfg.energy_thrs)
+        elif stp == 4:
+            print('Using accuracy bound')
+            accuracyBound(cfg.acc_thrs)
+        else:
+            print('Training for {} epochs'.format(cfg.n_epochs))

-        #early_stop.append(valid_acc)
-        #if epoch % 4 == 0 and epoch > 0:
-        #    print("Value 1: {} >= {}, Value 2: {} >= {}, Value 2: {} >= {}".format(early_stop[0],valid_acc-thrs,early_stop[1],valid_acc-thrs,early_stop[2],valid_acc-thrs))
-        #    if abs(early_stop[0]) >= valid_acc-thrs and abs(early_stop[1]) >= valid_acc-thrs and abs(early_stop[2]) >= valid_acc-thrs:
-        #        break
-        #    early_stop = []
-
-        #if train_acc >= 0.99:
-        #    break
-
-        #if gpu_sample_draw.total_watt_consumed() > 100000:
-        #    break
-
-        # save model when finished
-        #if epoch == n_epochs:
-            #torch.save(net.state_dict(), ckpt_name)
+        if sav == 1:
+            # save model when finished
+            if epoch == n_epochs:
+                torch.save(net.state_dict(), ckpt_name)
    
    with open("freq_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
      pickle.dump(train_data, f)
--- a/mem_free.sh
+++ b/mem_free.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/env bash


 while true
--- a/read_pickle.py
+++ b/read_pickle.py
@ -0,0 +1,17 @@
+import pickle
+
+gpu_data = []
+with (open("bayes_wattdata_1.pkl", "rb")) as openfile:
+    while True:
+        try:
+            gpu_data.append(pickle.load(openfile))
+        except EOFError:
+            break
+
+exp_data = []
+with (open("bayes_exp_data_1.pkl", "rb")) as openfile:
+    while True:
+        try:
+            exp_data.append(pickle.load(openfile))
+        except EOFError:
+            break
--- a/run_service.py
+++ b/run_service.py
@ -1,16 +1,9 @@
-import argparse
+import arguments
 from time import sleep
 import subprocess as sub
+from arguments import makeArguments

-# Construct an argument parser
-all_args = argparse.ArgumentParser()
-
-all_args.add_argument("-b", "--Value1", action="store", dest="b",
-   type=int, choices=range(1,6), help="Bayesian model of size x")
-all_args.add_argument("-f", "--Value2", action="store", dest="f",
-   type=int, choices=range(1,6), help="Frequentist model of size x")
-args = vars(all_args.parse_args())
-
+args = makeArguments(arguments.all_args)

 check = list(args.values())
 if all(v is None for v in check):
@ -29,6 +22,26 @@ wide = args["f"] or args["b"]
 with open("tmp", "w") as file:
    file.write(str(wide))

+if args['EarlyStopping']:
+    with open("stp", "w") as file:
+        file.write('2')
+elif args['EnergyBound']:
+    with open("stp", "w") as file:
+        file.write('3')
+elif args['AccuracyBound']:
+    with open("stp", "w") as file:
+        file.write('4')
+else:
+    with open("stp", "w") as file:
+        file.write('1')
+        
+if args['Save']:
+    with open("sav", "w") as file:
+        file.write('1')
+else:
+    with open("sav", "w") as file:
+        file.write('0')
+
 sleep(3)


@ -44,7 +57,7 @@ elif cmd[1] == "main_bayesian.py":
    cmd3 = ["./mem_free.sh", "bayes_{}_ram_use".format(wide)]
    with open("bay", "w") as file:
        file.write(str(1))
-    with open("frw", "w") as file:
+    with open("frq", "w") as file:
        file.write(str(0))


@ -52,7 +65,7 @@ path = sub.check_output(['pwd'])
 path = path.decode()
 path = path.replace('\n', '')

-#startWattCounter = 'python ' + path + '/gpu_sample_draw.py'
+startWattCounter = 'python ' + path + '/amd_sample_draw.py'

 #test = startNODE.split()
 #test.append(pythonEnd)
@ -60,11 +73,11 @@ path = path.replace('\n', '')

 #startNODE = test

-##print(startNODE)
-##print(startWattCounter)
+#print(startNODE)
+#print(startWattCounter)

 p1 = sub.Popen(cmd)
-#p2 = sub.Popen(startWattCounter.split())
+p2 = sub.Popen(startWattCounter.split())
 p3 = sub.Popen(cmd2)
 p4 = sub.Popen(cmd3)

@ -72,6 +85,6 @@ retcode = p1.wait()
 print("Return code: {}".format(retcode))

 p1.kill()
-#p2.kill()
+p2.kill()
 p3.kill()
 p4.kill()
--- a/stopping_crit.py
+++ b/stopping_crit.py
@ -0,0 +1,22 @@
+def earlyStopping(early_stopping: list, train_acc: float, sensitivity: float=1e-9):
+    early_stopping.append(train_acc)
+    if epoch % 4 == 0 and epoch > 0:
+        print("Value 1: {} >= {}, Value 2: {} >= {}, \
+            Value 2: {} >= {}".format(early_stopping[0], \
+            train_acc-sensitivity,early_stopping[1], \
+            train_acc-sensitivity, early_stopping[2], train_acc-sensitivity))
+        if abs(early_stopping[0]) >= train_acc-sensitivity and \
+            abs(early_stopping[1]) >= train_acc-sensitivity and \
+            abs(early_stopping[2]) >= train_acc-sensitivity:
+            return None
+        early_stopping = []
+        
+
+def energyBound(threshold: float=100000.0):
+    if gpu_sample_draw.total_watt_consumed() > threshold:
+        return None
+
+
+def accuracyBound(train_acc: float, threshold: float=0.99):
+    if train_acc >= threshold:
+        return None