Solved killing pipes, renewed how training sampling works

2023-06-28 17:02:56 +01:00 · 2023-06-28 17:02:56 +01:00 · 7fa9a14303
commit 7fa9a14303
parent 44272d52a7
13 changed files with 132 additions and 203 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,7 @@ bayes_*
 times_*
 freq_*
 *.pkl
+*.txt
 bay
 frq
 sav
--- a/amd_sample_draw.py
+++ b/amd_sample_draw.py
@ -1,86 +1,44 @@
-import os
-import re
 import pickle
-import numpy as np
 from warnings import warn
+from gpu_power_func import get_sample_of_gpu

-with open("frq", "r") as file:
-    frq = int(file.read())
-
-with open("bay", "r") as file:
-    bay = int(file.read())
-
-if frq == 1:
-  model_t = "freq"
-  with open("tmp", "r") as file:
-    size = float(file.read())
-
-if bay == 1:
-  model_t = "bayes"
-  with open("tmp", "r") as file:
-    size = int(file.read())
-
-pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
-print("GPU energy file config: {}".format(pickle_name))
-
-def get_sample_of_gpu():
-  from re import sub, findall
-  import subprocess
-  from subprocess import run
-
-  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
-  no_version = "Failed to initialize NVML: Driver/library version mismatch"
-  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
-  smi_string = smi_string.stdout.decode('utf-8')
-  smi_string = smi_string.split("\n")
-  smi_string = list(filter(lambda x: x, smi_string))
-  if smi_string[0] ==  no_graph:
-    raise Exception("It seems that no AMD GPU is installed")
-  elif smi_string[0] ==  no_version:
-    raise Exception("rocm-smi version mismatch")
-  else:
-    results= []
-    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
-    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
-    gpuM0 = findall("[0-9]+",smi_string[7]) 
-    gpuM1 = findall("[0-9]+",smi_string[9])
-    gpuV0 = findall("[0-9]+",smi_string[13]) 
-    gpuV1 = findall("[0-9]+",smi_string[14])
-    results.append(float(gpuW0[0]) + float(gpuW1[0]))
-    if len(gpuM0) == 2 and len(gpuM1) == 2:
-      results.append(int(gpuM0[1]) + int(gpuM1[1]))
-    elif len(gpuM0) == 2:
-      results.append(gpuM0[1])
-    elif len(gpuM1) == 2:
-      results.append(gpuM1[1])
-    results.append(int(gpuV0[1]) + int(gpuV1[1]))
-    return results
-    #for l in smi_string:
-        #temp = findall("[0-9]*MiB | [0-9]*W",l)
-        #if temp:
-           #return temp
-
-def total_watt_consumed():
-  with (open(pickle_name, "rb")) as file:
+with (open("configuration.pkl", "rb")) as file:
    while True:
        try:
-              x = pickle.load(file)
+            cfg = pickle.load(file)
        except EOFError:
            break
-  x = np.array(x)
-  x = x[:,0]
-  y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
-  return sum(y)
+
+#with open("frq", "r") as file:
+#    frq = int(file.read())
+
+#with open("bay", "r") as file:
+#    bay = int(file.read())
+
+#if frq == 1:
+#  model_t = "freq"
+#  with open("tmp", "r") as file:
+#    size = float(file.read())
+
+#if bay == 1:
+#  model_t = "bayes"
+#  with open("tmp", "r") as file:
+#    size = int(file.read())
+
+#pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
+#print("GPU energy file config: {}".format(pickle_name))
+
+#print(cfg)
+

 if __name__ == '__main__':
  dataDump = []
  #var = True
  #pickling_on = open("wattdata.pickle","wb")
  while True:
-    #from run_service import retcode
    try:
      dataDump.append(get_sample_of_gpu())
-      with open(pickle_name, 'wb') as f:
+      with open(cfg["pickle_path"], 'wb') as f:
        pickle.dump(dataDump, f)
    except EOFError:
      warn('Pickle ran out of space')
--- a/arguments.py
+++ b/arguments.py
@ -17,4 +17,6 @@ def makeArguments(arguments: ArgumentParser) -> dict:
    all_args.add_argument("-a", "--AccuracyBound", action="store_true",
    help="Accuracy Bound criteria")
    all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
+    all_args.add_argument('--net_type', default='lenet', type=str, help='model = [lenet/AlexNet/3Conv3FC]')
+    all_args.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
    return vars(all_args.parse_args())
--- a/config_bayesian.py
+++ b/config_bayesian.py
@ -1,45 +0,0 @@
-############### Configuration file for Bayesian ###############
-
-import os
-layer_type = 'lrt'  # 'bbb' or 'lrt'
-activation_type = 'softplus'  # 'softplus' or 'relu'
-priors={
-    'prior_mu': 0,
-    'prior_sigma': 0.1,
-    'posterior_mu_initial': (0, 0.1),  # (mean, std) normal_
-    'posterior_rho_initial': (-5, 0.1),  # (mean, std) normal_
-}
-
-n_epochs = 100
-sens = 1e-9
-energy_thrs = 100000
-acc_thrs = 0.99
-lr_start = 0.001
-num_workers = 4
-valid_size = 0.2
-batch_size = 256
-train_ens = 1
-valid_ens = 1
-beta_type = 0.1  # 'Blundell', 'Standard', etc. Use float for const value
-
-
-with open("bay", "r") as file:
-    bay = int(file.read())
-
-if bay == 1:
-    with open("tmp", "r") as file:
-        wide = int(file.read())
-
-    #if os.path.exists("tmp"):
-    #    os.remove("tmp")
-    #else:
-    #    raise Exception("Tmp file not found")
-
-    print("Bayesian configured to run with width: {}".format(wide))
-
-
-#if os.path.exists("bay"): 
-#    os.remove("bay")
-#else:
-#    raise Exception("Bay file not found")
-    
--- a/config_frequentist.py
+++ b/config_frequentist.py
@ -1,32 +0,0 @@
-############### Configuration file for Frequentist ###############
-
-import os
-n_epochs = 100
-sens = 1e-9
-energy_thrs = 10000
-acc_thrs = 0.99
-lr = 0.001
-num_workers = 4
-valid_size = 0.2
-batch_size = 256
-
-with open("frq", "r") as file:
-    frq = int(file.read())
-
-if frq == 1:
-    with open("tmp", "r") as file:
-        wide = int(file.read())
-
-    if os.path.exists("tmp"):
-        os.remove("tmp")
-    else:
-        raise Exception("Tmp file not found")
-
-    print("Frequentist configured to run with width: {}".format(wide))
-
-
-
-#if os.path.exists("frq"):
-#    os.remove("frq")
-#else:
-#    raise Exception("Frq file not found")
--- a/gpu_power_func.py
+++ b/gpu_power_func.py
@ -0,0 +1,54 @@
+import os
+import re
+import pickle
+import numpy as np
+
+
+def get_sample_of_gpu():
+  from re import sub, findall
+  import subprocess
+  from subprocess import run
+
+  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
+  no_version = "Failed to initialize NVML: Driver/library version mismatch"
+  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
+  smi_string = smi_string.stdout.decode('utf-8')
+  smi_string = smi_string.split("\n")
+  smi_string = list(filter(lambda x: x, smi_string))
+  if smi_string[0] ==  no_graph:
+    raise Exception("It seems that no AMD GPU is installed")
+  elif smi_string[0] ==  no_version:
+    raise Exception("rocm-smi version mismatch")
+  else:
+    results= []
+    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
+    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
+    gpuM0 = findall("[0-9]+",smi_string[7]) 
+    gpuM1 = findall("[0-9]+",smi_string[9])
+    gpuV0 = findall("[0-9]+",smi_string[13]) 
+    gpuV1 = findall("[0-9]+",smi_string[14])
+    results.append(float(gpuW0[0]) + float(gpuW1[0]))
+    if len(gpuM0) == 2 and len(gpuM1) == 2:
+      results.append(int(gpuM0[1]) + int(gpuM1[1]))
+    elif len(gpuM0) == 2:
+      results.append(gpuM0[1])
+    elif len(gpuM1) == 2:
+      results.append(gpuM1[1])
+    results.append(int(gpuV0[1]) + int(gpuV1[1]))
+    return results
+    #for l in smi_string:
+        #temp = findall("[0-9]*MiB | [0-9]*W",l)
+        #if temp:
+           #return temp
+
+def total_watt_consumed(pickle_name):
+  with (open(pickle_name, "rb")) as file:
+      while True:
+          try:
+              x = pickle.load(file)
+          except EOFError:
+              break
+  x = np.array(x)
+  x = x[:,0]
+  y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
+  return sum(y)
--- a/gpu_sample_draw.py
+++ b/gpu_sample_draw.py
@ -4,24 +4,6 @@ import pickle
 import numpy as np
 from warnings import warn

-with open("frq", "r") as file:
-    frq = int(file.read())
-
-with open("bay", "r") as file:
-    bay = int(file.read())
-
-if frq == 1:
-  model_t = "freq"
-  with open("tmp", "r") as file:
-    size = float(file.read())
-
-if bay == 1:
-  model_t = "bayes"
-  with open("tmp", "r") as file:
-    size = int(file.read())
-
-pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
-#print("GPU energy file config: {}".format(pickle_name))

 def get_sample_of_gpu():
  from re import sub, findall
@ -45,6 +27,7 @@ def get_sample_of_gpu():
        #if temp:
           #return temp

+
 def total_watt_consumed():
    with open(pickle_name, 'rb') as f:
        x = pickle.load(f)
@ -53,12 +36,12 @@ def total_watt_consumed():
    y = [int(re.findall("\d+",xi)[0]) for xi in x]
    return sum(y)

+
 if __name__ == '__main__':
  dataDump = []
  #var = True
  #pickling_on = open("wattdata.pickle","wb")
  while True:
-    #from run_service import retcode
    try:
      dataDump.append(get_sample_of_gpu())
      with open(pickle_name, 'wb') as f:
--- a/main_bayesian.py
+++ b/main_bayesian.py
@ -6,24 +6,30 @@ import utils
 import torch
 import pickle
 import metrics
-import argparse
 import numpy as np
-import amd_sample_draw
-import config_bayesian as cfg
 from datetime import datetime
 from torch.nn import functional as F
 from torch.optim import Adam, lr_scheduler
+from gpu_power_func import total_watt_consumed
 from models.BayesianModels.BayesianLeNet import BBBLeNet
 from models.BayesianModels.BayesianAlexNet import BBBAlexNet
 from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
 from stopping_crit import earlyStopping, energyBound, accuracyBound

+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break
+
+
 # CUDA settings
 device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

 def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
    if (net_type == 'lenet'):
-        return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg.wide)
+        return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg["model"]["size"])
    elif (net_type == 'alexnet'):
        return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type)
    elif (net_type == '3conv3fc'):
@ -91,18 +97,18 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, epoch=
 def run(dataset, net_type):

    # Hyper Parameter settings
-    layer_type = cfg.layer_type
-    activation_type = cfg.activation_type
-    priors = cfg.priors
+    layer_type = cfg["model"]["layer_type"]
+    activation_type = cfg["model"]["activation_type"]
+    priors = cfg["model"]["priors"]

-    train_ens = cfg.train_ens
-    valid_ens = cfg.valid_ens
-    n_epochs = cfg.n_epochs
-    lr_start = cfg.lr_start
-    num_workers = cfg.num_workers
-    valid_size = cfg.valid_size
-    batch_size = cfg.batch_size
-    beta_type = cfg.beta_type
+    train_ens = cfg["model"]["train_ens"]
+    valid_ens = cfg["model"]["valid_ens"]
+    n_epochs = cfg["model"]["n_epochs"]
+    lr_start = cfg["model"]["lr"]
+    num_workers = cfg["model"]["num_workers"]
+    valid_size = cfg["model"]["valid_size"]
+    batch_size = cfg["model"]["batch_size"]
+    beta_type = cfg["model"]["beta_type"]

    trainset, testset, inputs, outputs = data.getDataset(dataset)
    train_loader, valid_loader, test_loader = data.getDataloader(
@ -110,15 +116,13 @@ def run(dataset, net_type):
    net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to(device)

    ckpt_dir = f'checkpoints/{dataset}/bayesian'
-    ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg.wide}.pt'
+    ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg["model"]["size"]}.pt'

    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)

-    with open("stp", "r") as file:
-        stp = int(file.read())
-    with open("sav", "r") as file:
-        sav = int(file.read())
+    stp = cfg["stopping_crit"]
+    sav = cfg["save"]

    criterion = metrics.ELBO(len(trainset)).to(device)
    optimizer = Adam(net.parameters(), lr=lr_start)
@ -139,19 +143,19 @@ def run(dataset, net_type):
            epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))

        if stp == 2:
-            #print('Using early stopping')
-            if earlyStopping(early_stop,train_acc,epoch,cfg.sens) == 1:
+            print('Using early stopping')
+            if earlyStopping(early_stop,valid_acc,epoch,cfg["model"]["sens"]) == 1:
                break
        elif stp == 3: 
-            #print('Using energy bound')
-            if energyBound(cfg.energy_thrs) == 1:
+            print('Using energy bound')
+            if energyBound(cfg["model"]["energy_thrs"]) == 1:
                break
        elif stp == 4:
-            #print('Using accuracy bound')
-            if accuracyBound(cfg.acc_thrs) == 1:
+            print('Using accuracy bound')
+            if accuracyBound(train_acc,cfg.acc_thrs) == 1:
                break
        else:
-            print('Training for {} epochs'.format(cfg.n_epochs))
+            print('Training for {} epochs'.format(cfg["model"]["n_epochs"]))

        if sav == 1:
            # save model when finished
@ -159,18 +163,14 @@ def run(dataset, net_type):
                torch.save(net.state_dict(), ckpt_name)


-    with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
+    with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f:
      pickle.dump(train_data, f)

 if __name__ == '__main__':
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Initial Time =", current_time)
-    parser = argparse.ArgumentParser(description = "PyTorch Bayesian Model Training")
-    parser.add_argument('--net_type', default='lenet', type=str, help='model')
-    parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
-    args = parser.parse_args()
-    run(args.dataset, args.net_type)
+    run(cfg["data"], cfg["model"]["net_type"])
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Final Time =", current_time)
--- a/main_frequentist.py
+++ b/main_frequentist.py
@ -116,7 +116,7 @@ def run(dataset, net_type):
                break
        elif stp == 4:
            #print('Using accuracy bound')
-            if accuracyBound(train_acc,0.70) == 1:
+            if accuracyBound(train_acc,cfg.acc_thrs) == 1:
                break
        else:
            print('Training for {} epochs'.format(cfg.n_epochs))
@ -136,7 +136,7 @@ if __name__ == '__main__':
    print("Initial Time =", current_time)
    parser = argparse.ArgumentParser(description = "PyTorch Frequentist Model Training")
    parser.add_argument('--net_type', default='lenet', type=str, help='model')
-    parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
+    parser.add_argument('--dataset', default='MNIST', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
    args = parser.parse_args()
    run(args.dataset, args.net_type)
    now = datetime.now()
--- a/radeontop.sh
+++ b/radeontop.sh
@ -0,0 +1,3 @@
+#!/bin/env bash
+
+radeontop -b 08 -d - > $1
--- a/read_pickle.py
+++ b/read_pickle.py
@ -1,7 +1,7 @@
 import pickle

 gpu_data = []
-with (open("freq_wattdata_1.0.pkl", "rb")) as openfile:
+with (open("bayesian_wattdata_3.pkl", "rb")) as openfile:
    while True:
        try:
            gpu_data = pickle.load(openfile)
--- a/stopping_crit.py
+++ b/stopping_crit.py
@ -1,6 +1,13 @@
-import amd_sample_draw
+import pickle
 from time import sleep
+from gpu_power_func import total_watt_consumed

+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break

 def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9):
    early_stopping.append(train_acc)
@ -20,16 +27,17 @@ def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivit

 def energyBound(threshold: float=100000.0):
    try:
-        energy = amd_sample_draw.total_watt_consumed()
+        energy = total_watt_consumed(cfg["pickle_path"])
    except Exception as e:
        sleep(3)
-        energy = amd_sample_draw.total_watt_consumed()
+        energy = total_watt_consumed(cfg["pickle_path"])
    print("Energy used: {}".format(energy))
    if energy > threshold:
        print("Energy bound achieved")
        return 1
    return 0

+
 def accuracyBound(train_acc: float, threshold: float=0.99):
    if train_acc >= threshold:
        print("Accuracy bound achieved")
--- a/utils.py
+++ b/utils.py
@ -3,9 +3,6 @@ import torch
 import numpy as np
 from torch.nn import functional as F

-import config_bayesian as cfg
-
-
 # cifar10 classes
 cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']