Solved killing pipes, renewed how training sampling works

2023-06-28 17:02:56 +01:00 · 2023-06-28 17:02:56 +01:00 · 7fa9a14303
commit 7fa9a14303
parent 44272d52a7
13 changed files with 132 additions and 203 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,7 @@ bayes_*
 times_*
 freq_*
 *.pkl
 *.txt
 bay
 frq
 sav
--- a/amd_sample_draw.py
+++ b/amd_sample_draw.py
@ -1,86 +1,44 @@
 import os
 import re
 import pickle
 import numpy as np
 from warnings import warn
 from gpu_power_func import get_sample_of_gpu
-with open("frq", "r") as file:
+with (open("configuration.pkl", "rb")) as file:
    frq = int(file.read())
 with open("bay", "r") as file:
    bay = int(file.read())
 if frq == 1:
  model_t = "freq"
  with open("tmp", "r") as file:
    size = float(file.read())
 if bay == 1:
  model_t = "bayes"
  with open("tmp", "r") as file:
    size = int(file.read())
 pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 print("GPU energy file config: {}".format(pickle_name))
 def get_sample_of_gpu():
  from re import sub, findall
  import subprocess
  from subprocess import run
  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
  no_version = "Failed to initialize NVML: Driver/library version mismatch"
  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
  smi_string = smi_string.stdout.decode('utf-8')
  smi_string = smi_string.split("\n")
  smi_string = list(filter(lambda x: x, smi_string))
  if smi_string[0] ==  no_graph:
    raise Exception("It seems that no AMD GPU is installed")
  elif smi_string[0] ==  no_version:
    raise Exception("rocm-smi version mismatch")
  else:
    results= []
    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
    gpuM0 = findall("[0-9]+",smi_string[7]) 
    gpuM1 = findall("[0-9]+",smi_string[9])
    gpuV0 = findall("[0-9]+",smi_string[13]) 
    gpuV1 = findall("[0-9]+",smi_string[14])
    results.append(float(gpuW0[0]) + float(gpuW1[0]))
    if len(gpuM0) == 2 and len(gpuM1) == 2:
      results.append(int(gpuM0[1]) + int(gpuM1[1]))
    elif len(gpuM0) == 2:
      results.append(gpuM0[1])
    elif len(gpuM1) == 2:
      results.append(gpuM1[1])
    results.append(int(gpuV0[1]) + int(gpuV1[1]))
    return results
    #for l in smi_string:
        #temp = findall("[0-9]*MiB | [0-9]*W",l)
        #if temp:
           #return temp
 def total_watt_consumed():
  with (open(pickle_name, "rb")) as file:
    while True:
        try:
-              x = pickle.load(file)
+            cfg = pickle.load(file)
        except EOFError:
            break
-  x = np.array(x)
+
-  x = x[:,0]
+#with open("frq", "r") as file:
-  y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
+#    frq = int(file.read())
-  return sum(y)
+
 #with open("bay", "r") as file:
 #    bay = int(file.read())
 #if frq == 1:
 #  model_t = "freq"
 #  with open("tmp", "r") as file:
 #    size = float(file.read())
 #if bay == 1:
 #  model_t = "bayes"
 #  with open("tmp", "r") as file:
 #    size = int(file.read())
 #pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 #print("GPU energy file config: {}".format(pickle_name))
 #print(cfg)
 if __name__ == '__main__':
  dataDump = []
  #var = True
  #pickling_on = open("wattdata.pickle","wb")
  while True:
    #from run_service import retcode
    try:
      dataDump.append(get_sample_of_gpu())
-      with open(pickle_name, 'wb') as f:
+      with open(cfg["pickle_path"], 'wb') as f:
        pickle.dump(dataDump, f)
    except EOFError:
      warn('Pickle ran out of space')
--- a/arguments.py
+++ b/arguments.py
@ -17,4 +17,6 @@ def makeArguments(arguments: ArgumentParser) -> dict:
    all_args.add_argument("-a", "--AccuracyBound", action="store_true",
    help="Accuracy Bound criteria")
    all_args.add_argument("-s", "--Save", action="store_true", help="Save model")
    all_args.add_argument('--net_type', default='lenet', type=str, help='model = [lenet/AlexNet/3Conv3FC]')
    all_args.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
    return vars(all_args.parse_args())
--- a/config_bayesian.py
+++ b/config_bayesian.py
@ -1,45 +0,0 @@
 ############### Configuration file for Bayesian ###############
 import os
 layer_type = 'lrt'  # 'bbb' or 'lrt'
 activation_type = 'softplus'  # 'softplus' or 'relu'
 priors={
    'prior_mu': 0,
    'prior_sigma': 0.1,
    'posterior_mu_initial': (0, 0.1),  # (mean, std) normal_
    'posterior_rho_initial': (-5, 0.1),  # (mean, std) normal_
 }
 n_epochs = 100
 sens = 1e-9
 energy_thrs = 100000
 acc_thrs = 0.99
 lr_start = 0.001
 num_workers = 4
 valid_size = 0.2
 batch_size = 256
 train_ens = 1
 valid_ens = 1
 beta_type = 0.1  # 'Blundell', 'Standard', etc. Use float for const value
 with open("bay", "r") as file:
    bay = int(file.read())
 if bay == 1:
    with open("tmp", "r") as file:
        wide = int(file.read())
    #if os.path.exists("tmp"):
    #    os.remove("tmp")
    #else:
    #    raise Exception("Tmp file not found")
    print("Bayesian configured to run with width: {}".format(wide))
 #if os.path.exists("bay"): 
 #    os.remove("bay")
 #else:
 #    raise Exception("Bay file not found")
--- a/config_frequentist.py
+++ b/config_frequentist.py
@ -1,32 +0,0 @@
 ############### Configuration file for Frequentist ###############
 import os
 n_epochs = 100
 sens = 1e-9
 energy_thrs = 10000
 acc_thrs = 0.99
 lr = 0.001
 num_workers = 4
 valid_size = 0.2
 batch_size = 256
 with open("frq", "r") as file:
    frq = int(file.read())
 if frq == 1:
    with open("tmp", "r") as file:
        wide = int(file.read())
    if os.path.exists("tmp"):
        os.remove("tmp")
    else:
        raise Exception("Tmp file not found")
    print("Frequentist configured to run with width: {}".format(wide))
 #if os.path.exists("frq"):
 #    os.remove("frq")
 #else:
 #    raise Exception("Frq file not found")
--- a/gpu_power_func.py
+++ b/gpu_power_func.py
@ -0,0 +1,54 @@
 import os
 import re
 import pickle
 import numpy as np
 def get_sample_of_gpu():
  from re import sub, findall
  import subprocess
  from subprocess import run
  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
  no_version = "Failed to initialize NVML: Driver/library version mismatch"
  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
  smi_string = smi_string.stdout.decode('utf-8')
  smi_string = smi_string.split("\n")
  smi_string = list(filter(lambda x: x, smi_string))
  if smi_string[0] ==  no_graph:
    raise Exception("It seems that no AMD GPU is installed")
  elif smi_string[0] ==  no_version:
    raise Exception("rocm-smi version mismatch")
  else:
    results= []
    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
    gpuM0 = findall("[0-9]+",smi_string[7]) 
    gpuM1 = findall("[0-9]+",smi_string[9])
    gpuV0 = findall("[0-9]+",smi_string[13]) 
    gpuV1 = findall("[0-9]+",smi_string[14])
    results.append(float(gpuW0[0]) + float(gpuW1[0]))
    if len(gpuM0) == 2 and len(gpuM1) == 2:
      results.append(int(gpuM0[1]) + int(gpuM1[1]))
    elif len(gpuM0) == 2:
      results.append(gpuM0[1])
    elif len(gpuM1) == 2:
      results.append(gpuM1[1])
    results.append(int(gpuV0[1]) + int(gpuV1[1]))
    return results
    #for l in smi_string:
        #temp = findall("[0-9]*MiB | [0-9]*W",l)
        #if temp:
           #return temp
 def total_watt_consumed(pickle_name):
  with (open(pickle_name, "rb")) as file:
      while True:
          try:
              x = pickle.load(file)
          except EOFError:
              break
  x = np.array(x)
  x = x[:,0]
  y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
  return sum(y)
--- a/gpu_sample_draw.py
+++ b/gpu_sample_draw.py
@ -4,24 +4,6 @@ import pickle
 import numpy as np
 from warnings import warn
 with open("frq", "r") as file:
    frq = int(file.read())
 with open("bay", "r") as file:
    bay = int(file.read())
 if frq == 1:
  model_t = "freq"
  with open("tmp", "r") as file:
    size = float(file.read())
 if bay == 1:
  model_t = "bayes"
  with open("tmp", "r") as file:
    size = int(file.read())
 pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
 #print("GPU energy file config: {}".format(pickle_name))
 def get_sample_of_gpu():
  from re import sub, findall
@ -45,6 +27,7 @@ def get_sample_of_gpu():
        #if temp:
           #return temp
 def total_watt_consumed():
    with open(pickle_name, 'rb') as f:
        x = pickle.load(f)
@ -53,12 +36,12 @@ def total_watt_consumed():
    y = [int(re.findall("\d+",xi)[0]) for xi in x]
    return sum(y)
 if __name__ == '__main__':
  dataDump = []
  #var = True
  #pickling_on = open("wattdata.pickle","wb")
  while True:
    #from run_service import retcode
    try:
      dataDump.append(get_sample_of_gpu())
      with open(pickle_name, 'wb') as f:
--- a/main_bayesian.py
+++ b/main_bayesian.py
@ -6,24 +6,30 @@ import utils
 import torch
 import pickle
 import metrics
 import argparse
 import numpy as np
 import amd_sample_draw
 import config_bayesian as cfg
 from datetime import datetime
 from torch.nn import functional as F
 from torch.optim import Adam, lr_scheduler
 from gpu_power_func import total_watt_consumed
 from models.BayesianModels.BayesianLeNet import BBBLeNet
 from models.BayesianModels.BayesianAlexNet import BBBAlexNet
 from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
 from stopping_crit import earlyStopping, energyBound, accuracyBound
 with (open("configuration.pkl", "rb")) as file:
    while True:
        try:
            cfg = pickle.load(file)
        except EOFError:
            break
 # CUDA settings
 device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
 def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
    if (net_type == 'lenet'):
-        return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg.wide)
+        return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,wide=cfg["model"]["size"])
    elif (net_type == 'alexnet'):
        return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type)
    elif (net_type == '3conv3fc'):
@ -91,18 +97,18 @@ def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1, epoch=
 def run(dataset, net_type):
    # Hyper Parameter settings
-    layer_type = cfg.layer_type
+    layer_type = cfg["model"]["layer_type"]
-    activation_type = cfg.activation_type
+    activation_type = cfg["model"]["activation_type"]
-    priors = cfg.priors
+    priors = cfg["model"]["priors"]
-    train_ens = cfg.train_ens
+    train_ens = cfg["model"]["train_ens"]
-    valid_ens = cfg.valid_ens
+    valid_ens = cfg["model"]["valid_ens"]
-    n_epochs = cfg.n_epochs
+    n_epochs = cfg["model"]["n_epochs"]
-    lr_start = cfg.lr_start
+    lr_start = cfg["model"]["lr"]
-    num_workers = cfg.num_workers
+    num_workers = cfg["model"]["num_workers"]
-    valid_size = cfg.valid_size
+    valid_size = cfg["model"]["valid_size"]
-    batch_size = cfg.batch_size
+    batch_size = cfg["model"]["batch_size"]
-    beta_type = cfg.beta_type
+    beta_type = cfg["model"]["beta_type"]
    trainset, testset, inputs, outputs = data.getDataset(dataset)
    train_loader, valid_loader, test_loader = data.getDataloader(
@ -110,15 +116,13 @@ def run(dataset, net_type):
    net = getModel(net_type, inputs, outputs, priors, layer_type, activation_type).to(device)
    ckpt_dir = f'checkpoints/{dataset}/bayesian'
-    ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg.wide}.pt'
+    ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}_{activation_type}_{cfg["model"]["size"]}.pt'
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir, exist_ok=True)
-    with open("stp", "r") as file:
+    stp = cfg["stopping_crit"]
-        stp = int(file.read())
+    sav = cfg["save"]
    with open("sav", "r") as file:
        sav = int(file.read())
    criterion = metrics.ELBO(len(trainset)).to(device)
    optimizer = Adam(net.parameters(), lr=lr_start)
@ -139,19 +143,19 @@ def run(dataset, net_type):
            epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
        if stp == 2:
-            #print('Using early stopping')
+            print('Using early stopping')
-            if earlyStopping(early_stop,train_acc,epoch,cfg.sens) == 1:
+            if earlyStopping(early_stop,valid_acc,epoch,cfg["model"]["sens"]) == 1:
                break
        elif stp == 3: 
-            #print('Using energy bound')
+            print('Using energy bound')
-            if energyBound(cfg.energy_thrs) == 1:
+            if energyBound(cfg["model"]["energy_thrs"]) == 1:
                break
        elif stp == 4:
-            #print('Using accuracy bound')
+            print('Using accuracy bound')
-            if accuracyBound(cfg.acc_thrs) == 1:
+            if accuracyBound(train_acc,cfg.acc_thrs) == 1:
                break
        else:
-            print('Training for {} epochs'.format(cfg.n_epochs))
+            print('Training for {} epochs'.format(cfg["model"]["n_epochs"]))
        if sav == 1:
            # save model when finished
@ -159,18 +163,14 @@ def run(dataset, net_type):
                torch.save(net.state_dict(), ckpt_name)
-    with open("bayes_exp_data_"+str(cfg.wide)+".pkl", 'wb') as f:
+    with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f:
      pickle.dump(train_data, f)
 if __name__ == '__main__':
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Initial Time =", current_time)
-    parser = argparse.ArgumentParser(description = "PyTorch Bayesian Model Training")
+    run(cfg["data"], cfg["model"]["net_type"])
    parser.add_argument('--net_type', default='lenet', type=str, help='model')
    parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
    args = parser.parse_args()
    run(args.dataset, args.net_type)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Final Time =", current_time)
--- a/main_frequentist.py
+++ b/main_frequentist.py
@ -116,7 +116,7 @@ def run(dataset, net_type):
                break
        elif stp == 4:
            #print('Using accuracy bound')
-            if accuracyBound(train_acc,0.70) == 1:
+            if accuracyBound(train_acc,cfg.acc_thrs) == 1:
                break
        else:
            print('Training for {} epochs'.format(cfg.n_epochs))
@ -136,7 +136,7 @@ if __name__ == '__main__':
    print("Initial Time =", current_time)
    parser = argparse.ArgumentParser(description = "PyTorch Frequentist Model Training")
    parser.add_argument('--net_type', default='lenet', type=str, help='model')
-    parser.add_argument('--dataset', default='CIFAR10', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
+    parser.add_argument('--dataset', default='MNIST', type=str, help='dataset = [MNIST/CIFAR10/CIFAR100]')
    args = parser.parse_args()
    run(args.dataset, args.net_type)
    now = datetime.now()
--- a/radeontop.sh
+++ b/radeontop.sh
@ -0,0 +1,3 @@
 #!/bin/env bash
 radeontop -b 08 -d - > $1
--- a/read_pickle.py
+++ b/read_pickle.py
@ -1,7 +1,7 @@
 import pickle
 gpu_data = []
-with (open("freq_wattdata_1.0.pkl", "rb")) as openfile:
+with (open("bayesian_wattdata_3.pkl", "rb")) as openfile:
    while True:
        try:
            gpu_data = pickle.load(openfile)
--- a/stopping_crit.py
+++ b/stopping_crit.py
@ -1,6 +1,13 @@
-import amd_sample_draw
+import pickle
 from time import sleep
 from gpu_power_func import total_watt_consumed
 with (open("configuration.pkl", "rb")) as file:
    while True:
        try:
            cfg = pickle.load(file)
        except EOFError:
            break
 def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9):
    early_stopping.append(train_acc)
@ -20,16 +27,17 @@ def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivit
 def energyBound(threshold: float=100000.0):
    try:
-        energy = amd_sample_draw.total_watt_consumed()
+        energy = total_watt_consumed(cfg["pickle_path"])
    except Exception as e:
        sleep(3)
-        energy = amd_sample_draw.total_watt_consumed()
+        energy = total_watt_consumed(cfg["pickle_path"])
    print("Energy used: {}".format(energy))
    if energy > threshold:
        print("Energy bound achieved")
        return 1
    return 0
 def accuracyBound(train_acc: float, threshold: float=0.99):
    if train_acc >= threshold:
        print("Accuracy bound achieved")
--- a/utils.py
+++ b/utils.py
@ -3,9 +3,6 @@ import torch
 import numpy as np
 from torch.nn import functional as F
 import config_bayesian as cfg
 # cifar10 classes
 cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']
		`@ -0,0 +1,3 @@`
							`#!/bin/env bash`

							`radeontop -b 08 -d - > $1`