Commited code from 2023

2024-05-10 10:59:24 +01:00 · 2024-05-10 10:59:24 +01:00 · c900129ec9
parent 4c9dd542ae
commit c900129ec9
36 changed files with 2763 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,21 @@
+**/**/__pycache__/
+**/**/__init__.py
+**/__pycache__/
+**/__init__.py
+checkpoints/
+__pycache__/
+data_budget/
+bayes_*
+times_*
+.vscode
+freq_*
+data/
+.idea
+*.pkl
+*.txt
+stp
+sav
+bay
+frq
+sav
+tmp
--- a/Mixtures/config_mixtures.py
+++ b/Mixtures/config_mixtures.py
@ -0,0 +1,8 @@
+############### Configuration file for Training of SplitMNIST and Mixtures ###############
+n_epochs = 5
+lr_start = 0.001
+num_workers = 4
+valid_size = 0.2
+batch_size = 256
+train_ens = 15
+valid_ens = 10
--- a/Mixtures/gmm.py
+++ b/Mixtures/gmm.py
@ -0,0 +1,305 @@
+# Courtesy of https://github.com/ldeecke/gmm-torch
+
+import torch
+import numpy as np
+
+from math import pi
+
+
+class GaussianMixture(torch.nn.Module):
+    """
+    Fits a mixture of k=1,..,K Gaussians to the input data. Input tensors are expected to be flat with dimensions (n: number of samples, d: number of features).
+    The model then extends them to (n, k: number of components, d).
+    The model parametrization (mu, sigma) is stored as (1, k, d), and probabilities are shaped (n, k, 1) if they relate to an individual sample, or (1, k, 1) if they assign membership probabilities to one of the mixture components.
+    """
+    def __init__(self, n_components, n_features, mu_init=None, var_init=None, eps=1.e-6):
+        """
+        Initializes the model and brings all tensors into their required shape. The class expects data to be fed as a flat tensor in (n, d). The class owns:
+            x:              torch.Tensor (n, k, d)
+            mu:             torch.Tensor (1, k, d)
+            var:            torch.Tensor (1, k, d)
+            pi:             torch.Tensor (1, k, 1)
+            eps:            float
+            n_components:   int
+            n_features:     int
+            score:          float
+        args:
+            n_components:   int
+            n_features:     int
+            mu_init:        torch.Tensor (1, k, d)
+            var_init:       torch.Tensor (1, k, d)
+            eps:            float
+        """
+
+        super(GaussianMixture, self).__init__()
+
+        self.eps = eps
+        self.n_components = n_components
+        self.n_features = n_features
+        self.log_likelihood = -np.inf
+
+        self.mu_init = mu_init
+        self.var_init = var_init
+
+        self._init_params()
+
+
+    def _init_params(self):
+        if self.mu_init is not None:
+            assert self.mu_init.size() == (1, self.n_components, self.n_features), "Input mu_init does not have required tensor dimensions (1, %i, %i)" % (self.n_components, self.n_features)
+            # (1, k, d)
+            self.mu = torch.nn.Parameter(self.mu_init, requires_grad=False)
+        else:
+            self.mu = torch.nn.Parameter(torch.randn(1, self.n_components, self.n_features), requires_grad=False)
+
+        if self.var_init is not None:
+            assert self.var_init.size() == (1, self.n_components, self.n_features), "Input var_init does not have required tensor dimensions (1, %i, %i)" % (self.n_components, self.n_features)
+            # (1, k, d)
+            self.var = torch.nn.Parameter(self.var_init, requires_grad=False)
+        else:
+            self.var = torch.nn.Parameter(torch.ones(1, self.n_components, self.n_features), requires_grad=False)
+
+        # (1, k, 1)
+        self.pi = torch.nn.Parameter(torch.Tensor(1, self.n_components, 1), requires_grad=False).fill_(1./self.n_components)
+
+        self.params_fitted = False
+
+
+    def bic(self, x):
+        """
+        Bayesian information criterion for samples x.
+        args:
+            x:      torch.Tensor (n, d) or (n, k, d)
+        returns:
+            bic:    float
+        """
+        n = x.shape[0]
+
+        if len(x.size()) == 2:
+            # (n, d) --> (n, k, d)
+            x = x.unsqueeze(1).expand(n, self.n_components, x.size(1))
+
+        bic = -2. * self.__score(self.pi, self.__p_k(x, self.mu, self.var), sum_data=True) * n + self.n_components * np.log(n)
+
+        return bic
+
+
+    def fit(self, x, warm_start=False, delta=1e-8, n_iter=1000):
+        """
+        Public method that fits data to the model.
+        args:
+            n_iter:     int
+            delta:      float
+        """
+
+        if not warm_start and self.params_fitted:
+            self._init_params()
+
+        if len(x.size()) == 2:
+            # (n, d) --> (n, k, d)
+            x = x.unsqueeze(1).expand(x.size(0), self.n_components, x.size(1))
+
+        i = 0
+        j = np.inf
+
+        while (i <= n_iter) and (j >= delta):
+
+            log_likelihood_old = self.log_likelihood
+            mu_old = self.mu
+            var_old = self.var
+
+            self.__em(x)
+            self.log_likelihood = self.__score(self.pi, self.__p_k(x, self.mu, self.var))
+
+            if (self.log_likelihood.abs() == float("Inf")) or (self.log_likelihood == float("nan")):
+                # when the log-likelihood assumes inane values, reinitialize model
+                self.__init__(self.n_components, self.n_features)
+
+            i += 1
+            j = self.log_likelihood - log_likelihood_old
+            if j <= delta:
+                # when the score decreases, revert to old parameters
+                self.__update_mu(mu_old)
+                self.__update_var(var_old)
+
+        self.params_fitted = True
+
+
+    def predict(self, x, probs=False):
+        """
+        Assigns input data to one of the mixture components by evaluating the likelihood under each. If probs=True returns normalized probabilities of class membership instead.
+        args:
+            x:          torch.Tensor (n, d) or (n, k, d)
+            probs:      bool
+        returns:
+            y:          torch.LongTensor (n)
+        """
+
+        if len(x.size()) == 2:
+            # (n, d) --> (n, k, d)
+            x = x.unsqueeze(1).expand(x.size(0), self.n_components, x.size(1))
+
+        p_k = self.__p_k(x, self.mu, self.var)
+        if probs:
+            return p_k / (p_k.sum(1, keepdim=True) + self.eps)
+        else:
+            _, predictions = torch.max(p_k, 1)
+            return torch.squeeze(predictions).type(torch.LongTensor)
+
+    def predict_proba(self, x):
+        """
+        Returns normalized probabilities of class membership.
+        args:
+            x:          torch.Tensor (n, d) or (n, k, d)
+        returns:
+            y:          torch.LongTensor (n)
+        """
+        return self.predict(x, probs=True)
+
+
+    def score_samples(self, x):
+        """
+        Computes log-likelihood of data (x) under the current model.
+        args:
+            x:          torch.Tensor (n, d) or (n, k, d)
+        returns:
+            score:      torch.LongTensor (n)
+        """
+        if len(x.size()) == 2:
+            # (n, d) --> (n, k, d)
+            x = x.unsqueeze(1).expand(x.size(0), self.n_components, x.size(1))
+
+        score = self.__score(self.pi, self.__p_k(x, self.mu, self.var), sum_data=False)
+        return score
+
+
+    def __p_k(self, x, mu, var):
+        """
+        Returns a tensor with dimensions (n, k, 1) indicating the likelihood of data belonging to the k-th Gaussian.
+        args:
+            x:      torch.Tensor (n, k, d)
+            mu:     torch.Tensor (1, k, d)
+            var:    torch.Tensor (1, k, d)
+        returns:
+            p_k:    torch.Tensor (n, k, 1)
+        """
+
+        # (1, k, d) --> (n, k, d)
+        mu = mu.expand(x.size(0), self.n_components, self.n_features)
+        var = var.expand(x.size(0), self.n_components, self.n_features)
+
+        # (n, k, d) --> (n, k, 1)
+        exponent = torch.exp(-.5 * torch.sum((x - mu) * (x - mu) / var, 2, keepdim=True))
+        # (n, k, d) --> (n, k, 1)
+        prefactor = torch.rsqrt(((2. * pi) ** self.n_features) * torch.prod(var, dim=2, keepdim=True) + self.eps)
+
+        return prefactor * exponent
+
+
+    def __e_step(self, pi, p_k):
+        """
+        Computes weights that indicate the probabilistic belief that a data point was generated by one of the k mixture components. This is the so-called expectation step of the EM-algorithm.
+        args:
+            pi:         torch.Tensor (1, k, 1)
+            p_k:        torch.Tensor (n, k, 1)
+        returns:
+            weights:    torch.Tensor (n, k, 1)
+        """
+
+        weights = pi * p_k
+        return torch.div(weights, torch.sum(weights, 1, keepdim=True) + self.eps)
+
+
+    def __m_step(self, x, weights):
+        """
+        Updates the model's parameters. This is the maximization step of the EM-algorithm.
+        args:
+            x:          torch.Tensor (n, k, d)
+            weights:    torch.Tensor (n, k, 1)
+        returns:
+            pi_new:     torch.Tensor (1, k, 1)
+            mu_new:     torch.Tensor (1, k, d)
+            var_new:    torch.Tensor (1, k, d)
+        """
+
+        # (n, k, 1) --> (1, k, 1)
+        n_k = torch.sum(weights, 0, keepdim=True)
+        pi_new = torch.div(n_k, torch.sum(n_k, 1, keepdim=True) + self.eps)
+        # (n, k, d) --> (1, k, d)
+        mu_new = torch.div(torch.sum(weights * x, 0, keepdim=True), n_k + self.eps)
+        # (n, k, d) --> (1, k, d)
+        var_new = torch.div(torch.sum(weights * (x - mu_new) * (x - mu_new), 0, keepdim=True), n_k + self.eps)
+
+        return pi_new, mu_new, var_new
+
+
+    def __em(self, x):
+        """
+        Performs one iteration of the expectation-maximization algorithm by calling the respective subroutines.
+        args:
+            x:          torch.Tensor (n, k, d)
+        """
+
+        weights = self.__e_step(self.pi, self.__p_k(x, self.mu, self.var))
+        pi_new, mu_new, var_new = self.__m_step(x, weights)
+
+        self.__update_pi(pi_new)
+        self.__update_mu(mu_new)
+        self.__update_var(var_new)
+
+
+    def __score(self, pi, p_k, sum_data=True):
+        """
+        Computes the log-likelihood of the data under the model.
+        args:
+            pi:         torch.Tensor (1, k, 1)
+            p_k:        torch.Tensor (n, k, 1)
+        """
+
+        weights = pi * p_k
+        if sum_data:
+            return torch.sum(torch.log(torch.sum(weights, 1) + self.eps))
+        else:
+            return torch.log(torch.sum(weights, 1) + self.eps)
+
+
+    def __update_mu(self, mu):
+        """
+        Updates mean to the provided value.
+        args:
+            mu:         torch.FloatTensor
+        """
+
+        assert mu.size() in [(self.n_components, self.n_features), (1, self.n_components, self.n_features)], "Input mu does not have required tensor dimensions (%i, %i) or (1, %i, %i)" % (self.n_components, self.n_features, self.n_components, self.n_features)
+
+        if mu.size() == (self.n_components, self.n_features):
+            self.mu = mu.unsqueeze(0)
+        elif mu.size() == (1, self.n_components, self.n_features):
+            self.mu.data = mu
+
+
+    def __update_var(self, var):
+        """
+        Updates variance to the provided value.
+        args:
+            var:        torch.FloatTensor
+        """
+
+        assert var.size() in [(self.n_components, self.n_features), (1, self.n_components, self.n_features)], "Input var does not have required tensor dimensions (%i, %i) or (1, %i, %i)" % (self.n_components, self.n_features, self.n_components, self.n_features)
+
+        if var.size() == (self.n_components, self.n_features):
+            self.var = var.unsqueeze(0)
+        elif var.size() == (1, self.n_components, self.n_features):
+            self.var.data = var
+
+
+    def __update_pi(self, pi):
+        """
+        Updates pi to the provided value.
+        args:
+            pi:         torch.FloatTensor
+        """
+
+        assert pi.size() in [(1, self.n_components, 1)], "Input pi does not have required tensor dimensions (%i, %i, %i)" % (1, self.n_components, 1)
+
+        self.pi.data = pi
--- a/Mixtures/main.py
+++ b/Mixtures/main.py
@ -0,0 +1,117 @@
+import sys
+sys.path.append('..')
+
+import os
+import torch
+import numpy as np
+from collections import OrderedDict
+
+import gmm
+import utils
+import utils_mixture
+import config_bayesian as cfg
+
+
+def feedforward_and_save_mean_var(net, dataloader, task_no, num_ens=1):
+    cfg.mean_var_dir = "Mixtures/mean_vars/task-{}/".format(task_no)
+    if not os.path.exists(cfg.mean_var_dir):
+        os.makedirs(cfg.mean_var_dir, exist_ok=True)
+        cfg.record_mean_var = True
+        cfg.record_layers = None  # All layers
+        cfg.record_now = True
+        cfg.curr_batch_no = 0  # Not required
+        cfg.curr_epoch_no = 0  # Not required
+
+    net.train()  # To get distribution of mean and var
+    accs = []
+
+    for i, (inputs, labels) in enumerate(dataloader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device)
+        kl = 0.0
+        for j in range(num_ens):
+            net_out, _kl = net(inputs)
+            kl += _kl
+            outputs[:, :, j] = F.log_softmax(net_out, dim=1).data
+
+        log_outputs = utils.logmeanexp(outputs, dim=2)
+        accs.append(metrics.acc(log_outputs, labels))
+    return np.mean(accs)
+
+
+def _get_ordered_layer_name(mean_var_path):
+    # Order files according to creation time
+    files = os.listdir(mean_var_path)
+    files = [os.path.join(mean_var_path, f) for f in files]
+    files.sort(key=os.path.getctime)
+    layer_names = [f.split('/')[-1].split('.')[0] for f in files]
+    return layer_names
+
+
+def _get_layer_wise_mean_var_per_task(mean_var_path):
+    # Order files according to creation time
+    # To get the correct model architecture
+    files = os.listdir(mean_var_path)
+    files = [os.path.join(mean_var_path, f) for f in files]
+    files.sort(key=os.path.getctime)
+    layer_names = [f.split('/')[-1].split('.')[0] for f in files]
+
+    mean_var = OrderedDict()
+    for i in range(len(files)):
+        data = {}
+        mean, var = utils.load_mean_std_from_file(files[i])
+        mean, var = np.vstack(mean), np.vstack(var)  # shape is (len(trainset), output shape)
+        data['mean'] = mean
+        data['var'] = var
+        data['mean.mu'] = np.mean(mean, axis=0)
+        data['var.mu'] = np.mean(var, axis=0)
+        data['mean.var'] = np.var(mean, axis=0)
+        data['var.var'] = np.var(var, axis=0)
+        mean_var[layer_names[i]] = data
+    return mean_var
+
+
+def get_mean_vars_for_all_tasks(mean_var_dir):
+    all_tasks = {}
+    for task in os.listdir(mean_var_dir):
+        path_to_task = os.path.join(mean_var_dir, task)
+        mean_var_per_task = _get_layer_wise_mean_var_per_task(path_to_task)
+        all_tasks[task] = mean_var_per_task
+    return all_tasks
+
+
+def fit_to_gmm(num_tasks, layer_name, data_type, all_tasks):
+    data = np.vstack([all_tasks[f'task-{i}'][layer_name][data_type] for i in range(1, num_tasks+1)])
+    data = torch.tensor(data).float()
+    #data_mu = torch.cat([torch.tensor(all_tasks[f'task-{i}'][layer_name][data_type+'.mu']).unsqueeze(0) for i in range(1, num_tasks+1)], dim=0).float().unsqueeze(0)
+    #data_var = torch.cat([torch.tensor(all_tasks[f'task-{i}'][layer_name][data_type+'.var']).unsqueeze(0) for i in range(1, num_tasks+1)], dim=0).float().unsqueeze(0)
+    model = gmm.GaussianMixture(n_components=num_tasks, n_features=np.prod(data.shape[1:]))#, mu_init=data_mu, var_init=data_var)
+    data = data[torch.randperm(data.size()[0])]  # Shuffling of data
+    model.fit(data)
+    return model.predict(data)
+
+
+def main():
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/bayesian/splitted/2-tasks/"
+
+    loader_task1, loader_task2 = utils_mixture.get_splitmnist_dataloaders(num_tasks)
+    train_loader_task1 = loader_task1[0]
+    train_loader_task2 = loader_task2[0]
+
+    net_task1, net_task2 = utils_mixture.get_splitmnist_models(num_tasks, True, weights_dir)
+    net_task1.cuda()
+    net_task2.cuda()
+
+    print("Task-1 Accuracy:", feedforward_and_save_mean_var(net_task1, train_loader_task1, task_no=1))
+    print("Task-2 Accuracy:", feedforward_and_save_mean_var(net_task2, train_loader_task2, task_no=2))
+
+    mean_vars_all_tasks = get_mean_vars_for_all_tasks("Mixtures/mean_vars/")
+    
+
+
+if __name__ == '__main__':
+    all_tasks = get_mean_vars_for_all_tasks("Mixtures/mean_vars/")
+    y = fit_to_gmm(2, 'fc3', 'mean', all_tasks)
+    print("Cluster0", (1-y).sum())
+    print("Cluster1", y.sum())
--- a/Mixtures/mixture_experiment.py
+++ b/Mixtures/mixture_experiment.py
@ -0,0 +1,171 @@
+import sys
+sys.path.append('..')
+
+import os
+import datetime
+import torch
+import contextlib
+
+from utils_mixture import *
+from layers.BBBLinear import BBBLinear
+
+
+@contextlib.contextmanager
+def print_to_logfile(file):
+    # capture all outputs to a log file while still printing it
+    class Logger:
+        def __init__(self, file):
+            self.terminal = sys.stdout
+            self.log = file
+
+        def write(self, message):
+            self.terminal.write(message)
+            self.log.write(message)
+
+        def __getattr__(self, attr):
+            return getattr(self.terminal, attr)
+
+    logger = Logger(file)
+
+    _stdout = sys.stdout
+    sys.stdout = logger
+    try:
+        yield logger.log
+    finally:
+        sys.stdout = _stdout
+
+
+def initiate_experiment(experiment):
+
+    def decorator(*args, **kwargs):
+        log_file_dir = "experiments/mixtures/"
+        log_file = log_file_dir + experiment.__name__ + ".txt"
+        if not os.path.exists(log_file):
+            os.makedirs(log_file_dir, exist_ok=True)
+        with print_to_logfile(open(log_file, 'a')):
+            print("Performing experiment:", experiment.__name__)
+            print("Date-Time:", datetime.datetime.now())
+            print("\n", end="")
+            print("Args:", args)
+            print("Kwargs:", kwargs)
+            print("\n", end="")
+            experiment(*args, **kwargs)
+            print("\n\n", end="")
+    return decorator
+
+
+@initiate_experiment
+def experiment_regular_prediction_bayesian(weights_dir=None, num_ens=10):
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/bayesian/splitted/2-tasks/" if weights_dir is None else weights_dir
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, bayesian=True, pretrained=True, weights_dir=weights_dir)
+    net1.cuda()
+    net2.cuda()
+
+    print("Model-1, Task-1-Dataset=> Accuracy:", predict_regular(net1, loaders1[1], bayesian=True, num_ens=num_ens))
+    print("Model-2, Task-2-Dataset=> Accuracy:", predict_regular(net2, loaders2[1], bayesian=True, num_ens=num_ens))
+
+
+@initiate_experiment
+def experiment_regular_prediction_frequentist(weights_dir=None):
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/frequentist/splitted/2-tasks/" if weights_dir is None else weights_dir
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, bayesian=False, pretrained=True, weights_dir=weights_dir)
+    net1.cuda()
+    net2.cuda()
+
+    print("Model-1, Task-1-Dataset=> Accuracy:", predict_regular(net1, loaders1[1], bayesian=False))
+    print("Model-2, Task-2-Dataset=> Accuracy:", predict_regular(net2, loaders2[1], bayesian=False))
+
+
+@initiate_experiment
+def experiment_simultaneous_without_mixture_model_with_uncertainty(uncertainty_type="epistemic_softmax", T=25, weights_dir=None):
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/bayesian/splitted/2-tasks/" if weights_dir is None else weights_dir
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, True, True, weights_dir)
+    net1.cuda()
+    net2.cuda()
+
+    print("Both Models, Task-1-Dataset=> Accuracy: {:.3}\tModel-1-Preferred: {:.3}\tModel-2-Preferred: {:.3}\t" \
+          "Task-1-Dataset-Uncertainty: {:.3}\tTask-2-Dataset-Uncertainty: {:.3}".format(
+        *predict_using_uncertainty_separate_models(net1, net2, loaders1[1], uncertainty_type=uncertainty_type, T=T)))
+    print("Both Models, Task-2-Dataset=> Accuracy: {:.3}\tModel-1-Preferred: {:.3}\tModel-2-Preferred: {:.3}\t" \
+          "Task-1-Dataset-Uncertainty: {:.3}\tTask-2-Dataset-Uncertainty: {:.3}".format(
+        *predict_using_uncertainty_separate_models(net1, net2, loaders2[1], uncertainty_type=uncertainty_type, T=T)))
+
+
+@initiate_experiment
+def experiment_simultaneous_without_mixture_model_with_confidence(weights_dir=None):
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/frequentist/splitted/2-tasks/" if weights_dir is None else weights_dir
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, False, True, weights_dir)
+    net1.cuda()
+    net2.cuda()
+
+    print("Both Models, Task-1-Dataset=> Accuracy: {:.3}\tModel-1-Preferred: {:.3}\tModel-2-Preferred: {:.3}".format(
+        *predict_using_confidence_separate_models(net1, net2, loaders1[1])))
+    print("Both Models, Task-2-Dataset=> Accuracy: {:.3}\tModel-1-Preferred: {:.3}\tModel-2-Preferred: {:.3}".format(
+        *predict_using_confidence_separate_models(net1, net2, loaders2[1])))
+
+
+@initiate_experiment
+def wip_experiment_average_weights_mixture_model():
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/bayesian/splitted/2-tasks/"
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, True, weights_dir)
+    net1.cuda()
+    net2.cuda()
+    net_mix = get_mixture_model(num_tasks, weights_dir, include_last_layer=True)
+    net_mix.cuda()
+
+    print("Model-1, Loader-1:", calculate_accuracy(net1, loaders1[1]))
+    print("Model-2, Loader-2:", calculate_accuracy(net2, loaders2[1]))
+    print("Model-1, Loader-2:", calculate_accuracy(net1, loaders2[1]))
+    print("Model-2, Loader-1:", calculate_accuracy(net2, loaders1[1]))
+    print("Model-Mix, Loader-1:", calculate_accuracy(net_mix, loaders1[1]))
+    print("Model-Mix, Loader-2:", calculate_accuracy(net_mix, loaders2[1]))
+
+
+@initiate_experiment
+def wip_experiment_simultaneous_average_weights_mixture_model_with_uncertainty():
+    num_tasks = 2
+    weights_dir = "checkpoints/MNIST/bayesian/splitted/2-tasks/"
+
+    loaders1, loaders2 = get_splitmnist_dataloaders(num_tasks)
+    net1, net2 = get_splitmnist_models(num_tasks, True, weights_dir)
+    net1.cuda()
+    net2.cuda()
+    net_mix = get_mixture_model(num_tasks, weights_dir, include_last_layer=False)
+    net_mix.cuda()
+
+    # Creating 2 sets of last layer
+    fc3_1 = BBBLinear(84, 5, name='fc3_1') # hardcoded for lenet
+    weights_1 = torch.load(weights_dir + "model_lenet_2.1.pt")
+    fc3_1.W = torch.nn.Parameter(weights_1['fc3.W'])
+    fc3_1.log_alpha = torch.nn.Parameter(weights_1['fc3.log_alpha'])
+
+    fc3_2 = BBBLinear(84, 5, name='fc3_2') # hardcoded for lenet
+    weights_2 = torch.load(weights_dir + "model_lenet_2.2.pt")
+    fc3_2.W = torch.nn.Parameter(weights_2['fc3.W'])
+    fc3_2.log_alpha = torch.nn.Parameter(weights_2['fc3.log_alpha'])
+
+    fc3_1, fc3_2 = fc3_1.cuda(), fc3_2.cuda()
+
+    print("Model-1, Loader-1:", calculate_accuracy(net1, loaders1[1]))
+    print("Model-2, Loader-2:", calculate_accuracy(net2, loaders2[1]))
+    print("Model-Mix, Loader-1:", predict_using_epistemic_uncertainty_with_mixture_model(net_mix, fc3_1, fc3_2, loaders1[1]))
+    print("Model-Mix, Loader-2:", predict_using_epistemic_uncertainty_with_mixture_model(net_mix, fc3_1, fc3_2, loaders2[1]))
+
+
+if __name__ == '__main__':
+    experiment_simultaneous_without_mixture_model_with_confidence()
--- a/Mixtures/temp_gmm.py
+++ b/Mixtures/temp_gmm.py
@ -0,0 +1,50 @@
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import torch
+
+import gmm
+
+def create_synthetic_data(num_gaussians, num_features, num_samples, means, vars):
+    assert len(means[0]) == len(vars[0]) == num_features
+    samples = []
+    for g in range(num_gaussians):
+        loc = torch.tensor(means[g]).float()
+        covariance_matrix = torch.eye(num_features).float() * torch.tensor(vars[g]).float()
+        dist = torch.distributions.multivariate_normal.MultivariateNormal(
+            loc = loc, covariance_matrix=covariance_matrix)
+        
+        for i in range(num_samples//num_gaussians):
+            sample = dist.sample()
+            samples.append(sample.unsqueeze(0))
+        
+    samples = torch.cat(samples, axis=0)
+    return samples
+
+
+def plot_data(data, y=None):
+    if y is not None:
+        for sample, target in zip(data, y):
+            if target==0:
+                plt.scatter(*sample, color='blue')
+            elif target==1:
+                plt.scatter(*sample, color='red')
+            elif target==2:
+                plt.scatter(*sample, color='green')
+    else:
+        for sample in data:
+            plt.scatter(*sample, color='black')
+    plt.show(block=False)
+    plt.pause(2)
+    plt.close()
+
+
+means = [[1, 4], [5, 5], [2, -1]]  # list of task's means(which is mean of each feature)
+vars = [[0.1, 0.1], [0.05, 0.4], [0.5, 0.2]]  # list of task's vars(which is var of each feature)
+data = create_synthetic_data(3, 2, 600, means, vars)  # shape: (total_samples, num_features)
+plot_data(data)
+model = gmm.GaussianMixture(3, 2)  # (num_gaussians, num_features)
+model.fit(data)
+y = model.predict(data)
+plot_data(data, y)
+
--- a/Mixtures/train_splitted.py
+++ b/Mixtures/train_splitted.py
@ -0,0 +1,83 @@
+import sys
+sys.path.append('..')
+
+import os
+import argparse
+import torch
+from torch import nn
+import numpy as np
+from torch.optim import Adam
+
+import utils
+import metrics
+import config_mixtures as cfg
+import utils_mixture as mix_utils
+from main_bayesian import train_model as train_bayesian, validate_model as validate_bayesian
+from main_frequentist import train_model as train_frequentist, validate_model as validate_frequentist
+
+# CUDA settings
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def train_splitted(num_tasks, bayesian=True, net_type='lenet'):
+    assert 10 % num_tasks == 0
+
+    # Hyper Parameter settings
+    train_ens = cfg.train_ens
+    valid_ens = cfg.valid_ens
+    n_epochs = cfg.n_epochs
+    lr_start = cfg.lr_start
+
+    if bayesian:
+        ckpt_dir = f"checkpoints/MNIST/bayesian/splitted/{num_tasks}-tasks/"
+    else:
+        ckpt_dir = f"checkpoints/MNIST/frequentist/splitted/{num_tasks}-tasks/"
+    if not os.path.exists(ckpt_dir):
+        os.makedirs(ckpt_dir, exist_ok=True)
+
+    loaders, datasets = mix_utils.get_splitmnist_dataloaders(num_tasks, return_datasets=True)
+    models = mix_utils.get_splitmnist_models(num_tasks, bayesian=bayesian, pretrained=False, net_type=net_type)
+
+    for task in range(1, num_tasks + 1):
+        print(f"Training task-{task}..")
+        trainset, testset, _, _ = datasets[task-1]
+        train_loader, valid_loader, _ = loaders[task-1]
+        net = models[task-1]
+        net = net.to(device)
+        ckpt_name = ckpt_dir + f"model_{net_type}_{num_tasks}.{task}.pt"
+
+        criterion = (metrics.ELBO(len(trainset)) if bayesian else nn.CrossEntropyLoss()).to(device)
+        optimizer = Adam(net.parameters(), lr=lr_start)
+        valid_loss_max = np.Inf
+        for epoch in range(n_epochs):  # loop over the dataset multiple times
+            utils.adjust_learning_rate(optimizer, metrics.lr_linear(epoch, 0, n_epochs, lr_start))
+
+            if bayesian:
+                train_loss, train_acc, train_kl = train_bayesian(net, optimizer, criterion, train_loader, num_ens=train_ens)
+                valid_loss, valid_acc = validate_bayesian(net, criterion, valid_loader, num_ens=valid_ens)
+                print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}'.format(
+                    epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl))
+            else:
+                train_loss, train_acc = train_frequentist(net, optimizer, criterion, train_loader)
+                valid_loss, valid_acc = validate_frequentist(net, criterion, valid_loader)
+                print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f}'.format(
+                    epoch, train_loss, train_acc, valid_loss, valid_acc))
+
+            # save model if validation accuracy has increased
+            if valid_loss <= valid_loss_max:
+                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
+                    valid_loss_max, valid_loss))
+                torch.save(net.state_dict(), ckpt_name)
+                valid_loss_max = valid_loss
+
+        print(f"Done training task-{task}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = "PyTorch Bayesian Split Model Training")
+    parser.add_argument('--num_tasks', default=2, type=int, help='number of tasks')
+    parser.add_argument('--net_type', default='lenet', type=str, help='model')
+    parser.add_argument('--bayesian', default=1, type=int, help='is_bayesian_model(0/1)')
+    args = parser.parse_args()
+
+    train_splitted(args.num_tasks, bool(args.bayesian), args.net_type)
--- a/Mixtures/utils_mixture.py
+++ b/Mixtures/utils_mixture.py
@ -0,0 +1,259 @@
+import sys
+sys.path.append('..')
+
+import os
+import torch
+import numpy as np
+import torch.nn as nn
+from torch.nn import functional as F
+
+import data
+import utils
+import metrics
+from main_bayesian import getModel as getBayesianModel
+from main_frequentist import getModel as getFrequentistModel
+import config_mixtures as cfg
+import uncertainty_estimation as ue
+
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+class Pass(nn.Module):
+    def __init__(self):
+        super(Pass, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def _get_splitmnist_datasets(num_tasks):
+    datasets = []
+    for i in range(1, num_tasks + 1):
+        name = 'SplitMNIST-{}.{}'.format(num_tasks, i)
+        datasets.append(data.getDataset(name))
+    return datasets
+
+
+def get_splitmnist_dataloaders(num_tasks, return_datasets=False):
+    loaders = []
+    datasets = _get_splitmnist_datasets(num_tasks)
+    for i in range(1, num_tasks + 1):
+        trainset, testset, _, _ = datasets[i-1]
+        curr_loaders = data.getDataloader(
+            trainset, testset, cfg.valid_size, cfg.batch_size, cfg.num_workers)
+        loaders.append(curr_loaders)  # (train_loader, valid_loader, test_loader)
+    if return_datasets:
+        return loaders, datasets
+    return loaders
+
+
+def get_splitmnist_models(num_tasks, bayesian=True, pretrained=False, weights_dir=None, net_type='lenet'):
+    inputs = 1
+    outputs = 10 // num_tasks
+    models = []
+    if pretrained:
+        assert weights_dir
+    for i in range(1, num_tasks + 1):
+        if bayesian:
+            model = getBayesianModel(net_type, inputs, outputs)
+        else:
+            model = getFrequentistModel(net_type, inputs, outputs)
+        models.append(model)
+        if pretrained:
+            weight_path = weights_dir + f"model_{net_type}_{num_tasks}.{i}.pt"
+            models[-1].load_state_dict(torch.load(weight_path))
+    return models
+
+
+def get_mixture_model(num_tasks, weights_dir, net_type='lenet', include_last_layer=True):
+    """
+    Current implementation is based on average value of weights
+    """
+    net = getBayesianModel(net_type, 1, 5)
+    if not include_last_layer:
+        net.fc3 = Pass()
+
+    task_weights = []
+    for i in range(1, num_tasks + 1):
+        weight_path = weights_dir + f"model_{net_type}_{num_tasks}.{i}.pt"
+        task_weights.append(torch.load(weight_path))
+
+    mixture_weights = net.state_dict().copy()
+    layer_list = list(mixture_weights.keys())
+
+    for key in mixture_weights:
+        if key in layer_list:
+            concat_weights = torch.cat([w[key].unsqueeze(0) for w in task_weights] , dim=0)
+            average_weight = torch.mean(concat_weights, dim=0)
+            mixture_weights[key] = average_weight
+
+    net.load_state_dict(mixture_weights)
+    return net
+
+
+def predict_regular(net, validloader, bayesian=True, num_ens=10):
+    """
+    For both Bayesian and Frequentist models
+    """
+    net.eval()
+    accs = []
+
+    for i, (inputs, labels) in enumerate(validloader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        if bayesian:
+            outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).to(device)
+            for j in range(num_ens):
+                net_out, _ = net(inputs)
+                outputs[:, :, j] = F.log_softmax(net_out, dim=1).data
+
+            log_outputs = utils.logmeanexp(outputs, dim=2)
+            accs.append(metrics.acc(log_outputs, labels))
+        else:
+            output = net(inputs)
+            accs.append(metrics.acc(output.detach(), labels))
+
+    return np.mean(accs)
+
+
+def predict_using_uncertainty_separate_models(net1, net2, valid_loader, uncertainty_type='epistemic_softmax', T=25):
+    """
+    For Bayesian models
+    """
+    accs = []
+    total_u1 = 0.0
+    total_u2 = 0.0
+    set1_selected = 0
+    set2_selected = 0
+
+    epi_or_ale, soft_or_norm = uncertainty_type.split('_')
+    soft_or_norm = True if soft_or_norm=='normalized' else False
+
+    for i, (inputs, labels) in enumerate(valid_loader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        pred1, epi1, ale1 = ue.get_uncertainty_per_batch(net1, inputs, T=T, normalized=soft_or_norm)
+        pred2, epi2, ale2 = ue.get_uncertainty_per_batch(net2, inputs, T=T, normalized=soft_or_norm)
+
+        if epi_or_ale=='epistemic':
+            u1 = np.sum(epi1, axis=1)
+            u2 = np.sum(epi2, axis=1)
+        elif epi_or_ale=='aleatoric':
+            u1 = np.sum(ale1, axis=1)
+            u2 = np.sum(ale2, axis=1)
+        elif epi_or_ale=='both':
+            u1 = np.sum(epi1, axis=1) + np.sum(ale1, axis=1)
+            u2 = np.sum(epi2, axis=1) + np.sum(ale2, axis=1)
+        else:
+            raise ValueError("Not correct uncertainty type")
+
+        total_u1 += np.sum(u1).item()
+        total_u2 += np.sum(u2).item()
+
+        set1_preferred = u2 > u1  # idx where set1 has less uncertainty
+        set1_preferred = np.expand_dims(set1_preferred, 1)
+        preds = np.where(set1_preferred, pred1, pred2)
+
+        set1_selected += np.sum(set1_preferred)
+        set2_selected += np.sum(~set1_preferred)
+
+        accs.append(metrics.acc(torch.tensor(preds), labels))
+
+    return np.mean(accs), set1_selected/(set1_selected + set2_selected), \
+        set2_selected/(set1_selected + set2_selected), total_u1, total_u2
+
+
+def predict_using_confidence_separate_models(net1, net2, valid_loader):
+    """
+    For Frequentist models
+    """
+    accs = []
+    set1_selected = 0
+    set2_selected = 0
+
+    for i, (inputs, labels) in enumerate(valid_loader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        pred1 = F.softmax(net1(inputs), dim=1)
+        pred2 = F.softmax(net2(inputs), dim=1)
+
+        set1_preferred = pred1.max(dim=1)[0] > pred2.max(dim=1)[0]  # idx where set1 has more confidence
+        preds = torch.where(set1_preferred.unsqueeze(1), pred1, pred2)
+
+        set1_selected += torch.sum(set1_preferred).float().item()
+        set2_selected += torch.sum(~set1_preferred).float().item()
+
+        accs.append(metrics.acc(preds.detach(), labels))
+
+    return np.mean(accs), set1_selected/(set1_selected + set2_selected), \
+        set2_selected/(set1_selected + set2_selected)
+
+
+def wip_predict_using_epistemic_uncertainty_with_mixture_model(model, fc3_1, fc3_2, valid_loader, T=10):
+    accs = []
+    total_epistemic_1 = 0.0
+    total_epistemic_2 = 0.0
+    set_1_selected = 0
+    set_2_selected = 0
+
+    for i, (inputs, labels) in enumerate(valid_loader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        outputs = []
+        for i in range(inputs.shape[0]):  # loop over batch
+            input_image = inputs[i].unsqueeze(0)
+
+            p_hat_1 = []
+            p_hat_2 = []
+            preds_1 = []
+            preds_2 = []
+            for t in range(T):
+                net_out_mix, _ = model(input_image)
+
+                # set_1
+                net_out_1 = fc3_1(net_out_mix)
+                preds_1.append(net_out_1)
+                prediction = F.softplus(net_out_1)
+                prediction = prediction / torch.sum(prediction, dim=1)
+                p_hat_1.append(prediction.cpu().detach())
+
+                # set_2
+                net_out_2 = fc3_2(net_out_mix)
+                preds_2.append(net_out_2)
+                prediction = F.softplus(net_out_2)
+                prediction = prediction / torch.sum(prediction, dim=1)
+                p_hat_2.append(prediction.cpu().detach())
+
+            # set_1
+            p_hat = torch.cat(p_hat_1, dim=0).numpy()
+            p_bar = np.mean(p_hat, axis=0)
+
+            preds = torch.cat(preds_1, dim=0)
+            pred_set_1 = torch.sum(preds, dim=0).unsqueeze(0)
+
+            temp = p_hat - np.expand_dims(p_bar, 0)
+            epistemic = np.dot(temp.T, temp) / T
+            epistemic_set_1 = np.sum(np.diag(epistemic)).item()
+            total_epistemic_1 += epistemic_set_1
+
+            # set_2
+            p_hat = torch.cat(p_hat_2, dim=0).numpy()
+            p_bar = np.mean(p_hat, axis=0)
+
+            preds = torch.cat(preds_2, dim=0)
+            pred_set_2 = torch.sum(preds, dim=0).unsqueeze(0)
+
+            temp = p_hat - np.expand_dims(p_bar, 0)
+            epistemic = np.dot(temp.T, temp) / T
+            epistemic_set_2 = np.sum(np.diag(epistemic)).item()
+            total_epistemic_2 += epistemic_set_2
+
+            if epistemic_set_1 > epistemic_set_2:
+                set_2_selected += 1
+                outputs.append(pred_set_2)
+            else:
+                set_1_selected += 1
+                outputs.append(pred_set_1)
+
+        outputs = torch.cat(outputs, dim=0)
+        accs.append(metrics.acc(outputs.detach(), labels))
+
+    return np.mean(accs), set_1_selected/(set_1_selected + set_2_selected), \
+        set_2_selected/(set_1_selected + set_2_selected), total_epistemic_1, total_epistemic_2
--- a/README.md
+++ b/README.md
@ -1,2 +1,3 @@
-# bayesiancnn
+# Energy efficiency comparison

+This experiment compares a Frequentist CNN model against a Bayesian CNN model
--- a/amd_sample_draw.py
+++ b/amd_sample_draw.py
@ -0,0 +1,29 @@
+import pickle
+from warnings import warn
+from gpu_power_func import get_sample_of_gpu
+
+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break
+
+
+# pickle_name = "{}_wattdata_{}.pkl".format(model_t,size)
+# print("GPU energy file config: {}".format(pickle_name))
+
+# print(cfg)
+
+
+if __name__ == '__main__':
+    dataDump = []
+    while True:
+        try:
+            dataDump.append(get_sample_of_gpu())
+            with open(cfg["pickle_path"], 'wb') as f:
+                pickle.dump(dataDump, f)
+        except EOFError:
+            warn('Pickle ran out of space')
+        finally:
+            f.close()
--- a/arguments.py
+++ b/arguments.py
@ -0,0 +1,27 @@
+import argparse
+from argparse import ArgumentParser
+
+# Construct an argument parser
+all_args = argparse.ArgumentParser()
+
+
+def makeArguments(arguments: ArgumentParser) -> dict:
+    all_args.add_argument("-b", "--Bayesian", action="store", dest="b",
+                          type=int, choices=range(1, 8),
+                          help="Bayesian model of size x")
+    all_args.add_argument("-f", "--Frequentist", action="store", dest="f",
+                          type=int, choices=range(1, 8),
+                          help="Frequentist model of size x")
+    all_args.add_argument("-E", "--EarlyStopping", action="store_true",
+                          help="Early Stopping criteria")
+    all_args.add_argument("-e", "--EnergyBound", action="store_true",
+                          help="Energy Bound criteria")
+    all_args.add_argument("-a", "--AccuracyBound", action="store_true",
+                          help="Accuracy Bound criteria")
+    all_args.add_argument("-s", "--Save", action="store_true",
+                          help="Save model")
+    all_args.add_argument('--net_type', default='lenet', type=str,
+                          help='model = [lenet/AlexNet/3Conv3FC]')
+    all_args.add_argument('--dataset', default='CIFAR10', type=str,
+                          help='dataset = [MNIST/CIFAR10/CIFAR100]')
+    return vars(all_args.parse_args())
--- a/cpu_watt.sh
+++ b/cpu_watt.sh
@ -0,0 +1,5 @@
+#!/bin/env bash
+
+powerstat -D -z 0.5 10000000 > $1
+#powerstat -z 0.5 1000000 > $1
+#sudo powerstat -D -z 0.5 10000000 > $1
--- a/gpu_power_func.py
+++ b/gpu_power_func.py
@ -0,0 +1,54 @@
+import os
+import re
+import pickle
+import numpy as np
+
+
+def get_sample_of_gpu():
+  from re import sub, findall
+  import subprocess
+  from subprocess import run
+
+  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
+  no_version = "Failed to initialize NVML: Driver/library version mismatch"
+  smi_string = run(['rocm-smi', '-P', '--showvoltage', '--showmemuse'], stdout=subprocess.PIPE)
+  smi_string = smi_string.stdout.decode('utf-8')
+  smi_string = smi_string.split("\n")
+  smi_string = list(filter(lambda x: x, smi_string))
+  if smi_string[0] ==  no_graph:
+    raise Exception("It seems that no AMD GPU is installed")
+  elif smi_string[0] ==  no_version:
+    raise Exception("rocm-smi version mismatch")
+  else:
+    results= []
+    gpuW0 = findall("[0-9]*\.[0-9]*",smi_string[2]) 
+    gpuW1 = findall("[0-9]*\.[0-9]*",smi_string[4])
+    gpuM0 = findall("[0-9]+",smi_string[7]) 
+    gpuM1 = findall("[0-9]+",smi_string[9])
+    gpuV0 = findall("[0-9]+",smi_string[13]) 
+    gpuV1 = findall("[0-9]+",smi_string[14])
+    results.append(float(gpuW0[0]) + float(gpuW1[0]))
+    if len(gpuM0) == 2 and len(gpuM1) == 2:
+      results.append(int(gpuM0[1]) + int(gpuM1[1]))
+    elif len(gpuM0) == 2:
+      results.append(gpuM0[1])
+    elif len(gpuM1) == 2:
+      results.append(gpuM1[1])
+    results.append(int(gpuV0[1]) + int(gpuV1[1]))
+    return results
+    #for l in smi_string:
+        #temp = findall("[0-9]*MiB | [0-9]*W",l)
+        #if temp:
+           #return temp
+
+def total_watt_consumed(pickle_name):
+  with (open(pickle_name, "rb")) as file:
+      while True:
+          try:
+              x = pickle.load(file)
+          except EOFError:
+              break
+  x = np.array(x)
+  x = x[:,0]
+  y = [float(re.findall("\d+.\d+",xi)[0]) for xi in x]
+  return sum(y)
--- a/gpu_sample_draw.py
+++ b/gpu_sample_draw.py
@ -0,0 +1,64 @@
+import os
+import re
+import pickle
+import numpy as np
+from warnings import warn
+
+
+def get_sample_of_gpu():
+  from re import sub, findall
+  import subprocess
+  from subprocess import run
+
+  no_graph = "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running."
+  no_version = "Failed to initialize NVML: Driver/library version mismatch"
+  smi_string = run(['nvidia-smi'], stdout=subprocess.PIPE)
+  smi_string = smi_string.stdout.decode('utf-8')
+  smi_string = smi_string.split("\n")
+  smi_string = list(filter(lambda x: x, smi_string))
+  if smi_string[0] ==  no_graph:
+    raise Exception("It seems that no NVIDIA GPU is installed")
+  elif smi_string[0] ==  no_version:
+    raise Exception("nvidia-smi version mismatch")
+  else:
+    return findall("[0-9]*MiB | [0-9]*W",smi_string[6])
+    #for l in smi_string:
+        #temp = findall("[0-9]*MiB | [0-9]*W",l)
+        #if temp:
+           #return temp
+
+
+def total_watt_consumed():
+    with open(pickle_name, 'rb') as f:
+        x = pickle.load(f)
+    x = np.array(x)
+    x = x[:,0]
+    y = [int(re.findall("\d+",xi)[0]) for xi in x]
+    return sum(y)
+
+
+if __name__ == '__main__':
+  dataDump = []
+  #var = True
+  #pickling_on = open("wattdata.pickle","wb")
+  while True:
+    try:
+      dataDump.append(get_sample_of_gpu())
+      with open(pickle_name, 'wb') as f:
+        pickle.dump(dataDump, f)
+    except EOFError:
+      warn('Pickle ran out of space')
+      size += 0.01
+    finally:
+      f.close()
+
+    #if retcode == 0:
+      #break
+
+  #pickle.dump(dataDump, pickling_on)
+  #pickling_on.close()
+
+
+
+
+
--- a/layers/BBB/BBBConv.py
+++ b/layers/BBB/BBBConv.py
@ -0,0 +1,83 @@
+import sys
+sys.path.append("..")
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+from metrics import calculate_kl as KL_DIV
+from ..misc import ModuleWrapper
+
+
+class BBBConv2d(ModuleWrapper):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, bias=True, priors=None):
+
+        super(BBBConv2d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = 1
+        self.use_bias = bias
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        if priors is None:
+            priors = {
+                'prior_mu': 0,
+                'prior_sigma': 0.1,
+                'posterior_mu_initial': (0, 0.1),
+                'posterior_rho_initial': (-3, 0.1),
+            }
+        self.prior_mu = priors['prior_mu']
+        self.prior_sigma = priors['prior_sigma']
+        self.posterior_mu_initial = priors['posterior_mu_initial']
+        self.posterior_rho_initial = priors['posterior_rho_initial']
+
+        self.W_mu = Parameter(torch.empty((out_channels, in_channels, *self.kernel_size), device=self.device))
+        self.W_rho = Parameter(torch.empty((out_channels, in_channels, *self.kernel_size), device=self.device))
+
+        if self.use_bias:
+            self.bias_mu = Parameter(torch.empty((out_channels), device=self.device))
+            self.bias_rho = Parameter(torch.empty((out_channels), device=self.device))
+        else:
+            self.register_parameter('bias_mu', None)
+            self.register_parameter('bias_rho', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.W_mu.data.normal_(*self.posterior_mu_initial)
+        self.W_rho.data.normal_(*self.posterior_rho_initial)
+
+        if self.use_bias:
+            self.bias_mu.data.normal_(*self.posterior_mu_initial)
+            self.bias_rho.data.normal_(*self.posterior_rho_initial)
+
+    def forward(self, input, sample=True):
+        if self.training or sample:
+            W_eps = torch.empty(self.W_mu.size()).normal_(0, 1).to(self.device)
+            self.W_sigma = torch.log1p(torch.exp(self.W_rho))
+            weight = self.W_mu + W_eps * self.W_sigma
+
+            if self.use_bias:
+                bias_eps = torch.empty(self.bias_mu.size()).normal_(0, 1).to(self.device)
+                self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
+                bias = self.bias_mu + bias_eps * self.bias_sigma
+            else:
+                bias = None
+        else:
+            weight = self.W_mu
+            bias = self.bias_mu if self.use_bias else None
+
+        return F.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+
+    def kl_loss(self):
+        kl = KL_DIV(self.prior_mu, self.prior_sigma, self.W_mu, self.W_sigma)
+        if self.use_bias:
+            kl += KL_DIV(self.prior_mu, self.prior_sigma, self.bias_mu, self.bias_sigma)
+        return kl
--- a/layers/BBB/BBBLinear.py
+++ b/layers/BBB/BBBLinear.py
@ -0,0 +1,76 @@
+import sys
+sys.path.append("..")
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+from metrics import calculate_kl as KL_DIV
+from ..misc import ModuleWrapper
+
+
+class BBBLinear(ModuleWrapper):
+    def __init__(self, in_features, out_features, bias=True, priors=None):
+        super(BBBLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        if priors is None:
+            priors = {
+                'prior_mu': 0,
+                'prior_sigma': 0.1,
+                'posterior_mu_initial': (0, 0.1),
+                'posterior_rho_initial': (-3, 0.1),
+            }
+        self.prior_mu = priors['prior_mu']
+        self.prior_sigma = priors['prior_sigma']
+        self.posterior_mu_initial = priors['posterior_mu_initial']
+        self.posterior_rho_initial = priors['posterior_rho_initial']
+
+        self.W_mu = Parameter(torch.empty((out_features, in_features), device=self.device))
+        self.W_rho = Parameter(torch.empty((out_features, in_features), device=self.device))
+
+        if self.use_bias:
+            self.bias_mu = Parameter(torch.empty((out_features), device=self.device))
+            self.bias_rho = Parameter(torch.empty((out_features), device=self.device))
+        else:
+            self.register_parameter('bias_mu', None)
+            self.register_parameter('bias_rho', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.W_mu.data.normal_(*self.posterior_mu_initial)
+        self.W_rho.data.normal_(*self.posterior_rho_initial)
+
+        if self.use_bias:
+            self.bias_mu.data.normal_(*self.posterior_mu_initial)
+            self.bias_rho.data.normal_(*self.posterior_rho_initial)
+
+    def forward(self, input, sample=True):
+        if self.training or sample:
+            W_eps = torch.empty(self.W_mu.size()).normal_(0, 1).to(self.device)
+            self.W_sigma = torch.log1p(torch.exp(self.W_rho))
+            weight = self.W_mu + W_eps * self.W_sigma
+
+            if self.use_bias:
+                bias_eps = torch.empty(self.bias_mu.size()).normal_(0, 1).to(self.device)
+                self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
+                bias = self.bias_mu + bias_eps * self.bias_sigma
+            else:
+                bias = None
+        else:
+            weight = self.W_mu
+            bias = self.bias_mu if self.use_bias else None
+
+        return F.linear(input, weight, bias)
+
+    def kl_loss(self):
+        kl = KL_DIV(self.prior_mu, self.prior_sigma, self.W_mu, self.W_sigma)
+        if self.use_bias:
+            kl += KL_DIV(self.prior_mu, self.prior_sigma, self.bias_mu, self.bias_sigma)
+        return kl
--- a/layers/BBB_LRT/BBBConv.py
+++ b/layers/BBB_LRT/BBBConv.py
@ -0,0 +1,86 @@
+import sys
+sys.path.append("..")
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+import utils
+from metrics import calculate_kl as KL_DIV
+from ..misc import ModuleWrapper
+
+
+class BBBConv2d(ModuleWrapper):
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, bias=True, priors=None):
+        super(BBBConv2d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = (kernel_size, kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = 1
+        self.use_bias = bias
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        if priors is None:
+            priors = {
+                'prior_mu': 0,
+                'prior_sigma': 0.1,
+                'posterior_mu_initial': (0, 0.1),
+                'posterior_rho_initial': (-3, 0.1),
+            }
+        self.prior_mu = priors['prior_mu']
+        self.prior_sigma = priors['prior_sigma']
+        self.posterior_mu_initial = priors['posterior_mu_initial']
+        self.posterior_rho_initial = priors['posterior_rho_initial']
+
+        self.W_mu = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        self.W_rho = Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if self.use_bias:
+            self.bias_mu = Parameter(torch.Tensor(out_channels))
+            self.bias_rho = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias_mu', None)
+            self.register_parameter('bias_rho', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.W_mu.data.normal_(*self.posterior_mu_initial)
+        self.W_rho.data.normal_(*self.posterior_rho_initial)
+
+        if self.use_bias:
+            self.bias_mu.data.normal_(*self.posterior_mu_initial)
+            self.bias_rho.data.normal_(*self.posterior_rho_initial)
+
+    def forward(self, x, sample=True):
+
+        self.W_sigma = torch.log1p(torch.exp(self.W_rho))
+        if self.use_bias:
+            self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
+            bias_var = self.bias_sigma ** 2
+        else:
+            self.bias_sigma = bias_var = None
+
+        act_mu = F.conv2d(
+            x, self.W_mu, self.bias_mu, self.stride, self.padding, self.dilation, self.groups)
+        act_var = 1e-16 + F.conv2d(
+            x ** 2, self.W_sigma ** 2, bias_var, self.stride, self.padding, self.dilation, self.groups)
+        act_std = torch.sqrt(act_var)
+
+        if self.training or sample:
+            eps = torch.empty(act_mu.size()).normal_(0, 1).to(self.device)
+            return act_mu + act_std * eps
+        else:
+            return act_mu
+
+    def kl_loss(self):
+        kl = KL_DIV(self.prior_mu, self.prior_sigma, self.W_mu, self.W_sigma)
+        if self.use_bias:
+            kl += KL_DIV(self.prior_mu, self.prior_sigma, self.bias_mu, self.bias_sigma)
+        return kl
--- a/layers/BBB_LRT/BBBLinear.py
+++ b/layers/BBB_LRT/BBBLinear.py
@ -0,0 +1,78 @@
+import sys
+sys.path.append("..")
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+
+import utils
+from metrics import calculate_kl as KL_DIV
+from ..misc import ModuleWrapper
+
+
+class BBBLinear(ModuleWrapper):
+
+    def __init__(self, in_features, out_features, bias=True, priors=None):
+        super(BBBLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        if priors is None:
+                priors = {
+                'prior_mu': 0,
+                'prior_sigma': 0.1,
+                'posterior_mu_initial': (0, 0.1),
+                'posterior_rho_initial': (-3, 0.1),
+            }
+        self.prior_mu = priors['prior_mu']
+        self.prior_sigma = priors['prior_sigma']
+        self.posterior_mu_initial = priors['posterior_mu_initial']
+        self.posterior_rho_initial = priors['posterior_rho_initial']
+
+        self.W_mu = Parameter(torch.Tensor(out_features, in_features))
+        self.W_rho = Parameter(torch.Tensor(out_features, in_features))
+        if self.use_bias:
+            self.bias_mu = Parameter(torch.Tensor(out_features))
+            self.bias_rho = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias_mu', None)
+            self.register_parameter('bias_rho', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.W_mu.data.normal_(*self.posterior_mu_initial)
+        self.W_rho.data.normal_(*self.posterior_rho_initial)
+
+        if self.use_bias:
+            self.bias_mu.data.normal_(*self.posterior_mu_initial)
+            self.bias_rho.data.normal_(*self.posterior_rho_initial)
+
+    def forward(self, x, sample=True):
+
+        self.W_sigma = torch.log1p(torch.exp(self.W_rho))
+        if self.use_bias:
+            self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
+            bias_var = self.bias_sigma ** 2
+        else:
+            self.bias_sigma = bias_var = None
+
+        act_mu = F.linear(x, self.W_mu, self.bias_mu)
+        act_var = 1e-16 + F.linear(x ** 2, self.W_sigma ** 2, bias_var)
+        act_std = torch.sqrt(act_var)
+
+        if self.training or sample:
+            eps = torch.empty(act_mu.size()).normal_(0, 1).to(self.device)
+            return act_mu + act_std * eps
+        else:
+            return act_mu
+
+    def kl_loss(self):
+        kl = KL_DIV(self.prior_mu, self.prior_sigma, self.W_mu, self.W_sigma)
+        if self.use_bias:
+            kl += KL_DIV(self.prior_mu, self.prior_sigma, self.bias_mu, self.bias_sigma)
+        return kl
--- a/layers/misc.py
+++ b/layers/misc.py
@ -0,0 +1,35 @@
+from torch import nn
+
+
+class ModuleWrapper(nn.Module):
+    """Wrapper for nn.Module with support for arbitrary flags and a universal forward pass"""
+
+    def __init__(self):
+        super(ModuleWrapper, self).__init__()
+
+    def set_flag(self, flag_name, value):
+        setattr(self, flag_name, value)
+        for m in self.children():
+            if hasattr(m, 'set_flag'):
+                m.set_flag(flag_name, value)
+
+    def forward(self, x):
+        for module in self.children():
+            x = module(x)
+
+        kl = 0.0
+        for module in self.modules():
+            if hasattr(module, 'kl_loss'):
+                kl = kl + module.kl_loss()
+
+        return x, kl
+
+
+class FlattenLayer(ModuleWrapper):
+
+    def __init__(self, num_features):
+        super(FlattenLayer, self).__init__()
+        self.num_features = num_features
+
+    def forward(self, x):
+        return x.view(-1, self.num_features)
--- a/main_bayesian.py
+++ b/main_bayesian.py
@ -0,0 +1,203 @@
+from __future__ import print_function
+
+import os
+import data
+import utils
+import torch
+import pickle
+import metrics
+import numpy as np
+from datetime import datetime
+from torch.nn import functional as F
+from torch.optim import Adam, lr_scheduler
+from models.BayesianModels.BayesianLeNet import BBBLeNet
+from models.BayesianModels.BayesianAlexNet import BBBAlexNet
+from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
+from stopping_crit import earlyStopping, energyBound, accuracyBound
+
+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break
+
+
+# CUDA settings
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def getModel(net_type, inputs, outputs, priors, layer_type, activation_type):
+    if (net_type == 'lenet'):
+        return BBBLeNet(outputs, inputs, priors, layer_type, activation_type,
+                        wide=cfg["model"]["size"])
+    elif (net_type == 'alexnet'):
+        return BBBAlexNet(outputs, inputs, priors, layer_type, activation_type)
+    elif (net_type == '3conv3fc'):
+        return BBB3Conv3FC(outputs, inputs, priors, layer_type,
+                           activation_type)
+    else:
+        raise ValueError('Network should be either [LeNet / AlexNet\
+                / 3Conv3FC')
+
+
+def train_model(net, optimizer, criterion, trainloader, num_ens=1,
+                beta_type=0.1, epoch=None, num_epochs=None):
+    net.train()
+    training_loss = 0.0
+    accs = []
+    kl_list = []
+    for i, (inputs, labels) in enumerate(trainloader, 1):
+
+        optimizer.zero_grad()
+
+        inputs, labels = inputs.to(device), labels.to(device)
+        outputs = torch.zeros(inputs.shape[0], net.num_classes,
+                              num_ens).to(device)
+
+        kl = 0.0
+        for j in range(num_ens):
+            net_out, _kl = net(inputs)
+            kl += _kl
+            outputs[:, :, j] = F.log_softmax(net_out, dim=1)
+
+        kl = kl / num_ens
+        kl_list.append(kl.item())
+        log_outputs = utils.logmeanexp(outputs, dim=2)
+
+        beta = metrics.get_beta(i-1, len(trainloader), beta_type,
+                                epoch, num_epochs)
+        loss = criterion(log_outputs, labels, kl, beta)
+        loss.backward()
+        optimizer.step()
+
+        accs.append(metrics.acc(log_outputs.data, labels))
+        training_loss += loss.cpu().data.numpy()
+    return training_loss/len(trainloader), np.mean(accs), np.mean(kl_list)
+
+
+def validate_model(net, criterion, validloader, num_ens=1, beta_type=0.1,
+                   epoch=None, num_epochs=None):
+    """Calculate ensemble accuracy and NLL Loss"""
+    net.train()
+    valid_loss = 0.0
+    accs = []
+
+    for i, (inputs, labels) in enumerate(validloader):
+        inputs, labels = inputs.to(device), labels.to(device)
+        outputs = torch.zeros(inputs.shape[0], net.num_classes,
+                              num_ens).to(device)
+        kl = 0.0
+        for j in range(num_ens):
+            net_out, _kl = net(inputs)
+            kl += _kl
+            outputs[:, :, j] = F.log_softmax(net_out, dim=1).data
+
+        log_outputs = utils.logmeanexp(outputs, dim=2)
+
+        beta = metrics.get_beta(i-1, len(validloader), beta_type,
+                                epoch, num_epochs)
+        valid_loss += criterion(log_outputs, labels, kl, beta).item()
+        accs.append(metrics.acc(log_outputs, labels))
+
+    return valid_loss/len(validloader), np.mean(accs)
+
+
+def run(dataset, net_type):
+
+    # Hyper Parameter settings
+    layer_type = cfg["model"]["layer_type"]
+    activation_type = cfg["model"]["activation_type"]
+    priors = cfg["model"]["priors"]
+
+    train_ens = cfg["model"]["train_ens"]
+    valid_ens = cfg["model"]["valid_ens"]
+    n_epochs = cfg["model"]["n_epochs"]
+    lr_start = cfg["model"]["lr"]
+    num_workers = cfg["model"]["num_workers"]
+    valid_size = cfg["model"]["valid_size"]
+    batch_size = cfg["model"]["batch_size"]
+    beta_type = cfg["model"]["beta_type"]
+
+    trainset, testset, inputs, outputs = data.getDataset(dataset)
+    train_loader, valid_loader, test_loader = data.getDataloader(
+        trainset, testset, valid_size, batch_size, num_workers)
+    net = getModel(net_type, inputs, outputs, priors, layer_type,
+                   activation_type).to(device)
+
+    ckpt_dir = f'checkpoints/{dataset}/bayesian'
+    ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}_{layer_type}\
+            _{activation_type}_{cfg["model"]["size"]}.pt'
+
+    if not os.path.exists(ckpt_dir):
+        os.makedirs(ckpt_dir, exist_ok=True)
+
+    stp = cfg["stopping_crit"]
+    sav = cfg["save"]
+
+    criterion = metrics.ELBO(len(trainset)).to(device)
+    optimizer = Adam(net.parameters(), lr=lr_start)
+    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6,
+                                              verbose=True)
+    # valid_loss_max = np.Inf
+    # if stp == 2:
+    early_stop = []
+    train_data = []
+    for epoch in range(n_epochs):  # loop over the dataset multiple times
+
+        train_loss, train_acc, train_kl = train_model(net, optimizer,
+                                                      criterion,
+                                                      train_loader,
+                                                      num_ens=train_ens,
+                                                      beta_type=beta_type,
+                                                      epoch=epoch,
+                                                      num_epochs=n_epochs)
+        valid_loss, valid_acc = validate_model(net, criterion, valid_loader,
+                                               num_ens=valid_ens,
+                                               beta_type=beta_type,
+                                               epoch=epoch,
+                                               num_epochs=n_epochs)
+        lr_sched.step(valid_loss)
+
+        train_data.append([epoch, train_loss, train_acc, valid_loss,
+                           valid_acc])
+        print('Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy:\
+                {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy:\
+                {:.4f} \ttrain_kl_div: {:.4f}'.format(epoch, train_loss,
+                                                      train_acc, valid_loss,
+                                                      valid_acc, train_kl))
+
+        if stp == 2:
+            print('Using early stopping')
+            if earlyStopping(early_stop, valid_acc, epoch,
+                             cfg["model"]["sens"]) == 1:
+                break
+        elif stp == 3:
+            print('Using energy bound')
+            if energyBound(cfg["model"]["energy_thrs"]) == 1:
+                break
+        elif stp == 4:
+            print('Using accuracy bound')
+            if accuracyBound(train_acc, cfg.acc_thrs) == 1:
+                break
+        else:
+            print('Training for {} epochs'.format(cfg["model"]["n_epochs"]))
+
+        if sav == 1:
+            # save model when finished
+            if epoch == cfg.n_epochs-1:
+                torch.save(net.state_dict(), ckpt_name)
+
+    with open("bayes_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f:
+        pickle.dump(train_data, f)
+
+
+if __name__ == '__main__':
+    now = datetime.now()
+    current_time = now.strftime("%H:%M:%S")
+    print("Initial Time =", current_time)
+    print("Using bayesian model of size: {}".format(cfg["model"]["size"]))
+    run(cfg["data"], cfg["model"]["net_type"])
+    now = datetime.now()
+    current_time = now.strftime("%H:%M:%S")
+    print("Final Time =", current_time)
--- a/main_frequentist.py
+++ b/main_frequentist.py
@ -0,0 +1,150 @@
+from __future__ import print_function
+import os
+import data
+import torch
+import pickle
+import metrics
+import numpy as np
+import torch.nn as nn
+from datetime import datetime
+from torch.optim import Adam, lr_scheduler
+from models.NonBayesianModels.LeNet import LeNet
+from models.NonBayesianModels.AlexNet import AlexNet
+from stopping_crit import earlyStopping, energyBound, accuracyBound
+from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
+
+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break
+
+# CUDA settings
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def getModel(net_type, inputs, outputs, wide=cfg["model"]["size"]):
+    if (net_type == 'lenet'):
+        return LeNet(outputs, inputs, wide)
+    elif (net_type == 'alexnet'):
+        return AlexNet(outputs, inputs)
+    elif (net_type == '3conv3fc'):
+        return ThreeConvThreeFC(outputs, inputs)
+    else:
+        raise ValueError('Network should be either [LeNet / AlexNet / \
+                3Conv3FC')
+
+
+def train_model(net, optimizer, criterion, train_loader):
+    train_loss = 0.0
+    net.train()
+    accs = []
+    for datas, target in train_loader:
+        data, target = datas.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = net(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+        train_loss += loss.item()*data.size(0)
+        accs.append(metrics.acc(output.detach(), target))
+    return train_loss, np.mean(accs)
+
+
+def validate_model(net, criterion, valid_loader):
+    valid_loss = 0.0
+    net.eval()
+    accs = []
+    for datas, target in valid_loader:
+        data, target = datas.to(device), target.to(device)
+        output = net(data)
+        loss = criterion(output, target)
+        valid_loss += loss.item()*data.size(0)
+        accs.append(metrics.acc(output.detach(), target))
+    return valid_loss, np.mean(accs)
+
+
+def run(dataset, net_type):
+
+    # Hyper Parameter settings
+    n_epochs = cfg["model"]["n_epochs"]
+    lr = cfg["model"]["lr"]
+    num_workers = cfg["model"]["num_workers"]
+    valid_size = cfg["model"]["valid_size"]
+    batch_size = cfg["model"]["batch_size"]
+
+    trainset, testset, inputs, outputs = data.getDataset(dataset)
+    train_loader, valid_loader, test_loader = data.getDataloader(
+        trainset, testset, valid_size, batch_size, num_workers)
+    net = getModel(net_type, inputs, outputs).to(device)
+
+    ckpt_dir = f'checkpoints/{dataset}/frequentist'
+    ckpt_name = f'checkpoints/{dataset}/frequentist/model\
+            _{net_type}_{cfg["model"]["size"]}.pt'
+
+    if not os.path.exists(ckpt_dir):
+        os.makedirs(ckpt_dir, exist_ok=True)
+
+    stp = cfg["stopping_crit"]
+    sav = cfg["save"]
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = Adam(net.parameters(), lr=lr)
+    lr_sched = lr_scheduler.ReduceLROnPlateau(optimizer, patience=6,
+                                              verbose=True)
+    # valid_loss_min = np.Inf
+    # if stp == 2:
+    early_stop = []
+    train_data = []
+    for epoch in range(1, n_epochs+1):
+
+        train_loss, train_acc = train_model(net, optimizer, criterion,
+                                            train_loader)
+        valid_loss, valid_acc = validate_model(net, criterion, valid_loader)
+        lr_sched.step(valid_loss)
+
+        train_loss = train_loss/len(train_loader.dataset)
+        valid_loss = valid_loss/len(valid_loader.dataset)
+
+        train_data.append([epoch, train_loss, train_acc, valid_loss,
+                           valid_acc])
+        print('Epoch: {} \tTraining Loss: {: .4f} \tTraining Accuracy: {: .4f}\
+              \tValidation Loss: {: .4f} \tValidation Accuracy: {: .4f}\
+              '.format(epoch, train_loss, train_acc, valid_loss, valid_acc))
+
+        if stp == 2:
+            # print('Using early stopping')
+            if earlyStopping(early_stop, valid_acc, epoch,
+                             cfg["model"]["sens"]) == 1:
+                break
+        elif stp == 3:
+            # print('Using energy bound')
+            if energyBound(cfg["model"]["energy_thrs"]) == 1:
+                break
+        elif stp == 4:
+            # print('Using accuracy bound')
+            if accuracyBound(train_acc,
+                             cfg["model"]["acc_thrs"]) == 1:
+                break
+        else:
+            print('Training for {} epochs'.format(cfg["model"]["n_epochs"]))
+
+        if sav == 1:
+            # save model when finished
+            if epoch == n_epochs:
+                torch.save(net.state_dict(), ckpt_name)
+
+    with open("freq_exp_data_"+str(cfg["model"]["size"])+".pkl", 'wb') as f:
+        pickle.dump(train_data, f)
+
+
+if __name__ == '__main__':
+    now = datetime.now()
+    current_time = now.strftime("%H:%M:%S")
+    print("Initial Time =", current_time)
+    print("Using frequentist model of size: {}".format(cfg["model"]["size"]))
+    run(cfg["data"], cfg["model"]["net_type"])
+    now = datetime.now()
+    current_time = now.strftime("%H:%M:%S")
+    print("Final Time =", current_time)
--- a/mem_free.sh
+++ b/mem_free.sh
@ -0,0 +1,8 @@
+#!/bin/env bash
+
+
+while true
+do
+    free -h >> $1
+done
+
--- a/metrics.py
+++ b/metrics.py
@ -0,0 +1,46 @@
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+import torch
+
+
+class ELBO(nn.Module):
+    def __init__(self, train_size):
+        super(ELBO, self).__init__()
+        self.train_size = train_size
+
+    def forward(self, input, target, kl, beta):
+        assert not target.requires_grad
+        return F.nll_loss(input, target, reduction='mean') * self.train_size + beta * kl
+
+
+# def lr_linear(epoch_num, decay_start, total_epochs, start_value):
+#     if epoch_num < decay_start:
+#         return start_value
+#     return start_value*float(total_epochs-epoch_num)/float(total_epochs-decay_start)
+
+
+def acc(outputs, targets):
+    return np.mean(outputs.cpu().numpy().argmax(axis=1) == targets.data.cpu().numpy())
+
+
+def calculate_kl(mu_q, sig_q, mu_p, sig_p):
+    kl = 0.5 * (2 * torch.log(sig_p / sig_q) - 1 + (sig_q / sig_p).pow(2) + ((mu_p - mu_q) / sig_p).pow(2)).sum()
+    return kl
+
+
+def get_beta(batch_idx, m, beta_type, epoch, num_epochs):
+    if type(beta_type) is float:
+        return beta_type
+
+    if beta_type == "Blundell":
+        beta = 2 ** (m - (batch_idx + 1)) / (2 ** m - 1)
+    elif beta_type == "Soenderby":
+        if epoch is None or num_epochs is None:
+            raise ValueError('Soenderby method requires both epoch and num_epochs to be passed.')
+        beta = min(epoch / (num_epochs // 4), 1)
+    elif beta_type == "Standard":
+        beta = 1 / m
+    else:
+        beta = 0
+    return beta
--- a/models/BayesianModels/Bayesian3Conv3FC.py
+++ b/models/BayesianModels/Bayesian3Conv3FC.py
@ -0,0 +1,55 @@
+import math
+import torch.nn as nn
+from layers import BBB_Linear, BBB_Conv2d
+from layers import BBB_LRT_Linear, BBB_LRT_Conv2d
+from layers import FlattenLayer, ModuleWrapper
+
+class BBB3Conv3FC(ModuleWrapper):
+    """
+
+    Simple Neural Network having 3 Convolution
+    and 3 FC layers with Bayesian layers.
+    """
+    def __init__(self, outputs, inputs, priors, layer_type='lrt', activation_type='softplus'):
+        super(BBB3Conv3FC, self).__init__()
+
+        self.num_classes = outputs
+        self.layer_type = layer_type
+        self.priors = priors
+
+        if layer_type=='lrt':
+            BBBLinear = BBB_LRT_Linear
+            BBBConv2d = BBB_LRT_Conv2d
+        elif layer_type=='bbb':
+            BBBLinear = BBB_Linear
+            BBBConv2d = BBB_Conv2d
+        else:
+            raise ValueError("Undefined layer_type")
+        
+        if activation_type=='softplus':
+            self.act = nn.Softplus
+        elif activation_type=='relu':
+            self.act = nn.ReLU
+        else:
+            raise ValueError("Only softplus or relu supported")
+
+        self.conv1 = BBBConv2d(inputs, 32, 5, padding=2, bias=True, priors=self.priors)
+        self.act1 = self.act()
+        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        self.conv2 = BBBConv2d(32, 64, 5, padding=2, bias=True, priors=self.priors)
+        self.act2 = self.act()
+        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        self.conv3 = BBBConv2d(64, 128, 5, padding=1, bias=True, priors=self.priors)
+        self.act3 = self.act()
+        self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
+
+        self.flatten = FlattenLayer(2 * 2 * 128)
+        self.fc1 = BBBLinear(2 * 2 * 128, 1000, bias=True, priors=self.priors)
+        self.act4 = self.act()
+
+        self.fc2 = BBBLinear(1000, 1000, bias=True, priors=self.priors)
+        self.act5 = self.act()
+
+        self.fc3 = BBBLinear(1000, outputs, bias=True, priors=self.priors)
--- a/models/BayesianModels/BayesianAlexNet.py
+++ b/models/BayesianModels/BayesianAlexNet.py
@ -0,0 +1,53 @@
+import torch.nn as nn
+import math
+from layers import BBB_Linear, BBB_Conv2d
+from layers import BBB_LRT_Linear, BBB_LRT_Conv2d
+from layers import FlattenLayer, ModuleWrapper
+
+
+class BBBAlexNet(ModuleWrapper):
+    '''The architecture of AlexNet with Bayesian Layers'''
+
+    def __init__(self, outputs, inputs, priors, layer_type='lrt', activation_type='softplus'):
+        super(BBBAlexNet, self).__init__()
+
+        self.num_classes = outputs
+        self.layer_type = layer_type
+        self.priors = priors
+
+        if layer_type=='lrt':
+            BBBLinear = BBB_LRT_Linear
+            BBBConv2d = BBB_LRT_Conv2d
+        elif layer_type=='bbb':
+            BBBLinear = BBB_Linear
+            BBBConv2d = BBB_Conv2d
+        else:
+            raise ValueError("Undefined layer_type")
+        
+        if activation_type=='softplus':
+            self.act = nn.Softplus
+        elif activation_type=='relu':
+            self.act = nn.ReLU
+        else:
+            raise ValueError("Only softplus or relu supported")
+
+        self.conv1 = BBBConv2d(inputs, 64, 11, stride=4, padding=5, bias=True, priors=self.priors)
+        self.act1 = self.act()
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.conv2 = BBBConv2d(64, 192, 5, padding=2, bias=True, priors=self.priors)
+        self.act2 = self.act()
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.conv3 = BBBConv2d(192, 384, 3, padding=1, bias=True, priors=self.priors)
+        self.act3 = self.act()
+
+        self.conv4 = BBBConv2d(384, 256, 3, padding=1, bias=True, priors=self.priors)
+        self.act4 = self.act()
+
+        self.conv5 = BBBConv2d(256, 128, 3, padding=1, bias=True, priors=self.priors)
+        self.act5 = self.act()
+        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.flatten = FlattenLayer(1 * 1 * 128)
+        self.classifier = BBBLinear(1 * 1 * 128, outputs, bias=True, priors=self.priors)
--- a/models/BayesianModels/BayesianLeNet.py
+++ b/models/BayesianModels/BayesianLeNet.py
@ -0,0 +1,49 @@
+import math
+import torch.nn as nn
+from layers import BBB_Linear, BBB_Conv2d
+from layers import BBB_LRT_Linear, BBB_LRT_Conv2d
+from layers import FlattenLayer, ModuleWrapper
+
+
+class BBBLeNet(ModuleWrapper):
+    '''The architecture of LeNet with Bayesian Layers'''
+
+    def __init__(self, outputs, inputs, priors, layer_type='lrt', activation_type='softplus',wide=2):
+        super(BBBLeNet, self).__init__()
+
+        self.num_classes = outputs
+        self.layer_type = layer_type
+        self.priors = priors
+
+        if layer_type=='lrt':
+            BBBLinear = BBB_LRT_Linear
+            BBBConv2d = BBB_LRT_Conv2d
+        elif layer_type=='bbb':
+            BBBLinear = BBB_Linear
+            BBBConv2d = BBB_Conv2d
+        else:
+            raise ValueError("Undefined layer_type")
+        
+        if activation_type=='softplus':
+            self.act = nn.Softplus
+        elif activation_type=='relu':
+            self.act = nn.ReLU
+        else:
+            raise ValueError("Only softplus or relu supported")
+
+        self.conv1 = BBBConv2d(inputs, 6*wide, 5, padding=0, bias=True, priors=self.priors)
+        self.act1 = self.act()
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.conv2 = BBBConv2d(6*wide, 16*wide, 5, padding=0, bias=True, priors=self.priors)
+        self.act2 = self.act()
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.flatten = FlattenLayer(5 * 5 * 16*wide)
+        self.fc1 = BBBLinear(5 * 5 * 16*wide, 120*wide, bias=True, priors=self.priors)
+        self.act3 = self.act()
+
+        self.fc2 = BBBLinear(120*wide, 84, bias=True, priors=self.priors)
+        self.act4 = self.act()
+
+        self.fc3 = BBBLinear(84, outputs, bias=True, priors=self.priors)
--- a/models/NonBayesianModels/AlexNet.py
+++ b/models/NonBayesianModels/AlexNet.py
@ -0,0 +1,40 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+def conv_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        #nn.init.xavier_uniform(m.weight, gain=np.sqrt(2))
+        nn.init.normal_(m.weight, mean=0, std=1)
+        nn.init.constant(m.bias, 0)
+
+class AlexNet(nn.Module):
+
+    def __init__(self, num_classes, inputs=3):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(inputs, 64, kernel_size=11, stride=4, padding=5),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=0.5),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=0.5),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p=0.5),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.classifier = nn.Linear(256, num_classes)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
--- a/models/NonBayesianModels/LeNet.py
+++ b/models/NonBayesianModels/LeNet.py
@ -0,0 +1,40 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from math import ceil
+
+def conv_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        #nn.init.xavier_uniform(m.weight, gain=np.sqrt(2))
+        nn.init.normal_(m.weight, mean=0, std=1)
+        nn.init.constant(m.bias, 0)
+
+class LeNet(nn.Module):
+    def __init__(self, num_classes, inputs=3, wide=2):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(inputs, ceil(6*wide), 5)
+        self.conv2 = nn.Conv2d(ceil(6*wide), ceil(16*wide), 5)
+        self.fc1   = nn.Linear(ceil(16*5*5*wide), ceil(120*wide))
+        self.fc2   = nn.Linear(ceil(120*wide), 84)
+        self.fc3   = nn.Linear(84, num_classes)
+
+    def forward(self, x):
+        out = F.relu(self.conv1(x))
+        #print(out.size())
+        out = F.max_pool2d(out, 2)
+        #print(out.size())
+        out = F.relu(self.conv2(out))
+        #print(out.size())
+        out = F.max_pool2d(out, 2)
+        #print(out.size())
+        out = out.view(out.size(0), -1)
+        #print(out.size())
+        out = F.relu(self.fc1(out))
+        #print(out.size())
+        out = F.relu(self.fc2(out))
+        #print(out.size())
+        out = self.fc3(out)
+        #print(out.size())
+        #print("END")
+        return(out)
--- a/models/NonBayesianModels/ThreeConvThreeFC.py
+++ b/models/NonBayesianModels/ThreeConvThreeFC.py
@ -0,0 +1,42 @@
+import torch.nn as nn
+from layers.misc import FlattenLayer
+
+
+def conv_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        #nn.init.xavier_uniform(m.weight, gain=np.sqrt(2))
+        nn.init.normal_(m.weight, mean=0, std=1)
+        nn.init.constant(m.bias, 0)
+
+class ThreeConvThreeFC(nn.Module):
+    """
+    To train on CIFAR-10:
+    https://arxiv.org/pdf/1207.0580.pdf
+    """
+    def __init__(self, outputs, inputs):
+        super(ThreeConvThreeFC, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(inputs, 32, 5, stride=1, padding=2),
+            nn.Softplus(),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(32, 64, 5, stride=1, padding=2),
+            nn.Softplus(),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 128, 5, stride=1, padding=1),
+            nn.Softplus(),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            FlattenLayer(2 * 2 * 128),
+            nn.Linear(2 * 2 * 128, 1000),
+            nn.Softplus(),
+            nn.Linear(1000, 1000),
+            nn.Softplus(),
+            nn.Linear(1000, outputs)
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x
--- a/radeontop.sh
+++ b/radeontop.sh
@ -0,0 +1,3 @@
+#!/bin/env bash
+
+radeontop -b 08 -d - > $1
--- a/read_pickle.py
+++ b/read_pickle.py
@ -0,0 +1,17 @@
+import pickle
+
+gpu_data = []
+with (open("bayesian_wattdata_3.pkl", "rb")) as openfile:
+    while True:
+        try:
+            gpu_data = pickle.load(openfile)
+        except EOFError:
+            break
+
+#exp_data = []
+#with (open("bayes_exp_data_6.pkl", "rb")) as openfile:
+#    while True:
+#        try:
+#            exp_data = pickle.load(openfile)
+#        except EOFError:
+#            break
--- a/run_service.py
+++ b/run_service.py
@ -0,0 +1,136 @@
+import psutil
+import pickle
+import arguments
+from time import sleep
+#from pathlib import Path
+import subprocess as sub
+from arguments import makeArguments
+
+
+def kill(proc_pid):
+    process = psutil.Process(proc_pid)
+    for proc in process.children(recursive=True):
+        proc.kill()
+    process.kill()
+
+
+cfg = {
+        "model": {"net_type": None, "type": None, "size": None, "layer_type":
+                  "lrt", "activation_type": "softplus", "priors": {
+                      'prior_mu': 0,
+                      'prior_sigma': 0.1,
+                      'posterior_mu_initial': (0, 0.1),  # (mean,std) normal_
+                      'posterior_rho_initial': (-5, 0.1),  # (mean,std) normal_
+                      },
+                  "n_epochs": 100,
+                  "sens": 1e-9,
+                  "energy_thrs": 10000,
+                  "acc_thrs": 0.99,
+                  "lr": 0.001,
+                  "num_workers": 4,
+                  "valid_size": 0.2,
+                  "batch_size": 256,
+                  "train_ens": 1,
+                  "valid_ens": 1,
+                  "beta_type": 0.1,  # 'Blundell','Standard',etc.
+                  # Use float for const value
+                  },
+        "data": None,
+        "stopping_crit": None,
+        "save": None,
+        "pickle_path": None,
+}
+
+args = makeArguments(arguments.all_args)
+
+check = list(args.values())
+if all(v is None for v in check):
+    raise Exception("One argument required")
+elif None in check:
+    if args['f'] is not None:
+        cmd = ["python", "main_frequentist.py"]
+        cfg["model"]["type"] = "freq"
+    elif args['b'] is not None:
+        cmd = ["python", "main_bayesian.py"]
+        cfg["model"]["type"] = "bayes"
+else:
+    raise Exception("Only one argument allowed")
+
+
+wide = args["f"] or args["b"]
+
+cfg["model"]["size"] = wide
+cfg["data"] = args["dataset"]
+cfg["model"]["net_type"] = args["net_type"]
+
+
+if args['EarlyStopping']:
+    cfg["stopping_crit"] = 2
+elif args['EnergyBound']:
+    cfg["stopping_crit"] = 3
+elif args['AccuracyBound']:
+    cfg["stopping_crit"] = 4
+else:
+    cfg["stopping_crit"] = 1
+
+if args['Save']:
+    cfg["save"] = 1
+else:
+    cfg["save"] = 0
+
+
+cfg["pickle_path"] = "{}_wattdata_{}.pkl".format(cfg["model"]["type"],
+                                                 cfg["model"]["size"])
+
+with open("configuration.pkl", "wb") as f:
+    pickle.dump(cfg, f)
+
+# print(args)
+# print(cfg)
+
+sleep(3)
+
+cpu_watt = "cpu_watt.sh"
+ram = "mem_free.sh"
+gpu = "radeontop.sh"
+
+#path_cpu_watt = Path(cpu_watt)
+#path_ram = Path(ram)
+#path_gpu = Path(gpu)
+
+#path_cpu_watt = str(Path(cpu_watt).absolute()) + '/' + cpu_watt
+#path_ram = str(Path(ram).absolute()) + '/' + ram
+#path_gpu = str(Path(gpu).absolute()) + '/' + gpu
+
+if cmd[1] == "main_frequentist.py":
+    cmd2 = ['./'+cpu_watt, "freq_{}_cpu_watts".format(wide)]
+    cmd3 = ['./'+ram, "freq_{}_ram_use".format(wide)]
+    cmd4 = ['./'+gpu, "freq_{}_flop_app".format(wide)]
+elif cmd[1] == "main_bayesian.py":
+    cmd2 = ['./'+cpu_watt, "bayes_{}_cpu_watts".format(wide)]
+    cmd3 = ['./'+ram, "bayes_{}_ram_use".format(wide)]
+    cmd4 = ['./'+gpu, "bayes_{}_flop_app".format(wide)]
+
+
+path = sub.check_output(['pwd'])
+path = path.decode()
+path = path.replace('\n', '')
+
+startWattCounter = 'python ' + path + '/amd_sample_draw.py'
+
+
+p1 = sub.Popen(cmd)
+p2 = sub.Popen(startWattCounter.split(), stdin=sub.PIPE, stdout=sub.PIPE,
+               stderr=sub.PIPE)
+p3 = sub.Popen(cmd2, stdin=sub.PIPE, stdout=sub.PIPE, stderr=sub.PIPE)
+p4 = sub.Popen(cmd3, stdin=sub.PIPE, stdout=sub.PIPE, stderr=sub.PIPE)
+p5 = sub.Popen(cmd4, stdin=sub.PIPE, stdout=sub.PIPE, stderr=sub.PIPE)
+
+retcode = p1.wait()
+print("Return code: {}".format(retcode))
+
+p1.kill()
+kill(p2.pid)
+kill(p3.pid)
+kill(p4.pid)
+kill(p5.pid)
--- a/stopping_crit.py
+++ b/stopping_crit.py
@ -0,0 +1,45 @@
+import pickle
+from time import sleep
+from gpu_power_func import total_watt_consumed
+
+with (open("configuration.pkl", "rb")) as file:
+    while True:
+        try:
+            cfg = pickle.load(file)
+        except EOFError:
+            break
+
+def earlyStopping(early_stopping: list, train_acc: float, epoch: int, sensitivity: float=1e-9):
+    early_stopping.append(train_acc)
+    if epoch % 4 == 0 and epoch > 0:
+        print("Value 1: {} > Value 2: {} > \
+            Value 3: {}".format(early_stopping[0], \
+            abs(early_stopping[1]-sensitivity), \
+            abs(early_stopping[2]-sensitivity)))
+        if train_acc > 0.5:
+            if early_stopping[0] > abs(early_stopping[1]-sensitivity) and \
+               early_stopping[1] > abs(early_stopping[2]-sensitivity):
+                print("Stopping Early")
+                return 1
+        del early_stopping[:]
+    return 0
+        
+
+def energyBound(threshold: float=100000.0):
+    try:
+        energy = total_watt_consumed(cfg["pickle_path"])
+    except Exception as e:
+        sleep(3)
+        energy = total_watt_consumed(cfg["pickle_path"])
+    print("Energy used: {}".format(energy))
+    if energy > threshold:
+        print("Energy bound achieved")
+        return 1
+    return 0
+
+
+def accuracyBound(train_acc: float, threshold: float=0.99):
+    if train_acc >= threshold:
+        print("Accuracy bound achieved")
+        return 1
+    return 0
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -0,0 +1,98 @@
+import os
+import sys
+import math
+import pytest
+import unittest
+import numpy as np
+import torch
+import torch.nn as nn
+
+import utils
+from models.BayesianModels.Bayesian3Conv3FC import BBB3Conv3FC
+from models.BayesianModels.BayesianAlexNet import BBBAlexNet
+from models.BayesianModels.BayesianLeNet import BBBLeNet
+from models.NonBayesianModels.AlexNet import AlexNet
+from models.NonBayesianModels.LeNet import LeNet
+from models.NonBayesianModels.ThreeConvThreeFC import ThreeConvThreeFC
+from layers.BBBConv import BBBConv2d
+from layers.BBBLinear import  BBBLinear
+from layers.misc import FlattenLayer
+
+cuda_available = torch.cuda.is_available()
+bayesian_models = [BBBLeNet, BBBAlexNet, BBB3Conv3FC]
+non_bayesian_models = [LeNet, AlexNet, ThreeConvThreeFC]
+
+class TestModelForwardpass:
+
+    @pytest.mark.parametrize("model", bayesian_models)
+    def test_cpu_bayesian(self, model):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 3, 32, 32))
+        net = model(10, 3)
+        out = net(batch)
+        assert out[0].shape[0]==batch_size
+    
+    @pytest.mark.parametrize("model", non_bayesian_models)
+    def test_cpu_frequentist(self, model):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 3, 32, 32))
+        net = model(10, 3)
+        out = net(batch)
+        assert out.shape[0]==batch_size
+
+    @pytest.mark.skipif(not cuda_available, reason="CUDA not available")
+    @pytest.mark.parametrize("model", bayesian_models)
+    def test_gpu_bayesian(self, model):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 3, 32, 32))
+        net = model(10, 3)
+        if cuda_available:
+            net = net.cuda()
+            batch = batch.cuda()
+        out = net(batch)
+        assert out[0].shape[0]==batch_size
+
+    @pytest.mark.skipif(not cuda_available, reason="CUDA not available")
+    @pytest.mark.parametrize("model", non_bayesian_models)
+    def test_gpu_frequentist(self, model):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 3, 32, 32))
+        net = model(10, 3)
+        if cuda_available:
+            net = net.cuda()
+            batch = batch.cuda()
+        out = net(batch)
+        assert out.shape[0]==batch_size
+
+
+class TestBayesianLayers:
+
+    def test_flatten(self):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 64, 4, 4))
+
+        layer = FlattenLayer(4 * 4 * 64)
+        batch = layer(batch)
+
+        assert batch.shape[0]==batch_size
+        assert batch.shape[1]==(4 * 4 *64)
+
+    def test_conv(self):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 16, 24, 24))
+
+        layer = BBBConv2d(16, 6, 4, alpha_shape=(1,1), padding=0, bias=False)
+        batch = layer(batch)
+
+        assert batch.shape[0]==batch_size
+        assert batch.shape[1]==6
+
+    def test_linear(self):
+        batch_size = np.random.randint(1, 256)
+        batch = torch.randn((batch_size, 128))
+
+        layer = BBBLinear(128, 64, alpha_shape=(1,1), bias=False)
+        batch = layer(batch)
+
+        assert batch.shape[0]==batch_size
+        assert batch.shape[1]==64
--- a/uncertainty_estimation.py
+++ b/uncertainty_estimation.py
@ -0,0 +1,184 @@
+import argparse
+import torch
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from PIL import Image
+import torchvision
+from torch.nn import functional as F
+import torchvision.transforms as transforms
+import matplotlib.pyplot as plt
+
+import data
+from main_bayesian import getModel
+import config_bayesian as cfg
+
+
+# CUDA settings
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+mnist_set = None
+notmnist_set = None
+
+transform = transforms.Compose([
+    transforms.ToPILImage(),
+    transforms.Resize((32, 32)),
+    transforms.ToTensor(),
+    ])
+
+
+def init_dataset(notmnist_dir):
+    global mnist_set
+    global notmnist_set
+    mnist_set, _, _, _ = data.getDataset('MNIST')
+    notmnist_set = torchvision.datasets.ImageFolder(root=notmnist_dir)
+
+
+def get_uncertainty_per_image(model, input_image, T=15, normalized=False):
+    input_image = input_image.unsqueeze(0)
+    input_images = input_image.repeat(T, 1, 1, 1)
+
+    net_out, _ = model(input_images)
+    pred = torch.mean(net_out, dim=0).cpu().detach().numpy()
+    if normalized:
+        prediction = F.softplus(net_out)
+        p_hat = prediction / torch.sum(prediction, dim=1).unsqueeze(1)
+    else:
+        p_hat = F.softmax(net_out, dim=1)
+    p_hat = p_hat.detach().cpu().numpy()
+    p_bar = np.mean(p_hat, axis=0)
+
+    temp = p_hat - np.expand_dims(p_bar, 0)
+    epistemic = np.dot(temp.T, temp) / T
+    epistemic = np.diag(epistemic)
+
+    aleatoric = np.diag(p_bar) - (np.dot(p_hat.T, p_hat) / T)
+    aleatoric = np.diag(aleatoric)
+
+    return pred, epistemic, aleatoric
+
+
+def get_uncertainty_per_batch(model, batch, T=15, normalized=False):
+    batch_predictions = []
+    net_outs = []
+    batches = batch.unsqueeze(0).repeat(T, 1, 1, 1, 1)
+
+    preds = []
+    epistemics = []
+    aleatorics = []
+    
+    for i in range(T):  # for T batches
+        net_out, _ = model(batches[i].cuda())
+        net_outs.append(net_out)
+        if normalized:
+            prediction = F.softplus(net_out)
+            prediction = prediction / torch.sum(prediction, dim=1).unsqueeze(1)
+        else:
+            prediction = F.softmax(net_out, dim=1)
+        batch_predictions.append(prediction)
+    
+    for sample in range(batch.shape[0]):
+        # for each sample in a batch
+        pred = torch.cat([a_batch[sample].unsqueeze(0) for a_batch in net_outs], dim=0)
+        pred = torch.mean(pred, dim=0)
+        preds.append(pred)
+
+        p_hat = torch.cat([a_batch[sample].unsqueeze(0) for a_batch in batch_predictions], dim=0).detach().cpu().numpy()
+        p_bar = np.mean(p_hat, axis=0)
+
+        temp = p_hat - np.expand_dims(p_bar, 0)
+        epistemic = np.dot(temp.T, temp) / T
+        epistemic = np.diag(epistemic)
+        epistemics.append(epistemic)
+
+        aleatoric = np.diag(p_bar) - (np.dot(p_hat.T, p_hat) / T)
+        aleatoric = np.diag(aleatoric)
+        aleatorics.append(aleatoric)
+
+    epistemic = np.vstack(epistemics)  # (batch_size, categories)
+    aleatoric = np.vstack(aleatorics)  # (batch_size, categories)
+    preds = torch.cat([i.unsqueeze(0) for i in preds]).cpu().detach().numpy()  # (batch_size, categories)
+
+    return preds, epistemic, aleatoric
+
+
+def get_sample(dataset, sample_type='mnist'):
+    idx = np.random.randint(len(dataset.targets))
+    if sample_type=='mnist':
+        sample = dataset.data[idx]
+        truth = dataset.targets[idx]
+    else:
+        path, truth = dataset.samples[idx]
+        sample = torch.from_numpy(np.array(Image.open(path)))
+
+    sample = sample.unsqueeze(0)
+    sample = transform(sample)
+    return sample.to(device), truth
+
+
+def run(net_type, weight_path, notmnist_dir):
+    init_dataset(notmnist_dir)
+
+    layer_type = cfg.layer_type
+    activation_type = cfg.activation_type
+
+    net = getModel(net_type, 1, 10, priors=None, layer_type=layer_type, activation_type=activation_type)
+    net.load_state_dict(torch.load(weight_path))
+    net.train()
+    net.to(device)
+
+    fig = plt.figure()
+    fig.suptitle('Uncertainty Estimation', fontsize='x-large')
+    mnist_img = fig.add_subplot(321)
+    notmnist_img = fig.add_subplot(322)
+    epi_stats_norm = fig.add_subplot(323)
+    ale_stats_norm = fig.add_subplot(324)
+    epi_stats_soft = fig.add_subplot(325)
+    ale_stats_soft = fig.add_subplot(326)
+
+    sample_mnist, truth_mnist = get_sample(mnist_set)
+    pred_mnist, epi_mnist_norm, ale_mnist_norm = get_uncertainty_per_image(net, sample_mnist, T=25, normalized=True)
+    pred_mnist, epi_mnist_soft, ale_mnist_soft = get_uncertainty_per_image(net, sample_mnist, T=25, normalized=False)
+    mnist_img.imshow(sample_mnist.squeeze().cpu(), cmap='gray')
+    mnist_img.axis('off')
+    mnist_img.set_title('MNIST Truth: {} Prediction: {}'.format(int(truth_mnist), int(np.argmax(pred_mnist))))
+
+    sample_notmnist, truth_notmnist = get_sample(notmnist_set, sample_type='notmnist')
+    pred_notmnist, epi_notmnist_norm, ale_notmnist_norm = get_uncertainty_per_image(net, sample_notmnist, T=25, normalized=True)
+    pred_notmnist, epi_notmnist_soft, ale_notmnist_soft = get_uncertainty_per_image(net, sample_notmnist, T=25, normalized=False)
+    notmnist_img.imshow(sample_notmnist.squeeze().cpu(), cmap='gray')
+    notmnist_img.axis('off')
+    notmnist_img.set_title('notMNIST Truth: {}({}) Prediction: {}({})'.format(
+        int(truth_notmnist), chr(65 + truth_notmnist), int(np.argmax(pred_notmnist)), chr(65 + np.argmax(pred_notmnist))))
+
+    x = list(range(10))
+    data = pd.DataFrame({
+        'epistemic_norm': np.hstack([epi_mnist_norm, epi_notmnist_norm]),
+        'aleatoric_norm': np.hstack([ale_mnist_norm, ale_notmnist_norm]),
+        'epistemic_soft': np.hstack([epi_mnist_soft, epi_notmnist_soft]),
+        'aleatoric_soft': np.hstack([ale_mnist_soft, ale_notmnist_soft]),
+        'category': np.hstack([x, x]),
+        'dataset': np.hstack([['MNIST']*10, ['notMNIST']*10])
+    })
+    print(data)
+    sns.barplot(x='category', y='epistemic_norm', hue='dataset', data=data, ax=epi_stats_norm)
+    sns.barplot(x='category', y='aleatoric_norm', hue='dataset', data=data, ax=ale_stats_norm)
+    epi_stats_norm.set_title('Epistemic Uncertainty (Normalized)')
+    ale_stats_norm.set_title('Aleatoric Uncertainty (Normalized)')
+
+    sns.barplot(x='category', y='epistemic_soft', hue='dataset', data=data, ax=epi_stats_soft)
+    sns.barplot(x='category', y='aleatoric_soft', hue='dataset', data=data, ax=ale_stats_soft)
+    epi_stats_soft.set_title('Epistemic Uncertainty (Softmax)')
+    ale_stats_soft.set_title('Aleatoric Uncertainty (Softmax)')
+
+    plt.show()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = "PyTorch Uncertainty Estimation b/w MNIST and notMNIST")
+    parser.add_argument('--net_type', default='lenet', type=str, help='model')
+    parser.add_argument('--weights_path', default='checkpoints/MNIST/bayesian/model_lenet.pt', type=str, help='weights for model')
+    parser.add_argument('--notmnist_dir', default='data/notMNIST_small/', type=str, help='weights for model')
+    args = parser.parse_args()
+
+    run(args.net_type, args.weights_path, args.notmnist_dir)
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,41 @@
+import os
+import torch
+import numpy as np
+from torch.nn import functional as F
+
+# cifar10 classes
+cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
+                   'dog', 'frog', 'horse', 'ship', 'truck']
+
+
+def logmeanexp(x, dim=None, keepdim=False):
+    """Stable computation of log(mean(exp(x))"""
+
+    
+    if dim is None:
+        x, dim = x.view(-1), 0
+    x_max, _ = torch.max(x, dim, keepdim=True)
+    x = x_max + torch.log(torch.mean(torch.exp(x - x_max), dim, keepdim=True))
+    return x if keepdim else x.squeeze(dim)
+
+# check if dimension is correct
+
+# def dimension_check(x, dim=None, keepdim=False):
+#     if dim is None:
+#         x, dim = x.view(-1), 0
+
+#     return x if keepdim else x.squeeze(dim)
+
+
+def adjust_learning_rate(optimizer, lr):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def save_array_to_file(numpy_array, filename):
+    file = open(filename, 'a')
+    shape = " ".join(map(str, numpy_array.shape))
+    np.savetxt(file, numpy_array.flatten(), newline=" ", fmt="%.3f")
+    file.write("\n")
+    file.close()