Merge pull request #56 from ambroggi/dev

Here is item save file
bayegaspard · Oct 16, 2023 · 8a0d76e · 8a0d76e
2 parents ef393d2 + 3ef3700
commit 8a0d76e
Show file tree

Hide file tree

Showing 7 changed files with 173 additions and 131 deletions.
diff --git a/Tests/test_fullRun.py b/Tests/test_fullRun.py
@@ -70,7 +70,8 @@ def testLoadDataset():
         assert torch.all(x[0] == y[0])
 
 def testLoadDatasetfromSave():
-    main.Config.parameters["attemptLoad"][0] = 1
+    main.Config.parameters["attemptLoadModel"][0] = 1
+    main.Config.parameters["attemptLoadData"][0] = 1
     main.torch.manual_seed(1)
     train1, test1, val1 = FileHandling.checkAttempLoad("")
     main.torch.manual_seed(1)

diff --git a/build_number.txt b/build_number.txt
@@ -1 +1 @@
-     504
+     506
diff --git a/src/main/Config.py b/src/main/Config.py
@@ -13,7 +13,7 @@
     #         pass
     #     raise doubleImport
     print(f"A POSSIBLE PROBLEM HAS OCCURED, Config was loaded improperly, from {__name__} instead of directly\
-    this might break some global variables by having two copies",file=sys.stderr)
+    this might break some global variables by having two copies", file=sys.stderr)
 
 #TODO: Rework config so that it is less janky and uses less bad practices of global variables. 
 # Possibly by moving HelperFunctions Loop functions to outside of the program 
@@ -41,22 +41,23 @@ def loopOverUnknowns(unknownlist=False):
     if len(knownVals)<2:
         print("Too few knowns, things might break")
     parameters["Unknowns"] = f"{len(unknownlist)} Unknowns"
-    parameters["Unknowns_clss"] = [unknownlist,"Values used for testing"]
-    parameters["Knowns_clss"] = [knownVals,"Values used for training"]
+    parameters["Unknowns_clss"] = [unknownlist, "Values used for testing"]
+    parameters["Knowns_clss"] = [knownVals, "Values used for training"]
     return knownVals
 
 #This is the diffrent optimization functions
-opt_func = {"Adam":torch.optim.Adam,"SGD":torch.optim.SGD, "RMSprop":torch.optim.RMSprop}
+opt_func = {"Adam":torch.optim.Adam, "SGD":torch.optim.SGD, "RMSprop":torch.optim.RMSprop}
 
 
 #Here are all of the paremeters for the model.
 parameters = {
     #These parameters are orginized like this:
-    #"ParamName":[Value,"Description",[possible values]]
+    #"ParamName":[Value, "Description", [possible values]]
     #for a parameter called "ParamName" with a value of Value
     "batch_size":[100000, "Number of items per batch"],
     "num_workers":[14, "Number of threads working on building batches"],
-    "attemptLoad":[0, "0: do not use saves\n1:use saves"],
+    "attemptLoadModel":[0, "0: do not use saves for the model\n1:use saves for the model"],
+    "attemptLoadData":[0, "0: do not use saves for the dataset\n1:use saves for the dataset"],
     "testlength":[1/4, "[0,1) percentage of training to test with"],
     "Mix unknowns and validation": [1,"0 or 1, 0 means that the test set is purely unknowns and 1 means that the testset is the validation set plus unknowns (for testing)"],
     "MaxPerClass": [1000, "Maximum number of samples per class\n if Dataloader_Variation is Cluster and this value is a float it interprets it as the maximum percentage of the class instead."],
@@ -83,46 +84,47 @@ def loopOverUnknowns(unknownlist=False):
     "4: Loop through predefined hyperparameters found in datasets/hyperparamList.csv"],
     "Dataset": ["Payload_data_CICIDS2017", "This is what dataset we are using,", ["Payload_data_CICIDS2017","Payload_data_UNSW"]],
     "SchedulerStepSize": [10, "This is how often the scheduler takes a step, 3 means every third epoch"],
-    "SchedulerStep": [0.9,"This is how big a step the scheduler takes, leave 0 for no step"]
+    "SchedulerStep": [0.9,"This is how big a step the scheduler takes, leave 0 for no step"],
+    "ApplyPrelimSoft": [1, "This says to use a preliminary softmax and only use unknown detection on things that fail the softmax unknown detection"]
 }
 
 
 #Argparse tutorial: https://docs.python.org/3/howto/argparse.html 
 parser = argparse.ArgumentParser()
 for x in parameters.keys():
-    if x in ["batch_size","num_workers","MaxPerClass","num_epochs","Degree of Overcompleteness","Number of Layers","Nodes","SchedulerStepSize"]:
-        parser.add_argument(f"--{x}",type=int,default=parameters[x][0],help=parameters[x][1],required=False)
-    if x in ["testlength","learningRate","threshold","Dropout","Temperature","SchedulerStep"]:
-        parser.add_argument(f"--{x}",type=float,default=parameters[x][0],help=parameters[x][1],required=False)
-    if x in ["attemptLoad","Mix unknowns and validation"]:
-        parser.add_argument(f"--{x}",type=int,choices=[1,0],default=parameters[x][0],help=parameters[x][1],required=False)
+    if x in ["batch_size", "num_workers", "MaxPerClass", "num_epochs", "Degree of Overcompleteness", "Number of Layers", "Nodes", "SchedulerStepSize"]:
+        parser.add_argument(f"--{x}", type=int, default=parameters[x][0], help=parameters[x][1], required=False)
+    if x in ["testlength", "learningRate", "threshold", "Dropout", "Temperature", "SchedulerStep"]:
+        parser.add_argument(f"--{x}", type=float, default=parameters[x][0], help=parameters[x][1], required=False)
+    if x in ["attemptLoadModel", "attemptLoadData","Mix unknowns and validation","ApplyPrelimSoft"]:
+        parser.add_argument(f"--{x}", type=int, choices=[1, 0], default=parameters[x][0], help=parameters[x][1], required=False)
     if x in ["LOOP"]:
-        parser.add_argument(f"--{x}",type=int,choices=[0,1,2,3,4],default=parameters[x][0],help=parameters[x][1],required=False)
-    if x in ["model","OOD Type","Dataloader_Variation","Activation","Dataset"]:
-        parser.add_argument(f"--{x}",choices=parameters[x].pop(),default=parameters[x][0],help=parameters[x][1],required=False)
+        parser.add_argument(f"--{x}", type=int, choices=[0, 1, 2, 3, 4], default=parameters[x][0], help=parameters[x][1], required=False)
+    if x in ["model", "OOD Type", "Dataloader_Variation", "Activation", "Dataset"]:
+        parser.add_argument(f"--{x}", choices=parameters[x].pop(), default=parameters[x][0], help=parameters[x][1], required=False)
     if x in ["Unknowns_clss"]:
-        parser.add_argument(f"--{x}",default=f"{parameters[x][0]}",help=parameters[x][1],required=False)
-if "pytest" not in sys.modules: #The argument parser appears to have issues with the pytest tests. I have no idea why.
+        parser.add_argument(f"--{x}", default=f"{parameters[x][0]}", help=parameters[x][1], required=False)
+if "pytest" not in sys.modules:  # The argument parser appears to have issues with the pytest tests. I have no idea why.
     args = parser.parse_args()
     for x in args._get_kwargs():
         parameters[x[0]][0] = x[1]
 
-if isinstance(parameters["Unknowns_clss"][0],str):
+if isinstance(parameters["Unknowns_clss"][0], str):
     if len(parameters["Unknowns_clss"][0])>0 and len(parameters["Unknowns_clss"][0])!=2: #Not sure why I need this specifier but it breaks if the default is []
         # print(len(parameters["Unknowns_clss"][0]))
-        parameters["Unknowns_clss"][0] = [int(y) for y in parameters["Unknowns_clss"][0].removesuffix("]").removeprefix("[").split(sep=",")]
+        parameters["Unknowns_clss"][0] = [int(y) for y in parameters["Unknowns_clss"][0].removesuffix("]").removeprefix("[").split(sep=", ")]
     else:
         parameters["Unknowns_clss"][0] = []
 
 
-DOC_kernels = [3,4,5]
+DOC_kernels = [3, 4, 5]
 
 #Set Number of classes:
 if parameters["Dataset"][0] == "Payload_data_UNSW":
     parameters["CLASSES"][0] = 10
     UnusedClasses = []
 else:
-    UnusedClasses = [8,9,10]
+    UnusedClasses = [8, 9, 10]
 UnusedClasses = []
 
 #Dendrogram chunk uses a slightly diffrent output on the model structure.
@@ -140,34 +142,34 @@ def loopOverUnknowns(unknownlist=False):
 
 
 #This is to test all of the algorithms one after the other. (Loop 1 values)
-alg = ["Soft","Open","Energy","COOL","DOC","iiMod"]
-batch = [100,1000,10000,100000]
-datapoints_per_class = [10,100,1000]
-thresholds = [0.1,1,10]
-thresholds = [30,20,15,5]
+alg = ["Soft", "Open", "Energy", "COOL", "DOC", "iiMod"]
+batch = [100, 1000, 10000, 100000]
+datapoints_per_class = [10, 100, 1000]
+thresholds = [0.1, 1, 10]
+thresholds = [30, 20, 15, 5]
 thresholds = [parameters["threshold"][0]]
-learning_rates = [0.1,0.01,0.001,0.0001]
-activation = ["ReLU", "Tanh", "Sigmoid","Leaky"]
-groups = [[],[2],[2,3],[2,3,4],[2,3,4,5],[2,3,4,5,6],[2,3,4,5,6,7],[1,2,3,4,5,6,7],[1,2,3,4,5,6,7,8]]
-#groups = [[7,8,9]]
+learning_rates = [0.1, 0.01, 0.001, 0.0001]
+activation = ["ReLU", "Tanh", "Sigmoid", "Leaky"]
+groups = [[], [2], [2, 3], [2, 3, 4], [2, 3, 4, 5], [2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7, 8]]
+#groups = [[7, 8, 9]]
 if parameters["Dataset"][0] == "Payload_data_CICIDS2017":
-    incGroups = [[2,3,4,5,6,7,8,9,10,11,12,13,14],[3,4,5,6,7,8,9,10,11,12,13,14],[4,5,6,7,8,9,10,11,12,13,14],[5,6,7,8,9,10,11,12,13,14],[6,7,8,9,10,11,12,13,14],[7,8,9,10,11,12,13,14],[8,9,10,11,12,13,14],[9,10,11,12,13,14],[10,11,12,13,14],[11,12,13,14],[12,13,14],[13,14],[14]] 
+    incGroups = [[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [6, 7, 8, 9, 10, 11, 12, 13, 14], [7, 8, 9, 10, 11, 12, 13, 14], [8, 9, 10, 11, 12, 13, 14], [9, 10, 11, 12, 13, 14], [10, 11, 12, 13, 14], [11, 12, 13, 14], [12, 13, 14], [13, 14], [14]] 
 #This one list is for loop 2. Note: array size should be decreasing.
 else:
-    incGroups = [[2,3,4,5,6,7,8,9],[3,4,5,6,7,8,9],[4,5,6,7,8,9],[5,6,7,8,9],[6,7,8,9],[7,8,9],[8,9],[9]]
+    incGroups = [[2, 3, 4, 5, 6, 7, 8, 9], [3, 4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 8, 9], [5, 6, 7, 8, 9], [6, 7, 8, 9], [7, 8, 9], [8, 9], [9]]
 epochs= []
-epochs = [1,10,100,150]
+epochs = [1, 10, 100, 150]
 
 
-# groups = [list(range(2,parameters["CLASSES"][0]))]
+# groups = [list(range(2, parameters["CLASSES"][0]))]
 # #Little bit of code that generates incremental numbers of unknowns.
 # while len(groups[0])>2:
 #     new = groups[0].copy()
 #     new.pop(0)
 #     new.pop(0)
-#     groups.insert(0,new)
+#     groups.insert(0, new)
 # #Little bit of code that generates decrementing numbers of unknowns.
-# incGroups = [list(range(2,parameters["CLASSES"][0]))]
+# incGroups = [list(range(2, parameters["CLASSES"][0]))]
 # while len(incGroups[-1])>1:
 #     new = incGroups[-1].copy()
 #     new.pop(0)
@@ -194,29 +196,29 @@ def loopOverUnknowns(unknownlist=False):
 #Adds in everything in config:
 
 # #learning_rates.remove(Config.parameters["learningRate"][0])
-# learning_rates.insert(0,parameters["learningRate"][0])
+# learning_rates.insert(0, parameters["learningRate"][0])
 # #epochs.remove(Config.parameters["num_epochs"][0])
-# epochs.insert(0,parameters["num_epochs"][0])
-# groups.insert(0,helper_variables["unknowns_clss"])
+# epochs.insert(0, parameters["num_epochs"][0])
+# groups.insert(0, helper_variables["unknowns_clss"])
 
 # #Always starts with the configured activation type
 # alg.remove(parameters["OOD Type"][0])
-# alg.insert(0,parameters["OOD Type"][0])
+# alg.insert(0, parameters["OOD Type"][0])
 
 #This is an array to eaiser loop through everything.
-loops = [batch,learning_rates,activation,["Standard","Cluster"],groups]
+loops = [batch, learning_rates, activation, ["Standard", "Cluster"], groups]
 # loops = [groups]
-loops2 = ["batch_size","learningRate","Activation","Dataloader_Variation","Unknowns"]
+loops2 = ["batch_size", "learningRate", "Activation", "Dataloader_Variation", "Unknowns"]
 # loops2 = ["Unknowns"]
 for i in range(len(loops)):
     if loops2[i] == "Unknowns":
-        loops[i].insert(0,parameters["Unknowns_clss"][0])
+        loops[i].insert(0, parameters["Unknowns_clss"][0])
     elif loops2[i] == "optimizer":
-        loops[i].insert(0,parameters[loops2[i]])
+        loops[i].insert(0, parameters[loops2[i]])
     elif loops2[i] == "None":
         pass
     else:
-        loops[i].insert(0,parameters[loops2[i]][0])
+        loops[i].insert(0, parameters[loops2[i]][0])
 
 #Override the unknowns because model is kept
 if parameters["LOOP"][0] == 2:
@@ -237,30 +239,36 @@ def algorithmSpecificSettings(alg="None"):
 
 
     # match alg:
-    if alg == "Soft":
-        pass
-    if alg == "Open":
-        parameters["threshold"][0] = 0.8
-    if alg == "Energy":
-        parameters["threshold"][0] = 0.474
-    if alg == "COOL":
-        parameters["threshold"][0] = 0.516034961
-    if alg == "DOC":
-        parameters["threshold"][0] = 0.06449493
-    if alg == "iiMod":
-        parameters["threshold"][0] = 102064.4453
+    # if alg == "Soft":
+    #     pass
+    # if alg == "Open":
+    #     parameters["threshold"][0] = 0.8
+    # if alg == "Energy":
+    #     parameters["threshold"][0] = 0.474
+    # if alg == "COOL":
+    #     parameters["threshold"][0] = 0.516034961
+    # if alg == "DOC":
+    #     parameters["threshold"][0] = 0.06449493
+    # if alg == "iiMod":
+    #     parameters["threshold"][0] = 102064.4453
 
 if parameters["LOOP"][0] == 3:
+    print("Warning: Unknowns may have been changed due to LOOP 3 percentages file")
+    import pandas as pd
     # parameters["num_epochs"][0] = 0
-    parameters["loopLevel"] = [0,"What percentages the model is on"]
+    parameters["loopLevel"] = [0, "What percentages the model is on"]
     parameters["MaxSamples"] = [parameters["MaxPerClass"][0], "Max number of samples total"]
+    file = pd.read_csv("datasets/percentages.csv", index_col=None).to_numpy()
+    zeros = file[0][:parameters["CLASSES"][0]] == 0
+    unknownClasses = zeros.nonzero()[0]
+    parameters["Unknowns_clss"][0] = unknownClasses.tolist()
 
 
 #Getting version number
 #https://gist.github.com/sg-s/2ddd0fe91f6037ffb1bce28be0e74d4e
-f = open("build_number.txt","r")
-parameters["Version"] = [f.read(),"The version number"]
+f = open("build_number.txt", "r")
+parameters["Version"] = [f.read(), "The version number"]
 
 save_as_tensorboard = True
-datasetRandomOffset =True
+datasetRandomOffset = True
 dataparallel = True
diff --git a/src/main/FileHandling.py b/src/main/FileHandling.py
@@ -59,7 +59,7 @@ def getDatagroup():
 def checkAttempLoad(root_path=""):
     """
     Creates the training, testing, and validaton datasets and saves them in Saves/Data.pt, Saves/DataTest.pt, and Saves/DataVal.pt.
-    if Config's "attemptLoad" is true it instead loads the datasets from the files and does not create them. 
+    if Config's "attemptLoadData" is true it instead loads the datasets from the files and does not create them. 
     This is so that the validation and testing data does not get mixed up which would invalidate the validation data.
     """
     # get the data and create a test set and train set
@@ -80,7 +80,7 @@ def checkAttempLoad(root_path=""):
     if Config.unit_test_mode:
         return train, test, val
 
-    if Config.parameters["attemptLoad"][0] and os.path.exists(os.path.join(root_path,"Saves","Data.pt")):
+    if Config.parameters["attemptLoadData"][0] and os.path.exists(os.path.join(root_path,"Saves","Data.pt")):
         print("Found prior dataset to load")
         try:
             train = torch.load(os.path.join(root_path,"Saves","Data.pt"))
@@ -101,7 +101,7 @@ def checkAttempLoad(root_path=""):
         torch.save(train,os.path.join(root_path,"Saves","Data.pt"))
         torch.save(test,os.path.join(root_path,"Saves","DataTest.pt"))
         torch.save(val,os.path.join(root_path,"Saves","DataVal.pt"))
-        if Config.parameters["attemptLoad"][0]:
+        if Config.parameters["attemptLoadData"][0]:
             print("No model train and test checkpoint was found, saving datacheckpoints ...")
     return train, test, val
 
@@ -302,7 +302,7 @@ def create_params_Fscore(path, score, threshold = None):
 
 class Score_saver():
 
-    def __init__(self,path="Scoresall.csv"):
+    def __init__(self,path="Scoresall.csv",Record_Hyperparams=True):
         """
         Score_saver() is a class to consolidate the saving of data to the csv files. 
         When a Score_saver() object is initialized then it creates a new row onto the file, 
@@ -312,13 +312,16 @@ def __init__(self,path="Scoresall.csv"):
         self.writer = None
         self.path = path #unused at the moment
         self.name_all = {path:0}
-        self.create_params_All()
+        if Record_Hyperparams:
+            self.create_params_All()
+        else:
+            pd.DataFrame().to_csv(self.path)
         if Config.save_as_tensorboard:
             self.tensorboard_start()
 
 
 
-    def __call__(self,name:str,val,path="",fileName=None):
+    def __call__(self,name:str,val,path="",fileName=None, recursiveList=False):
         """
         Adds the measurement to the file. The __call__ version allows the Score_saver to be called like this:
         scoresaver = Score_saver()
@@ -331,7 +334,7 @@ def __call__(self,name:str,val,path="",fileName=None):
         """
         if fileName is None:
             fileName = self.path
-        self.addMeasurement(name,val,path,fileName)
+        self.addMeasurement(name,val,path,fileName,recursiveList=recursiveList)
 
     def create_params_All(self,name=None):
         """
@@ -389,7 +392,7 @@ def create_loop_history(self,name:str):
         #hist = hist.transpose()
         hist.to_csv(os.path.join("Saves",name))
 
-    def addMeasurement(self,name:str,val,path="",fileName=None,step=0):
+    def addMeasurement(self,name:str,val,path="",fileName=None,step=0, recursiveList=0):
         """
         Adds a measurement to the LATEST line in the Scoresall.csv file. This may cause problems if you are running two versions at once.
         we reccomend only running one version at once. 
@@ -405,6 +408,10 @@ def addMeasurement(self,name:str,val,path="",fileName=None,step=0):
         """
         if Config.unit_test_mode:
             return
+        if recursiveList>0 and (hasattr(val, '__iter__')):
+            for num,v in enumerate(val):
+                self.addMeasurement(name+f"_{num}", v, path, fileName, step, recursiveList-1)
+            return
         if fileName is None:
             fileName = self.path
         if self.writer is not None:
@@ -490,3 +497,33 @@ def start(self):
             self.writer = None
         self.tensorboard_start()
         self.create_params_All()
+
+class items_with_classes_record():
+    def __init__(self, labels:torch.Tensor):
+        self.labels = labels.unsqueeze(dim=-1)
+        self.items = None
+        self.predict = None
+
+    def __call__(self, items:torch.Tensor, file = "Saves/items.csv"):
+        self.storeItems(items)
+        self.useItems(file)
+
+    def storeItems(self, items:torch.Tensor):
+        self.items = items
+
+    def useItems(self, file = "Saves/items.csv"):
+        index_names = [f"Logit{x}" for x in range(len(self.items[0]))]
+        if self.predict is None:
+            items_with_labels = torch.concat([self.items,self.labels],dim=1)
+        else:
+            items_with_labels = torch.concat([self.items,self.predict,self.labels],dim=1)
+            index_names.append("Prediction")
+        index_names.append("Label")
+        df = pd.DataFrame(items_with_labels.T,index=index_names).T
+        df.to_csv(file,mode="a",header=(not os.path.exists(file)))
+        self.items = None
+        self.predict = None
+
+    def storePredictions(self, predictions:torch.Tensor):
+        assert predictions.dim() == 1
+        self.predict = predictions.unsqueeze(dim=-1)