Merge pull request #52 from ambroggi/dev

Added Argument parser
bayegaspard · Jul 24, 2023 · 65a60dc · 65a60dc
2 parents b5f563b + e48cff2
commit 65a60dc
Show file tree

Hide file tree

Showing 15 changed files with 13,588 additions and 145 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,9 +19,11 @@ Saves/unknown/*
 !Saves/roc/
 !Saves/models
 Saves/models/*
+!Saves/Scoresall-ArchivedForPaper.csv
 *.json
 src/CodeFromImplementations/Ignore.py
 !datasets/percentages.csv
 test concat scores.py
 test copy.py
 test concat duplication problems.py
+log.txt
diff --git a/README.md b/README.md
@@ -26,10 +26,11 @@ source opensetperf/bin/activate
 - Navigate up one directory `cd ..` into the root directory for the Repo.
 - You may need to create some folders in the Saves directory namely, `OpenSetPerf\Saves\conf`,`OpenSetPerf\Saves\models`, and `OpenSetPerf\Saves\roc`.
 - Run the model using `python3 src\main\main.py`.
+- Get hyperparameter information by using `python3 src\main\main.py -h`.
 - Alternatively, you can run `chmod +x ./threeRuns.sh` and then `.\threeRuns.sh` to run the model three times.
 - Saves and model outputs will generate in the `Saves` folder.
 
-- Edit the `src/main/Config.py` file to change the hyperparameters for the model. More information in `src/main/README.md`
+- Edit the `src/main/Config.py` file to change the hyperparameters for the model. More information in `src/main/README.md`. You can also edit the hyperparameters using command line parameters, see `python3 src\main\main.py -h` for more details.
 
 
 
@@ -51,6 +52,7 @@ source opensetperf/bin/activate
       - `EndLayer.py` - This file works with the folder `CodeFromImplementations` to implement each of the diffrent algorithms.
       - `plots.py` - This file generates 4 png files of diffrent matplotlib graphs.
       - `GPU.py` - This file helps run the model on diffrent GPUs or move tensors from one GPU to the CPU.
+      - `GenerateImages` - This file reads the save file and generates images in `Saves/images/` to visualize the data.
       - `helperFunctions.py` - This file contains all other functions that are not contained in another file.
 
   - `CodeFromImplementations`
@@ -68,7 +70,8 @@ source opensetperf/bin/activate
   - It has many diffrent types of files such as:
     - Data/DataTest - This saves the specific dataloaders from the last time the model was run including the train/test split as to not contaiminate the model if it is run again.
     - Scoresall.csv - This file saves the model state and all standard metrics from the model. 
-    - EpochXXX - These save the hyperparameters of the pytorch model at each of the epochs. NOTE: If you decrease the number of epochs in the model it will not delete the old epoch files which may cause problems.
+    - models - This file stores the most trained models for each of the algorithms
+    - images - this is a folder that stores graphical representations of the `Scoresall.csv`
     - ROC_data - these are files that store the reciever operator characteristic data. It can be used to inform about the settings of threshold. The first line is false positives, the second is true positives, and the third is thesholds to acheve those positives.
   - The following files are still generated but are not used:
     - fscore - this saves the Config parameters and the associated f-score that those parameters got to.
@@ -78,6 +81,14 @@ source opensetperf/bin/activate
     - hyperparam - saves the Config of the last time the file was run.
     - unknown - saves which classes were unknown from the last time the file was run.
     - batch - saves information about each batch that has been run. NOTE: this file can break if it is saved to too many times, you may need to delete it and allow it to regenerate.
+
+- `Tests`
+  - This is a folder of pytest tests to ensure the model works properly.
+  - test_endlayer.py - Tests that the endlayer outputs correctly given a valid input.
+  - test_fullRun.py - Runs thw whole model on minimial settings with No-save mode so that it does not mess up the save file.
+  - test_LongTests.py - Runs the code on the whole loop. Takes a long time so it is not tested. Does not work.
+  - test_loops.py - tests if the loop functions in main.py work.
+  - test_other.py - tests things not in the other catagoies.
 
 - `datasets`
   - We place the NIDS dataset in this folder.
@@ -87,4 +98,7 @@ source opensetperf/bin/activate
   - This is a number that is included in `Saves/Scoresall.csv` to inform about which version of the code was used to generate the outputs.
 
 - `threeRuns.sh`
-  - This is a simple and small shell script to run the model three times. 
+  - This is a simple and small shell script to run the model three times.
+
+### Data Used:
+  -The data located at `Saves/Scoresall-ArchivedForPaper.csv` is the data generated for a paper. Not all of the data in the file was used for the paper as some of it is old. Look at the Version column, the values of version numbers grater than or equal to 422 were used with the `src/main/Generateimages.py` file to create the data for the paper. 
diff --git a/Saves/Scoresall-ArchivedForPaper.csv b/Saves/Scoresall-ArchivedForPaper.csv
diff --git a/Saves/Scoresall.csv b/Saves/Scoresall.csv
diff --git a/Tests/test_fullRun.py b/Tests/test_fullRun.py
@@ -16,6 +16,7 @@
 from torch.utils.data import DataLoader
 
 main.Config.parameters["num_epochs"][0] = 1
+main.Config.parameters["num_workers"][0] = 0
 main.Config.parameters["MaxPerClass"][0] = 10
 main.Config.parameters["LOOP"][0] = 0
 

diff --git a/Tests/test_loops.py b/Tests/test_loops.py
@@ -33,6 +33,7 @@ def addtoLoopNames(itemDescription,item):
 
     main.Config.parameters["LOOP"][0] = 1
     main.loopType1(nothing,addtoLoopNames)
+    main.FileHandling.addMeasurement("For Unit Test","True",fileName="LoopRan.csv")
 
 def testLoop2Uniqueness():
     """

diff --git a/build_number.txt b/build_number.txt
@@ -1 +1 @@
-     437
+     449
diff --git a/src/main/Config.py b/src/main/Config.py
@@ -1,6 +1,25 @@
 import torch
 import pandas as pd
 import os
+import sys
+import argparse
+
+
+if __name__ != "Config":
+    # if "Config" in sys.modules:
+    #     class doubleImport(ImportError):
+    #         """
+    #         Config was imported using a different path after it has already been imported.
+    #         This causes problems when Config is modified.
+    #         """
+    #         pass
+    #     raise doubleImport
+    print(f"A POSSIBLE PROBLEM HAS OCCURED, Config was loaded improperly, from {__name__} instead of directly\
+    this might break some global variables by having two copies",file=sys.stderr)
+
+#TODO: Rework config so that it is less janky and uses less bad practices of global variables. 
+# Possibly by moving HelperFunctions Loop functions to outside of the program 
+# and just using the command line parser for the individual sections.
 
 #This config file is mainly used as global variables for the rest of the program.
 #It should only be modified by the loop commands in helperfunctions
@@ -40,7 +59,7 @@ def loopOverUnknowns(unknownlist):
 #Here are all of the paremeters for the model.
 parameters = {
     #These parameters are orginized like this:
-    #"ParamName":[Value,"Description"]
+    #"ParamName":[Value,"Description",[possible values]]
     #for a parameter called "ParamName" with a value of Value
     "batch_size":[100000, "Number of items per batch"],
     "num_workers":[14, "Number of threads working on building batches"],
@@ -51,26 +70,49 @@ def loopOverUnknowns(unknownlist):
     "num_epochs":[150,"Number of times it trains on the whole trainset"],
     "learningRate":[0.01, "a modifier for training"],
     "threshold":[0.5,"When to declare something to be unknown"],
-    "model":["Convolutional","Model type [Fully_Connected,Convolutional]"],
-    "OOD Type":["Soft","type of out of distribution detection [Soft,Open,Energy,COOL,DOC,iiMod]"],
+    "model":["Convolutional","Model type",["Fully_Connected","Convolutional"]],
+    "OOD Type":["Soft","type of out of distribution detection", ["Soft","Open","Energy","COOL","DOC","iiMod"]],
     "Dropout":[0.01,"percent of nodes that are skipped per run, larger numbers for more complex models [0,1)"],
-    "Datagrouping":["Dendrogramlimit","Datagroup type [ClassChunk,Dendrogramlimit]"],
+    "Datagrouping":["Dendrogramlimit","Datagroup type", ["ClassChunk","Dendrogramlimit"]],
     "optimizer":opt_func["Adam"],
     "Unknowns":"refer to unknowns.CSV",
     "CLASSES":[15,"Number of classes, do not change"],
     "Temperature":[1,"Energy OOD scaling parameter"],
     "Degree of Overcompleteness": [3,"Parameter for Fitted Learning"],
     "Number of Layers": [2,"Number of layers to add to the base model"],
     "Nodes": [512,"The number of nodes per added layer"],
-    "Activation": ["Leaky","The type of activation function to use"],
+    "Activation": ["Leaky","The type of activation function to use",["ReLU", "Tanh", "Sigmoid","Leaky"]],
     "LOOP": [1,"This is a parameter that determines if we want to loop over the algorithms.\n "\
     "0: no loop, 1:loop through variations of algorithms,thresholds,learning rates, groups and numbers of epochs, \n"\
-    "2: Loop while adding more unknowns into the training data (making them knowns) without resetting the model"],
-    "Dataset": ["Payload_data_CICIDS2017", "This is what dataset we are using, [Payload_data_CICIDS2017,Payload_data_UNSW]"],
+    "2: Loop while adding more unknowns into the training data (making them knowns) without resetting the model, \n"\
+    "3: Loop through different data distributions without training the model."],
+    "Dataset": ["Payload_data_CICIDS2017", "This is what dataset we are using,", ["Payload_data_CICIDS2017","Payload_data_UNSW"]],
     "SchedulerStepSize": [10, "This is how often the scheduler takes a step, 3 means every third epoch"],
     "SchedulerStep": [0.8,"This is how big a step the scheduler takes, leave 0 for no step"]
 }
 
+#Argparse tutorial: https://docs.python.org/3/howto/argparse.html 
+parser = argparse.ArgumentParser()
+for x in parameters.keys():
+    if x in ["batch_size","num_workers","MaxPerClass","num_epochs","Degree of Overcompleteness","Number of Layers","Nodes","SchedulerStepSize"]:
+        parser.add_argument(f"--{x}",type=int,default=parameters[x][0],help=parameters[x][1],required=False)
+    if x in ["testlength","learningRate","threshold","Dropout","Temperature","SchedulerStep"]:
+        parser.add_argument(f"--{x}",type=float,default=parameters[x][0],help=parameters[x][1],required=False)
+    if x in ["attemptLoad","Mix unknowns and validation"]:
+        parser.add_argument(f"--{x}",choices=[True,False],default=parameters[x][0],help=parameters[x][1],required=False)
+    if x in ["LOOP"]:
+        parser.add_argument(f"--{x}",type=int,choices=[0,1,2,3],default=parameters[x][0],help=parameters[x][1],required=False)
+    if x in ["model","OOD Type","Datagrouping","Activation","Dataset"]:
+        parser.add_argument(f"--{x}",choices=parameters[x].pop(),default=parameters[x][0],help=parameters[x][1],required=False)
+if "pytest" not in sys.modules: #The argument parser appears to have issues with the pytest tests. I have no idea why.
+    args = parser.parse_args()
+    for x in args._get_kwargs():
+        parameters[x[0]][0] = x[1]
+
+
+
+
+
 DOC_kernels = [3,4,5]
 
 #Set Number of classes:
@@ -103,7 +145,7 @@ def loopOverUnknowns(unknownlist):
 thresholds = [30,20,15,5]
 learning_rates = [0.1,0.01,0.001,0.0001]
 activation = ["ReLU", "Tanh", "Sigmoid","Leaky"]
-groups = [[2],[2,3],[2,3,4],[2,3,4,5],[2,3,4,5,6],[2,3,4,5,6,7],[1,2,3,4,5,6,7],[1,2,3,4,5,6,7,8]]
+groups = [[],[2],[2,3],[2,3,4],[2,3,4,5],[2,3,4,5,6],[2,3,4,5,6,7],[1,2,3,4,5,6,7],[1,2,3,4,5,6,7,8]]
 #groups = [[7,8,9]]
 if parameters["Dataset"][0] == "Payload_data_CICIDS2017":
     incGroups = [[2,3,4,5,6,7,8,9,10,11,12,13,14],[3,4,5,6,7,8,9,10,11,12,13,14],[4,5,6,7,8,9,10,11,12,13,14],[5,6,7,8,9,10,11,12,13,14],[6,7,8,9,10,11,12,13,14],[7,8,9,10,11,12,13,14],[8,9,10,11,12,13,14],[9,10,11,12,13,14],[10,11,12,13,14],[11,12,13,14],[12,13,14],[13,14],[14]] 
@@ -136,6 +178,11 @@ def loopOverUnknowns(unknownlist):
 # alg.remove("DOC")
 # alg.remove("iiMod")
 
+#it causes problems if you dont start at the start of the loop.
+if parameters["LOOP"][0] == 1:
+    parameters["OOD Type"][0] = alg[0]
+
+
 #Optimizer has been removed from the list of things we are changing
 optim = [opt_func["Adam"], opt_func["SGD"], opt_func["RMSprop"]]
 optim = [opt_func["Adam"]]
@@ -210,3 +257,5 @@ def algorithmSpecificSettings(alg="None"):
 #https://gist.github.com/sg-s/2ddd0fe91f6037ffb1bce28be0e74d4e
 f = open("build_number.txt","r")
 parameters["Version"] = [f.read(),"The version number"]
+
+
diff --git a/src/main/FileHandling.py b/src/main/FileHandling.py
@@ -300,7 +300,7 @@ def create_params_Fscore(path, score, threshold = None):
 
     hist.to_csv(os.path.join(path,"Saves","fscore.csv"),index=False)
 
-def create_params_All(path=""):
+def create_params_All(path="",name="Scoresall.csv"):
     """
     Generates a new line of the file scoresAll.csv that we use to store the scores from the run.
     The new line contains all of the Config values that we are using.
@@ -313,16 +313,32 @@ def create_params_All(path=""):
     params = pd.DataFrame(Config.parameters,columns=Config.parameters.keys())
 
 
-    if os.path.exists(os.path.join(path,"Saves","Scoresall.csv")):
-        hist = pd.read_csv(os.path.join(path,"Saves","Scoresall.csv"),index_col=0)
+    if os.path.exists(os.path.join(path,"Saves",name)):
+        hist = pd.read_csv(os.path.join(path,"Saves",name),index_col=0)
         hist = pd.concat([hist,params.iloc[[0]]],axis=0,ignore_index=True)
     else:
         hist = params.iloc[[0]]
 
     #hist = hist.transpose()
-    hist.to_csv(os.path.join(path,"Saves","Scoresall.csv"))
+    hist.to_csv(os.path.join(path,"Saves",name))
 
-def addMeasurement(name:str,val,path=""):
+def create_loop_history(name:str,path=""):
+    if Config.unit_test_mode:
+        return
+    params = pd.DataFrame([Config.loops],columns=Config.loops2)
+    params["Version"] = Config.parameters["Version"][0]
+
+
+    if os.path.exists(os.path.join(path,"Saves",name)):
+        hist = pd.read_csv(os.path.join(path,"Saves",name),index_col=0)
+        hist = pd.concat([hist,params.iloc[[0]]],axis=0,ignore_index=True)
+    else:
+        hist = params.iloc[[0]]
+
+    #hist = hist.transpose()
+    hist.to_csv(os.path.join(path,"Saves",name))
+
+def addMeasurement(name:str,val,path="",fileName="Scoresall.csv"):
     """
     Adds a measurement to the LATEST line in the Scoresall.csv file. This may cause problems if you are running two versions at once.
     we reccomend only running one version at once. 
@@ -333,10 +349,11 @@ def addMeasurement(name:str,val,path=""):
     """
     if Config.unit_test_mode:
         return
-    total = pd.read_csv(os.path.join(path,"Saves","Scoresall.csv"),index_col=0)
+    total = pd.read_csv(os.path.join(path,"Saves",fileName),index_col=0)
     #print(f"last valid index = {total.last_valid_index()} item name= {name}, item value={val}")
     if name in total and not (pd.isnull(total.at[total.last_valid_index(),name]) or name in ["Number Of Failures"]):
         total.at[total.last_valid_index(),"A spot has already been filled?"] = "An error has occured"
     total.at[total.last_valid_index(),name] = val
-    total.to_csv(os.path.join(path,"Saves","Scoresall.csv"))
+    total.to_csv(os.path.join(path,"Saves",fileName))
+    return total.last_valid_index()
 
diff --git a/src/main/GenerateImages.py b/src/main/GenerateImages.py
@@ -218,12 +218,23 @@ def keepFirstThree(df:pd.DataFrame):
     return final
 
 
-def main(save=True,show=False, minimumVersion=None, bysection=False):
+def main(save=True,show=False, minimumVersion=None, bysection=False, latex=False):
+    """
+    Creates pivot tables based on the data in Scoresall.csv. It then displays the data in up to three different ways.
+    MinimumVersion deturmines what rows of the datatable to use based on the Version column. 
+    If no version is specified then the current version of the code base is used.
+
+    If Save is true then main() creates graphs based on the data and saves the graphs in Saves/images/
+    If Show is true then main() opens the graphs made by Save and shows them in an internet window
+    If latex is true then main() will save the pivot table as a latex table in Saves/images/
+
+    """
     if minimumVersion is None:
         #Getting version number
         #https://gist.github.com/sg-s/2ddd0fe91f6037ffb1bce28be0e74d4e
         f = open("build_number.txt","r")
         minimumVersion = int(f.read().strip())
+
     if not os.path.exists("Saves/images/"):
         os.mkdir("Saves/images")
 
@@ -249,40 +260,50 @@ def main(save=True,show=False, minimumVersion=None, bysection=False):
             if graphTabel(part_tabel2,show=show,save=bysection) == -1:
                 print(f"{z1}-{z2} was unable to find samples for graphs")
 
-    graphTabel(whole_table,show=show,save=save)
+
+    whole_table[whole_table[valueLocations["Convolutional"]]=="Convolutional"]
+    whole_table[whole_table[valueLocations["Payload_data_CICIDS2017"]]=="Payload_data_CICIDS2017"]
+    graphTabel(whole_table,show=show,save=save,latex=latex)
 
 
 
 
-def graphTabel(df:pd.DataFrame,show=False,save=True,extrapath=""):
+def graphTabel(df:pd.DataFrame,show=False,save=True,latex=False,extrapath=""):
     if len(df) <2:
         print("Dataframe not enough values")
         return -1
 
-    for y in ["Test_F1","Val_F1","Test_Found_Unknowns"]:
-        for x in set(df["Type of modification"]):
-            part_table = pd.pivot_table(df[df["Type of modification"]==x],values=y,index=[f"{x}"],columns=["OOD Type"],aggfunc=np.mean)
-            # print(part_table)
-
-            if x in ["Activation"]:
-                fig = px.scatter(part_table)
-            elif x in ["Datagrouping","Unknowns"]:
-                fig = px.line(part_table,markers=True)
-            else:
-                fig = px.line(part_table,markers=True,log_x=True)
-            fig.update_layout(yaxis_title=y,xaxis_title=x,paper_bgcolor="rgba(0,0,0,0)",plot_bgcolor="rgba(0,0,0,0)",font={"size":18,"color":"rgba(0,0,0,255)"},legend_title_text='Algorithm')
-            fig.update_yaxes(range=[0, 1],gridcolor="rgba(200,200,200,50)",zerolinecolor="rgba(200,200,200,50)",zerolinewidth=1)
-            fig.update_xaxes(gridcolor="rgba(200,200,200,50)",zerolinecolor="rgba(200,200,200,50)",zerolinewidth=1,exponentformat='power')
-
-            fig.for_each_trace(traceLines)
+    for prefix in ["","AUTOTHRESHOLD_","AUTOTHRESHOLD2_"]:
+        for y in [f"Test_F1",f"Val_F1",f"Test_Found_Unknowns"]:
+            for x in set(df["Type of modification"]):
 
-            if show:
-                fig.show()
-            if save:
-                fig.write_image(f"Saves/images/{extrapath}{y}{x}.png",scale=4)
+                part_table = pd.pivot_table(df[df["Type of modification"]==x],values=f"{prefix}{y}",index=[f"{x}"],columns=["OOD Type"],aggfunc=np.mean)
+                # print(part_table)
+
+                if x in ["Activation"]:
+                    fig = px.scatter(part_table)
+                elif x in ["Datagrouping","Unknowns"]:
+                    fig = px.line(part_table,markers=True)
+                else:
+                    fig = px.line(part_table,markers=True,log_x=True)
+                xaxis = x
+                if xaxis == "MaxPerClass":
+                    xaxis = "Datapoints per class"
+                fig.update_layout(yaxis_title=y,xaxis_title=xaxis,paper_bgcolor="rgba(0,0,0,0)",plot_bgcolor="rgba(0,0,0,0)",font={"size":18,"color":"rgba(0,0,0,255)"},legend_title_text='Algorithm')
+                fig.update_yaxes(range=[0, 1],gridcolor="rgba(200,200,200,50)",zerolinecolor="rgba(200,200,200,50)",zerolinewidth=1)
+                fig.update_xaxes(gridcolor="rgba(200,200,200,50)",zerolinecolor="rgba(200,200,200,50)",zerolinewidth=1,exponentformat='power')
+
+                fig.for_each_trace(traceLines)
+
+                if show:
+                    fig.show()
+                if save:
+                    fig.write_image(f"Saves/images/{extrapath}{prefix}{y}{x}.png",scale=4)
+                if latex:
+                    part_table.to_latex(f"Saves/images/{extrapath}{prefix}{y}{x}",float_format="%.2f")
 
 if __name__ == '__main__':
-    main(minimumVersion=422)
+    main(minimumVersion=422, latex=True)