updated eval_recall.py to use Youden index for optimal sensitivity an…

…d specificity values.
biospi · Jul 9, 2024 · dfec59f · dfec59f
1 parent 84e724d
commit dfec59f
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 55 deletions.
diff --git a/boot_roc_curve.py b/boot_roc_curve.py
@@ -19,7 +19,9 @@
 # Set matplotlib to use Times New Roman
 rcParams['font.family'] = 'serif'
 rcParams['font.serif'] = ['Times New Roman']
-
+import scipy
+from scipy.stats import ttest_rel
+import scipy.stats as stats
 
 class AnyObjectHandler(HandlerBase):
     def create_artists(
@@ -245,15 +247,15 @@ def main(path=None, n_bootstrap=100, n_job=6):
     )
 
     try:
-        xaxis_train_ = random.sample(xaxis_train, 100)
+        xaxis_train_ = random.sample(xaxis_train, 10)
     except ValueError as e:
         print(e)
         xaxis_train_ = xaxis_train
 
     for fpr, tpr in xaxis_train_:
         ax_roc_merge.plot(fpr, tpr, color="tab:purple", alpha=0.3, linewidth=1)
 
-    xaxis_test_ = random.sample(xaxis_test, 100)
+    xaxis_test_ = random.sample(xaxis_test, 10)
     for fpr, tpr in xaxis_test_:
         ax_roc_merge.plot(fpr, tpr, color="tab:blue", alpha=0.3, linewidth=1)
 
@@ -533,6 +535,8 @@ def boostrap_auc_peak(results, out_dir):
         n_job = int(sys.argv[3])
     else:
         res_folder = Path("E:/Cats/paper_debug_regularisation_8/")
+        n_bootstrap = 20
+        n_job = 2
 
     results = []
     folders = [
@@ -541,6 +545,9 @@ def boostrap_auc_peak(results, out_dir):
         if x.is_dir()
     ]
     for i, item in enumerate(folders):
+
+        if i > 4:
+            break
         print(f"{i}/{len(folders)}...")
         print(item)
         res = main(item, n_bootstrap=n_bootstrap, n_job=n_job)
@@ -583,3 +590,18 @@ def boostrap_auc_peak(results, out_dir):
     df_ = df_.head(20)
     print(df_.to_latex(index=False))
     df.to_csv("cat_result_table.csv", index=False)
+
+    df_noproc = df[df["Pre-processing"] == '']
+    df_noproc = df_noproc.sort_values("N peaks")
+    df_noproc_auc = df_noproc["median_auc_test"].values
+
+    df_l1 = df[df["Pre-processing"] == 'L1']
+    df_l1 = df_l1.sort_values("N peaks")
+    df_l1_auc = df_l1["median_auc_test"].values
+
+    # First, conduct the Wilcoxon signed-rank test
+    wilcoxon_p_value = stats.wilcoxon(df_l1_auc, df_noproc_auc, alternative='less').pvalue
+    print(f"Wilcoxon Signed-Rank Test: p-value = {wilcoxon_p_value}")
+
+    t_stat, p_value = ttest_rel(df_l1_auc, df_noproc_auc)
+    print(f"Paired T-Test: t-statistic = {t_stat}, p-value = {p_value}")
diff --git a/build_dataset.py b/build_dataset.py
@@ -15,7 +15,7 @@
 from plotnine import ggplot, aes, geom_jitter, stat_summary, theme, element_text
 from utils._anscombe import anscombe
 from utils.utils import time_of_day_
-import pytest
+
 
 from matplotlib import rcParams
 # Set matplotlib to use Times New Roman
@@ -445,22 +445,37 @@ def get_cat_data(data_dir, bin, subset=None):
     # files = new
 
     dfs = []
+    gender = ""
     for i, file in enumerate(files):
-        print(f"progress[{i}/{len(files)}]...")
-        print(f"reading file: {file}")
-
-        cat_id = int(file.stem.split("_")[0])
-        cat_name = file.stem.split("_")[1]
-        individual_to_ignore = ["MrDudley", "Oliver_F", "Lucy"]
-        if cat_name in individual_to_ignore:
-            continue
-        cat_meta = get_cat_meta(data_dir, cat_id, individual_to_ignore=individual_to_ignore)
-        df = pd.read_csv(file, sep=",", skiprows=range(0, 23), header=None)
-        df = format_raw_data(df, bin)
-        df["health"] = cat_meta["health"]
-        df["age"] = cat_meta["age"]
-        df["cat_id"] = cat_id
-        dfs.append(df)
+        try:
+            print(f"progress[{i}/{len(files)}]...")
+            print(f"reading file: {file}")
+            cat_id = int(file.stem.split("_")[0])
+            cat_name = file.stem.split("_")[1]
+
+            if "maisie" not in str(file).lower():
+                continue
+
+            individual_to_ignore = ["MrDudley", "Oliver_F", "Lucy"]
+            if cat_name in individual_to_ignore:
+                continue
+            cat_meta = get_cat_meta(data_dir, cat_id, individual_to_ignore=individual_to_ignore)
+            df = pd.read_csv(file, sep=",", nrows=1, header=None)
+
+            df_ = pd.read_csv(file, sep=",", nrows=23, header=1, error_bad_lines=False)
+            gender = df_[df_["Filename:"] == "Gender:"].values[0][1]
+            weight = df_[df_["Filename:"] == "Weight:"].values[0][1]
+            #df = format_raw_data(df, bin)
+            df["health"] = cat_meta["health"]
+            df["age"] = cat_meta["age"]
+            df["cat_id"] = cat_id
+            df["mob_score"] = cat_meta['mobility_score']
+            df["gender"] = gender
+            df["weight"] = weight
+            df = df[["cat_id", "age", "gender", "mob_score", "health", "weight"]]
+            dfs.append(df)
+        except Exception as e:
+            print(e)
     return dfs
 
 
@@ -471,7 +486,7 @@ def run(
     out_dir: Path = typer.Option(
         ..., exists=False, file_okay=False, dir_okay=True, resolve_path=True
     ),
-    dataset_path: Path = Path("dataset.csv"),
+    dataset_path: Path = Path("dataset_test8.csv"),
     bin: str = "S",
     w_size: List[int] = [15],
     threshs: List[int] = [10],
@@ -509,8 +524,11 @@ def run(
         cat_data = get_cat_data(data_dir, bin)
         #dataset_path = f"{dataset_path.name}_{bin}.csv"
         print(f"saving {dataset_path}...")
-        pd.concat(cat_data).to_csv(dataset_path, index=True)
-        #print("done.")
+        df_meta = pd.concat(cat_data)
+        df_meta.to_csv(dataset_path, index=True)
+        df_meta = df_meta.drop("weight", axis=1)
+        print(df_meta.to_latex())
+        print("done.")
 
     datasets = []
     for t in threshs:
@@ -541,40 +559,41 @@ def run(
     return datasets
 
 
-def test():
-
-    n_peak = 7
-    rois = []
-    for i in range(10):
-        rois.append([f"sample {i+1}"])
-    rois = np.array(rois)
-
-    idxs_peaks = np.arange(len(rois))
-    combinat = list(combinations(idxs_peaks, n_peak))
-    try:
-        rois_idxs = random.sample(combinat, k=100)
-    except ValueError as e:
-        print(e)
-        print(f"There are less samples than max_sample={100}")
-        rois_idxs = combinat
-
-    #build augmented sample by concatenating permutations of peaks
-    n_peak_samples = []
-    for idxs in rois_idxs:
-        new_samples = []
-        for i in idxs:
-            sample = rois[i]
-            new_samples.append(sample)
-        activity = np.concatenate(new_samples)
-        s = activity.tolist()
-        n_peak_samples.append(s)
-    n_peak_samples = np.array(n_peak_samples)
-    print(f"{len(rois)} samples before combination")
-    #print(rois)
-    print(f"{len(n_peak_samples)} samples after combination")
-    #print(n_peak_samples)
+# def test():
+#
+#     n_peak = 7
+#     rois = []
+#     for i in range(10):
+#         rois.append([f"sample {i+1}"])
+#     rois = np.array(rois)
+#
+#     idxs_peaks = np.arange(len(rois))
+#     combinat = list(combinations(idxs_peaks, n_peak))
+#     try:
+#         rois_idxs = random.sample(combinat, k=100)
+#     except ValueError as e:
+#         print(e)
+#         print(f"There are less samples than max_sample={100}")
+#         rois_idxs = combinat
+#
+#     #build augmented sample by concatenating permutations of peaks
+#     n_peak_samples = []
+#     for idxs in rois_idxs:
+#         new_samples = []
+#         for i in idxs:
+#             sample = rois[i]
+#             new_samples.append(sample)
+#         activity = np.concatenate(new_samples)
+#         s = activity.tolist()
+#         n_peak_samples.append(s)
+#     n_peak_samples = np.array(n_peak_samples)
+#     print(f"{len(rois)} samples before combination")
+#     #print(rois)
+#     print(f"{len(n_peak_samples)} samples after combination")
+#     #print(n_peak_samples)
 
 
 if __name__ == "__main__":
-    typer.run(run)
+    run(Path("E:\Cats"), Path("E:\Cats"))
+    #typer.run(run)
 
diff --git a/crepuscular.py b/crepuscular.py
@@ -87,7 +87,11 @@ def ml(samples_dir, n_bootstrap=100, n_job=5):
     cat_meta = get_cat_meta(data_dir, None)
 
     #Get data from raw csv
+    dataset_path = "meta_data.csv"
     cat_data = get_cat_data(data_dir, "S")
+    print(f"saving {dataset_path}...")
+    pd.concat(cat_data).to_csv(dataset_path, index=True)
+
     num_ticks = 6
     p = 0.95
     w_size = 30

diff --git a/utils/del_pkl.py b/utils/del_pkl.py
@@ -0,0 +1,19 @@
+import os
+import glob
+
+# Define the directory path
+directory = r"E:\Cats\paper_debug_regularisation_36"
+
+# Construct the search pattern
+pattern = os.path.join(directory, "*.pkl")
+
+# Get the list of all .pkl files in the directory
+pkl_files = glob.glob(pattern)
+
+# Delete each .pkl file
+for file_path in pkl_files:
+    try:
+        os.remove(file_path)
+        print(f"Deleted: {file_path}")
+    except Exception as e:
+        print(f"Error deleting {file_path}: {e}")