From 8b81a0510d6356c5696777210957bc9290125cdd Mon Sep 17 00:00:00 2001
From: Carsten Behring <carsten.behring@gmail.com>
Date: Mon, 6 May 2024 18:30:35 +0200
Subject: [PATCH] added some tests

---
 .gitignore                      |  4 ++++
 notebooks/noj_book/ml_basic.clj | 40 ++++++++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2f81350..4f2df8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,7 @@ temp
 *.qmd
 book
 
+/.RData
+/.Rhistory
+/.Rprofile
+/.clj-kondo/
diff --git a/notebooks/noj_book/ml_basic.clj b/notebooks/noj_book/ml_basic.clj
index 7df0cf2..8044cb2 100644
--- a/notebooks/noj_book/ml_basic.clj
+++ b/notebooks/noj_book/ml_basic.clj
@@ -11,9 +11,13 @@
   (:require [tablecloth.api :as tc]
             [scicloj.metamorph.ml.toydata :as data]
             [tech.v3.dataset :as ds]
-            [scicloj.metamorph.ml :as ml]
+            [camel-snake-kebab.core :as csk]
+            [scicloj.kindly.v4.kind :as kind]
             [scicloj.kindly.v4.api :as kindly]))
 
+
+
+
 ;; ## Inspect data
 ;;
 ;;  The titanic data is part of `metamorph.ml` and in the form of a
@@ -91,8 +95,27 @@
    (ds-cat/fit-categorical-map relevant-titanic-data :pclass [0 1 2] :float64)
    (ds-cat/fit-categorical-map relevant-titanic-data :embarked ["S" "Q" "C"] :float64)])
 
+
 cat-maps
 
+(kind/test-last (fn [cat-maps]
+                  (every?
+                   true?
+                   (map
+                    #(.equals %1 %2)
+                    cat-maps
+                    [
+                     {:lookup-table {"male" 0, "female" 1},
+                      :src-column :sex,
+                      :result-datatype :float64}
+                     {:lookup-table {0 0, 1 1, 2 2, 3 3},
+                      :src-column :pclass,
+                      :result-datatype :float64}
+                     {:lookup-table {"S" 0, "Q" 1, "C" 2},
+                      :src-column :embarked,
+                      :result-datatype :float64}]))))
+
+
 ;; After the mappings are applied, we have a numeric dataset, as expected
 ;; by most models.
 (def numeric-titanic-data
@@ -103,6 +126,16 @@ cat-maps
 (tc/head
  numeric-titanic-data)
 
+
+(kind/test-last (fn [ds]
+                  (=
+                   [[0.0 3.0 0.0 0.0]
+                    [1.0 1.0 2.0 1.0]
+                    [1.0 3.0 0.0 1.0]
+                    [1.0 1.0 0.0 1.0]
+                    [0.0 3.0 0.0 0.0]]
+                   (ds/rowvecs ds))))
+
 ;; Split data into train and test set
 ;;  Now we split the data into train and test. By we use
 ;;  a :holdout strategy, so will get a single split in training an test data.
@@ -143,7 +176,7 @@ split
 (loss/classification-accuracy
  (:survived (ds-cat/reverse-map-categorical-xforms (:test split)))
  (:survived (ds-cat/reverse-map-categorical-xforms dummy-prediction)))
-;;  It's performance is poor, even worse the coin flip.
+;;  It's performance is poor, even worse then coin flip.
 
 
 ;; ## Logistic regression
@@ -166,7 +199,8 @@ split
  (:survived (ds-cat/reverse-map-categorical-xforms (:test split)))
  (:survived (ds-cat/reverse-map-categorical-xforms lreg-prediction)))
 
-;; Its performance is  better, 60 %
+(kind/test-last [= 0.7373737373737373])
+;; Its performance is  better, 73 %
 
 ;; ## Random forest
 ;; Next is random forest