From 8b81a0510d6356c5696777210957bc9290125cdd Mon Sep 17 00:00:00 2001 From: Carsten Behring Date: Mon, 6 May 2024 18:30:35 +0200 Subject: [PATCH] added some tests --- .gitignore | 4 ++++ notebooks/noj_book/ml_basic.clj | 40 ++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 2f81350..4f2df8d 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,7 @@ temp *.qmd book +/.RData +/.Rhistory +/.Rprofile +/.clj-kondo/ diff --git a/notebooks/noj_book/ml_basic.clj b/notebooks/noj_book/ml_basic.clj index 7df0cf2..8044cb2 100644 --- a/notebooks/noj_book/ml_basic.clj +++ b/notebooks/noj_book/ml_basic.clj @@ -11,9 +11,13 @@ (:require [tablecloth.api :as tc] [scicloj.metamorph.ml.toydata :as data] [tech.v3.dataset :as ds] - [scicloj.metamorph.ml :as ml] + [camel-snake-kebab.core :as csk] + [scicloj.kindly.v4.kind :as kind] [scicloj.kindly.v4.api :as kindly])) + + + ;; ## Inspect data ;; ;; The titanic data is part of `metamorph.ml` and in the form of a @@ -91,8 +95,27 @@ (ds-cat/fit-categorical-map relevant-titanic-data :pclass [0 1 2] :float64) (ds-cat/fit-categorical-map relevant-titanic-data :embarked ["S" "Q" "C"] :float64)]) + cat-maps +(kind/test-last (fn [cat-maps] + (every? + true? + (map + #(.equals %1 %2) + cat-maps + [ + {:lookup-table {"male" 0, "female" 1}, + :src-column :sex, + :result-datatype :float64} + {:lookup-table {0 0, 1 1, 2 2, 3 3}, + :src-column :pclass, + :result-datatype :float64} + {:lookup-table {"S" 0, "Q" 1, "C" 2}, + :src-column :embarked, + :result-datatype :float64}])))) + + ;; After the mappings are applied, we have a numeric dataset, as expected ;; by most models. (def numeric-titanic-data @@ -103,6 +126,16 @@ cat-maps (tc/head numeric-titanic-data) + +(kind/test-last (fn [ds] + (= + [[0.0 3.0 0.0 0.0] + [1.0 1.0 2.0 1.0] + [1.0 3.0 0.0 1.0] + [1.0 1.0 0.0 1.0] + [0.0 3.0 0.0 0.0]] + (ds/rowvecs ds)))) + ;; Split data into train and test set ;; Now we split the data into train and test. By we use ;; a :holdout strategy, so will get a single split in training an test data. @@ -143,7 +176,7 @@ split (loss/classification-accuracy (:survived (ds-cat/reverse-map-categorical-xforms (:test split))) (:survived (ds-cat/reverse-map-categorical-xforms dummy-prediction))) -;; It's performance is poor, even worse the coin flip. +;; It's performance is poor, even worse then coin flip. ;; ## Logistic regression @@ -166,7 +199,8 @@ split (:survived (ds-cat/reverse-map-categorical-xforms (:test split))) (:survived (ds-cat/reverse-map-categorical-xforms lreg-prediction))) -;; Its performance is better, 60 % +(kind/test-last [= 0.7373737373737373]) +;; Its performance is better, 73 % ;; ## Random forest ;; Next is random forest