From 665869e91f51ca5c4598a3ddfcf81d110b79b318 Mon Sep 17 00:00:00 2001 From: "Michael (Mikhail) Yudelson" Date: Thu, 17 Jul 2014 17:06:40 -0500 Subject: [PATCH] Pushed changes from the master version, phase 2, updated copyright --- FitBit.cpp | 7 +- FitBit.h | 4 +- HMMProblem.cpp | 2 +- HMMProblem.h | 2 +- InputUtil.cpp | 2 +- InputUtil.h | 4 +- StripedArray.cpp | 2 +- StripedArray.h | 2 +- inputconvert.cpp | 2 +- predicthmm.cpp | 7 +- trainhmm.cpp | 557 ++++++++++++++++++++++++++++++++++++----------- utils.cpp | 2 +- utils.h | 2 +- 13 files changed, 450 insertions(+), 145 deletions(-) diff --git a/FitBit.cpp b/FitBit.cpp index 2c96196..ebc25fd 100644 --- a/FitBit.cpp +++ b/FitBit.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without @@ -302,7 +302,8 @@ void FitBit::add(enum FIT_BIT_SLOT sourse_fbs, enum FIT_BIT_SLOT target_fbs) { add(soursePI, sourseA, sourseB, targetPI, targetA, targetB); } -bool FitBit::checkConvergence() { +bool FitBit::checkConvergence(FitResult *fr) { + NUMBER critetion = 0; for(NPAR i=0; inS; i++) { @@ -315,6 +316,8 @@ bool FitBit::checkConvergence() { } } return sqrt(critetion) < this->tol; // double the truth or false + +// return (fr->pOmid - fr->pO) < this->tol; } void FitBit::doLog10ScaleGentle(enum FIT_BIT_SLOT fbs) { diff --git a/FitBit.h b/FitBit.h index 0c37dde..04f6b4e 100644 --- a/FitBit.h +++ b/FitBit.h @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without @@ -84,7 +84,7 @@ class FitBit { void destroy(enum FIT_BIT_SLOT fbs); void copy(enum FIT_BIT_SLOT sourse_fbs, enum FIT_BIT_SLOT target_fbs); void add(enum FIT_BIT_SLOT sourse_fbs, enum FIT_BIT_SLOT target_fbs); - bool checkConvergence(); + bool checkConvergence(FitResult *fr); void doLog10ScaleGentle(enum FIT_BIT_SLOT fbs); private: NUMBER tol; diff --git a/HMMProblem.cpp b/HMMProblem.cpp index 7ac9f5f..65f7c01 100644 --- a/HMMProblem.cpp +++ b/HMMProblem.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/HMMProblem.h b/HMMProblem.h index 8d70a53..0e6b3b3 100644 --- a/HMMProblem.h +++ b/HMMProblem.h @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/InputUtil.cpp b/InputUtil.cpp index 7223ebf..5cc463a 100644 --- a/InputUtil.cpp +++ b/InputUtil.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/InputUtil.h b/InputUtil.h index 4c56304..a7362a6 100644 --- a/InputUtil.h +++ b/InputUtil.h @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ #include "utils.h" //#define bin_input_file_verstion 1 -#define bin_input_file_verstion 2 // increase number of skills students to 4 bytes +#define bin_input_file_verstion 2 // increase number of skills/students to a 4 byte integer class InputUtil { public: diff --git a/StripedArray.cpp b/StripedArray.cpp index aec77db..4d7c0cd 100644 --- a/StripedArray.cpp +++ b/StripedArray.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/StripedArray.h b/StripedArray.h index bd98d68..de5a5af 100644 --- a/StripedArray.h +++ b/StripedArray.h @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/inputconvert.cpp b/inputconvert.cpp index 810784d..a88f5f6 100644 --- a/inputconvert.cpp +++ b/inputconvert.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/predicthmm.cpp b/predicthmm.cpp index 0b1e55d..9f59cb8 100644 --- a/predicthmm.cpp +++ b/predicthmm.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without @@ -110,14 +110,13 @@ int main (int argc, char ** argv) { // predict(predict_file, hmm); if(param.quiet == 0) printf("predicting is done in %8.6f seconds\n",(NUMBER)(clock()-tm)/CLOCKS_PER_SEC); - // THERE IS NO METRICS, WE PREDICT UNKNOWN, however, if we force prediction of all we do -// if( param.predictions>0 ) { + //if( param.predictions>0 ) { printf("trained model LL=%15.7f (%15.7f), AIC=%8.6f, BIC=%8.6f, RMSE=%8.6f (%8.6f), Acc=%8.6f (%8.6f)\n", metrics[0], metrics[1], // ll's 2*hmm->getNparams() + 2*metrics[0], hmm->getNparams()*safelog(param.N) + 2*metrics[0], metrics[2], metrics[3], // rmse's metrics[4], metrics[5]); // acc's -// } + //} free(metrics); destroy_input_data(¶m); diff --git a/trainhmm.cpp b/trainhmm.cpp index f994b5d..a998013 100644 --- a/trainhmm.cpp +++ b/trainhmm.cpp @@ -1,6 +1,6 @@ /* - Copyright (c) 2012, Michael (Mikhail) Yudelson + Copyright (c) 2012-2014, Michael (Mikhail) Yudelson All rights reserved. Redistribution and use in source and binary forms, with or without @@ -52,27 +52,40 @@ void cross_validate_nstrat(NUMBER* metrics, const char *filename, clock_t *tm_fi int main (int argc, char ** argv) { - clock_t tm0 = clock(); + clock_t tm_all = clock();//overall time //SEQ char input_file[1024]; char output_file[1024]; char predict_file[1024]; set_param_defaults(¶m); - parse_arguments(argc, argv, input_file, output_file, predict_file); - if(!param.quiet) printf("trainhmm starting...\n"); - if( ! read_and_structure_data(input_file) ) + + clock_t tm_read = clock();//overall time //SEQ + int red_ok = read_and_structure_data(input_file); + tm_read = (NUMBER)(clock()-tm_read);//SEQ + + if( ! red_ok ) return 0; + // now we know the real data + parse_arguments(argc, argv, input_file, output_file, predict_file); + // to reflect upon number of states and observations if those are not 2 and 2 respectively + reset_param_defaults(¶m); + + +// write_pLo_irt(); + + if(!param.quiet) printf("input read, nO=%d, nG=%d, nK=%d, nI=%d\n",param.nO, param.nG, param.nK, param.nI); // erase blocking labels zeroLabels(¶m); - clock_t tm; //SEQ + clock_t tm_fit; //SEQ + clock_t tm_predict; //SEQ if(param.cv_folds==0) { // not cross-validation // create problem @@ -84,11 +97,10 @@ int main (int argc, char ** argv) { hmm = new HMMProblem(¶m); break; } - clock_t tm = clock(); //SEQ + tm_fit = clock(); //SEQ hmm->fit(); - - printf("fitting is done in %8.6f seconds\n",(NUMBER)(clock()-tm)/CLOCKS_PER_SEC); //SEQ + tm_fit = clock()-tm_fit;//SEQ // write model hmm->toFile(output_file); @@ -97,7 +109,10 @@ int main (int argc, char ** argv) { NUMBER* metrics = Calloc(NUMBER, (size_t)7); // LL, AIC, BIC, RMSE, RMSEnonull, Acc, Acc_nonull; // takes care of predictions and metrics, writes predictions if param.predictions==1 + tm_predict = clock(); //SEQ hmm->predict(metrics, predict_file, param.dat_obs, param.dat_group, param.dat_skill, param.dat_multiskill, false/*all, not only unlabelled*/); + tm_predict = clock()-tm_predict;//SEQ + if( param.metrics>0 /*&& !param.quiet*/) { printf("trained model LL=%15.7f (%15.7f), AIC=%8.6f, BIC=%8.6f, RMSE=%8.6f (%8.6f), Acc=%8.6f (%8.6f)\n", metrics[0], metrics[1], // ll's @@ -110,33 +125,31 @@ int main (int argc, char ** argv) { delete hmm; } else { // cross-validation - tm = clock(); //SEQ NUMBER* metrics = Calloc(NUMBER, (size_t)7); // AIC, BIC, RMSE, RMSE no null switch (param.cv_strat) { case CV_GROUP: - cross_validate(metrics, predict_file); + cross_validate(metrics, predict_file, &tm_fit, &tm_predict);//SEQ break; case CV_ITEM: - cross_validate_item(metrics, predict_file); + cross_validate_item(metrics, predict_file, &tm_fit, &tm_predict);//SEQ break; case CV_NSTR: - cross_validate_nstrat(metrics, predict_file); + cross_validate_nstrat(metrics, predict_file, &tm_fit, &tm_predict);//SEQ break; default: break; } - - printf("%d-fold cross-validation: LL=%15.7f, AIC=%8.6f, BIC=%8.6f, RMSE=%8.6f (%8.6f), Acc=%8.6f (%8.6f) computed in %8.6f seconds\n",param.cv_folds, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5], metrics[6], (NUMBER)(clock()-tm)/CLOCKS_PER_SEC); //SEQ - + if(!param.quiet) { + printf("%d-fold cross-validation: LL=%15.7f, AIC=%8.6f, BIC=%8.6f, RMSE=%8.6f (%8.6f), Acc=%8.6f (%8.6f)\n",param.cv_folds, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5], metrics[6]); //SEQ + } free(metrics); } // free data destroy_input_data(¶m); - - printf("overall time running is %8.6f seconds\n",(NUMBER)(clock()-tm0)/CLOCKS_PER_SEC); //SEQ - +// if(param.quiet == 0) + printf("timing: overall %f seconds, read %f, fit %f, predict %f\n",(NUMBER)((clock()-tm_all)/CLOCKS_PER_SEC), (NUMBER)tm_read/CLOCKS_PER_SEC, (NUMBER)tm_fit/CLOCKS_PER_SEC, (NUMBER)tm_predict/CLOCKS_PER_SEC);//SEQ return 0; } @@ -169,17 +182,20 @@ void exit_with_help() { " specify observation for which metrics to be reported, list it after ','.\n" " For example '-m 0', '-m 1' (by default, observation 1 is assumed), '-m 1,2'\n" " (compute metrics for observation 2). Incompatible with-v option.\n" - "-v : cross-validation folds and target state to validate against, perform\n" - " subject-stratified cross-validation, default 0 (no cross-validation),\n" - " examples '-v 5,2' - 5 fold, predict state 2, '-v 10' - 10-fold predict\n" - " state 1 by default.\n" + "-v : cross-validation folds, stratification, and target state to validate\n" + " against, default 0 (no cross-validation),\n" + " examples '-v 5,i,2' - 5 fold, item-stratified c.-v., predict state 2,\n" + " '-v 10' - 10-fold subject-stratified c.-v. predict state 1 by default,\n" + " alternatively '-v 10,g,1', and finally '-v 5,n,2,' - 5-fold unstratified\n" + " c.-v. predicting state 1.\n" "-p : report model predictions on the train set 0-no (default), 1-yes; 2-yes,\n" " plus output state probability; works with -v and -m parameters.\n" "-d : delimiter for multiple skills per observation; 0-single skill per\n" " observation (default), otherwise -- delimiter character, e.g. '-d ~'.\n" "-b : treat input file as binary input file (specifications TBA).\n" - "-B : Block PI (prior), A (transition), or B (observation) parameters from being\n" - " fit. E.g., '-B 0,0,0 (default) blocks none, '-B 1,0,0' blocks PI (priors).\n" + "-B : block re-estimation of prior, transitions, or emissions parameters\n" + " respectively (defailt is '-B 0,0,0'), to block re-estimation of transition\n" + " probabilities specify '-B 0,1,0'.\n" ); exit(1); } @@ -187,9 +203,14 @@ void exit_with_help() { void parse_arguments(int argc, char **argv, char *input_file_name, char *output_file_name, char *predict_file_name) { // parse command line options, starting from 1 (0 is path to executable) // go in pairs, looking at whether first in pair starts with '-', if not, stop parsing arguments + + // at this time we should know nO -- the number of observations int i; int n; char *ch, *ch2; + bool init_specd = false; // init parameters specified + bool lims_specd = false; // parameter limits specified + bool stat_specd_gt2 = false; // number of states specified to be >2 for(i=1;i