From 8a59b384060adaf1c3379dd36c02cc2e1c55fe67 Mon Sep 17 00:00:00 2001 From: Nicu Tofan Date: Sun, 25 Aug 2013 23:14:40 +0300 Subject: [PATCH] Covering some parts of the cogutil in doxydocs; reformatted cluster.h & .c to make description visible to doxygen --- doc/doxydoc/libcogutil.dox | 26 +- doc/doxydoc/progcogserver.dox | 6 +- doc/doxygen.conf.in | 2 +- opencog/util/algorithm.h | 48 +- opencog/util/ansi.h | 9 +- opencog/util/cluster.c | 850 +++++----------------------------- opencog/util/cluster.h | 638 ++++++++++++++++++++++++- 7 files changed, 797 insertions(+), 782 deletions(-) diff --git a/doc/doxydoc/libcogutil.dox b/doc/doxydoc/libcogutil.dox index fe242c608cf..b9d06fcc442 100644 --- a/doc/doxydoc/libcogutil.dox +++ b/doc/doxydoc/libcogutil.dox @@ -1,3 +1,4 @@ +namespace opencog { /** \page libcogutil cogutil library @@ -8,7 +9,30 @@ found no place elsewhere. To build it use: make cogutil @endcode +\section sect_files Files +\subsection ssect_algorithm_h algorithm.h +Hosts small templated functions like for_each(), +accumulate2d(), etc. Sets utilities are also present. + +\subsection ssect_ansi ansi (.h & .cc) +ANSI codes for colored output in terminals that support this feature. +Instead of directly using the codes, make use of the ansi_code() +function that checks the \b ANSI_ENABLED configuration variable +and either appends the code or an empty string. + +\subsection ssect_basedvar based_variant.h +Defines a simple variant with base; seems to be used only in defining +moses::disc_knob type. + +\section sect_libs Libraries + +\subsection ssect_clustering The C Clustering Library +This library (version 1.49) was written at the Laboratory of DNA Information Analysis, +Human Genome Center, Institute of Medical Science, University of Tokyo +by Michiel Jan Laurens de Hoon. + +Methods such as clusterdistance(), getclustercentroids(), kcluster() are provided. \if MARKER_TREE_START ignored by doxygen; used as markers for update-links.py; @@ -23,4 +47,4 @@ ignored by doxygen; used as markers for update-links.py; ignored by doxygen; used as markers for update-links.py; \endif */ - +} //~namespace opencog diff --git a/doc/doxydoc/progcogserver.dox b/doc/doxydoc/progcogserver.dox index 981345ab545..2f09545ac37 100644 --- a/doc/doxydoc/progcogserver.dox +++ b/doc/doxydoc/progcogserver.dox @@ -1,7 +1,11 @@ /** - \page progcogserver cogserver program +To build it use: +@code +make cogserver +@endcode + \if MARKER_TREE_START ignored by doxygen; used as markers for update-links.py; \endif diff --git a/doc/doxygen.conf.in b/doc/doxygen.conf.in index 3bcbfb411bd..fc9009cbb8e 100644 --- a/doc/doxygen.conf.in +++ b/doc/doxygen.conf.in @@ -251,7 +251,7 @@ IDL_PROPERTY_SUPPORT = YES # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. -DISTRIBUTE_GROUP_DOC = NO +DISTRIBUTE_GROUP_DOC = YES # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a diff --git a/opencog/util/algorithm.h b/opencog/util/algorithm.h index f2966540be1..5b15b871576 100644 --- a/opencog/util/algorithm.h +++ b/opencog/util/algorithm.h @@ -13,7 +13,7 @@ namespace opencog { -//these needs to be changed for non-gcc +/// @todo these needs to be changed for non-gcc using __gnu_cxx::copy_n; using __gnu_cxx::lexicographical_compare_3way; using __gnu_cxx::random_sample_n; @@ -21,7 +21,7 @@ using __gnu_cxx::random_sample; using __gnu_cxx::is_heap; using __gnu_cxx::is_sorted; -//binary and ternary and quaternary for_each +//! binary for_each template F for_each(It1 from1, It1 to1, It2 from2, F f) { @@ -29,6 +29,7 @@ F for_each(It1 from1, It1 to1, It2 from2, F f) f(*from1, *from2); return f; } +//! ternary for_each template F for_each(It1 from1, It1 to1, It2 from2, It3 from3, F f) { @@ -36,6 +37,7 @@ F for_each(It1 from1, It1 to1, It2 from2, It3 from3, F f) f(*from1, *from2, *from3); return f; } +//! quaternary for_each template F for_each(It1 from1, It1 to1, It2 from2, It3 from3, It4 from4, F f) { @@ -44,7 +46,7 @@ F for_each(It1 from1, It1 to1, It2 from2, It3 from3, It4 from4, F f) return f; } -//accumulate over a range of containers +//! accumulate over a range of containers template T accumulate2d(iter first, iter last, T init) { @@ -53,14 +55,14 @@ T accumulate2d(iter first, iter last, T init) return init; } -// appends b at the end of a, that is a = a@b +//! appends b at the end of a, that is a = a@b template void append(T& a, const T& b) { a.insert(a.end(), b.begin(), b.end()); } -//erase the intersection of sorted ranges [from1,to1) and [from2,to2) from c, -//leaving the difference +//! erase the intersection of sorted ranges [from1,to1) and [from2,to2) from c, +//! leaving the difference template void erase_set_intersection(Erase erase, It1 from1, It1 to1, It2 from2, It2 to2, Comp comp) @@ -81,8 +83,8 @@ void erase_set_intersection(Erase erase, It1 from1, It1 to1, } } -//erase the difference of sorted ranges [from1,to1) and [from2,to2) from c, -//leaving the intersection +//! erase the difference of sorted ranges [from1,to1) and [from2,to2) from c, +//! leaving the intersection template void erase_set_difference(Erase erase, It1 from1, It1 to1, It2 from2, It2 to2, Comp comp) @@ -105,9 +107,9 @@ void erase_set_difference(Erase erase, It1 from1, It1 to1, erase(from1++); } -//insert [from2,to2)-[from1,to1) with inserter -//i.e., if insert inserts into the container holding [from1,to1), -//it will now hold the union +//! insert [from2,to2)-[from1,to1) with inserter +//! i.e., if insert inserts into the container holding [from1,to1), +//! it will now hold the union template void insert_set_complement(Insert insert, It1 from1, It1 to1, It2 from2, It2 to2, Comp comp) @@ -133,6 +135,7 @@ void insert_set_complement(Insert insert, It1 from1, It1 to1, } } +//! determine if the intersection of two sets is the empty set template bool has_empty_intersection(It1 from1, It1 to1, It2 from2, It2 to2, Comp comp) @@ -152,6 +155,7 @@ bool has_empty_intersection(It1 from1, It1 to1, return true; } +//! determine if the intersection of two sets is the empty set template bool has_empty_intersection(const Set& ls, const Set& rs) { return has_empty_intersection(ls.begin(), ls.end(), @@ -192,7 +196,7 @@ bool is_disjoint(const Set1 &set1, const Set2 &set2) } /** - * return a singleton set with the input given in it + * \return a singleton set with the input given in it */ template Set make_singleton_set(const typename Set::value_type& v) { @@ -202,7 +206,7 @@ Set make_singleton_set(const typename Set::value_type& v) { } /** - * Return s1 union s2 + * \return s1 union s2 * s1 and s2 being std::set or similar concept */ template @@ -213,7 +217,7 @@ Set set_union(const Set& s1, const Set& s2) { } /** - * Return s1 inter s2 + * \return s1 inter s2 * s1 and s2 must be sorted */ template @@ -225,7 +229,7 @@ Set set_intersection(const Set& s1, const Set& s2) { } /** - * Returns s1 - s2 + * \return s1 - s2 * s1 and s2 must be sorted */ template @@ -236,8 +240,8 @@ Set set_difference(const Set& s1, const Set& s2) { return res; } -// Predicate maps to the range [0, n) -// n-1 values (the pivots) are copied to out +//! Predicate maps to the range [0, n) +//! n-1 values (the pivots) are copied to out template Out n_way_partition(It begin, It end, const Pred p, int n, Out out) { @@ -248,16 +252,13 @@ Out n_way_partition(It begin, It end, const Pred p, int n, Out out) } /** - * return the power set ps of s such that all elements of ps are + * \return the power set ps of s such that all elements of ps are * subsets of size n or below. * * @param s the input set - * * @param n the size of the largest subset of s. If n is larger than * |s| then the size of the largest subset of s will be |s| - * * @param exact if true then do not include subsets of size below n - * * @return the power set of s with subsets up to size n */ template std::set powerset(const Set& s, size_t n, @@ -280,8 +281,9 @@ template std::set powerset(const Set& s, size_t n, res.insert(Set()); return res; } + /** - * return the power set of s. + * \return the power set of s. */ template std::set powerset(const Set& s) { @@ -302,7 +304,7 @@ Seq seq_filtered(const Seq& seq, const Indices& indices) { res.push_back(seq[idx]); return res; } - + } //~namespace opencog diff --git a/opencog/util/ansi.h b/opencog/util/ansi.h index 337ad3d64c2..d5dec980ab3 100644 --- a/opencog/util/ansi.h +++ b/opencog/util/ansi.h @@ -26,6 +26,8 @@ #include namespace opencog { +//!@{ +//! color codes in ANSI static const char* const COLOR_OFF = "\033[0m"; static const char* const BRIGHT = "\033[1m"; static const char* const RED = "\033[31m"; @@ -35,11 +37,15 @@ static const char* const BLUE = "\033[34m"; static const char* const MAGENTA = "\033[35m"; static const char* const CYAN = "\033[36m"; static const char* const WHITE = "\033[37m"; - +//!@} + +//! inserts the code only if \b ANSI_ENABLED variable is set inline void ansi_code(std::string &s,const std::string &code) { if (config().get_bool("ANSI_ENABLED")) s.append(code); } +//!@{ +//! appends the code if \b ANSI_ENABLED variable is set inline void ansi_off(std::string &s) { ansi_code(s,COLOR_OFF); } inline void ansi_bright(std::string &s) { ansi_code(s,BRIGHT); } inline void ansi_red(std::string &s) { ansi_code(s,RED); } @@ -49,5 +55,6 @@ inline void ansi_blue(std::string &s) { ansi_code(s,BLUE); } inline void ansi_magenta(std::string &s) { ansi_code(s,MAGENTA); } inline void ansi_cyan(std::string &s) { ansi_code(s,CYAN); } inline void ansi_white(std::string &s) { ansi_code(s,WHITE); } +//!@} } diff --git a/opencog/util/cluster.c b/opencog/util/cluster.c index 839d7bdfb1d..c61d4779ffc 100644 --- a/opencog/util/cluster.c +++ b/opencog/util/cluster.c @@ -64,13 +64,6 @@ double mean(int n, double x[]) /* ************************************************************************ */ double median (int n, double x[]) -/* -Find the median of X(1), ... , X(N), using as much of the quicksort -algorithm as is needed to isolate it. -N.B. On exit, the array X is partially ordered. -Based on Alan J. Miller's median.f90 routine. -*/ - { int i, j; int nr = n / 2; int nl = nr - 1; @@ -159,15 +152,20 @@ Based on Alan J. Miller's median.f90 routine. /* ********************************************************************** */ -static const double* sortdata = NULL; /* used in the quicksort algorithm */ +//! used in the quicksort algorithm +//! +static const double* sortdata = NULL; /* ---------------------------------------------------------------------- */ +/** + * \internal + * Helper function for sort. Previously, this was a nested function under + * sort, which is not allowed under ANSI C. + * + */ static int compare(const void* a, const void* b) -/* Helper function for sort. Previously, this was a nested function under - * sort, which is not allowed under ANSI C. - */ { const int i1 = *(const int*)a; const int i2 = *(const int*)b; const double term1 = sortdata[i1]; @@ -180,10 +178,6 @@ int compare(const void* a, const void* b) /* ---------------------------------------------------------------------- */ void sort(int n, const double data[], int index[]) -/* Sets up an index table given the data, such that data[index[]] is in - * increasing order. Sorting is done on the indices; the array data - * is unchanged. - */ { int i; sortdata = data; for (i = 0; i < n; i++) index[i] = i; @@ -192,13 +186,16 @@ void sort(int n, const double data[], int index[]) /* ********************************************************************** */ -static double* getrank (int n, double data[]) -/* Calculates the ranks of the elements in the array data. Two elements with +/** + * \internal + * Calculates the ranks of the elements in the array data. Two elements with * the same value get the same rank, equal to the average of the ranks had the * elements different values. The ranks are returned as a newly allocated * array that should be freed by the calling routine. If getrank fails due to * a memory allocation error, it returns NULL. + * */ +static double* getrank (int n, double data[]) { int i; double* rank; int* index; @@ -284,9 +281,9 @@ freedatamask(int n, double** data, int** mask) /* ---------------------------------------------------------------------- */ -static -double find_closest_pair(int n, double** distmatrix, int* ip, int* jp) -/* +/** +\internal + This function searches the distance matrix to find the pair with the shortest distance between them. The indices of the pair are returned in ip and jp; the distance itself is returned by the function. @@ -305,7 +302,11 @@ the shortest distance. jp (output) int* A pointer to the integer that is to receive the second index of the pair with the shortest distance. + + */ +static +double find_closest_pair(int n, double** distmatrix, int* ip, int* jp) { int i, j; double temp; double distance = distmatrix[1][0]; @@ -326,8 +327,9 @@ the shortest distance. /* ********************************************************************* */ -static int svd(int m, int n, double** u, double w[], double** vt) -/* +/** + * \internal + * * This subroutine is a translation of the Algol procedure svd, * Num. Math. 14, 403-420(1970) by Golub and Reinsch. * Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971). @@ -390,7 +392,10 @@ static int svd(int m, int n, double** u, double w[], double** vt) * internally in this routine, instead of passing it as an * argument. If the allocation fails, svd returns -1. * 2003.06.05 + * + * */ +static int svd(int m, int n, double** u, double w[], double** vt) { int i, j, k, i1, k1, l1, its; double c,f,h,s,x,y,z; int l = 0; @@ -809,63 +814,7 @@ static int svd(int m, int n, double** u, double w[], double** vt) /* ********************************************************************* */ int pca(int nrows, int ncolumns, double** u, double** v, double* w) -/* -Purpose -======= - -This subroutine uses the singular value decomposition to perform principal -components analysis of a real nrows by ncolumns rectangular matrix. - -Arguments -========= - -nrows (input) int -The number of rows in the matrix u. -ncolumns (input) int -The number of columns in the matrix v. - -u (input) double[nrows][ncolumns] -On input, the array containing the data to which the principal component -analysis should be applied. The function assumes that the mean has already been -subtracted of each column, and hence that the mean of each column is zero. -On output, see below. - -v (input) double[n][n], where n = min(nrows, ncolumns) -Not used on input. - -w (input) double[n], where n = min(nrows, ncolumns) -Not used on input. - - -Return value -============ - -On output: - -If nrows >= ncolumns, then - -u contains the coordinates with respect to the principal components; -v contains the principal component vectors. - -The dot product u . v reproduces the data that were passed in u. - - -If nrows < ncolumns, then - -u contains the principal component vectors; -v contains the coordinates with respect to the principal components. - -The dot product v . u reproduces the data that were passed in u. - -The eigenvalues of the covariance matrix are returned in w. - -The arrays u, v, and w are sorted according to eigenvalue, with the largest -eigenvalues appearing first. - -The function returns 0 if successful, -1 if memory allocation fails, and a -positive integer if the singular value decomposition fails to converge. -*/ { int i; int j; @@ -932,11 +881,10 @@ positive integer if the singular value decomposition fails to converge. /* ********************************************************************* */ -static -double euclid (int n, double** data1, double** data2, int** mask1, int** mask2, - const double weight[], int index1, int index2, int transpose) - -/* + +/** +\internal + Purpose ======= @@ -977,9 +925,13 @@ transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ -{ double result = 0.; +static +double euclid (int n, double** data1, double** data2, int** mask1, int** mask2, + const double weight[], int index1, int index2, int transpose) +{ + double result = 0.; double tweight = 0; int i; if (transpose==0) /* Calculate the distance between two rows */ @@ -1007,13 +959,10 @@ Otherwise, the distance between two columns in the matrix is calculated. /* ********************************************************************* */ -static -double cityblock (int n, double** data1, double** data2, int** mask1, - int** mask2, const double weight[], int index1, int index2, int transpose) -/* -Purpose -======= +/** +\internal + The cityblock routine calculates the weighted "City Block" distance between two rows or columns in a matrix. City Block distance is defined as the @@ -1053,8 +1002,10 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. - -============================================================================ */ +*/ +static +double cityblock (int n, double** data1, double** data2, int** mask1, + int** mask2, const double weight[], int index1, int index2, int transpose) { double result = 0.; double tweight = 0; int i; @@ -1086,7 +1037,10 @@ Otherwise, the distance between two columns in the matrix is calculated. static double correlation (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + + Purpose ======= @@ -1130,7 +1084,6 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ */ { double result = 0.; double sum1 = 0.; @@ -1186,7 +1139,9 @@ Otherwise, the distance between two columns in the matrix is calculated. static double acorrelation (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + Purpose ======= @@ -1229,7 +1184,7 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ { double result = 0.; double sum1 = 0.; @@ -1285,7 +1240,9 @@ Otherwise, the distance between two columns in the matrix is calculated. static double ucorrelation (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + Purpose ======= @@ -1330,7 +1287,7 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ { double result = 0.; double denom1 = 0.; @@ -1380,7 +1337,9 @@ Otherwise, the distance between two columns in the matrix is calculated. static double uacorrelation (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + Purpose ======= @@ -1425,7 +1384,7 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ { double result = 0.; double denom1 = 0.; @@ -1475,7 +1434,9 @@ Otherwise, the distance between two columns in the matrix is calculated. static double spearman (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + Purpose ======= @@ -1517,7 +1478,7 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ { int i; int m = 0; @@ -1603,7 +1564,9 @@ Otherwise, the distance between two columns in the matrix is calculated. static double kendall (int n, double** data1, double** data2, int** mask1, int** mask2, const double weight[], int index1, int index2, int transpose) -/* +/** +\internal + Purpose ======= @@ -1644,7 +1607,7 @@ Index of the second row or column. transpose (input) int If transpose==0, the distance between two rows in the matrix is calculated. Otherwise, the distance between two columns in the matrix is calculated. -============================================================================ + */ { int con = 0; int dis = 0; @@ -1730,7 +1693,9 @@ static double(*setmetric(char dist)) /* ********************************************************************* */ static double uniform(void) -/* +/** +\internal + Purpose ======= @@ -1758,7 +1723,7 @@ Return value ============ A double-precison number between 0.0 and 1.0. -============================================================================ + */ { int z; static const int m1 = 2147483563; @@ -1793,7 +1758,9 @@ A double-precison number between 0.0 and 1.0. /* ************************************************************************ */ static int binomial(int n, double p) -/* +/** +\internal + Purpose ======= @@ -1822,7 +1789,7 @@ Return value An integer drawn from a binomial distribution with parameters (p, n). -============================================================================ + */ { const double q = 1 - p; if (n*p < 30.0) /* Algorithm BINV */ @@ -1936,7 +1903,9 @@ An integer drawn from a binomial distribution with parameters (p, n). /* ************************************************************************ */ static void randomassign (int nclusters, int nelements, int clusterid[]) -/* +/** +\internal + Purpose ======= @@ -1959,7 +1928,7 @@ to be clustered). clusterid (output) int[nelements] The cluster number to which an element was assigned. -============================================================================ + */ { int i, j; int k = 0; @@ -1995,7 +1964,9 @@ The cluster number to which an element was assigned. static void getclustermeans(int nclusters, int nrows, int ncolumns, double** data, int** mask, int clusterid[], double** cdata, int** cmask, int transpose) -/* +/** +\internal + Purpose ======= @@ -2044,7 +2015,7 @@ transpose (input) int If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of columns (microarrays) are specified. -======================================================================== + */ { int i, j, k; if (transpose==0) @@ -2105,7 +2076,9 @@ static void getclustermedians(int nclusters, int nrows, int ncolumns, double** data, int** mask, int clusterid[], double** cdata, int** cmask, int transpose, double cache[]) -/* +/** +\internal + Purpose ======= @@ -2160,7 +2133,7 @@ This array should be allocated before calling getclustermedians; its contents on input is not relevant. This array is used as a temporary storage space when calculating the medians. -======================================================================== + */ { int i, j, k; if (transpose==0) @@ -2212,70 +2185,6 @@ calculating the medians. int getclustercentroids(int nclusters, int nrows, int ncolumns, double** data, int** mask, int clusterid[], double** cdata, int** cmask, int transpose, char method) -/* -Purpose -======= - -The getclustercentroids routine calculates the cluster centroids, given to -which cluster each element belongs. Depending on the argument method, the -centroid is defined as either the mean or the median for each dimension over -all elements belonging to a cluster. - -Arguments -========= - -nclusters (input) int -The number of clusters. - -nrows (input) int -The number of rows in the gene expression data matrix, equal to the number of -genes. - -ncolumns (input) int -The number of columns in the gene expression data matrix, equal to the number of -microarrays. - -data (input) double[nrows][ncolumns] -The array containing the gene expression data. - -mask (input) int[nrows][ncolumns] -This array shows which data values are missing. If mask[i][j]==0, then -data[i][j] is missing. - -clusterid (output) int[nrows] if transpose==0 - int[ncolumns] if transpose==1 -The cluster number to which each element belongs. If transpose==0, then the -dimension of clusterid is equal to nrows (the number of genes). Otherwise, it -is equal to ncolumns (the number of microarrays). - -cdata (output) double[nclusters][ncolumns] if transpose==0 - double[nrows][nclusters] if transpose==1 -On exit of getclustercentroids, this array contains the cluster centroids. - -cmask (output) int[nclusters][ncolumns] if transpose==0 - int[nrows][nclusters] if transpose==1 -This array shows which data values of are missing for each centroid. If -cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for -a centroid if all corresponding data values of the cluster members are missing. - -transpose (input) int -If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of -columns (microarrays) are specified. - -method (input) char -For method=='a', the centroid is defined as the mean over all elements -belonging to a cluster for each dimension. -For method=='m', the centroid is defined as the median over all elements -belonging to a cluster for each dimension. - -Return value -============ - -The function returns an integer to indicate success or failure. If a -memory error occurs, or if method is not 'm' or 'a', getclustercentroids -returns 0. If successful, getclustercentroids returns 1. -======================================================================== -*/ { switch(method) { case 'm': { const int nelements = (transpose==0) ? nrows : ncolumns; @@ -2299,41 +2208,6 @@ returns 0. If successful, getclustercentroids returns 1. void getclustermedoids(int nclusters, int nelements, double** distance, int clusterid[], int centroids[], double errors[]) -/* -Purpose -======= - -The getclustermedoids routine calculates the cluster centroids, given to which -cluster each element belongs. The centroid is defined as the element with the -smallest sum of distances to the other elements. - -Arguments -========= - -nclusters (input) int -The number of clusters. - -nelements (input) int -The total number of elements. - -distmatrix (input) double array, ragged - (number of rows is nelements, number of columns is equal to the row number) -The distance matrix. To save space, the distance matrix is given in the -form of a ragged array. The distance matrix is symmetric and has zeros -on the diagonal. See distancematrix for a description of the content. - -clusterid (output) int[nelements] -The cluster number to which each element belongs. - -centroid (output) int[nclusters] -The index of the element that functions as the centroid for each cluster. - -errors (output) double[nclusters] -The within-cluster sum of distances between the items and the cluster -centroid. - -======================================================================== -*/ { int i, j, k; for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX; for (i = 0; i < nelements; i++) @@ -2567,87 +2441,6 @@ void kcluster (int nclusters, int nrows, int ncolumns, double** data, int** mask, double weight[], int transpose, int npass, char method, char dist, int clusterid[], double* error, int* ifound) -/* -Purpose -======= - -The kcluster routine performs k-means or k-median clustering on a given set of -elements, using the specified distance measure. The number of clusters is given -by the user. Multiple passes are being made to find the optimal clustering -solution, each time starting from a different initial clustering. - - -Arguments -========= - -nclusters (input) int -The number of clusters to be found. - -data (input) double[nrows][ncolumns] -The array containing the data of the elements to be clustered (i.e., the gene -expression data). - -mask (input) int[nrows][ncolumns] -This array shows which data values are missing. If -mask[i][j] == 0, then data[i][j] is missing. - -nrows (input) int -The number of rows in the data matrix, equal to the number of genes. - -ncolumns (input) int -The number of columns in the data matrix, equal to the number of microarrays. - -weight (input) double[n] -The weights that are used to calculate the distance. - -transpose (input) int -If transpose==0, the rows of the matrix are clustered. Otherwise, columns -of the matrix are clustered. - -npass (input) int -The number of times clustering is performed. Clustering is performed npass -times, each time starting from a different (random) initial assignment of -genes to clusters. The clustering solution with the lowest within-cluster sum -of distances is chosen. -If npass==0, then the clustering algorithm will be run once, where the initial -assignment of elements to clusters is taken from the clusterid array. - -method (input) char -Defines whether the arithmetic mean (method=='a') or the median -(method=='m') is used to calculate the cluster center. - -dist (input) char -Defines which distance measure is used, as given by the table: -dist=='e': Euclidean distance -dist=='b': City-block distance -dist=='c': correlation -dist=='a': absolute value of the correlation -dist=='u': uncentered correlation -dist=='x': absolute uncentered correlation -dist=='s': Spearman's rank correlation -dist=='k': Kendall's tau -For other values of dist, the default (Euclidean distance) is used. - -clusterid (output; input) int[nrows] if transpose==0 - int[ncolumns] if transpose==1 -The cluster number to which a gene or microarray was assigned. If npass==0, -then on input clusterid contains the initial clustering assignment from which -the clustering algorithm starts. On output, it contains the clustering solution -that was found. - -error (output) double* -The sum of distances to the cluster center of each item in the optimal k-means -clustering solution that was found. - -ifound (output) int* -The number of times the optimal clustering solution was -found. The value of ifound is at least 1; its maximum value is npass. If the -number of clusters is larger than the number of elements being clustered, -*ifound is set to 0 as an error code. If a memory allocation error occurs, -*ifound is set to -1. - -======================================================================== -*/ { const int nelements = (transpose==0) ? nrows : ncolumns; const int ndata = (transpose==0) ? ncolumns : nrows; @@ -2731,61 +2524,6 @@ number of clusters is larger than the number of elements being clustered, void kmedoids (int nclusters, int nelements, double** distmatrix, int npass, int clusterid[], double* error, int* ifound) -/* -Purpose -======= - -The kmedoids routine performs k-medoids clustering on a given set of elements, -using the distance matrix and the number of clusters passed by the user. -Multiple passes are being made to find the optimal clustering solution, each -time starting from a different initial clustering. - - -Arguments -========= - -nclusters (input) int -The number of clusters to be found. - -nelements (input) int -The number of elements to be clustered. - -distmatrix (input) double array, ragged - (number of rows is nelements, number of columns is equal to the row number) -The distance matrix. To save space, the distance matrix is given in the -form of a ragged array. The distance matrix is symmetric and has zeros -on the diagonal. See distancematrix for a description of the content. - -npass (input) int -The number of times clustering is performed. Clustering is performed npass -times, each time starting from a different (random) initial assignment of genes -to clusters. The clustering solution with the lowest within-cluster sum of -distances is chosen. -If npass==0, then the clustering algorithm will be run once, where the initial -assignment of elements to clusters is taken from the clusterid array. - -clusterid (output; input) int[nelements] -On input, if npass==0, then clusterid contains the initial clustering assignment -from which the clustering algorithm starts; all numbers in clusterid should be -between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on -input. -On output, clusterid contains the clustering solution that was found: clusterid -contains the number of the cluster to which each item was assigned. On output, -the number of a cluster is defined as the item number of the centroid of the -cluster. - -error (output) double -The sum of distances to the cluster center of each item in the optimal k-medoids -clustering solution that was found. - -ifound (output) int -If kmedoids is successful: the number of times the optimal clustering solution -was found. The value of ifound is at least 1; its maximum value is npass. -If the user requested more clusters than elements available, ifound is set -to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1. - -======================================================================== -*/ { int i, j, icluster; int* tclusterid; int* saved; @@ -2907,67 +2645,6 @@ to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1. double** distancematrix (int nrows, int ncolumns, double** data, int** mask, double weights[], char dist, int transpose) -/* -Purpose -======= - -The distancematrix routine calculates the distance matrix between genes or -microarrays using their measured gene expression data. Several distance measures -can be used. The routine returns a pointer to a ragged array containing the -distances between the genes. As the distance matrix is symmetric, with zeros on -the diagonal, only the lower triangular half of the distance matrix is saved. -The distancematrix routine allocates space for the distance matrix. If the -parameter transpose is set to a nonzero value, the distances between the columns -(microarrays) are calculated, otherwise distances between the rows (genes) are -calculated. -If sufficient space in memory cannot be allocated to store the distance matrix, -the routine returns a NULL pointer, and all memory allocated so far for the -distance matrix is freed. - - -Arguments -========= - -nrows (input) int -The number of rows in the gene expression data matrix (i.e., the number of -genes) - -ncolumns (input) int -The number of columns in the gene expression data matrix (i.e., the number of -microarrays) - -data (input) double[nrows][ncolumns] -The array containing the gene expression data. - -mask (input) int[nrows][ncolumns] -This array shows which data values are missing. If mask[i][j]==0, then -data[i][j] is missing. - -weight (input) double[n] -The weights that are used to calculate the distance. The length of this vector -is equal to the number of columns if the distances between genes are calculated, -or the number of rows if the distances between microarrays are calculated. - -dist (input) char -Defines which distance measure is used, as given by the table: -dist=='e': Euclidean distance -dist=='b': City-block distance -dist=='c': correlation -dist=='a': absolute value of the correlation -dist=='u': uncentered correlation -dist=='x': absolute uncentered correlation -dist=='s': Spearman's rank correlation -dist=='k': Kendall's tau -For other values of dist, the default (Euclidean distance) is used. - -transpose (input) int -If transpose is equal to zero, the distances between the rows is -calculated. Otherwise, the distances between the columns is calculated. -The former is needed when genes are being clustered; the latter is used -when microarrays are being clustered. - -======================================================================== -*/ { /* First determine the size of the distance matrix */ const int n = (transpose==0) ? nrows : ncolumns; const int ndata = (transpose==0) ? ncolumns : nrows; @@ -3009,73 +2686,6 @@ when microarrays are being clustered. double* calculate_weights(int nrows, int ncolumns, double** data, int** mask, double weights[], int transpose, char dist, double cutoff, double exponent) -/* -Purpose -======= - -This function calculates the weights using the weighting scheme proposed by -Michael Eisen: -w[i] = 1.0 / sum_{j where d[i][j]= ncolumns, then + +u contains the coordinates with respect to the principal components; +v contains the principal component vectors. + +The dot product u . v reproduces the data that were passed in u. + + +If nrows < ncolumns, then + +u contains the principal component vectors; +v contains the coordinates with respect to the principal components. + +The dot product v . u reproduces the data that were passed in u. + +The eigenvalues of the covariance matrix are returned in w. + +The arrays u, v, and w are sorted according to eigenvalue, with the largest +eigenvalues appearing first. + +The function returns 0 if successful, -1 if memory allocation fails, and a +positive integer if the singular value decomposition fails to converge. +*/ int pca(int m, int n, double** u, double** v, double* w); -/* Utility routines, currently undocumented */ +/** Sets up an index table given the data, such that data[index[]] is in + * increasing order. Sorting is done on the indices; the array data + * is unchanged. + */ void sort(int n, const double data[], int index[]); + +//! compute the mean of the numbers in list double mean(int n, double x[]); + +/** +Find the median of X(1), ... , X(N), using as much of the quicksort +algorithm as is needed to isolate it. +N.B. On exit, the array X is partially ordered. +Based on Alan J. Miller's median.f90 routine. +*/ double median (int n, double x[]); +/** +This function calculates the weights using the weighting scheme proposed by +Michael Eisen: +w[i] = 1.0 / sum_{j where d[i][j]