Skip to content

Commit

Permalink
Adding the pop_size module
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdavidsmith committed Sep 23, 2020
1 parent f374ba7 commit 6e4a080
Showing 1 changed file with 248 additions and 10 deletions.
258 changes: 248 additions & 10 deletions src/preseq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,19 @@ median_from_sorted_vector (const vector<T> sorted_data,
}

template<typename T> T
quantile_from_sorted_vector (const vector<T> sorted_data,
const size_t stride, const size_t n,
const double f) {
quantile_from_sorted_vector (const vector<T> sorted_data,
const size_t stride, const size_t n,
const double f) {
const double index = f * (n - 1);
const size_t lhs = (int)index;
const double delta = index - lhs;

if (n == 0 || sorted_data.empty()) return 0.0;

if (lhs == n - 1) return sorted_data[lhs * stride];
return (1 - delta) * sorted_data[lhs * stride]
+ delta * sorted_data[(lhs + 1) * stride];

return (1 - delta) * sorted_data[lhs * stride]
+ delta * sorted_data[(lhs + 1) * stride];
}

// Confidence interval stuff
Expand Down Expand Up @@ -202,8 +202,8 @@ factorial (double x) {

double Ag = lanczos[0];

for (size_t k=1; k < lanczos.size(); k++)
Ag += lanczos[k] / (x + k);
for (size_t k=1; k < lanczos.size(); k++)
Ag += lanczos[k] / (x + k);

double term1 = (x + 0.5) * log((x + 7.5) / Euler);
double term2 = LogRootTwoPi + log(Ag);
Expand All @@ -229,7 +229,7 @@ resample_hist(std::mt19937 &gen, const vector<size_t> &vals_hist_distinct_counts

unsigned int distinct =
accumulate(begin(distinct_counts_hist), end(distinct_counts_hist), 0.0);

multinomial(gen, distinct_counts_hist, distinct,
sample_distinct_counts_hist);

Expand Down Expand Up @@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
return EXIT_SUCCESS;
}

////////////////////////////////////////////////////////////////////////
// POP_SIZE
static int
pop_size(const int argc, const char **argv) {

try {

static const size_t min_required_counts = 4;
static const string min_required_counts_error_message =
"max count before zero is less than min required count (" +
to_string(min_required_counts) + ") duplicates removed";

string outfile;

size_t orig_max_terms = 100;
double max_extrap = 1.0e10;
size_t n_bootstraps = 100;
int diagonal = 0;
double c_level = 0.95;
unsigned long int seed = 408;

/* FLAGS */
bool VERBOSE = false;
bool VALS_INPUT = false;
bool PAIRED_END = false;
bool HIST_INPUT = false;
bool SINGLE_ESTIMATE = false;
bool allow_defects = false;

#ifdef HAVE_HTSLIB
bool BAM_FORMAT_INPUT = false;
size_t MAX_SEGMENT_LENGTH = 5000;
#endif

const string description =
"Needs writing";

/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
opt_parse.add_opt("output", 'o', "output file (default: stdout)",
false , outfile);
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
#ifdef HAVE_HTSLIB
opt_parse.add_opt("bam", 'B', "input is in BAM format",
false, BAM_FORMAT_INPUT);
opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
"paired end bam reads",
false, MAX_SEGMENT_LENGTH);
#endif
opt_parse.add_opt("pe", 'P', "input is paired end read file",
false, PAIRED_END);
opt_parse.add_opt("vals", 'V',
"input is a text file containing only the observed counts",
false, VALS_INPUT);
opt_parse.add_opt("hist", 'H',
"input is a text file containing the observed histogram",
false, HIST_INPUT);
opt_parse.add_opt("quick", 'Q',
"quick mode (no bootstraps) for confidence intervals",
false, SINGLE_ESTIMATE);
opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
opt_parse.add_opt("seed", 'r', "seed for random number generator",
false, seed);
opt_parse.set_show_defaults();
vector<string> leftover_args;

// ADS: suspect bug below; "-about" isn't working.
opt_parse.parse(argc-1, argv+1, leftover_args);
if (argc == 2 || opt_parse.help_requested()) {
cerr << opt_parse.help_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.about_requested()) {
cerr << opt_parse.about_message() << endl;
return EXIT_SUCCESS;
}
if (opt_parse.option_missing()) {
cerr << opt_parse.option_missing_message() << endl;
return EXIT_SUCCESS;
}
if (leftover_args.empty()) {
cerr << opt_parse.help_message() << endl;
return EXIT_SUCCESS;
}
const string input_file_name = leftover_args.front();
/******************************************************************/

throw runtime_error("NOT YET IMPLEMENTED");

vector<double> counts_hist;
size_t n_reads = 0;

/************ loading input ***************************************/
if (HIST_INPUT) {
if (VERBOSE)
cerr << "HIST_INPUT" << endl;
n_reads = load_histogram(input_file_name, counts_hist);
}
else if (VALS_INPUT) {
if (VERBOSE)
cerr << "VALS_INPUT" << endl;
n_reads = load_counts(input_file_name, counts_hist);
}
#ifdef HAVE_HTSLIB
else if (BAM_FORMAT_INPUT && PAIRED_END) {
if (VERBOSE)
cerr << "PAIRED_END_BAM_INPUT" << endl;
const size_t MAX_READS_TO_HOLD = 5000000;
size_t n_paired = 0;
size_t n_mates = 0;
n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
MAX_SEGMENT_LENGTH,
MAX_READS_TO_HOLD, n_paired,
n_mates, counts_hist);
if (VERBOSE) {
cerr << "MERGED PAIRED END READS = " << n_paired << endl;
cerr << "MATES PROCESSED = " << n_mates << endl;
}
}
else if (BAM_FORMAT_INPUT) {
if (VERBOSE)
cerr << "BAM_INPUT" << endl;
n_reads = load_counts_BAM_se(input_file_name, counts_hist);
}
#endif
else if (PAIRED_END) {
if (VERBOSE)
cerr << "PAIRED_END_BED_INPUT" << endl;
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
}
else { // default is single end bed file
if (VERBOSE)
cerr << "BED_INPUT" << endl;
n_reads = load_counts_BED_se(input_file_name, counts_hist);
}
/************ done loading input **********************************/

const size_t max_observed_count = counts_hist.size() - 1;
const double distinct_reads =
accumulate(begin(counts_hist), end(counts_hist), 0.0);

// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
size_t first_zero = 1;
while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
++first_zero;

orig_max_terms = std::min(orig_max_terms, first_zero - 1);
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);

const size_t distinct_counts =
std::count_if(begin(counts_hist), end(counts_hist),
[](const double x) {return x > 0.0;});

if (VERBOSE)
cerr << "TOTAL READS = " << n_reads << endl
<< "DISTINCT READS = " << distinct_reads << endl
<< "DISTINCT COUNTS = " << distinct_counts << endl
<< "MAX COUNT = " << max_observed_count << endl
<< "COUNTS OF 1 = " << counts_hist[1] << endl
<< "MAX TERMS = " << orig_max_terms << endl;

// ADS: step size to be set based on the original number of
// observations
const double step_size = n_reads;

if (VERBOSE) {
// OUTPUT THE ORIGINAL HISTOGRAM
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
for (size_t i = 0; i < counts_hist.size(); i++)
if (counts_hist[i] > 0)
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
cerr << endl;
}

// check to make sure library is not overly saturated
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
if (two_fold_extrap < 0.0)
throw runtime_error("Saturation expected at double initial sample size."
" Unable to extrapolate");

// const size_t total_reads = get_counts_from_hist(counts_hist);

//assert(total_reads == n_reads); // ADS: why commented out?

// check that min required count is satisfied
if (orig_max_terms < min_required_counts)
throw runtime_error(min_required_counts_error_message);

if (VERBOSE)
cerr << "[ESTIMATING YIELD CURVE]" << endl;
vector<double> yield_estimates;

if (VERBOSE)
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;

const size_t max_iter = 100*n_bootstraps;

vector<vector <double> > bootstrap_estimates;
extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
orig_max_terms, diagonal, step_size, max_extrap,
max_iter, bootstrap_estimates);

if (VERBOSE)
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
// yield ci
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;

vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
yield_lower_ci_lognorm, yield_upper_ci_lognorm);

/////////////////////////////////////////////////////////////////////
if (VERBOSE)
cerr << "[WRITING OUTPUT]" << endl;

write_predicted_complexity_curve(outfile, c_level, step_size,
yield_estimates, yield_lower_ci_lognorm,
yield_upper_ci_lognorm);
}
catch (runtime_error &e) {
cerr << "ERROR:\t" << e.what() << endl;
return EXIT_FAILURE;
}
catch (std::bad_alloc &ba) {
cerr << "ERROR: could not allocate memory" << endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}


///////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -1475,7 +1708,7 @@ bound_pop(const int argc, const char **argv) {
bootstrap_moments.push_back(exp(factorial(i + 3) +
log(sample_hist[i + 2]) -
log(sample_hist[1])) );
}
}

size_t n_points = 0;
n_points = ensure_pos_def_mom_seq(bootstrap_moments, tolerance, VERBOSE);
Expand Down Expand Up @@ -1606,6 +1839,11 @@ main(const int argc, const char **argv) {

return bound_pop(argc, argv);

}
else if (strcmp(argv[1], "pop_size") == 0) {

return pop_size(argc, argv);

}
else {
cerr << "unrecognized command: " << argv[1] << endl
Expand Down

0 comments on commit 6e4a080

Please sign in to comment.