Skip to content

Commit

Permalink
better MIN_DP compression ; version hanlding
Browse files Browse the repository at this point in the history
  • Loading branch information
divonlan committed May 6, 2020
1 parent 65ddf6e commit a2f820a
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 34 deletions.
4 changes: 2 additions & 2 deletions move_to_front.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ uint32_t mtf_get_next_snip (VBlock *vb, MtfContext *ctx,
ASSERT (override_iterator || iterator->next_b250 <= LASTENT (uint8_t, ctx->b250), "Error while reconstrucing line %u vb_i=%u: iterator for %s reached end of data",
txt_line, vb->vblock_i, err_dict_id (ctx->dict_id));

uint32_t word_index = z_file->genozip_version >= 2 ? base250_decode (&iterator->next_b250) // if this line has no non-GT subfields, it will not have a ctx
: v1_base250_decode (&iterator->next_b250);
uint32_t word_index = is_v2_or_above ? base250_decode (&iterator->next_b250) // if this line has no non-GT subfields, it will not have a ctx
: v1_base250_decode (&iterator->next_b250);

// case: a subfield snip is missing - either the genotype data has less subfields than declared in FORMAT, or not provided at all for some (or all) samples.
if (word_index == WORD_INDEX_MISSING_SF) {
Expand Down
4 changes: 2 additions & 2 deletions piz.c
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ bool piz_dispatcher (const char *z_basename, unsigned max_threads,

if (data_type == DT_NONE) goto finish;

if (z_file->genozip_version < 2) enforce_v1_limitations (is_first_component); // genozip_version will be 0 for v1, bc we haven't read the vcf header yet
if (!is_v2_or_above) enforce_v1_limitations (is_first_component); // genozip_version will be 0 for v1, bc we haven't read the vcf header yet

// read and write txt header. in split mode this also opens txt_file
piz_successful = (data_type != DT_VCF_V1) ? header_genozip_to_txt (&original_file_digest)
Expand All @@ -316,7 +316,7 @@ bool piz_dispatcher (const char *z_basename, unsigned max_threads,
if (!dispatcher_is_input_exhausted (dispatcher) && dispatcher_has_free_thread (dispatcher)) {

bool still_more_data = false, grepped_out = false;
if (z_file->genozip_version > 1) {
if (is_v2_or_above) {

bool skipped_vb;
static Buffer region_ra_intersection_matrix = EMPTY_BUFFER; // we will move the data to the VB when we get it
Expand Down
27 changes: 14 additions & 13 deletions piz_vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "samples.h"
#include "gtshark_vcf.h"
#include "dict_id.h"
#include "strings.h"

#define DATA_LINE(i) ENT (PizDataLineVCF, vb->lines, i)

Expand Down Expand Up @@ -174,7 +175,7 @@ static inline void piz_vcf_reconstruct_info (VBlockVCF *vb, const SubfieldMapper

// starting v5, END is encoded as a delta vs. POS (sharing the POS/END delta stream)
char end_str[50];
if (z_file->genozip_version >= 5 && did_i == vb->end_did_i) {
if (is_v5_or_above && did_i == vb->end_did_i) {
vb->last_pos = piz_decode_pos (vb->last_pos, info_sf_value_snip[sf_i], info_sf_value_snip_len[sf_i],
NULL, end_str, &info_sf_value_snip_len[sf_i]);
info_sf_value_snip[sf_i] = end_str;
Expand Down Expand Up @@ -273,7 +274,7 @@ static bool piz_vcf_reconstruct_fields (VBlockVCF *vb, unsigned vb_line_i,
// info subfield eg "info1=value1;info2=value2" - "info1=", "info2=" are the name snips
// while "value1" and "value2" are the value snips - we merge them here
if (f == VCF_INFO && !flag_strip) {
(z_file->genozip_version >= 4 ? piz_vcf_reconstruct_info : v2v3_piz_vcf_reconstruct_info)
(is_v4_or_above ? piz_vcf_reconstruct_info : v2v3_piz_vcf_reconstruct_info)
(vb, iname_mapper, snip[VCF_INFO], info_sf_value_snip, info_sf_value_snip_len, *has_13);

if (snip_len[VCF_FORMAT]) // if INFO isn't the last field - add a tab
Expand Down Expand Up @@ -371,7 +372,7 @@ static void piz_vcf_reconstruct_genotype_data_line (VBlockVCF *vb, unsigned vb_l

const char *snip = NULL; // will be set to a pointer into a dictionary

const char *dp_snip=NULL; unsigned dp_snip_len=0; // used for copying DP to MIN_DP, if MIN_DP is empty
uint32_t dp_value=0; // used for calculating MIN_DP = dp_value + delta.

for (unsigned sf_i=0; sf_i < line_format_info->num_subfields; sf_i++) {

Expand All @@ -388,14 +389,14 @@ static void piz_vcf_reconstruct_genotype_data_line (VBlockVCF *vb, unsigned vb_l
mtf_get_next_snip ((VBlockP)vb, sf_ctx, &sample_iterator[sample_i], &snip, &snip_len, vb->first_line + vb_line_i);

// handle MIN_DP : if its a DP, store it...
if (sf_ctx && sf_ctx->dict_id.num == dict_id_FORMAT_DP) {
dp_snip = snip;
dp_snip_len = snip_len;
}
// ...and if its an empty MIN_DP, copy the DP instead
else if (sf_ctx && sf_ctx->dict_id.num == dict_id_FORMAT_MIN_DP && !snip_len && dp_snip_len) {
snip = dp_snip;
snip_len = dp_snip_len;
char min_dp[30];
if (sf_ctx && snip_len && sf_ctx->dict_id.num == dict_id_FORMAT_DP)
dp_value = atoi (snip);
// ...and if its an MIN_DP - calculate it
else if (sf_ctx && sf_ctx->dict_id.num == dict_id_FORMAT_MIN_DP && snip_len && is_v5_or_above) {
int32_t delta = atoi (snip);
str_int (dp_value - delta, min_dp, &snip_len); // note: we dp_value==0 if no DP subfield preceeds DP_MIN, that's fine
snip = min_dp;
}

if (snip && snip_len && is_line_included) { // it can be a valid empty subfield if snip="" and snip_len=0
Expand Down Expand Up @@ -801,7 +802,7 @@ static void piz_vcf_uncompress_all_sections (VBlockVCF *vb)
zfile_uncompress_section ((VBlockP)vb, header, &ctx->b250, "mtf_ctx.b250", SEC_VCF_INFO_SF_B250);
}

if (z_file->genozip_version >= 5) {
if (is_v5_or_above) {
SectionHeaderBase250 *header = (SectionHeaderBase250 *)(vb->z_data.data + section_index[section_i++]);

zfile_uncompress_section ((VBlockP)vb, header, &vb->id_numeric_data, "id_numeric_data", SEC_NUMERIC_ID_DATA);
Expand Down Expand Up @@ -898,7 +899,7 @@ void piz_vcf_uncompress_one_vb (VBlock *vb_)

VBlockVCF *vb = (VBlockVCF *)vb_;

if (z_file->genozip_version > 1) {
if (is_v2_or_above) {
piz_vcf_uncompress_all_sections (vb);

// combine all the sections of a variant block to regenerate the variant_data, haplotype_data,
Expand Down
22 changes: 12 additions & 10 deletions seg_vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -435,19 +435,16 @@ static int seg_vcf_genotype_area (VBlockVCF *vb, ZipDataLineVCF *dl,

bool end_of_cell = !cell_gt_data_len;

const char *dp_sf = NULL;
unsigned dp_sf_len = 0;
int32_t dp_value = 0;

for (unsigned sf=0; sf < format_mapper->num_subfields; sf++) { // iterate on the order as in the line

// move next to the beginning of the subfield data, if there is any
unsigned len = end_of_cell ? 0 : seg_snip_len_tnc (cell_gt_data, has_13);
MtfContext *ctx = MAPPER_CTX (format_mapper, sf);

if (ctx->dict_id.num == dict_id_FORMAT_DP) {
dp_sf = cell_gt_data;
dp_sf_len = len;
}
if (ctx->dict_id.num == dict_id_FORMAT_DP)
dp_value = atoi (cell_gt_data); // an integer terminated by : \t or \n

MtfNode *node;
uint32_t node_index;
Expand All @@ -463,10 +460,15 @@ static int seg_vcf_genotype_area (VBlockVCF *vb, ZipDataLineVCF *dl,
optimized_cell_gt_data_len -= (int)len - (int)optimized_snip_len;
}

// if case MIN_DP subfield is the same as DP (very common) - we store MIN_DP as WORD_INDEX_EMPTY_SF
// note: we count on GATK never actually having an empty MIN_DP word, as it would PIZ incorrectly to be a copy of DP
else if (cell_gt_data && ctx->dict_id.num == dict_id_FORMAT_MIN_DP && dp_sf_len == len && !memcmp (cell_gt_data, dp_sf, len))
node_index = mtf_evaluate_snip_seg ((VBlockP)vb, ctx, ":", 0, &node, NULL); // force a WORD_INDEX_EMPTY_SF
// if case MIN_DP subfield - it is slightly smaller and usually equal to DP - we store MIN_DP as the delta DP-MIN_DP
// note: the delta is vs. the DP field that preceeds MIN_DP - we take the DP as 0 there is no DP that preceeds
else if (cell_gt_data && ctx->dict_id.num == dict_id_FORMAT_MIN_DP) {
int32_t min_dp_value = atoi (cell_gt_data); // an integer terminated by : \t or \n
int32_t delta = dp_value - min_dp_value; // expected to be 0 or positive integer (may be negative if no DP preceeds)
char delta_str[30]; unsigned delta_str_len;
str_int (delta, delta_str, &delta_str_len);
node_index = mtf_evaluate_snip_seg ((VBlockP)vb, ctx, delta_str, delta_str_len, &node, NULL);
}

else
node_index = mtf_evaluate_snip_seg ((VBlockP)vb, ctx, cell_gt_data, len, &node, NULL);
Expand Down
2 changes: 1 addition & 1 deletion test-file.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# 23. DP and MIN_DP - FORMAT subfields - case when they're identical and case when not
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Person1 Person2 Person3 Person4 Person5
12 0 rs1 G A 100 PASS I12345678=A;I2=a GT:GL:GP:PL 1:-0.5999,-.1111,-.2222:-0.995,-0.1234578,-2.999:300,500,2 0:-0.995,-0.1234578,-2.999 0 17 18
1 207237234 rs123 G A 100 PASS I12345678=A;I2=b GT:DP:MIN_DP:GL 0|0:0,-.1,-2.:5:5 10|4:5:6 0|. 1 10|4
1 207237234 rs123 G A 100 PASS I12345678=A;I2=b GT:DP:MIN_DP:GL 0|0:5:5:0,-.1,-2. 10|4:5:6 0|. 1 10|4
13 207237235 no;numbers T T 100 PASS I2=a;NOVALUE;I3=x;VQSLOD=-4.19494 GT ./99 0|.|0 0|.|11 10|4|55 10|4|55
13 14 12345678 A C 100 PASS Info3 ABCDEFGHJ:GL xx
13 207237510 123456789012345 C A 100 PASS Info1 GT ./99/92 0/./0 0/./11 10/4/55 10/4/55
Expand Down
20 changes: 14 additions & 6 deletions zfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include "piz.h"
#include "license.h"

bool is_v2_or_above=0, is_v3_or_above=0, is_v4_or_above=0, is_v5_or_above=0;

static const char *password_test_string = "WhenIThinkBackOnAllTheCrapIlearntInHighschool";

void zfile_show_header (const SectionHeader *header, VBlock *vb /* optional if output to buffer */)
Expand Down Expand Up @@ -142,7 +144,7 @@ void zfile_uncompress_section (VBlock *vb,
CompressionAlg comp_alg = (CompressionAlg)section_header->sec_compression_alg;

// prior to version 5, the algorithm was hard coded, and this field was called "unused"
if (z_file->genozip_version < 5) {
if (!is_v5_or_above) {
if (expected_section_type == SEC_HT_GTSHARK_DB_DB || expected_section_type == SEC_HT_GTSHARK_DB_GT)
comp_alg = COMP_PLN;
else
Expand All @@ -158,7 +160,7 @@ void zfile_uncompress_section (VBlock *vb,

// decrypt data (in-place) if needed
if (data_encrypted_len) {
if (z_file->genozip_version > 1)
if (is_v2_or_above)
crypt_do (vb, (uint8_t*)section_header + compressed_offset, data_encrypted_len, vblock_i, section_header->section_type, false);
else
v1_crypt_do (vb, (uint8_t*)section_header + compressed_offset, data_encrypted_len, vblock_i, section_i);
Expand Down Expand Up @@ -352,7 +354,7 @@ int zfile_read_section (VBlock *vb,
// note: the first section is always read by zfile_read_section(). if it is a v1, or
// if it is not readable (perhaps v1 encrypted) - we assume its a v1 VCF header
// in v2+ the first section is always an unencrypted SectionHeaderGenozipHeader
ASSERT0 (z_file->genozip_version != 1, "Error: zfile_read_section cannot read v1 data");
ASSERT0 (is_v2_or_above || expected_sec_type==SEC_GENOZIP_HEADER, "Error: zfile_read_section cannot read v1 data");

bool is_encrypted = (expected_sec_type != SEC_GENOZIP_HEADER) &&
crypt_get_encrypted_len (&header_size, NULL); // update header size if encrypted
Expand Down Expand Up @@ -538,6 +540,12 @@ int16_t zfile_read_genozip_header (Md5Hash *digest) // out
z_file->num_components = BGEN32 (header->num_components);
*digest = header->md5_hash_concat;

// global bools to help testing
is_v2_or_above = (z_file->genozip_version >= 2);
is_v3_or_above = (z_file->genozip_version >= 3);
is_v4_or_above = (z_file->genozip_version >= 4);
is_v5_or_above = (z_file->genozip_version >= 5);

zfile_uncompress_section (evb, header, &z_file->section_list_buf, "z_file->section_list_buf", SEC_GENOZIP_HEADER);
z_file->section_list_buf.len /= sizeof (SectionListEntry); // fix len
BGEN_sections_list();
Expand Down Expand Up @@ -672,7 +680,7 @@ bool zfile_get_genozip_header (uint64_t *uncompressed_data_size,
*num_items_concat = BGEN64 (header.num_items_concat);
*md5_hash_concat = header.md5_hash_concat;

if (header.genozip_version >= 5)
if (header.genozip_version >= 5) // is_v5_or_above is for genols
*license_hash = header.license_hash;
else
license_hash->ulls[0] = license_hash->ulls[1] = 0;
Expand Down Expand Up @@ -951,7 +959,7 @@ void zfile_vcf_read_one_vb (VBlock *vb_)

if (vb->vblock_i == 1) {
buf_free (&global_iname_mapper_buf); // in case it was used to piz a previous file
(z_file->genozip_version >= 4 ? piz_vcf_map_iname_subfields : v2v3_piz_vcf_map_iname_subfields)(&global_iname_mapper_buf);
(is_v4_or_above ? piz_vcf_map_iname_subfields : v2v3_piz_vcf_map_iname_subfields)(&global_iname_mapper_buf);
buf_set_overlayable (&global_iname_mapper_buf);
}

Expand All @@ -978,7 +986,7 @@ void zfile_vcf_read_one_vb (VBlock *vb_)
READ_SECTION (SEC_VCF_INFO_SF_B250, SectionHeaderBase250);

// read the numberic data of the ID field (the non-numeric part is in SEC_ID_B250)
if (z_file->genozip_version >= 5)
if (is_v5_or_above)
READ_SECTION (SEC_NUMERIC_ID_DATA, SectionHeader);

// read the sample data
Expand Down
2 changes: 2 additions & 0 deletions zfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,6 @@ extern void zfile_me23_read_one_vb (VBlockP vb);
extern void zfile_me23_compress_vb_header (VBlockP vb);
extern void zfile_me23_update_compressed_vb_header (VBlockP vb, uint32_t vcf_first_line_i);

extern bool is_v2_or_above, is_v3_or_above, is_v4_or_above, is_v5_or_above;

#endif

0 comments on commit a2f820a

Please sign in to comment.