added GFF3, GVF, refactored VCF piz, iname ++

knmkr · May 8, 2020 · 3fd3b22 · 3fd3b22
1 parent f05a002
commit 3fd3b22
Show file tree

Hide file tree

Showing 37 changed files with 1,203 additions and 813 deletions.
diff --git a/Makefile b/Makefile
@@ -34,9 +34,9 @@ else
 endif
 
 MY_SRCS = genozip.c base250.c move_to_front.c header.c strings.c stats.c arch.c license.c \
-          zip.c zip_vcf.c zip_sam.c zip_fast.c zip_me23.c \
-		  piz.c piz_vcf.c piz_sam.c piz_fast.c piz_me23.c \
-		  seg.c seg_vcf.c seg_sam.c seg_fast.c seg_me23.c \
+          zip.c zip_vcf.c zip_sam.c zip_fast.c zip_gff3.c zip_me23.c \
+		  piz.c piz_vcf.c piz_sam.c piz_fast.c piz_gff3.c piz_me23.c \
+		  seg.c seg_vcf.c seg_sam.c seg_fast.c seg_gff3.c seg_me23.c \
           gloptimize_vcf.c buffer.c random_access.c sections.c compressor.c \
 	      txtfile.c squeeze_vcf.c zfile.c profiler.c file.c dispatcher.c crypt.c aes.c md5.c \
 		  vblock.c regions.c samples.c optimize.c dict_id.c hash.c gtshark_vcf.c stream.c url.c
@@ -53,7 +53,7 @@ CONDA_DEVS = Makefile .gitignore test-file.vcf
 
 CONDA_DOCS = LICENSE.non-commercial.txt LICENSE.commercial.txt AUTHORS README.md
 
-CONDA_INCS = aes.h dispatcher.h gloptimize_vcf.h optimize.h profiler.h dict_id.h txtfile.h zip.h v1_vcf.c v2v3_vcf.c \
+CONDA_INCS = aes.h dispatcher.h gloptimize_vcf.h optimize.h profiler.h dict_id.h txtfile.h zip.h v1_vcf.c \
              base250.h endianness.h md5.h sections.h text_help.h header.h strings.h hash.h stream.h url.h \
              buffer.h file.h move_to_front.h seg.h text_license.h version.h gtshark_vcf.h compressor.h stats.h \
              crypt.h genozip.h piz.h squeeze_vcf.h vblock.h zfile.h random_access.h regions.h samples.h \

diff --git a/dict_id.c b/dict_id.c
@@ -15,7 +15,7 @@ uint64_t dict_id_fields[MAX_NUM_FIELDS_PER_DATA_TYPE];
 // VCF stuff
 uint64_t dict_id_FORMAT_PL=0, dict_id_FORMAT_GL=0, dict_id_FORMAT_GP=0, dict_id_FORMAT_DP=0, dict_id_FORMAT_MIN_DP=0, 
          dict_id_INFO_AC=0, dict_id_INFO_AF=0, dict_id_INFO_AN=0, dict_id_INFO_DP=0, dict_id_INFO_VQSLOD=0,
-         dict_id_INFO_END=0, dict_id_INFO_13=0;
+         dict_id_INFO_END=0;
 
 // SAM stuff
 uint64_t dict_id_OPTION_AM=0, dict_id_OPTION_AS=0, dict_id_OPTION_CM=0, dict_id_OPTION_LB=0, dict_id_OPTION_FI=0, dict_id_OPTION_H0=0,
@@ -27,7 +27,16 @@ uint64_t dict_id_OPTION_AM=0, dict_id_OPTION_AS=0, dict_id_OPTION_CM=0, dict_id_
          dict_id_OPTION_XG=0, dict_id_OPTION_XS=0, dict_id_OPTION_XE=0,
          dict_id_OPTION_mc=0, dict_id_OPTION_ms=0,
          dict_id_OPTION_BD=0, dict_id_OPTION_BI=0,
-         dict_id_OPTION_STRAND=0;
+         dict_id_OPTION_STRAND=0; // private genozip dict
+
+// GVF stuff
+uint64_t dict_id_ATTR_ID=0, dict_id_ATTR_Variant_seq=0, dict_id_ATTR_Reference_seq=0,
+         dict_id_ATTR_Dbxref=0, // from from GRCh37/38 - example: "dbSNP_151:rs1282280967"
+         dict_id_ATTR_ancestral_allele=0, // from from GRCh37/38 - example ancestral_allele=GTTA
+         dict_id_ATTR_SEQ=0; // private genozip dict
+
+// our stuff used in multiple data types
+uint64_t dict_id_WindowsEOL=0;         
 
 DictIdType DICT_ID_NONE = {0};
 
@@ -40,6 +49,8 @@ void dict_id_initialize (void)
         dict_id_fields[f] = dict_id_field (dict_id_make (field_name, strlen (field_name))).num; 
     }
 
+    dict_id_WindowsEOL = dict_id_type_1 (dict_id_make ("#", 1)).num; 
+
     switch (z_file->data_type) { 
     case DT_VCF:
         dict_id_FORMAT_PL     = dict_id_vcf_format_sf (dict_id_make ("PL", 2)).num;
@@ -58,7 +69,6 @@ void dict_id_initialize (void)
         dict_id_FORMAT_MIN_DP = dict_id_vcf_format_sf (dict_id_make ("MIN_DP", 6)).num;
 
         // This appears if the VCF line has a Windows-style \r\n line ending
-        dict_id_INFO_13       = dict_id_vcf_info_sf   (dict_id_make ("#", 1)).num; 
         break;
 
     case DT_SAM:
@@ -113,6 +123,20 @@ void dict_id_initialize (void)
 
         break;
 
+    case DT_GFF3:
+        // standard GVF fields (ID is also a standard GFF3 field)
+        dict_id_ATTR_ID               = dict_id_gff3_attr_sf (dict_id_make ("ID", 2)).num;
+        dict_id_ATTR_Variant_seq      = dict_id_gff3_attr_sf (dict_id_make ("Variant_", 8)).num;
+        dict_id_ATTR_Reference_seq    = dict_id_gff3_attr_sf (dict_id_make ("Referenc", 8)).num;
+
+        // fields added in the GVFs of GRCh37/38
+        dict_id_ATTR_Dbxref           = dict_id_gff3_attr_sf (dict_id_make ("Dbxref", 6)).num;
+        dict_id_ATTR_ancestral_allele = dict_id_gff3_attr_sf (dict_id_make ("ancestra", 8)).num;
+
+        // our own dictionary where we store Variant_seq, Reference_seq and ancestral_allele together
+        dict_id_ATTR_SEQ              = dict_id_gff3_attr_sf (dict_id_make ("SEQ", 3)).num;
+        break;
+
     default:
         break; // no special fields for the other data types
     }

diff --git a/dict_id.h b/dict_id.h
@@ -54,19 +54,23 @@ static inline DictIdType dict_id_type_2(DictIdType dict_id) { return dict_id; }
 
 // FASTQ/FASTA field types 
 #define dict_id_is_fast_desc_sf dict_id_is_type_2
-
 #define dict_id_fast_desc_sf dict_id_type_2
 
+// GFF3 field types
+#define dict_id_is_gff3_attr_sf dict_id_is_type_1
+#define dict_id_gff3_attr_sf dict_id_type_1
+
 static inline DictIdType dict_id_printable(DictIdType dict_id) { dict_id.id[0] = (dict_id.id[0] & 0x7f) | 0x40; return dict_id; } // set 2 Msb to 01
 
 extern DictIdType DICT_ID_NONE;
 extern DictIdType dict_id_show_one_b250, dict_id_show_one_dict; // arguments of --show-b250-one and --show-dict-one (defined in genozip.c)
 extern DictIdType dict_id_dump_one_b250;                        // arguments of --dump-b250-one (defined in genozip.c)
 
 extern uint64_t dict_id_fields[MAX_NUM_FIELDS_PER_DATA_TYPE],
+
                 dict_id_FORMAT_PL, dict_id_FORMAT_GL, dict_id_FORMAT_GP, dict_id_FORMAT_DP, dict_id_FORMAT_MIN_DP, // some VCF FORMAT subfields
                 dict_id_INFO_AC, dict_id_INFO_AF, dict_id_INFO_AN, dict_id_INFO_DP, dict_id_INFO_VQSLOD, // some VCF INFO subfields
-                dict_id_INFO_END, dict_id_INFO_13,
+                dict_id_INFO_END, dict_id_WindowsEOL,
 
                 // standard tags, see here: https://samtools.github.io/hts-specs/SAMtags.pdf
                 dict_id_OPTION_AM, dict_id_OPTION_AS, dict_id_OPTION_CM, dict_id_OPTION_E2, dict_id_OPTION_LB, dict_id_OPTION_FI, 
@@ -85,7 +89,18 @@ extern uint64_t dict_id_fields[MAX_NUM_FIELDS_PER_DATA_TYPE],
                 // GATK tags
                 dict_id_OPTION_BD, dict_id_OPTION_BI,
 
-                dict_id_OPTION_STRAND;
+                // our own
+                dict_id_OPTION_STRAND,
+
+                // GVF attributes - standard
+                dict_id_ATTR_ID, dict_id_ATTR_Variant_seq, dict_id_ATTR_Reference_seq,
+
+                // GVF attributes - from GRCh37/38 etc
+                dict_id_ATTR_Dbxref, // example: "dbSNP_151:rs1282280967"
+                dict_id_ATTR_ancestral_allele,
+
+                dict_id_ATTR_SEQ; // private genozip dict
+
 
 extern void dict_id_initialize (void);
 

diff --git a/file.h b/file.h
@@ -81,12 +81,19 @@
 #define FNA_XZ_         ".fna.xz"
 #define FNA_GENOZIP_    ".fna" GENOZIP_EXT
 
+// GFF3 file variations (currently only GVF subtype, but maybe others in the future)
+#define GVF_            ".gvf"
+#define GVF_GZ_         ".gvf.gz"
+#define GVF_BZ2_        ".gvf.bz2"
+#define GVF_XZ_         ".gvf.xz"
+#define GVF_GENOZIP_    ".gvf" GENOZIP_EXT
+
 // 23andMe file variations
 // note: 23andMe files come as a .txt, and therefore the user must specify --input to compress them. we have this
 // made-up file extension here to avoid needing special cases throughout the code
-#define ME23_          ".txt" // our made up extension - natively, 23andMe files come as a zip container containing a txt file
-#define ME23_ZIP_      ".zip" 
-#define ME23_GENOZIP_  ".txt" GENOZIP_EXT
+#define ME23_           ".txt" // our made up extension - natively, 23andMe files come as a zip container containing a txt file
+#define ME23_ZIP_       ".zip" 
+#define ME23_GENOZIP_   ".txt" GENOZIP_EXT
 
 typedef enum {TXT_FILE, Z_FILE} FileSupertype; 
 
@@ -101,6 +108,7 @@ typedef enum      { UNKNOWN_FILE_TYPE,
                     FFN,   FFN_GZ,   FFN_BZ2,   FFN_XZ,   FFN_GENOZIP,
                     FNN,   FNN_GZ,   FNN_BZ2,   FNN_XZ,   FNN_GENOZIP,
                     FNA,   FNA_GZ,   FNA_BZ2,   FNA_XZ,   FNA_GENOZIP,
+                    GVF,   GVF_GZ,   GVF_BZ2,   GVF_XZ,   GVF_GENOZIP,
                     ME23,  ME23_ZIP,                      ME23_GENOZIP, 
                     AFTER_LAST_FILE_TYPE } FileType;
 
@@ -115,6 +123,7 @@ typedef enum      { UNKNOWN_FILE_TYPE,
                    FFN_,   FFN_GZ_,   FFN_BZ2_,   FFN_XZ_,   FFN_GENOZIP_, \
                    FNN_,   FNN_GZ_,   FNN_BZ2_,   FNN_XZ_,   FNN_GENOZIP_, \
                    FNA_,   FNA_GZ_,   FNA_BZ2_,   FNA_XZ_,   FNA_GENOZIP_, \
+                   GVF_,   GVF_GZ_,   GVF_BZ2_,   GVF_XZ_,   GVF_GENOZIP_, \
                    ME23_,  ME23_ZIP_,                        ME23_GENOZIP_,\
                    "stdin", "stdout" }
 extern const char *file_exts[];
@@ -149,6 +158,8 @@ typedef enum { COMP_UNKNOWN=-1, COMP_PLN=0 /* plain - no compression */,
                              { FNA_BZ2,   COMP_BZ2, FNA_GENOZIP   }, { FNA_XZ,   COMP_XZ,  FNA_GENOZIP   },\
                              { FA,        COMP_PLN, FA_GENOZIP    }, { FA_GZ,    COMP_GZ,  FA_GENOZIP    },\
                              { FA_BZ2,    COMP_BZ2, FA_GENOZIP    }, { FA_XZ,    COMP_XZ,  FA_GENOZIP    }, { 0, 0, 0 } },\
+                           { { GVF,       COMP_PLN, GVF_GENOZIP   }, { GVF_GZ,   COMP_GZ,  GVF_GENOZIP   },\
+                             { GVF_BZ2,   COMP_BZ2, GVF_GENOZIP   }, { GVF_XZ,   COMP_XZ,  GVF_GENOZIP   }, { 0, 0, 0 } },\
                            { { ME23,      COMP_PLN, ME23_GENOZIP  }, { ME23_ZIP, COMP_ZIP, ME23_GENOZIP  }, { 0, 0, 0 } } }
 
 // plain file MUST appear first on the list - this will be the default output when redirecting
@@ -157,6 +168,7 @@ typedef enum { COMP_UNKNOWN=-1, COMP_PLN=0 /* plain - no compression */,
                            { SAM,  BAM, 0 },                      \
                            { FASTQ, FASTQ_GZ, FQ, FQ_GZ, 0 },\
                            { FASTA, FASTA_GZ, FA, FA_GZ, FAA, FAA_GZ, FFN, FFN_GZ, FNN, FNN_GZ, FNA, FNA_GZ, 0 },\
+                           { GVF, GVF_GZ, 0 },\
                            { ME23, ME23_ZIP, 0 } }                        
 
 // txt file types and their corresponding genozip file types for each data type
@@ -165,6 +177,7 @@ typedef enum { COMP_UNKNOWN=-1, COMP_PLN=0 /* plain - no compression */,
                      { SAM_GENOZIP, 0 }, \
                      { FASTQ_GENOZIP, FQ_GENOZIP, 0 }, \
                      { FASTA_GENOZIP, FA_GENOZIP, FAA_GENOZIP, FFN_GENOZIP, FNN_GENOZIP, FNA_GENOZIP, 0 }, \
+                     { GVF_GENOZIP, 0 }, \
                      { ME23_GENOZIP, 0 } } 
 
 typedef const char *FileMode;

diff --git a/genozip.h b/genozip.h
@@ -64,6 +64,8 @@ typedef struct VBlockSAM *VBlockSAMP;
 typedef const struct VBlockSAM *ConstVBlockSAMP;
 typedef struct VBlockFAST *VBlockFASTP;
 typedef const struct VBlockFAST *ConstVBlockFASTP;
+typedef struct VBlockGFF3 *VBlockGFF3P;
+typedef const struct VBlockGFF3 *ConstVBlockGFF3P;
 typedef struct VBlockME23 *VBlockME23P;
 typedef const struct VBlockME23 *ConstVBlockME23P;
 

diff --git a/hash.c b/hash.c
@@ -166,6 +166,19 @@ void hash_alloc_local (VBlock *segging_vb, MtfContext *vb_ctx)
             vb_ctx->local_hash_prime = hash_next_size_up(500);
         break;
 
+    case DT_GFF3:
+        if (vb_ctx->dict_id.num == dict_id_fields[GFF3_SEQID] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_SOURCE] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_TYPE] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_END] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_SCORE] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_STRAND] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_PHASE] ||
+            vb_ctx->dict_id.num == dict_id_fields[GFF3_ATTRS])
+
+            vb_ctx->local_hash_prime = hash_next_size_up(500);
+        break;
+
     case DT_ME23:
         if (vb_ctx->dict_id.num == dict_id_fields[ME23_CHROM])
 

diff --git a/header.c b/header.c
@@ -26,8 +26,8 @@ static bool is_first_txt = true;
 // (names are not longer than 8=DICT_ID_LEN as the code assumes it)
 const char *field_names[NUM_DATATYPES][MAX_NUM_FIELDS_PER_DATA_TYPE] = FIELD_NAMES;
 
-const unsigned datatype_last_field[NUM_DATATYPES]      = DATATYPE_LAST_FIELD;
-const unsigned chrom_did_i_by_dt[NUM_DATATYPES]        = CHROM_DID_I_BY_DT; 
+const unsigned datatype_last_field[NUM_DATATYPES] = DATATYPE_LAST_FIELD;
+const uint8_t chrom_did_i_by_dt[NUM_DATATYPES]    = CHROM_DID_I_BY_DT; 
 
 // -----------
 // VCF stuff

diff --git a/header.h b/header.h
@@ -10,58 +10,51 @@
 #include "md5.h"
 
 // IMPORTANT: DATATYPES GO INTO THE FILE FORMAT - THEY CANNOT BE CHANGED
-#define NUM_DATATYPES 5
+#define NUM_DATATYPES 6
 typedef enum { DT_VCF_V1=-2, DT_NONE=-1, // these values are used in the code logic, they are never written to the file
                DT_VCF=0, DT_SAM=1, 
-               DT_FASTQ=2, DT_FASTA=3, 
-               DT_ME23=4 } DataType; // these values go into SectionHeaderGenozipHeader.data_type
-#define DATATYPE_NAMES { "VCF", "SAM", "FASTQ", "FASTA", "23ANDME" } // index in array matches values in DataType
+               DT_FASTQ=2, DT_FASTA=3, DT_GFF3=4,
+               DT_ME23=5 } DataType; // these values go into SectionHeaderGenozipHeader.data_type
+#define DATATYPE_NAMES { "VCF", "SAM", "FASTQ", "FASTA", "GVF" /* actually GFF3, but for now we can call it GVF */, \
+                         "23ANDME" } // index in array matches values in DataType
 
-#define DATATYPE_LAST_FIELD { VCF_FORMAT, SAM_OPTIONAL, FAST_LINEMETA, FAST_LINEMETA, ME23_ID }
-extern const unsigned datatype_last_field[NUM_DATATYPES];
-
-#define CHROM_DID_I_BY_DT   { VCF_CHROM, SAM_RNAME, -1, -1, ME23_CHROM } // -1 if DATATYPE_HAS_RANDOM_ACCESS is false
-extern const unsigned chrom_did_i_by_dt[NUM_DATATYPES];  // used for random access data
-
-#define DATATYPE_HAS_RANDOM_ACCESS { true, true, false, false, true }
+#define DATATYPE_HAS_RANDOM_ACCESS { true, true, false, false, true, true }
 
 typedef void (*ComputeFunc)(VBlockP);
 #define COMPRESS_FUNC_BY_DT { zip_vcf_compress_one_vb, zip_sam_compress_one_vb,  \
-                              zip_fast_compress_one_vb, zip_fast_compress_one_vb, zip_me23_compress_one_vb }
+                              zip_fast_compress_one_vb, zip_fast_compress_one_vb, zip_gff3_compress_one_vb, zip_me23_compress_one_vb }
 
 #define UNCOMPRESS_FUNC_BY_DT { piz_vcf_uncompress_one_vb, piz_sam_uncompress_one_vb, \
                                 piz_fast_uncompress_one_vb, piz_fast_uncompress_one_vb, \
-                                piz_me23_uncompress_one_vb }
+                                piz_gff3_uncompress_one_vb, piz_me23_uncompress_one_vb }
 
 typedef void (*UpdateHeaderFunc) (VBlockP vb, uint32_t vcf_first_line_i);
 #define UPDATE_HEADER_FUNC_BY_DT { zfile_vcf_update_compressed_vb_header,     \
                                    zfile_update_compressed_vb_header, \
                                    zfile_update_compressed_vb_header, \
                                    zfile_update_compressed_vb_header, \
+                                   zfile_update_compressed_vb_header, \
                                    zfile_update_compressed_vb_header  }         
 
 typedef void (*IOFunc) (VBlockP vb);
 #define READ_ONE_VB_FUNC_BY_DT { zfile_vcf_read_one_vb,  zfile_sam_read_one_vb,   \
                                  zfile_fast_read_one_vb, zfile_fast_read_one_vb, \
-                                 zfile_me23_read_one_vb }
+                                 zfile_gff3_read_one_vb, zfile_me23_read_one_vb  }
 
-#define FIRST_FIELD_DICT_SECTION { SEC_CHROM_DICT, SEC_SAM_QNAME_DICT, \
-                                   SEC_FAST_DESC_DICT, SEC_FAST_DESC_DICT, SEC_CHROM_DICT }
-
-// the chrom fields used by --regions for subsetting
-#define CHROM_FIELD_DICT_SECTION { SEC_CHROM_DICT, SEC_SAM_RNAME_DICT, -1, -1, SEC_CHROM_DICT }
+// by data type - related to the header of the txt file of each data type
+#define TXT_HEADER_IS_ALLOWED      { true, true, false, false, true, true } // is it possible to have a header in this data_type
+#define TXT_HEADER_IS_REQUIRED     { true, false, false, false, false, false } // should we error if the header is missing
+#define TXT_HEADER_LINE_FIRST_CHAR { '#', '@', -1, -1, '#', '#' }; // first character in each line in the text file header (-1 if TXT_HEADER_IS_ALLOWED is false)
 
-// related to the header of the txt file of each data type
-#define TXT_HEADER_IS_ALLOWED      { true, true, false, false, true } // is it possible to have a header in this data_type
-#define TXT_HEADER_IS_REQUIRED     { true, false , false , false , false } // should we error if the header is missing
-#define TXT_HEADER_LINE_FIRST_CHAR { '#', '@', -1, -1, '#' }; // first character in each line in the text file header (-1 if TXT_HEADER_IS_ALLOWED is false)
-
-#define STAT_SHOW_SECTIONS_LINE_NAME { "Variants", "Alignment lines", "Entries", "Lines", "SNPs" }
+// by data type - the header displayed in --show-sections
+#define STAT_SHOW_SECTIONS_LINE_NAME { "Variants", "Alignment lines", "Entries", "Lines", "Sequences", "SNPs" }
 
+// by data type - the dictionary type displayed in --show-sections
 #define STAT_DICT_TYPES { { "FIELD", "INFO",   "FORMAT" }, \
                           { "FIELD", "QNAME",  "OPTION" }, \
                           { "FIELD", "ERROR!", "DESC"   }, \
                           { "FIELD", "ERROR!", "DESC"   }, \
+                          { "FIELD", "ATTRS",  "ERROR!" }, \
                           { "FIELD", "ERROR!", "ERROR!" } };
 
 // VCF related global parameters - set before any thread is created, and never change
@@ -85,17 +78,37 @@ typedef enum { SAM_QNAME, SAM_FLAG, SAM_RNAME, SAM_POS, SAM_MAPQ, SAM_CIGAR, SAM
 #define NUM_FAST_FIELDS 2
 typedef enum { FAST_DESC, FAST_LINEMETA } FastqFields;
 
+#define NUM_GFF3_FIELDS 10 // https://m.ensembl.org/info/website/upload/gff3.html
+typedef enum { GFF3_SEQID, GFF3_SOURCE, GFF3_TYPE, GFF3_START, GFF3_END, GFF3_SCORE, GFF3_STRAND, GFF3_PHASE, GVF_SEQ, GFF3_ATTRS } Gff3Fields;
+
 // 23ANDME fields
 #define NUM_ME23_FIELDS 3
 typedef enum { ME23_CHROM, ME23_POS, ME23_ID } Me23Fields; // same order as VCF
 
-#define MAX_NUM_FIELDS_PER_DATA_TYPE 9 // maximum between NUM_*_FIELDS
+#define MAX_NUM_FIELDS_PER_DATA_TYPE 10 // maximum between NUM_*_FIELDS
+
+#define DATATYPE_LAST_FIELD { NUM_VCF_FIELDS-1, NUM_SAM_FIELDS-1, NUM_FAST_FIELDS-1, NUM_FAST_FIELDS-1, NUM_GFF3_FIELDS-1, NUM_ME23_FIELDS-1 }
+extern const unsigned datatype_last_field[NUM_DATATYPES];
+
+#define CHROM_DID_I_BY_DT   { VCF_CHROM, SAM_RNAME, -1, -1, GFF3_SEQID, ME23_CHROM } // -1 if DATATYPE_HAS_RANDOM_ACCESS is false
+extern const uint8_t chrom_did_i_by_dt[NUM_DATATYPES];  // used for random access data
+
+#define INFO_DID_I_BY_DT { VCF_INFO, -1, -1, -1, GFF3_ATTRS, -1 }
+
+#define FIRST_FIELD_DICT_SECTION { SEC_CHROM_DICT, SEC_SAM_QNAME_DICT, \
+                                   SEC_FAST_DESC_DICT, SEC_FAST_DESC_DICT, SEC_GFF3_SEQID_DICT, SEC_CHROM_DICT }
+
+// by data type - the chrom fields used by --regions for subsetting
+#define CHROM_FIELD_DICT_SECTION { SEC_CHROM_DICT, SEC_SAM_RNAME_DICT, -1, -1, SEC_GFF3_SEQID_DICT, SEC_CHROM_DICT }
+#define INFO_FIELD_DICT_SECTION  { SEC_VCF_INFO_DICT, -1, -1, -1, SEC_GFF3_ATTRS_DICT, -1 }
+#define INFO_SF_DICT_SECTION     { SEC_VCF_INFO_SF_DICT, -1, -1, -1, SEC_GFF3_ATTRS_SF_DICT, -1 }
 
 #define FIELD_NAMES /* max 8 chars per name */ \
     { { "CHROM", "POS", "ID", "REF+ALT", "QUAL", "FILTER", "INFO", "FORMAT" },\
       { "QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "PNEXT", "TLEN", "OPTIONAL" },\
       { "DESC", "LINEMETA" },\
       { "DESC", "LINEMETA" },\
+      { "SEQID", "SOURCE", "TYPE", "START", "END", "SCORE", "STRAND", "PHASE", "SEQ", "ATTRS" },\
       { "CHROM", "POS", "ID" }\
     };
 extern const char *field_names[NUM_DATATYPES][MAX_NUM_FIELDS_PER_DATA_TYPE];