Skip to content

Commit

Permalink
Support consuming compressed files.
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Baker committed Jul 20, 2019
1 parent fe80417 commit 6e54227
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 33 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ CFLAGS += -pedantic -Wall -O3
LFLAGS = -lm $(LDFLAGS)

TARGET = prodigal
ZTARGET = zprodigal
SOURCES = $(shell echo *.c)
HEADERS = $(shell echo *.h)
OBJECTS = $(SOURCES:.c=.o)
ZOBJECTS = $(SOURCES:.c=.oz)

INSTALLDIR = /usr/local/bin

Expand All @@ -36,9 +38,15 @@ all: $(TARGET)
$(TARGET): $(OBJECTS)
$(CC) $(CFLAGS) -o $@ $^ $(LFLAGS)

$(ZTARGET): $(ZOBJECTS)
$(CC) $(CFLAGS) -o $@ $^ $(LFLAGS) -lz -DSUPPORT_GZIP_COMPRESSED

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.oz: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $< -DSUPPORT_GZIP_COMPRESSED

install: $(TARGET)
install -d -m 0755 $(INSTALLDIR)
install -m 0755 $(TARGET) $(INSTALLDIR)
Expand All @@ -47,7 +55,7 @@ uninstall:
-rm $(INSTALLDIR)/$(TARGET)

clean:
-rm -f $(OBJECTS)
-rm -f $(OBJECTS) $(ZOBJECTS)

distclean: clean
-rm -f $(TARGET)
Expand Down
17 changes: 17 additions & 0 deletions fptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef SGC_H__
#define SGC_H__
#ifdef SUPPORT_GZIP_COMPRESSED
#include "zlib.h"
#define fptr gzFile
#define INPUT_OPEN gzopen
#define INPUT_SEEK gzseek
#define INPUT_CLOSE gzclose
#define INPUT_GETS(str, n, stream) gzgets(stream, str, n)
#else
#define fptr FILE *
#define INPUT_OPEN fopen
#define INPUT_SEEK fseek
#define INPUT_CLOSE fclose
#define INPUT_GETS(str, n, stream) fgets(str, n, stream)
#endif
#endif
33 changes: 22 additions & 11 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@
#include "node.h"
#include "dprog.h"
#include "gene.h"
#include "fptr.h"


#define VERSION "2.6.3"
#define DATE "February, 2016"

#define MIN_SINGLE_GENOME 20000
#define IDEAL_SINGLE_GENOME 100000


void version();
void usage(char *);
void help();
Expand All @@ -47,7 +50,8 @@ int main(int argc, char *argv[]) {
char *train_file, *start_file, *trans_file, *nuc_file;
char *input_file, *output_file, input_copy[MAX_LINE];
char cur_header[MAX_LINE], new_header[MAX_LINE], short_header[MAX_LINE];
FILE *input_ptr, *output_ptr, *start_ptr, *trans_ptr, *nuc_ptr;
FILE *output_ptr, *start_ptr, *trans_ptr, *nuc_ptr;
fptr input_ptr = NULL;
struct stat fbuf;
pid_t pid;
struct _node *nodes;
Expand Down Expand Up @@ -89,7 +93,7 @@ int main(int argc, char *argv[]) {
start_file = NULL; trans_file = NULL; nuc_file = NULL;
start_ptr = stdout; trans_ptr = stdout; nuc_ptr = stdout;
input_file = NULL; output_file = NULL; piped = 0;
input_ptr = stdin; output_ptr = stdout; max_slen = 0;
output_ptr = stdout; max_slen = 0;
output = 0; closed = 0; do_mask = 0; force_nonsd = 0;

/* Filename for input copy if needed */
Expand Down Expand Up @@ -249,7 +253,14 @@ int main(int argc, char *argv[]) {

/* Check i/o files (if specified) and prepare them for reading/writing */
if(input_file != NULL) {
input_ptr = fopen(input_file, "r");
input_ptr = INPUT_OPEN(input_file, "r");
if(input_ptr == NULL) {
fprintf(stderr, "\nError: can't open input file %s.\n\n", input_file);
exit(5);
}
}
if(input_ptr == NULL) {
input_ptr = INPUT_OPEN("/dev/stdin", "r");
if(input_ptr == NULL) {
fprintf(stderr, "\nError: can't open input file %s.\n\n", input_file);
exit(5);
Expand Down Expand Up @@ -423,7 +434,7 @@ int main(int argc, char *argv[]) {

/* Rewind input file */
if(quiet == 0) fprintf(stderr, "-------------------------------------\n");
if(fseek(input_ptr, 0, SEEK_SET) == -1) {
if(INPUT_SEEK(input_ptr, 0, SEEK_SET) == -1) {
fprintf(stderr, "\nError: could not rewind input file.\n");
exit(13);
}
Expand Down Expand Up @@ -600,15 +611,15 @@ int main(int argc, char *argv[]) {
}

/* Free all memory */
if(seq != NULL) free(seq);
if(rseq != NULL) free(rseq);
if(useq != NULL) free(useq);
if(nodes != NULL) free(nodes);
if(genes != NULL) free(genes);
for(i = 0; i < NUM_META; i++) if(meta[i].tinf != NULL) free(meta[i].tinf);
free(seq);
free(rseq);
free(useq);
free(nodes);
free(genes);
for(i = 0; i < NUM_META; i++) free(meta[i].tinf);

/* Close all the filehandles and exit */
if(input_ptr != stdin) fclose(input_ptr);
INPUT_CLOSE(input_ptr);
if(output_ptr != stdout) fclose(output_ptr);
if(start_ptr != stdout) fclose(start_ptr);
if(trans_ptr != stdout) fclose(trans_ptr);
Expand Down
38 changes: 19 additions & 19 deletions sequence.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,20 @@
Read the sequence for training purposes. If we encounter multiple
sequences, we insert TTAATTAATTAA between each one to force stops in all
six frames. When we hit MAX_SEQ bp, we stop and return what we've got so
far for training. This routine reads in FASTA, and has a very 'loose'
Genbank and Embl parser, but, to be safe, FASTA should generally be
far for training. This routine reads in FASTA, and has a very 'loose'
Genbank and Embl parser, but, to be safe, FASTA should generally be
preferred.
*******************************************************************************/

int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,
int read_seq_training(fptr fp, unsigned char *seq, unsigned char *useq,
double *gc, int do_mask, mask *mlist, int *nm) {
char line[MAX_LINE+1];
int hdr = 0, fhdr = 0, bctr = 0, len = 0, wrn = 0;
int gc_cont = 0, mask_beg = -1;
unsigned int i, gapsize = 0;

line[MAX_LINE] = '\0';
while(fgets(line, MAX_LINE, fp) != NULL) {
while(INPUT_GETS(line, MAX_LINE, fp) != NULL) {
if(hdr == 0 && line[strlen(line)-1] != '\n' && wrn == 0) {
wrn = 1;
fprintf(stderr, "\n\nWarning: saw non-sequence line longer than ");
Expand All @@ -58,7 +58,7 @@ int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,
else if(hdr == 1 && (line[0] == '/' && line[1] == '/')) hdr = 0;
else if(hdr == 1) {
if(strstr(line, "Expand") != NULL && strstr(line, "gap") != NULL) {
sscanf(strstr(line, "gap")+4, "%u", &gapsize);
sscanf(strstr(line, "gap")+4, "%u", &gapsize);
if(gapsize < 1 || gapsize > MAX_LINE) {
fprintf(stderr, "Error: gap size in gbk file can't exceed line");
fprintf(stderr, " size.\n");
Expand All @@ -73,7 +73,7 @@ int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,
if(len - mask_beg >= MASK_SIZE) {
if(*nm == MAX_MASKS) {
fprintf(stderr, "Error: saw too many regions of 'N''s in the ");
fprintf(stderr, "sequence.\n");
fprintf(stderr, "sequence.\n");
exit(52);
}
mlist[*nm].begin = mask_beg;
Expand All @@ -93,8 +93,8 @@ int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,
set(seq, bctr+1);
gc_cont++;
}
else if(line[i] != 'a' && line[i] != 'A') {
set(seq, bctr+1);
else if(line[i] != 'a' && line[i] != 'A') {
set(seq, bctr+1);
set(useq, len);
}
bctr+=2; len++;
Expand All @@ -119,7 +119,7 @@ int read_seq_training(FILE *fp, unsigned char *seq, unsigned char *useq,

/* This routine reads in the next sequence in a FASTA/GB/EMBL file */

int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,
int next_seq_multi(fptr fp, unsigned char *seq, unsigned char *useq,
int *sctr, double *gc, int do_mask, mask *mlist, int *nm,
char *cur_hdr, char *new_hdr) {
char line[MAX_LINE+1];
Expand All @@ -131,7 +131,7 @@ int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,

if(*sctr > 0) reading_seq = 1;
line[MAX_LINE] = '\0';
while(fgets(line, MAX_LINE, fp) != NULL) {
while(INPUT_GETS(line, MAX_LINE, fp) != NULL) {
if(reading_seq == 0 && line[strlen(line)-1] != '\n' && wrn == 0) {
wrn = 1;
fprintf(stderr, "\n\nWarning: saw non-sequence line longer than ");
Expand Down Expand Up @@ -169,7 +169,7 @@ int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,
}
else if(reading_seq == 1) {
if(strstr(line, "Expand") != NULL && strstr(line, "gap") != NULL) {
sscanf(strstr(line, "gap")+4, "%u", &gapsize);
sscanf(strstr(line, "gap")+4, "%u", &gapsize);
if(gapsize < 1 || gapsize > MAX_LINE) {
fprintf(stderr, "Error: gap size in gbk file can't exceed line");
fprintf(stderr, " size.\n");
Expand All @@ -184,7 +184,7 @@ int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,
if(len - mask_beg >= MASK_SIZE) {
if(*nm == MAX_MASKS) {
fprintf(stderr, "Error: saw too many regions of 'N''s in the ");
fprintf(stderr, "sequence.\n");
fprintf(stderr, "sequence.\n");
exit(55);
}
mlist[*nm].begin = mask_beg;
Expand All @@ -204,8 +204,8 @@ int next_seq_multi(FILE *fp, unsigned char *seq, unsigned char *useq,
set(seq, bctr+1);
gc_cont++;
}
else if(line[i] != 'a' && line[i] != 'A') {
set(seq, bctr+1);
else if(line[i] != 'a' && line[i] != 'A') {
set(seq, bctr+1);
set(useq, len);
}
bctr+=2; len++;
Expand Down Expand Up @@ -240,14 +240,14 @@ void calc_short_header(char *header, char *short_header, int sctr) {

/* Takes rseq and fills it up with the rev complement of seq */

void rcom_seq(unsigned char *seq, unsigned char *rseq, unsigned char *useq,
void rcom_seq(unsigned char *seq, unsigned char *rseq, unsigned char *useq,
int len) {
int i, slen=len*2;
for(i = 0; i < slen; i++)
if(test(seq, i) == 0) set(rseq, slen-i-1+(i%2==0?-1:1));
for(i = 0; i < len; i++) {
if(test(useq, i) == 1) {
toggle(rseq, slen-1-i*2);
toggle(rseq, slen-1-i*2);
toggle(rseq, slen-2-i*2);
}
}
Expand Down Expand Up @@ -564,7 +564,7 @@ int max_fr(int n1, int n2, int n3) {
}

/*******************************************************************************
Creates a GC frame plot for a given sequence. This is simply a string with
Creates a GC frame plot for a given sequence. This is simply a string with
the highest GC content frame for a window centered on position for every
position in the sequence.
*******************************************************************************/
Expand Down Expand Up @@ -654,7 +654,7 @@ void calc_mer_bg(int len, unsigned char *seq, unsigned char *rseq, int slen,
}

/*******************************************************************************
Finds the highest-scoring region similar to AGGAGG in a given stretch of
Finds the highest-scoring region similar to AGGAGG in a given stretch of
sequence upstream of a start.
*******************************************************************************/

Expand Down Expand Up @@ -730,7 +730,7 @@ int shine_dalgarno_exact(unsigned char *seq, int pos, int start, double *rwt) {
}

/*******************************************************************************
Finds the highest-scoring region similar to AGGAGG in a given stretch of
Finds the highest-scoring region similar to AGGAGG in a given stretch of
sequence upstream of a start. Only considers 5/6-mers with 1 mismatch.
*******************************************************************************/

Expand Down
5 changes: 3 additions & 2 deletions sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <math.h>
#include "bitmap.h"
#include "training.h"
#include "fptr.h"

#define MAX_SEQ 32000000
#define MAX_LINE 10000
Expand All @@ -44,9 +45,9 @@ typedef struct _mask {
int end;
} mask;

int read_seq_training(FILE *, unsigned char *, unsigned char *, double *, int,
int read_seq_training(fptr, unsigned char *, unsigned char *, double *, int,
mask *, int *);
int next_seq_multi(FILE *, unsigned char *, unsigned char *, int *, double *,
int next_seq_multi(fptr, unsigned char *, unsigned char *, int *, double *,
int, mask *, int *, char *, char *);
void rcom_seq(unsigned char *, unsigned char *, unsigned char *, int);

Expand Down
1 change: 1 addition & 0 deletions training.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <stdlib.h>
#include <string.h>


struct _training {
double gc; /* GC Content */
int trans_table; /* 11 = Standard Microbial, NCBI Trans Table to
Expand Down

0 comments on commit 6e54227

Please sign in to comment.