diff --git a/doc/external/kegg/getKEGGModelForOrganism.html b/doc/external/kegg/getKEGGModelForOrganism.html index 20e67bc1..5d08bbe7 100644 --- a/doc/external/kegg/getKEGGModelForOrganism.html +++ b/doc/external/kegg/getKEGGModelForOrganism.html @@ -755,693 +755,702 @@

SOURCE CODE ^%If no FASTA file is supplied, then just remove all genes which are not for 0440 %the given organism ID 0441 if isempty(fastaFile) -0442 fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); -0443 if ismember(organismID,{'eukaryotes','prokaryotes'}) -0444 phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); -0445 if strcmp(organismID,'eukaryotes') -0446 proxyid='hsa'; -0447 %Use H. sapiens here -0448 else -0449 proxyid='eco'; -0450 %Use E. coli here -0451 end -0452 [~, phylDistId]=ismember(proxyid,phylDists.ids); -0453 idsToKeep=phylDists.ids(~isinf(phylDists.distMat(phylDistId,:))); -0454 taxIDs=cellfun(@(x) x{1},cellfun(@(x) strsplit(x,':'),model.genes,'UniformOutput',false),'UniformOutput',false); -0455 I=ismember(upper(taxIDs),upper(idsToKeep)); -0456 else -0457 %KEGG organism IDs may have three or four letters -0458 organismID=strcat(organismID,':'); -0459 %Add colon for accurate matching -0460 if length(organismID)==4 -0461 I=cellfun(@(x) strcmpi(x(1:4),organismID),model.genes); -0462 elseif length(organismID)==5 -0463 I=cellfun(@(x) strcmpi(x(1:5),organismID),model.genes); -0464 end -0465 end -0466 %Remove those genes -0467 model.genes=model.genes(I); -0468 model.rxnGeneMat=model.rxnGeneMat(:,I); -0469 fprintf('COMPLETE\n'); -0470 end -0471 -0472 %First remove all reactions without genes -0473 if keepSpontaneous==true -0474 fprintf('Removing non-spontaneous reactions without GPR rules... '); -0475 load(fullfile(ravenPath,'external','kegg','keggRxns.mat'),'isSpontaneous'); -0476 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); -0477 spontRxnsWithGenes=model.rxns(any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous)); -0478 else -0479 fprintf('Removing reactions without GPR rules... '); -0480 I=~any(model.rxnGeneMat,2); -0481 end -0482 model=removeReactions(model,I,true); -0483 fprintf('COMPLETE\n'); -0484 -0485 %Clean gene names -0486 fprintf('Fixing gene names in the model... '); -0487 %Get rid of the prefix organism id -0488 model.genes=regexprep(model.genes,'^\w+?:',''); -0489 fprintf('COMPLETE\n'); -0490 -0491 %If no FASTA file is supplied, then we are done here -0492 if isempty(fastaFile) -0493 %Create grRules -0494 fprintf('Constructing GPR associations and annotations for the model... '); -0495 model.grRules=cell(numel(model.rxns),1); -0496 model.grRules(:)={''}; -0497 %Add the gene associations as 'or' -0498 for i=1:numel(model.rxns) -0499 %Find the involved genes -0500 I=find(model.rxnGeneMat(i,:)); -0501 if any(I) -0502 model.grRules{i}=['(' model.genes{I(1)}]; -0503 for j=2:numel(I) -0504 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; -0505 end -0506 model.grRules{i}=[model.grRules{i} ')']; -0507 end -0508 end -0509 %Fix grRules and reconstruct rxnGeneMat -0510 [grRules,rxnGeneMat] = standardizeGrRules(model); %Give detailed output -0511 model.grRules = grRules; -0512 model.rxnGeneMat = rxnGeneMat; -0513 %Add geneMiriams, assuming that it follows the syntax -0514 %kegg.genes/organismID:geneName -0515 model.geneMiriams=''; -0516 for i=1:numel(model.genes) -0517 model.geneMiriams{i,1}.name{1,1}='kegg.genes'; -0518 model.geneMiriams{i,1}.value{1,1}=strcat(lower(organismID),model.genes{i,1}); -0519 end -0520 %Add the description to the reactions -0521 for i=1:numel(model.rxns) -0522 if ~isempty(model.rxnNotes{i}) -0523 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (without HMMs).',model.rxnNotes(i)); -0524 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); -0525 else -0526 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (without HMMs)'}; -0527 end +0442 %Check if organismID can be found in KEGG species list or is +0443 %set to "eukaryotes" or "prokaryotes" +0444 phylDistsFull=getPhylDist(fullfile(dataDir,'keggdb'),true); +0445 if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) +0446 EM='Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'; +0447 disp(EM); +0448 error('Fatal error occured. See the details above'); +0449 end +0450 +0451 fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); +0452 if ismember(organismID,{'eukaryotes','prokaryotes'}) +0453 phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); +0454 if strcmp(organismID,'eukaryotes') +0455 proxyid='hsa'; +0456 %Use H. sapiens here +0457 else +0458 proxyid='eco'; +0459 %Use E. coli here +0460 end +0461 [~, phylDistId]=ismember(proxyid,phylDists.ids); +0462 idsToKeep=phylDists.ids(~isinf(phylDists.distMat(phylDistId,:))); +0463 taxIDs=cellfun(@(x) x{1},cellfun(@(x) strsplit(x,':'),model.genes,'UniformOutput',false),'UniformOutput',false); +0464 I=ismember(upper(taxIDs),upper(idsToKeep)); +0465 else +0466 %KEGG organism IDs may have three or four letters +0467 organismID=strcat(organismID,':'); +0468 %Add colon for accurate matching +0469 if length(organismID)==4 +0470 I=cellfun(@(x) strcmpi(x(1:4),organismID),model.genes); +0471 elseif length(organismID)==5 +0472 I=cellfun(@(x) strcmpi(x(1:5),organismID),model.genes); +0473 end +0474 end +0475 %Remove those genes +0476 model.genes=model.genes(I); +0477 model.rxnGeneMat=model.rxnGeneMat(:,I); +0478 fprintf('COMPLETE\n'); +0479 end +0480 +0481 %First remove all reactions without genes +0482 if keepSpontaneous==true +0483 fprintf('Removing non-spontaneous reactions without GPR rules... '); +0484 load(fullfile(ravenPath,'external','kegg','keggRxns.mat'),'isSpontaneous'); +0485 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); +0486 spontRxnsWithGenes=model.rxns(any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous)); +0487 else +0488 fprintf('Removing reactions without GPR rules... '); +0489 I=~any(model.rxnGeneMat,2); +0490 end +0491 model=removeReactions(model,I,true); +0492 fprintf('COMPLETE\n'); +0493 +0494 %Clean gene names +0495 fprintf('Fixing gene names in the model... '); +0496 %Get rid of the prefix organism id +0497 model.genes=regexprep(model.genes,'^\w+?:',''); +0498 fprintf('COMPLETE\n'); +0499 +0500 %If no FASTA file is supplied, then we are done here +0501 if isempty(fastaFile) +0502 %Create grRules +0503 fprintf('Constructing GPR associations and annotations for the model... '); +0504 model.grRules=cell(numel(model.rxns),1); +0505 model.grRules(:)={''}; +0506 %Add the gene associations as 'or' +0507 for i=1:numel(model.rxns) +0508 %Find the involved genes +0509 I=find(model.rxnGeneMat(i,:)); +0510 if any(I) +0511 model.grRules{i}=['(' model.genes{I(1)}]; +0512 for j=2:numel(I) +0513 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; +0514 end +0515 model.grRules{i}=[model.grRules{i} ')']; +0516 end +0517 end +0518 %Fix grRules and reconstruct rxnGeneMat +0519 [grRules,rxnGeneMat] = standardizeGrRules(model); %Give detailed output +0520 model.grRules = grRules; +0521 model.rxnGeneMat = rxnGeneMat; +0522 %Add geneMiriams, assuming that it follows the syntax +0523 %kegg.genes/organismID:geneName +0524 model.geneMiriams=''; +0525 for i=1:numel(model.genes) +0526 model.geneMiriams{i,1}.name{1,1}='kegg.genes'; +0527 model.geneMiriams{i,1}.value{1,1}=strcat(lower(organismID),model.genes{i,1}); 0528 end -0529 fprintf('COMPLETE\n\n'); -0530 fprintf('*** Model reconstruction complete ***\n'); -0531 return; -0532 end -0533 -0534 %Create a phylogenetic distance structure -0535 phylDistStruct=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); -0536 [~, phylDistId]=ismember(model.id,phylDistStruct.ids); -0537 -0538 %Calculate the real maximal distance now. An abitary large number of 1000 -0539 %is used for the "all in kingdom" or "all sequences" options. This is a bit -0540 %inconvenient way to do it, but it is to make it fit with some older code -0541 if isinf(maxPhylDist) || maxPhylDist==-1 -0542 maxPhylDist=1000; -0543 end -0544 -0545 %Get the KO ids for which files have been generated. Maybe not the neatest -0546 %way.. -0547 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); -0548 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); -0549 alignedWorking=listFiles(fullfile(dataDir,'aligned','*.faw')); -0550 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); -0551 outFiles=listFiles(fullfile(outDir,'*.out')); -0552 -0553 %Check if multi-FASTA files should be generated. This should only be -0554 %performed if there are IDs in the KOModel structure that haven't been -0555 %parsed yet -0556 missingFASTA=setdiff(KOModel.rxns,[fastaFiles;alignedFiles;hmmFiles;outFiles]); -0557 -0558 if ~isempty(missingFASTA) -0559 if ~exist(fullfile(dataDir,'keggdb','genes.pep'),'file') -0560 EM=['The file ''genes.pep'' cannot be located at ' strrep(dataDir,'\','/') '/ and should be downloaded from the KEGG FTP.\n']; -0561 dispEM(EM); -0562 end -0563 %Only construct models for KOs which don't have files already -0564 fastaModel=removeReactions(KOModel,setdiff(KOModel.rxns,missingFASTA),true,true); -0565 %Permute the order of the KOs in the model so that constructMultiFasta -0566 %can be run on several processors at once -0567 fastaModel=permuteModel(fastaModel,randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(fastaModel.rxns)),'rxns'); -0568 constructMultiFasta(fastaModel,fullfile(dataDir,'keggdb','genes.pep'),fullfile(dataDir,'fasta')); -0569 else -0570 fprintf('Generating the KEGG Orthology specific multi-FASTA files... COMPLETE\n'); -0571 end -0572 -0573 if isunix -0574 if ismac -0575 binEnd='.mac'; -0576 else -0577 binEnd=''; -0578 end -0579 elseif ispc -0580 binEnd=''; -0581 else -0582 EM='Unknown OS, exiting.'; -0583 disp(EM); -0584 return -0585 end -0586 -0587 %Check if alignment of FASTA files should be performed -0588 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); -0589 if ~isempty(missingAligned) -0590 if seqIdentity==-1 -0591 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... '); -0592 else -0593 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... '); -0594 end -0595 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); -0596 progressFlag=0; -0597 %Update fastaFiles. This is needed once rebuilding KEGG from FTP dump -0598 %files for more accurate progress reporting -0599 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); -0600 %Align all sequences using MAFFT -0601 for i=1:numel(missingAligned) -0602 %This is checked here because it could be that it is created by a -0603 %parallel process. The faw-files are saved as temporary files to -0604 %kept track of which files are being worked on -0605 if ~exist(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'file') &&... -0606 ~exist(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'file') -0607 %Check that the multi-FASTA file exists. It should do so since -0608 %we are saving empty files as well. Print a warning and -0609 %continue if not -0610 if ~exist(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']),'file') -0611 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; -0612 dispEM(EM,false); -0613 continue; -0614 end -0615 -0616 %If the multi-FASTA file is empty then save an empty aligned -0617 %file and continue -0618 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0619 if s.bytes<=0 -0620 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); -0621 fclose(fid); +0529 %Add the description to the reactions +0530 for i=1:numel(model.rxns) +0531 if ~isempty(model.rxnNotes{i}) +0532 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (without HMMs).',model.rxnNotes(i)); +0533 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); +0534 else +0535 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (without HMMs)'}; +0536 end +0537 end +0538 fprintf('COMPLETE\n\n'); +0539 fprintf('*** Model reconstruction complete ***\n'); +0540 return; +0541 end +0542 +0543 %Create a phylogenetic distance structure +0544 phylDistStruct=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); +0545 [~, phylDistId]=ismember(model.id,phylDistStruct.ids); +0546 +0547 %Calculate the real maximal distance now. An abitary large number of 1000 +0548 %is used for the "all in kingdom" or "all sequences" options. This is a bit +0549 %inconvenient way to do it, but it is to make it fit with some older code +0550 if isinf(maxPhylDist) || maxPhylDist==-1 +0551 maxPhylDist=1000; +0552 end +0553 +0554 %Get the KO ids for which files have been generated. Maybe not the neatest +0555 %way.. +0556 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); +0557 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); +0558 alignedWorking=listFiles(fullfile(dataDir,'aligned','*.faw')); +0559 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); +0560 outFiles=listFiles(fullfile(outDir,'*.out')); +0561 +0562 %Check if multi-FASTA files should be generated. This should only be +0563 %performed if there are IDs in the KOModel structure that haven't been +0564 %parsed yet +0565 missingFASTA=setdiff(KOModel.rxns,[fastaFiles;alignedFiles;hmmFiles;outFiles]); +0566 +0567 if ~isempty(missingFASTA) +0568 if ~exist(fullfile(dataDir,'keggdb','genes.pep'),'file') +0569 EM=['The file ''genes.pep'' cannot be located at ' strrep(dataDir,'\','/') '/ and should be downloaded from the KEGG FTP.\n']; +0570 dispEM(EM); +0571 end +0572 %Only construct models for KOs which don't have files already +0573 fastaModel=removeReactions(KOModel,setdiff(KOModel.rxns,missingFASTA),true,true); +0574 %Permute the order of the KOs in the model so that constructMultiFasta +0575 %can be run on several processors at once +0576 fastaModel=permuteModel(fastaModel,randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(fastaModel.rxns)),'rxns'); +0577 constructMultiFasta(fastaModel,fullfile(dataDir,'keggdb','genes.pep'),fullfile(dataDir,'fasta')); +0578 else +0579 fprintf('Generating the KEGG Orthology specific multi-FASTA files... COMPLETE\n'); +0580 end +0581 +0582 if isunix +0583 if ismac +0584 binEnd='.mac'; +0585 else +0586 binEnd=''; +0587 end +0588 elseif ispc +0589 binEnd=''; +0590 else +0591 EM='Unknown OS, exiting.'; +0592 disp(EM); +0593 return +0594 end +0595 +0596 %Check if alignment of FASTA files should be performed +0597 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); +0598 if ~isempty(missingAligned) +0599 if seqIdentity==-1 +0600 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... '); +0601 else +0602 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... '); +0603 end +0604 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); +0605 progressFlag=0; +0606 %Update fastaFiles. This is needed once rebuilding KEGG from FTP dump +0607 %files for more accurate progress reporting +0608 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); +0609 %Align all sequences using MAFFT +0610 for i=1:numel(missingAligned) +0611 %This is checked here because it could be that it is created by a +0612 %parallel process. The faw-files are saved as temporary files to +0613 %kept track of which files are being worked on +0614 if ~exist(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'file') &&... +0615 ~exist(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'file') +0616 %Check that the multi-FASTA file exists. It should do so since +0617 %we are saving empty files as well. Print a warning and +0618 %continue if not +0619 if ~exist(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']),'file') +0620 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; +0621 dispEM(EM,false); 0622 continue; 0623 end 0624 -0625 %Create an empty file to prevent other threads to start to work -0626 %on the same alignment -0627 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); -0628 fclose(fid); -0629 -0630 %First load the FASTA file, then select up to nSequences -0631 %sequences of the most closely related species, apply any -0632 %constraints from maxPhylDist, and save it as a temporary file, -0633 %and create the model from that -0634 -0635 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0636 phylDist=inf(numel(fastaStruct),1); -0637 for j=1:numel(fastaStruct) -0638 %Get the organism abbreviation -0639 index=strfind(fastaStruct(j).Header,':'); -0640 if any(index) -0641 abbrev=fastaStruct(j).Header(1:index(1)-1); -0642 [~, index]=ismember(abbrev,phylDistStruct.ids); -0643 if any(index) -0644 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); -0645 end -0646 end -0647 end -0648 -0649 %Inf means that it should not be included -0650 phylDist(phylDist>maxPhylDist)=[]; -0651 -0652 %Sort based on phylDist -0653 [~, order]=sort(phylDist); -0654 -0655 %Save the first nSequences hits to a temporary FASTA file -0656 if nSequences<=numel(fastaStruct) -0657 fastaStruct=fastaStruct(order(1:nSequences)); -0658 else -0659 fastaStruct=fastaStruct(order); -0660 end -0661 -0662 %Do the clustering and alignment if there are more than one -0663 %sequences, otherwise just save the sequence (or an empty file) -0664 if numel(fastaStruct)>1 -0665 if seqIdentity==0.9 -0666 cdhitInp100=tempname; -0667 fastawrite(cdhitInp100,fastaStruct); -0668 cdhitInp90=tempname; -0669 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp100 '" -o "' cdhitInp90 '" -c 1.0 -n 5 -M 2000']); -0670 if status~=0 -0671 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0672 dispEM(EM); -0673 end -0674 %Remove the old tempfile -0675 if exist(cdhitInp100, 'file') -0676 delete([cdhitInp100 '*']); -0677 end -0678 tmpFile=tempname; -0679 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp90 '" -o "' tmpFile '" -c 0.9 -n 5 -M 2000 -aL 0.8']); -0680 if status~=0 -0681 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0682 dispEM(EM); -0683 end -0684 %Remove the old tempfile -0685 if exist(cdhitInp90, 'file') -0686 delete([cdhitInp90 '*']); -0687 end -0688 elseif seqIdentity==0.5 -0689 cdhitInp100=tempname; -0690 fastawrite(cdhitInp100,fastaStruct); -0691 cdhitInp90=tempname; -0692 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp100 '" -o "' cdhitInp90 '" -c 1.0 -n 5 -M 2000']); -0693 if status~=0 -0694 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0695 dispEM(EM); +0625 %If the multi-FASTA file is empty then save an empty aligned +0626 %file and continue +0627 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0628 if s.bytes<=0 +0629 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); +0630 fclose(fid); +0631 continue; +0632 end +0633 +0634 %Create an empty file to prevent other threads to start to work +0635 %on the same alignment +0636 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); +0637 fclose(fid); +0638 +0639 %First load the FASTA file, then select up to nSequences +0640 %sequences of the most closely related species, apply any +0641 %constraints from maxPhylDist, and save it as a temporary file, +0642 %and create the model from that +0643 +0644 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0645 phylDist=inf(numel(fastaStruct),1); +0646 for j=1:numel(fastaStruct) +0647 %Get the organism abbreviation +0648 index=strfind(fastaStruct(j).Header,':'); +0649 if any(index) +0650 abbrev=fastaStruct(j).Header(1:index(1)-1); +0651 [~, index]=ismember(abbrev,phylDistStruct.ids); +0652 if any(index) +0653 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); +0654 end +0655 end +0656 end +0657 +0658 %Inf means that it should not be included +0659 phylDist(phylDist>maxPhylDist)=[]; +0660 +0661 %Sort based on phylDist +0662 [~, order]=sort(phylDist); +0663 +0664 %Save the first nSequences hits to a temporary FASTA file +0665 if nSequences<=numel(fastaStruct) +0666 fastaStruct=fastaStruct(order(1:nSequences)); +0667 else +0668 fastaStruct=fastaStruct(order); +0669 end +0670 +0671 %Do the clustering and alignment if there are more than one +0672 %sequences, otherwise just save the sequence (or an empty file) +0673 if numel(fastaStruct)>1 +0674 if seqIdentity==0.9 +0675 cdhitInp100=tempname; +0676 fastawrite(cdhitInp100,fastaStruct); +0677 cdhitInp90=tempname; +0678 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp100 '" -o "' cdhitInp90 '" -c 1.0 -n 5 -M 2000']); +0679 if status~=0 +0680 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0681 dispEM(EM); +0682 end +0683 %Remove the old tempfile +0684 if exist(cdhitInp100, 'file') +0685 delete([cdhitInp100 '*']); +0686 end +0687 tmpFile=tempname; +0688 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp90 '" -o "' tmpFile '" -c 0.9 -n 5 -M 2000 -aL 0.8']); +0689 if status~=0 +0690 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0691 dispEM(EM); +0692 end +0693 %Remove the old tempfile +0694 if exist(cdhitInp90, 'file') +0695 delete([cdhitInp90 '*']); 0696 end -0697 %Remove the old tempfile -0698 if exist(cdhitInp100, 'file') -0699 delete([cdhitInp100 '*']); -0700 end -0701 cdhitInp50=tempname; -0702 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp90 '" -o "' cdhitInp50 '" -c 0.9 -n 5 -M 2000 -aL 0.8']); -0703 if status~=0 -0704 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0705 dispEM(EM); -0706 end -0707 %Remove the old tempfile -0708 if exist(cdhitInp90, 'file') -0709 delete([cdhitInp90 '*']); -0710 end -0711 tmpFile=tempname; -0712 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp50 '" -o "' tmpFile '" -c 0.5 -n 3 -M 2000 -aL 0.8']); -0713 if status~=0 -0714 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0715 dispEM(EM); -0716 end -0717 %Remove the old tempfile -0718 if exist(cdhitInp50, 'file') -0719 delete([cdhitInp50 '*']); -0720 end -0721 elseif seqIdentity~=-1 -0722 cdhitInpCustom=tempname; -0723 fastawrite(cdhitInpCustom,fastaStruct); -0724 tmpFile=tempname; -0725 if seqIdentity<=1 && seqIdentity>0.7 -0726 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 5 -M 2000']); -0727 elseif seqIdentity>0.6 -0728 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 4 -M 2000']); -0729 elseif seqidentity>0.5 -0730 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 3 -M 2000']); -0731 elseif seqidentity>0.4 -0732 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 2 -M 2000']); -0733 else -0734 EM='The provided seqIdentity must be between 0 and 1\n'; -0735 dispEM(EM); -0736 end -0737 if status~=0 -0738 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0739 dispEM(EM); -0740 end -0741 %Remove the old tempfile -0742 if exist(cdhitInpCustom, 'file') -0743 delete([cdhitInpCustom '*']); -0744 end -0745 else -0746 %This means that CD-HIT should be skipped since -0747 %seqIdentity is equal to -1 -0748 tmpFile=tempname; -0749 fastawrite(tmpFile,fastaStruct); -0750 end -0751 %Do the alignment for this file -0752 if ismac -0753 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0754 elseif isunix -0755 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0756 elseif ispc -0757 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-win','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0758 end -0759 if status~=0 -0760 %It could be that alignment failed because only one -0761 %sequence was left after clustering. If that is the -0762 %case, then the clustered file is just copied as 'faw' -0763 %file -0764 if any(regexp(output,'Only 1 sequence found')) -0765 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); -0766 else -0767 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; -0768 dispEM(EM); -0769 end -0770 end -0771 %Remove the old tempfile -0772 if exist(tmpFile, 'file') -0773 delete([tmpFile '*']); -0774 end -0775 else -0776 %If there is only one sequence then it's not possible to do -0777 %a multiple alignment. Just print the sequence instead. An -0778 %empty file was written previously so that doesn't have to -0779 %be dealt with -0780 if numel(fastaStruct)==1 -0781 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); -0782 end -0783 end -0784 %Move the temporary file to the real one -0785 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); -0786 -0787 %Print the progress: no need to update this for every -0788 %iteration, just report once 25%, 50% and 75% are done -0789 if progressFlag==0 && i>numel(missingAligned)*0.25 -0790 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.fa')))/numel(fastaFiles))*100); -0791 progressFlag=progressFlag+1; -0792 elseif (progressFlag==1 && i>=numel(missingAligned)*0.5) || (progressFlag==2 && i>=numel(missingAligned)*0.75) -0793 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.fa')))/numel(fastaFiles))*100); -0794 progressFlag=progressFlag+1; -0795 end -0796 end -0797 end -0798 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0799 else -0800 if seqIdentity==-1 -0801 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0802 else -0803 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0804 end -0805 end -0806 -0807 %Check if training of Hidden Markov models should be performed -0808 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); -0809 if ~isempty(missingHMMs) -0810 fprintf('Training the KEGG Orthology specific HMMs... '); -0811 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); -0812 progressFlag=0; -0813 %Update alignedFiles. This is needed once rebuilding KEGG from FTP dump -0814 %files for more accurate progress reporting -0815 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); -0816 %Train models for all missing KOs -0817 for i=1:numel(missingHMMs) -0818 %This is checked here because it could be that it is created by a -0819 %parallel process -0820 if ~exist(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'file') && ~exist(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'file') -0821 %Check that the aligned FASTA file exists. It could be that it -0822 %is still being worked on by some other instance of the program -0823 %(the .faw file should then exist). This should not happen on a -0824 %single computer. It doesn't throw an error, because it should -0825 %finalize the ones it can -0826 if ~exist(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']),'file') -0827 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; -0828 dispEM(EM,false); -0829 continue; -0830 end -0831 -0832 %If the multi-FASTA file is empty then save an empty aligned -0833 %file and continue -0834 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); -0835 if s.bytes<=0 -0836 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); -0837 fclose(fid); +0697 elseif seqIdentity==0.5 +0698 cdhitInp100=tempname; +0699 fastawrite(cdhitInp100,fastaStruct); +0700 cdhitInp90=tempname; +0701 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp100 '" -o "' cdhitInp90 '" -c 1.0 -n 5 -M 2000']); +0702 if status~=0 +0703 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0704 dispEM(EM); +0705 end +0706 %Remove the old tempfile +0707 if exist(cdhitInp100, 'file') +0708 delete([cdhitInp100 '*']); +0709 end +0710 cdhitInp50=tempname; +0711 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp90 '" -o "' cdhitInp50 '" -c 0.9 -n 5 -M 2000 -aL 0.8']); +0712 if status~=0 +0713 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0714 dispEM(EM); +0715 end +0716 %Remove the old tempfile +0717 if exist(cdhitInp90, 'file') +0718 delete([cdhitInp90 '*']); +0719 end +0720 tmpFile=tempname; +0721 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInp50 '" -o "' tmpFile '" -c 0.5 -n 3 -M 2000 -aL 0.8']); +0722 if status~=0 +0723 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0724 dispEM(EM); +0725 end +0726 %Remove the old tempfile +0727 if exist(cdhitInp50, 'file') +0728 delete([cdhitInp50 '*']); +0729 end +0730 elseif seqIdentity~=-1 +0731 cdhitInpCustom=tempname; +0732 fastawrite(cdhitInpCustom,fastaStruct); +0733 tmpFile=tempname; +0734 if seqIdentity<=1 && seqIdentity>0.7 +0735 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 5 -M 2000']); +0736 elseif seqIdentity>0.6 +0737 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 4 -M 2000']); +0738 elseif seqidentity>0.5 +0739 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 3 -M 2000']); +0740 elseif seqidentity>0.4 +0741 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n 2 -M 2000']); +0742 else +0743 EM='The provided seqIdentity must be between 0 and 1\n'; +0744 dispEM(EM); +0745 end +0746 if status~=0 +0747 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0748 dispEM(EM); +0749 end +0750 %Remove the old tempfile +0751 if exist(cdhitInpCustom, 'file') +0752 delete([cdhitInpCustom '*']); +0753 end +0754 else +0755 %This means that CD-HIT should be skipped since +0756 %seqIdentity is equal to -1 +0757 tmpFile=tempname; +0758 fastawrite(tmpFile,fastaStruct); +0759 end +0760 %Do the alignment for this file +0761 if ismac +0762 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0763 elseif isunix +0764 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0765 elseif ispc +0766 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-win','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0767 end +0768 if status~=0 +0769 %It could be that alignment failed because only one +0770 %sequence was left after clustering. If that is the +0771 %case, then the clustered file is just copied as 'faw' +0772 %file +0773 if any(regexp(output,'Only 1 sequence found')) +0774 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); +0775 else +0776 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; +0777 dispEM(EM); +0778 end +0779 end +0780 %Remove the old tempfile +0781 if exist(tmpFile, 'file') +0782 delete([tmpFile '*']); +0783 end +0784 else +0785 %If there is only one sequence then it's not possible to do +0786 %a multiple alignment. Just print the sequence instead. An +0787 %empty file was written previously so that doesn't have to +0788 %be dealt with +0789 if numel(fastaStruct)==1 +0790 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); +0791 end +0792 end +0793 %Move the temporary file to the real one +0794 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); +0795 +0796 %Print the progress: no need to update this for every +0797 %iteration, just report once 25%, 50% and 75% are done +0798 if progressFlag==0 && i>numel(missingAligned)*0.25 +0799 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.fa')))/numel(fastaFiles))*100); +0800 progressFlag=progressFlag+1; +0801 elseif (progressFlag==1 && i>=numel(missingAligned)*0.5) || (progressFlag==2 && i>=numel(missingAligned)*0.75) +0802 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.fa')))/numel(fastaFiles))*100); +0803 progressFlag=progressFlag+1; +0804 end +0805 end +0806 end +0807 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0808 else +0809 if seqIdentity==-1 +0810 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0811 else +0812 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0813 end +0814 end +0815 +0816 %Check if training of Hidden Markov models should be performed +0817 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); +0818 if ~isempty(missingHMMs) +0819 fprintf('Training the KEGG Orthology specific HMMs... '); +0820 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); +0821 progressFlag=0; +0822 %Update alignedFiles. This is needed once rebuilding KEGG from FTP dump +0823 %files for more accurate progress reporting +0824 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); +0825 %Train models for all missing KOs +0826 for i=1:numel(missingHMMs) +0827 %This is checked here because it could be that it is created by a +0828 %parallel process +0829 if ~exist(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'file') && ~exist(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'file') +0830 %Check that the aligned FASTA file exists. It could be that it +0831 %is still being worked on by some other instance of the program +0832 %(the .faw file should then exist). This should not happen on a +0833 %single computer. It doesn't throw an error, because it should +0834 %finalize the ones it can +0835 if ~exist(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']),'file') +0836 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; +0837 dispEM(EM,false); 0838 continue; 0839 end -0840 %Create a temporary file to indicate that it is working on the -0841 %KO. This is because hmmbuild cannot overwrite existing files -0842 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); -0843 fclose(fid); -0844 -0845 %Create HMM -0846 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); -0847 if status~=0 -0848 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; -0849 dispEM(EM); -0850 end -0851 -0852 %Delete the temporary file -0853 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); -0854 -0855 %Print the progress: no need to update this for every -0856 %iteration, just report once 25%, 50% and 75% are done -0857 if progressFlag==0 && i>numel(missingHMMs)*0.25 -0858 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.hmm')))/numel(alignedFiles))*100); -0859 progressFlag=progressFlag+1; -0860 elseif (progressFlag==1 && i>=numel(missingHMMs)*0.5) || (progressFlag==2 && i>=numel(missingHMMs)*0.75) -0861 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.hmm')))/numel(alignedFiles))*100); -0862 progressFlag=progressFlag+1; -0863 end -0864 end -0865 end -0866 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0867 else -0868 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); -0869 end -0870 -0871 %Check which new .out files that should be generated. Check if training of -0872 %Hidden Markov models should be performed -0873 missingOUT=setdiff(KOModel.rxns,outFiles); -0874 if ~isempty(missingOUT) -0875 fprintf(['Querying <strong>' strrep(fastaFile,'\','/') '</strong> against the KEGG Orthology specific HMMs... ']); -0876 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); -0877 progressFlag=0; -0878 %Update hmmFiles. This is needed once rebuilding KEGG from FTP dump -0879 %files for more accurate progress reporting -0880 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); -0881 for i=1:numel(missingOUT) -0882 %This is checked here because it could be that it is created by a -0883 %parallel process -0884 if ~exist(fullfile(outDir,[missingOUT{i} '.out']),'file') -0885 %Check that the HMM file exists. It should do so since %we are -0886 %saving empty files as well. Print a warning and continue if -0887 %not -0888 if ~exist(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']),'file') -0889 EM=['The HMM file for ' missingOUT{i} ' does not exist']; -0890 dispEM(EM,false); -0891 continue; -0892 end -0893 -0894 %Save an empty file to prevent several threads working on the -0895 %same file -0896 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0897 fclose(fid); -0898 -0899 %If the HMM file is empty then save an out file and continue -0900 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); -0901 if s.bytes<=0 -0902 continue; -0903 end -0904 -0905 %Check each gene in the input file against this model -0906 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); -0907 if status~=0 -0908 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; -0909 dispEM(EM); -0910 end -0911 -0912 %Save the output to a file -0913 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0914 fwrite(fid,output); -0915 fclose(fid); -0916 -0917 %Print the progress: no need to update this for every -0918 %iteration, just report once 25%, 50% and 75% are done -0919 if progressFlag==0 && i>numel(missingOUT)*0.25 -0920 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(outDir,'*.out')))/numel(hmmFiles))*100); -0921 progressFlag=progressFlag+1; -0922 elseif (progressFlag==1 && i>=numel(missingOUT)*0.5) || (progressFlag==2 && i>=numel(missingOUT)*0.75) -0923 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(outDir,'*.out')))/numel(hmmFiles))*100); -0924 progressFlag=progressFlag+1; -0925 end -0926 end -0927 end -0928 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0929 else -0930 fprintf(['Querying <strong>' strrep(fastaFile,'\','/') '</strong> against the KEGG Orthology specific HMMs... COMPLETE\n']); -0931 end -0932 -0933 -0934 %***Begin retrieving the output and putting together the resulting model -0935 -0936 fprintf('Parsing the HMM search results... '); -0937 %Retrieve matched genes from the HMMs -0938 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes -0939 genes=cell(3000,1); -0940 %Store the best score for a gene in a hash list (since it will be searching -0941 %many times) -0942 hTable = java.util.Hashtable; -0943 -0944 geneCounter=0; -0945 for i=1:numel(KOModel.rxns) -0946 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') -0947 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); -0948 beginMatches=false; -0949 while 1 -0950 %Get the next line -0951 tline = fgetl(fid); -0952 -0953 %Abort at end of file -0954 if ~ischar(tline) -0955 break; -0956 end -0957 -0958 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) -0959 break; -0960 end +0840 +0841 %If the multi-FASTA file is empty then save an empty aligned +0842 %file and continue +0843 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); +0844 if s.bytes<=0 +0845 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); +0846 fclose(fid); +0847 continue; +0848 end +0849 %Create a temporary file to indicate that it is working on the +0850 %KO. This is because hmmbuild cannot overwrite existing files +0851 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); +0852 fclose(fid); +0853 +0854 %Create HMM +0855 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); +0856 if status~=0 +0857 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; +0858 dispEM(EM); +0859 end +0860 +0861 %Delete the temporary file +0862 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); +0863 +0864 %Print the progress: no need to update this for every +0865 %iteration, just report once 25%, 50% and 75% are done +0866 if progressFlag==0 && i>numel(missingHMMs)*0.25 +0867 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.hmm')))/numel(alignedFiles))*100); +0868 progressFlag=progressFlag+1; +0869 elseif (progressFlag==1 && i>=numel(missingHMMs)*0.5) || (progressFlag==2 && i>=numel(missingHMMs)*0.75) +0870 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(dataDir,'*.hmm')))/numel(alignedFiles))*100); +0871 progressFlag=progressFlag+1; +0872 end +0873 end +0874 end +0875 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0876 else +0877 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); +0878 end +0879 +0880 %Check which new .out files that should be generated. Check if training of +0881 %Hidden Markov models should be performed +0882 missingOUT=setdiff(KOModel.rxns,outFiles); +0883 if ~isempty(missingOUT) +0884 fprintf(['Querying <strong>' strrep(fastaFile,'\','/') '</strong> against the KEGG Orthology specific HMMs... ']); +0885 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); +0886 progressFlag=0; +0887 %Update hmmFiles. This is needed once rebuilding KEGG from FTP dump +0888 %files for more accurate progress reporting +0889 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); +0890 for i=1:numel(missingOUT) +0891 %This is checked here because it could be that it is created by a +0892 %parallel process +0893 if ~exist(fullfile(outDir,[missingOUT{i} '.out']),'file') +0894 %Check that the HMM file exists. It should do so since %we are +0895 %saving empty files as well. Print a warning and continue if +0896 %not +0897 if ~exist(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']),'file') +0898 EM=['The HMM file for ' missingOUT{i} ' does not exist']; +0899 dispEM(EM,false); +0900 continue; +0901 end +0902 +0903 %Save an empty file to prevent several threads working on the +0904 %same file +0905 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0906 fclose(fid); +0907 +0908 %If the HMM file is empty then save an out file and continue +0909 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); +0910 if s.bytes<=0 +0911 continue; +0912 end +0913 +0914 %Check each gene in the input file against this model +0915 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); +0916 if status~=0 +0917 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; +0918 dispEM(EM); +0919 end +0920 +0921 %Save the output to a file +0922 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0923 fwrite(fid,output); +0924 fclose(fid); +0925 +0926 %Print the progress: no need to update this for every +0927 %iteration, just report once 25%, 50% and 75% are done +0928 if progressFlag==0 && i>numel(missingOUT)*0.25 +0929 fprintf('%*.*f%% complete',5,2,(numel(listFiles(fullfile(outDir,'*.out')))/numel(hmmFiles))*100); +0930 progressFlag=progressFlag+1; +0931 elseif (progressFlag==1 && i>=numel(missingOUT)*0.5) || (progressFlag==2 && i>=numel(missingOUT)*0.75) +0932 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b%*.*f%% complete',5,2,(numel(listFiles(fullfile(outDir,'*.out')))/numel(hmmFiles))*100); +0933 progressFlag=progressFlag+1; +0934 end +0935 end +0936 end +0937 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0938 else +0939 fprintf(['Querying <strong>' strrep(fastaFile,'\','/') '</strong> against the KEGG Orthology specific HMMs... COMPLETE\n']); +0940 end +0941 +0942 +0943 %***Begin retrieving the output and putting together the resulting model +0944 +0945 fprintf('Parsing the HMM search results... '); +0946 %Retrieve matched genes from the HMMs +0947 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes +0948 genes=cell(3000,1); +0949 %Store the best score for a gene in a hash list (since it will be searching +0950 %many times) +0951 hTable = java.util.Hashtable; +0952 +0953 geneCounter=0; +0954 for i=1:numel(KOModel.rxns) +0955 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') +0956 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); +0957 beginMatches=false; +0958 while 1 +0959 %Get the next line +0960 tline = fgetl(fid); 0961 -0962 if beginMatches==false -0963 %This is how the listing of matches begins -0964 if any(strfind(tline,'E-value ')) -0965 %Read one more line that is only padding -0966 tline = fgetl(fid); -0967 beginMatches=true; -0968 end -0969 else -0970 %If matches should be read -0971 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) -0972 elements=regexp(tline,' ','split'); -0973 elements=elements(cellfun(@any,elements)); -0974 -0975 %Check if the match is below the treshhold -0976 score=str2double(elements{1}); -0977 gene=elements{9}; -0978 if score<=cutOff -0979 %If the score is exactly 0, change it to a very -0980 %small value to avoid NaN -0981 if score==0 -0982 score=10^-250; -0983 end -0984 %Check if the gene is added already and, is so, get -0985 %the best score for it -0986 I=hTable.get(gene); -0987 if any(I) -0988 koGeneMat(i,I)=score; -0989 else -0990 geneCounter=geneCounter+1; -0991 %The gene was not present yet so add it -0992 hTable.put(gene,geneCounter); -0993 genes{geneCounter}=gene; -0994 koGeneMat(i,geneCounter)=score; -0995 end -0996 end -0997 else -0998 break; -0999 end -1000 end -1001 end -1002 fclose(fid); -1003 end -1004 end -1005 fprintf('COMPLETE\n'); -1006 -1007 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); -1008 koGeneMat=koGeneMat(:,1:geneCounter); -1009 -1010 %Remove the genes for each KO that are below minScoreRatioKO. -1011 for i=1:size(koGeneMat,1) -1012 J=find(koGeneMat(i,:)); -1013 if any(J) -1014 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; -1015 end -1016 end -1017 -1018 %Remove the KOs for each gene that are below minScoreRatioG -1019 for i=1:size(koGeneMat,2) -1020 J=find(koGeneMat(:,i)); -1021 if any(J) -1022 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; -1023 end -1024 end -1025 fprintf('COMPLETE\n'); +0962 %Abort at end of file +0963 if ~ischar(tline) +0964 break; +0965 end +0966 +0967 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) +0968 break; +0969 end +0970 +0971 if beginMatches==false +0972 %This is how the listing of matches begins +0973 if any(strfind(tline,'E-value ')) +0974 %Read one more line that is only padding +0975 tline = fgetl(fid); +0976 beginMatches=true; +0977 end +0978 else +0979 %If matches should be read +0980 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) +0981 elements=regexp(tline,' ','split'); +0982 elements=elements(cellfun(@any,elements)); +0983 +0984 %Check if the match is below the treshhold +0985 score=str2double(elements{1}); +0986 gene=elements{9}; +0987 if score<=cutOff +0988 %If the score is exactly 0, change it to a very +0989 %small value to avoid NaN +0990 if score==0 +0991 score=10^-250; +0992 end +0993 %Check if the gene is added already and, is so, get +0994 %the best score for it +0995 I=hTable.get(gene); +0996 if any(I) +0997 koGeneMat(i,I)=score; +0998 else +0999 geneCounter=geneCounter+1; +1000 %The gene was not present yet so add it +1001 hTable.put(gene,geneCounter); +1002 genes{geneCounter}=gene; +1003 koGeneMat(i,geneCounter)=score; +1004 end +1005 end +1006 else +1007 break; +1008 end +1009 end +1010 end +1011 fclose(fid); +1012 end +1013 end +1014 fprintf('COMPLETE\n'); +1015 +1016 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); +1017 koGeneMat=koGeneMat(:,1:geneCounter); +1018 +1019 %Remove the genes for each KO that are below minScoreRatioKO. +1020 for i=1:size(koGeneMat,1) +1021 J=find(koGeneMat(i,:)); +1022 if any(J) +1023 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; +1024 end +1025 end 1026 -1027 fprintf('Adding gene annotations to the model... '); -1028 %Create the new model -1029 model.genes=genes(1:geneCounter); -1030 model.grRules=cell(numel(model.rxns),1); -1031 model.grRules(:)={''}; -1032 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); -1033 -1034 %Loop through the reactions and add the corresponding genes -1035 for i=1:numel(model.rxns) -1036 if isstruct(model.rxnMiriams{i}) -1037 %Get all KOs -1038 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); -1039 KOs=model.rxnMiriams{i}.value(I); -1040 %Find the KOs and the corresponding genes -1041 J=ismember(KOModel.rxns,KOs); -1042 [~, K]=find(koGeneMat(J,:)); -1043 -1044 if any(K) -1045 model.rxnGeneMat(i,K)=1; -1046 %Also delete KOs for which no genes were found. If no genes at -1047 %all were matched to the reaction it will be deleted later -1048 L=sum(koGeneMat(J,:),2)==0; -1049 model.rxnMiriams{i}.value(I(L))=[]; -1050 model.rxnMiriams{i}.name(I(L))=[]; -1051 end -1052 end -1053 end -1054 fprintf('COMPLETE\n'); -1055 -1056 %Find and delete all reactions without genes. This also removes genes that -1057 %are not used (which could happen because minScoreRatioG and -1058 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions -1059 %without genes are kept in the model. Spontaneous reactions with original -1060 %gene associations are treated in the same way, like the rest of the -1061 %reactions - if gene associations were removed during HMM search, such -1062 %reactions are deleted from the model -1063 if keepSpontaneous==true -1064 %Not the most comprise way to delete reactions without genes, but this -1065 %makes the code easier to understand. Firstly the non-spontaneous -1066 %reactions without genes are removed. After that, the second deletion -1067 %step removes spontaneous reactions, which had gene associations before -1068 %HMM search, but no longer have after it -1069 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); -1070 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); -1071 model=removeReactions(model,I,true,true); -1072 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); -1073 model=removeReactions(model,I,true,true); -1074 else -1075 %Just simply check for any new reactions without genes and remove -1076 %it -1077 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); -1078 I=~any(model.rxnGeneMat,2); -1079 model=removeReactions(model,I,true,true); -1080 end -1081 fprintf('COMPLETE\n'); -1082 -1083 fprintf('Constructing GPR rules and finalizing the model... '); -1084 %Add the gene associations as 'or' -1085 for i=1:numel(model.rxns) -1086 %Find the involved genes -1087 I=find(model.rxnGeneMat(i,:)); -1088 if any(I) -1089 model.grRules{i}=['(' model.genes{I(1)}]; -1090 for j=2:numel(I) -1091 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; -1092 end -1093 model.grRules{i}=[model.grRules{i} ')']; -1094 end -1095 end -1096 -1097 %Fix grRules and reconstruct rxnGeneMat -1098 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output -1099 model.grRules = grRules; -1100 model.rxnGeneMat = rxnGeneMat; -1101 -1102 %Add the description to the reactions -1103 for i=1:numel(model.rxns) -1104 if ~isempty(model.rxnNotes{i}) -1105 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); -1106 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); -1107 else -1108 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; -1109 end -1110 end -1111 %Remove the temp fasta file -1112 delete(fastaFile) -1113 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); -1114 end -1115 -1116 function files=listFiles(directory) -1117 %Supporter function to list the files in a directory and return them as a -1118 %cell array -1119 temp=dir(directory); -1120 files=cell(numel(temp),1); -1121 for i=1:numel(temp) -1122 files{i}=temp(i,1).name; +1027 %Remove the KOs for each gene that are below minScoreRatioG +1028 for i=1:size(koGeneMat,2) +1029 J=find(koGeneMat(:,i)); +1030 if any(J) +1031 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; +1032 end +1033 end +1034 fprintf('COMPLETE\n'); +1035 +1036 fprintf('Adding gene annotations to the model... '); +1037 %Create the new model +1038 model.genes=genes(1:geneCounter); +1039 model.grRules=cell(numel(model.rxns),1); +1040 model.grRules(:)={''}; +1041 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); +1042 +1043 %Loop through the reactions and add the corresponding genes +1044 for i=1:numel(model.rxns) +1045 if isstruct(model.rxnMiriams{i}) +1046 %Get all KOs +1047 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); +1048 KOs=model.rxnMiriams{i}.value(I); +1049 %Find the KOs and the corresponding genes +1050 J=ismember(KOModel.rxns,KOs); +1051 [~, K]=find(koGeneMat(J,:)); +1052 +1053 if any(K) +1054 model.rxnGeneMat(i,K)=1; +1055 %Also delete KOs for which no genes were found. If no genes at +1056 %all were matched to the reaction it will be deleted later +1057 L=sum(koGeneMat(J,:),2)==0; +1058 model.rxnMiriams{i}.value(I(L))=[]; +1059 model.rxnMiriams{i}.name(I(L))=[]; +1060 end +1061 end +1062 end +1063 fprintf('COMPLETE\n'); +1064 +1065 %Find and delete all reactions without genes. This also removes genes that +1066 %are not used (which could happen because minScoreRatioG and +1067 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions +1068 %without genes are kept in the model. Spontaneous reactions with original +1069 %gene associations are treated in the same way, like the rest of the +1070 %reactions - if gene associations were removed during HMM search, such +1071 %reactions are deleted from the model +1072 if keepSpontaneous==true +1073 %Not the most comprise way to delete reactions without genes, but this +1074 %makes the code easier to understand. Firstly the non-spontaneous +1075 %reactions without genes are removed. After that, the second deletion +1076 %step removes spontaneous reactions, which had gene associations before +1077 %HMM search, but no longer have after it +1078 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); +1079 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); +1080 model=removeReactions(model,I,true,true); +1081 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); +1082 model=removeReactions(model,I,true,true); +1083 else +1084 %Just simply check for any new reactions without genes and remove +1085 %it +1086 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); +1087 I=~any(model.rxnGeneMat,2); +1088 model=removeReactions(model,I,true,true); +1089 end +1090 fprintf('COMPLETE\n'); +1091 +1092 fprintf('Constructing GPR rules and finalizing the model... '); +1093 %Add the gene associations as 'or' +1094 for i=1:numel(model.rxns) +1095 %Find the involved genes +1096 I=find(model.rxnGeneMat(i,:)); +1097 if any(I) +1098 model.grRules{i}=['(' model.genes{I(1)}]; +1099 for j=2:numel(I) +1100 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; +1101 end +1102 model.grRules{i}=[model.grRules{i} ')']; +1103 end +1104 end +1105 +1106 %Fix grRules and reconstruct rxnGeneMat +1107 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output +1108 model.grRules = grRules; +1109 model.rxnGeneMat = rxnGeneMat; +1110 +1111 %Add the description to the reactions +1112 for i=1:numel(model.rxns) +1113 if ~isempty(model.rxnNotes{i}) +1114 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); +1115 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); +1116 else +1117 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; +1118 end +1119 end +1120 %Remove the temp fasta file +1121 delete(fastaFile) +1122 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); 1123 end -1124 files=strrep(files,'.fa',''); -1125 files=strrep(files,'.hmm',''); -1126 files=strrep(files,'.out',''); -1127 files=strrep(files,'.faw',''); -1128 end +1124 +1125 function files=listFiles(directory) +1126 %Supporter function to list the files in a directory and return them as a +1127 %cell array +1128 temp=dir(directory); +1129 files=cell(numel(temp),1); +1130 for i=1:numel(temp) +1131 files{i}=temp(i,1).name; +1132 end +1133 files=strrep(files,'.fa',''); +1134 files=strrep(files,'.hmm',''); +1135 files=strrep(files,'.out',''); +1136 files=strrep(files,'.faw',''); +1137 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/external/kegg/getKEGGModelForOrganism.m b/external/kegg/getKEGGModelForOrganism.m index 33fb19a6..0ef80540 100755 --- a/external/kegg/getKEGGModelForOrganism.m +++ b/external/kegg/getKEGGModelForOrganism.m @@ -408,6 +408,13 @@ %If no FASTA file is supplied, then just remove all genes which are not for %the given organism ID if isempty(fastaFile) + %Check if organismID can be found in KEGG species list or is + %set to "eukaryotes" or "prokaryotes" + phylDistsFull=getPhylDist(fullfile(dataDir,'keggdb'),true); + if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) + error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); + end + fprintf(['Pruning the model from non-' organismID ' genes... ']); if ismember(organismID,{'eukaryotes','prokaryotes'}) phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1);