update citation

minglishen · Jul 6, 2024 · 1ccfc59 · 1ccfc59
1 parent ad5ba92
commit 1ccfc59
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 67 deletions.
diff --git a/.idea/deployment.xml b/.idea/deployment.xml
diff --git a/README.md b/README.md
@@ -342,4 +342,4 @@ You may want to check out this [Wiki](https://github.com/CSU-KangHu/HiTE/wiki) p
 ## Citations
 Please cite our paper if you find `HiTE` useful:
 
-Hu, K., Xu, M., Zou, Y. & Wang, J.✉ (2023). HiTE: An accurate dynamic boundary adjustment approach for full-length Transposable Elements detection and annotation in Genome Assemblies. [bioRxiv](https://doi.org/10.1101/2023.05.23.541879).
+Hu, K., Ni, P., Xu, M. et al. HiTE: a fast and accurate dynamic boundary adjustment approach for full-length transposable element detection and annotation. Nat Commun 15, 5573 (2024). [https://doi.org/10.1038/s41467-024-49912-8](https://doi.org/10.1038/s41467-024-49912-8)
diff --git a/main.py b/main.py
@@ -58,7 +58,7 @@
     parser.add_argument('--outdir', required=True, metavar='output_dir', help='The path of output directory; It is recommended to use a new directory to avoid automatic deletion of important files.')
 
     parser.add_argument('--thread', metavar='thread_num', help='Input thread num, default = [ '+str(default_threads)+' ]')
-    parser.add_argument('--chunk_size', metavar='chunk_size', help='The chunk size of large genome, default = [ ' + str(default_chunk_size) + ' MB ]')
+    parser.add_argument('--chunk_size', metavar='chunk_size', help='The chunk size of genome, default = [ ' + str(default_chunk_size) + ' MB ]')
     parser.add_argument('--miu', metavar='miu', help='The neutral mutation rate (per bp per ya), default = [ ' + str(default_miu) + ' ]')
     parser.add_argument('--plant', metavar='is_plant', help='Is it a plant genome, 1: true, 0: false. default = [ ' + str(default_plant) + ' ]')
     # parser.add_argument('--classified', metavar='is_classified', help='Whether to classify TE models, HiTE uses RepeatClassifier from RepeatModeler to classify TEs, 1: true, 0: false. default = [ ' + str(default_classified) + ' ]')

diff --git a/module/HiTE_module_test.py b/module/HiTE_module_test.py
@@ -71,23 +71,24 @@ def filter_repbase_nonTE():
 
 def generate_repbases():
     # 水稻
-    repbase_dir = '/public/home/hpc194701009/KmerRepFinder_test/library/curated_lib/only_TE/repbase/'
-    repbase_path = repbase_dir + '/potato.ref'
+    repbase_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/dmel'
+    repbase_path = repbase_dir + '/drorep.ref'
     repbase_names, repbase_contigs = read_fasta_v1(repbase_path)
-    tags = set()
-    for name in repbase_names:
-        if not name.__contains__('Solanum tuberosum'):
-            continue
-        tag = name.split('\t')[1]
-        tags.add(tag)
-    print(tags)
-    print(len(tags))
+    tmp_out_dir = repbase_dir + '/dmel'
+    # tags = set()
+    # for name in repbase_names:
+    #     if not name.__contains__('Solanum tuberosum'):
+    #         continue
+    #     tag = name.split('\t')[1]
+    #     tags.add(tag)
+    # print(tags)
+    # print(len(tags))
 
     ltr_tags = ['Gypsy', 'Copia', 'LTR Retrotransposon', 'BEL', 'LTR', 'Endogenous Retrovirus', 'Caulimoviridae']
     tir_tags = ['Mariner/Tc1', 'DNA transposon', 'EnSpm/CACTA', 'MuDR', 'hAT', 'Harbinger', 'Transib', 'piggyBac', 'P', 'DNA', 'Sola2', 'Kolobok', ]
     helitron_tags = ['Helitron', 'MINIME_DN']
     non_ltr_tags = ['L1', 'SINE2/tRNA', 'Non-LTR Retrotransposon', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE', 'tRNA']
-    tmp_out_dir = repbase_dir + '/potato'
+
     if not os.path.exists(tmp_out_dir):
         os.makedirs(tmp_out_dir)
     ltr_repbase_path = tmp_out_dir + '/ltr.repbase.ref'
@@ -102,8 +103,8 @@ def generate_repbases():
     helitron_contigs = {}
     non_ltr_contigs = {}
     for name in repbase_names:
-        if not name.__contains__('Solanum tuberosum'):
-            continue
+        # if not name.__contains__('Solanum tuberosum'):
+        #     continue
         tag = name.split('\t')[1]
         if tag in ltr_tags:
             ltr_contigs[name] = repbase_contigs[name]
@@ -113,6 +114,8 @@ def generate_repbases():
             helitron_contigs[name] = repbase_contigs[name]
         elif tag in non_ltr_tags:
             non_ltr_contigs[name] = repbase_contigs[name]
+        else:
+            print(tag)
         all_contigs[name] = repbase_contigs[name]
     store_fasta(ltr_contigs, ltr_repbase_path)
     store_fasta(tir_contigs, tir_repbase_path)
@@ -122,29 +125,41 @@ def generate_repbases():
 
 def generate_rm2():
     # 水稻
-    repbase_dir = '/public/home/hpc194701009/KmerRepFinder_test/library/rm2_run_lib/potato'
-    repbase_path = repbase_dir + '/potato-families.fa'
+    repbase_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/zebrafish'
+    repbase_path = repbase_dir + '/zebrafish.lib'
+    tmp_out_dir = repbase_dir + '/zebrafish'
+    repbase_names, repbase_contigs = read_fasta(repbase_path)
+    # repbase中的文件可能有重复，我们根据序列，去掉完全一致的重复
+    seq_set = set()
+    for name in repbase_names:
+        seq = repbase_contigs[name]
+        if seq in seq_set:
+            del repbase_contigs[name]
+        else:
+            seq_set.add(seq)
+    store_fasta(repbase_contigs, repbase_path)
     repbase_names, repbase_contigs = read_fasta(repbase_path)
+
     tags = set()
     for name in repbase_names:
         tag = name.split('#')[1]
         tags.add(tag)
     print(tags)
     print(len(tags))
 
-    ltr_tags = ['LTR/Gypsy', 'LTR/Copia', 'LTR/Pao', 'LTR/Cassandra', 'LTR', 'LTR/ERVK', 'LTR/ERV1', 'LTR/Unknown', 'LTR/Caulimovirus']
-    tir_tags = ['Mariner/Tc1', 'DNA transposon', 'DNA/TcMar-Stowaway', 'DNA/TcMar', 'DNA/Maverick', 'DNA/TcMar-Pogo', 'DNA/hAT-Charlie', 'DNA/CMC-EnSpm', 'DNA/CMC', 'DNA/MULE-MuDR', 'DNA/hAT-Tag1', 'DNA/hAT-Ac', 'DNA/hAT-Tip100', 'DNA/PIF-Harbinger', 'Transib', 'piggyBac', 'DNA/P', 'DNA', 'Sola2', 'Kolobok', ]
-    helitron_tags = ['RC/Helitron', 'MINIME_DN']
-    non_ltr_tags = ['LINE/L1', 'LINE/RTE-BovB', 'Retroposon', 'Retroposon/L1-derived', 'SINE/tRNA', 'SINE/tRNA-RTE', 'SINE/ID', 'LINE/Rex-Babar', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE']
-    unknown_tags = ['Unknown']
-    tmp_out_dir = repbase_dir + '/potato'
+    # ltr_tags = ['LTR/Gypsy', 'LTR/Copia', 'LTR/Pao', 'LTR/Cassandra', 'LTR', 'LTR/ERVK', 'LTR/ERV1', 'LTR/Unknown', 'LTR/Caulimovirus']
+    # tir_tags = ['Mariner/Tc1', 'DNA transposon', 'DNA/TcMar-Stowaway', 'DNA/TcMar', 'DNA/Maverick', 'DNA/TcMar-Pogo', 'DNA/hAT-Charlie', 'DNA/CMC-EnSpm', 'DNA/CMC', 'DNA/MULE-MuDR', 'DNA/hAT-Tag1', 'DNA/hAT-Ac', 'DNA/hAT-Tip100', 'DNA/PIF-Harbinger', 'Transib', 'piggyBac', 'DNA/P', 'DNA', 'Sola2', 'Kolobok', ]
+    # helitron_tags = ['RC/Helitron', 'MINIME_DN']
+    # non_ltr_tags = ['LINE/L1', 'LINE/RTE-BovB', 'Retroposon', 'Retroposon/L1-derived', 'SINE/tRNA', 'SINE/tRNA-RTE', 'SINE/ID', 'LINE/Rex-Babar', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE']
+    # unknown_tags = ['Unknown']
+
     if not os.path.exists(tmp_out_dir):
         os.makedirs(tmp_out_dir)
-    ltr_repbase_path = tmp_out_dir + '/ltr.rm2.ref'
-    tir_repbase_path = tmp_out_dir + '/tir.rm2.ref'
-    helitron_repbase_path = tmp_out_dir + '/helitron.rm2.ref'
-    non_ltr_repbase_path = tmp_out_dir + '/non_ltr.rm2.ref'
-    unknown_repbase_path = tmp_out_dir + '/unknown.rm2.ref'
+    ltr_repbase_path = tmp_out_dir + '/ltr.ref'
+    tir_repbase_path = tmp_out_dir + '/tir.ref'
+    helitron_repbase_path = tmp_out_dir + '/helitron.ref'
+    non_ltr_repbase_path = tmp_out_dir + '/non_ltr.ref'
+    unknown_repbase_path = tmp_out_dir + '/unknown.ref'
 
     ltr_contigs = {}
     tir_contigs = {}
@@ -153,16 +168,18 @@ def generate_rm2():
     unknown_contigs = {}
     for name in repbase_names:
         tag = name.split('#')[1]
-        if tag in ltr_tags:
+        if 'LTR' in tag:
             ltr_contigs[name] = repbase_contigs[name]
-        elif tag in tir_tags:
+        elif 'DNA' in tag:
             tir_contigs[name] = repbase_contigs[name]
-        elif tag in helitron_tags:
+        elif 'Helitron' in tag:
             helitron_contigs[name] = repbase_contigs[name]
-        elif tag in non_ltr_tags:
+        elif 'LINE' in tag or 'SINE' in tag:
             non_ltr_contigs[name] = repbase_contigs[name]
-        elif tag in unknown_tags:
+        elif 'Unknown' in tag:
             unknown_contigs[name] = repbase_contigs[name]
+        else:
+            print(tag)
     store_fasta(ltr_contigs, ltr_repbase_path)
     store_fasta(tir_contigs, tir_repbase_path)
     store_fasta(helitron_contigs, helitron_repbase_path)
@@ -892,6 +909,7 @@ def draw_dist(input_file):
             query_copy_num.append(float(line))
 
     y = list(query_copy_num)
+    print(len(y))
     x = pd.Series(y, name="copy number")
     sns.set_theme(style="ticks", font='Times New Roman', font_scale=1.4)
     sns.set_context("paper")
@@ -3639,21 +3657,21 @@ def get_logo_seq(ltr_copies):
 log = Logger(work_dir + '/HiTE_Non_LTR.log', level='debug')
 
 if __name__ == '__main__':
-    # 将RepeatModeler2中的non-ltr抽取出来
-    work_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/human'
-    te_path = work_dir + '/human.lib'
-    names, contigs = read_fasta(te_path)
-    label_set = set()
-    for name in names:
-        label = name.split('#')[1]
-        label_set.add(label)
-    print(label_set)
-    non_ltr_path = work_dir + '/non_ltr.fa'
-    non_ltr_contigs = {}
-    for name in names:
-        if 'LINE' in name or 'SINE' in name:
-            non_ltr_contigs[name] = contigs[name]
-    store_fasta(non_ltr_contigs, non_ltr_path)
+    # # 将RepeatModeler2中的non-ltr抽取出来
+    # work_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/human'
+    # te_path = work_dir + '/human.lib'
+    # names, contigs = read_fasta(te_path)
+    # label_set = set()
+    # for name in names:
+    #     label = name.split('#')[1]
+    #     label_set.add(label)
+    # print(label_set)
+    # non_ltr_path = work_dir + '/non_ltr.fa'
+    # non_ltr_contigs = {}
+    # for name in names:
+    #     if 'LINE' in name or 'SINE' in name:
+    #         non_ltr_contigs[name] = contigs[name]
+    # store_fasta(non_ltr_contigs, non_ltr_path)
 
     # align_file = '/home/hukang/test/HiTE/demo/test1/non_ltr_copies_0_0/chr_0:13880942-13881327.blast.bed.fa.maf.fa'
     # debug = 1
@@ -4108,7 +4126,7 @@ def get_logo_seq(ltr_copies):
     #
     # column_data.to_csv('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv', index=False)
     #
-    # draw_dist('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv')
+    draw_dist('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv')
 
     # 获取新的TIR转座子，得到它们的多序列比对，蛋白质结构信息
     # tmp_output_dir = '/homeb/hukang/KmerRepFinder_test/library/nextflow_test4/rice'
@@ -4175,7 +4193,7 @@ def get_logo_seq(ltr_copies):
     #tmp_output_dir = '/homeb/hukang/KmerRepFinder_test/library/all_tools_run_lib/rice_v7/HiTE'
     #generate_zebrafish_repbases()
     #generate_repbases()
-    #generate_rm2()
+    # generate_rm2()
     #generate_EDTA()
     # generate_HiTE()
     # input = '/public/home/hpc194701009/WebTE_Lib/New_cash_crops/Solanum_tuberosum/GCF_000226075.1_SolTub_3.0_genomic.fna'

diff --git a/module/Util.py b/module/Util.py
@@ -5375,7 +5375,7 @@ def generate_full_length_out(BlastnOut, full_length_out, TE_lib, reference, tmp_
                                                                              full_length_threshold,
                                                                              search_struct, tools_dir)
 
-    lines = []
+    lines = set()
     for query_name in full_length_annotations.keys():
         query_name = str(query_name)
         for copy_annotation in full_length_annotations[query_name]:
@@ -5387,15 +5387,9 @@ def generate_full_length_out(BlastnOut, full_length_out, TE_lib, reference, tmp_
             chr_start = int(chr_pos_parts[0]) + 1
             chr_end = int(chr_pos_parts[1])
             new_line = (query_name, chr_name, chr_start, chr_end)
-            lines.append(new_line)
-    sorted_lines = sorted(lines, key=lambda x: (x[1], x[2], x[3]))
-
-    with open(full_length_out, 'w') as f_save:
-        for line in sorted_lines:
-            new_line = line[0] + '\t' + line[1] + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + str(line[2]) + '\t' + str(line[3]) + '\t' + '-1' + '\t' + '-1' + '\n'
-            f_save.write(new_line)
+            lines.add(new_line)
 
-    return sorted_lines
+    return lines
 
 def mask_genome_intactTE(TE_lib, genome_path, work_dir, thread, ref_index):
     tmp_blast_dir = work_dir + '/mask_tmp_' + str(ref_index)
@@ -11095,10 +11089,10 @@ def multiple_alignment_blast_v1(repeats_path, tools_dir, coverage_threshold, cat
         os.system(align_command)
 
     # invoke the function to retrieve the full-length copies.
-    generate_full_length_out(blastn2Results_path, full_length_out, split_repeats_path, genome_path, tmp_dir, tools_dir,
+    lines = generate_full_length_out(blastn2Results_path, full_length_out, split_repeats_path, genome_path, tmp_dir, tools_dir,
                              coverage_threshold, category)
 
-    return full_length_out
+    return lines
 
 def multi_process_align_v1(query_path, subject_path, blastnResults_path, tmp_blast_dir, threads, coverage_threshold, category, is_removed_dir=True):
     tools_dir = ''
@@ -11170,9 +11164,17 @@ def multi_process_align_v1(query_path, subject_path, blastnResults_path, tmp_bla
         jobs.append(job)
     ex.shutdown(wait=True)
 
+    lines = set()
     for job in as_completed(jobs):
-        cur_full_length_out = job.result()
-        os.system('cat ' + cur_full_length_out + ' >> ' + blastnResults_path)
+        cur_lines = job.result()
+        lines.update(cur_lines)
+    lines = list(lines)
+    sorted_lines = sorted(lines, key=lambda x: (x[1], x[2], x[3]))
+
+    with open(blastnResults_path, 'w') as f_save:
+        for line in sorted_lines:
+            new_line = line[0] + '\t' + line[1] + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + str(line[2]) + '\t' + str(line[3]) + '\t' + '-1' + '\t' + '-1' + '\n'
+            f_save.write(new_line)
 
     if is_removed_dir:
         os.system('rm -rf ' + tmp_blast_dir)
diff --git a/nextflow.config b/nextflow.config
@@ -14,8 +14,8 @@ params {
     conda_name                 = null  // sample: /opt/conda/envs/HiTE
     conda_cache = 'local_conda_cache'
 
-    docker_name                = "kanghu/hite:2.0.4"
-    singularity_name           = "docker://kanghu/hite:2.0.4"
+    docker_name                = "kanghu/hite:3.2.0"
+    singularity_name           = "docker://kanghu/hite:3.2.0"
     singularity_cache          = 'local_singularity_cache'
 
     // Specify your pipeline's command line flags

diff --git a/nextflow_base.config b/nextflow_base.config
@@ -1,6 +1,6 @@
 /*
 ========================================================================================
-    longmethyl Nextflow base config file
+    HiTE Nextflow base config file
 ========================================================================================
     A 'blank slate' config file, appropriate for general use on most high performance
     compute environments. Assumes that all software is installed and available on