Skip to content

Commit

Permalink
update citation
Browse files Browse the repository at this point in the history
  • Loading branch information
CSU-KangHu committed Jul 6, 2024
1 parent ad5ba92 commit 1ccfc59
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 67 deletions.
37 changes: 36 additions & 1 deletion .idea/deployment.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,4 +342,4 @@ You may want to check out this [Wiki](https://github.com/CSU-KangHu/HiTE/wiki) p
## Citations
Please cite our paper if you find `HiTE` useful:

Hu, K., Xu, M., Zou, Y. & Wang, J.✉ (2023). HiTE: An accurate dynamic boundary adjustment approach for full-length Transposable Elements detection and annotation in Genome Assemblies. [bioRxiv](https://doi.org/10.1101/2023.05.23.541879).
Hu, K., Ni, P., Xu, M. et al. HiTE: a fast and accurate dynamic boundary adjustment approach for full-length transposable element detection and annotation. Nat Commun 15, 5573 (2024). [https://doi.org/10.1038/s41467-024-49912-8](https://doi.org/10.1038/s41467-024-49912-8)
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
parser.add_argument('--outdir', required=True, metavar='output_dir', help='The path of output directory; It is recommended to use a new directory to avoid automatic deletion of important files.')

parser.add_argument('--thread', metavar='thread_num', help='Input thread num, default = [ '+str(default_threads)+' ]')
parser.add_argument('--chunk_size', metavar='chunk_size', help='The chunk size of large genome, default = [ ' + str(default_chunk_size) + ' MB ]')
parser.add_argument('--chunk_size', metavar='chunk_size', help='The chunk size of genome, default = [ ' + str(default_chunk_size) + ' MB ]')
parser.add_argument('--miu', metavar='miu', help='The neutral mutation rate (per bp per ya), default = [ ' + str(default_miu) + ' ]')
parser.add_argument('--plant', metavar='is_plant', help='Is it a plant genome, 1: true, 0: false. default = [ ' + str(default_plant) + ' ]')
# parser.add_argument('--classified', metavar='is_classified', help='Whether to classify TE models, HiTE uses RepeatClassifier from RepeatModeler to classify TEs, 1: true, 0: false. default = [ ' + str(default_classified) + ' ]')
Expand Down
114 changes: 66 additions & 48 deletions module/HiTE_module_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,24 @@ def filter_repbase_nonTE():

def generate_repbases():
# 水稻
repbase_dir = '/public/home/hpc194701009/KmerRepFinder_test/library/curated_lib/only_TE/repbase/'
repbase_path = repbase_dir + '/potato.ref'
repbase_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/dmel'
repbase_path = repbase_dir + '/drorep.ref'
repbase_names, repbase_contigs = read_fasta_v1(repbase_path)
tags = set()
for name in repbase_names:
if not name.__contains__('Solanum tuberosum'):
continue
tag = name.split('\t')[1]
tags.add(tag)
print(tags)
print(len(tags))
tmp_out_dir = repbase_dir + '/dmel'
# tags = set()
# for name in repbase_names:
# if not name.__contains__('Solanum tuberosum'):
# continue
# tag = name.split('\t')[1]
# tags.add(tag)
# print(tags)
# print(len(tags))

ltr_tags = ['Gypsy', 'Copia', 'LTR Retrotransposon', 'BEL', 'LTR', 'Endogenous Retrovirus', 'Caulimoviridae']
tir_tags = ['Mariner/Tc1', 'DNA transposon', 'EnSpm/CACTA', 'MuDR', 'hAT', 'Harbinger', 'Transib', 'piggyBac', 'P', 'DNA', 'Sola2', 'Kolobok', ]
helitron_tags = ['Helitron', 'MINIME_DN']
non_ltr_tags = ['L1', 'SINE2/tRNA', 'Non-LTR Retrotransposon', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE', 'tRNA']
tmp_out_dir = repbase_dir + '/potato'

if not os.path.exists(tmp_out_dir):
os.makedirs(tmp_out_dir)
ltr_repbase_path = tmp_out_dir + '/ltr.repbase.ref'
Expand All @@ -102,8 +103,8 @@ def generate_repbases():
helitron_contigs = {}
non_ltr_contigs = {}
for name in repbase_names:
if not name.__contains__('Solanum tuberosum'):
continue
# if not name.__contains__('Solanum tuberosum'):
# continue
tag = name.split('\t')[1]
if tag in ltr_tags:
ltr_contigs[name] = repbase_contigs[name]
Expand All @@ -113,6 +114,8 @@ def generate_repbases():
helitron_contigs[name] = repbase_contigs[name]
elif tag in non_ltr_tags:
non_ltr_contigs[name] = repbase_contigs[name]
else:
print(tag)
all_contigs[name] = repbase_contigs[name]
store_fasta(ltr_contigs, ltr_repbase_path)
store_fasta(tir_contigs, tir_repbase_path)
Expand All @@ -122,29 +125,41 @@ def generate_repbases():

def generate_rm2():
# 水稻
repbase_dir = '/public/home/hpc194701009/KmerRepFinder_test/library/rm2_run_lib/potato'
repbase_path = repbase_dir + '/potato-families.fa'
repbase_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/zebrafish'
repbase_path = repbase_dir + '/zebrafish.lib'
tmp_out_dir = repbase_dir + '/zebrafish'
repbase_names, repbase_contigs = read_fasta(repbase_path)
# repbase中的文件可能有重复,我们根据序列,去掉完全一致的重复
seq_set = set()
for name in repbase_names:
seq = repbase_contigs[name]
if seq in seq_set:
del repbase_contigs[name]
else:
seq_set.add(seq)
store_fasta(repbase_contigs, repbase_path)
repbase_names, repbase_contigs = read_fasta(repbase_path)

tags = set()
for name in repbase_names:
tag = name.split('#')[1]
tags.add(tag)
print(tags)
print(len(tags))

ltr_tags = ['LTR/Gypsy', 'LTR/Copia', 'LTR/Pao', 'LTR/Cassandra', 'LTR', 'LTR/ERVK', 'LTR/ERV1', 'LTR/Unknown', 'LTR/Caulimovirus']
tir_tags = ['Mariner/Tc1', 'DNA transposon', 'DNA/TcMar-Stowaway', 'DNA/TcMar', 'DNA/Maverick', 'DNA/TcMar-Pogo', 'DNA/hAT-Charlie', 'DNA/CMC-EnSpm', 'DNA/CMC', 'DNA/MULE-MuDR', 'DNA/hAT-Tag1', 'DNA/hAT-Ac', 'DNA/hAT-Tip100', 'DNA/PIF-Harbinger', 'Transib', 'piggyBac', 'DNA/P', 'DNA', 'Sola2', 'Kolobok', ]
helitron_tags = ['RC/Helitron', 'MINIME_DN']
non_ltr_tags = ['LINE/L1', 'LINE/RTE-BovB', 'Retroposon', 'Retroposon/L1-derived', 'SINE/tRNA', 'SINE/tRNA-RTE', 'SINE/ID', 'LINE/Rex-Babar', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE']
unknown_tags = ['Unknown']
tmp_out_dir = repbase_dir + '/potato'
# ltr_tags = ['LTR/Gypsy', 'LTR/Copia', 'LTR/Pao', 'LTR/Cassandra', 'LTR', 'LTR/ERVK', 'LTR/ERV1', 'LTR/Unknown', 'LTR/Caulimovirus']
# tir_tags = ['Mariner/Tc1', 'DNA transposon', 'DNA/TcMar-Stowaway', 'DNA/TcMar', 'DNA/Maverick', 'DNA/TcMar-Pogo', 'DNA/hAT-Charlie', 'DNA/CMC-EnSpm', 'DNA/CMC', 'DNA/MULE-MuDR', 'DNA/hAT-Tag1', 'DNA/hAT-Ac', 'DNA/hAT-Tip100', 'DNA/PIF-Harbinger', 'Transib', 'piggyBac', 'DNA/P', 'DNA', 'Sola2', 'Kolobok', ]
# helitron_tags = ['RC/Helitron', 'MINIME_DN']
# non_ltr_tags = ['LINE/L1', 'LINE/RTE-BovB', 'Retroposon', 'Retroposon/L1-derived', 'SINE/tRNA', 'SINE/tRNA-RTE', 'SINE/ID', 'LINE/Rex-Babar', 'SINE', 'R1', 'Jockey', 'CR1', 'R2', 'RTEX', 'Hero', 'RTE']
# unknown_tags = ['Unknown']

if not os.path.exists(tmp_out_dir):
os.makedirs(tmp_out_dir)
ltr_repbase_path = tmp_out_dir + '/ltr.rm2.ref'
tir_repbase_path = tmp_out_dir + '/tir.rm2.ref'
helitron_repbase_path = tmp_out_dir + '/helitron.rm2.ref'
non_ltr_repbase_path = tmp_out_dir + '/non_ltr.rm2.ref'
unknown_repbase_path = tmp_out_dir + '/unknown.rm2.ref'
ltr_repbase_path = tmp_out_dir + '/ltr.ref'
tir_repbase_path = tmp_out_dir + '/tir.ref'
helitron_repbase_path = tmp_out_dir + '/helitron.ref'
non_ltr_repbase_path = tmp_out_dir + '/non_ltr.ref'
unknown_repbase_path = tmp_out_dir + '/unknown.ref'

ltr_contigs = {}
tir_contigs = {}
Expand All @@ -153,16 +168,18 @@ def generate_rm2():
unknown_contigs = {}
for name in repbase_names:
tag = name.split('#')[1]
if tag in ltr_tags:
if 'LTR' in tag:
ltr_contigs[name] = repbase_contigs[name]
elif tag in tir_tags:
elif 'DNA' in tag:
tir_contigs[name] = repbase_contigs[name]
elif tag in helitron_tags:
elif 'Helitron' in tag:
helitron_contigs[name] = repbase_contigs[name]
elif tag in non_ltr_tags:
elif 'LINE' in tag or 'SINE' in tag:
non_ltr_contigs[name] = repbase_contigs[name]
elif tag in unknown_tags:
elif 'Unknown' in tag:
unknown_contigs[name] = repbase_contigs[name]
else:
print(tag)
store_fasta(ltr_contigs, ltr_repbase_path)
store_fasta(tir_contigs, tir_repbase_path)
store_fasta(helitron_contigs, helitron_repbase_path)
Expand Down Expand Up @@ -892,6 +909,7 @@ def draw_dist(input_file):
query_copy_num.append(float(line))

y = list(query_copy_num)
print(len(y))
x = pd.Series(y, name="copy number")
sns.set_theme(style="ticks", font='Times New Roman', font_scale=1.4)
sns.set_context("paper")
Expand Down Expand Up @@ -3639,21 +3657,21 @@ def get_logo_seq(ltr_copies):
log = Logger(work_dir + '/HiTE_Non_LTR.log', level='debug')

if __name__ == '__main__':
# 将RepeatModeler2中的non-ltr抽取出来
work_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/human'
te_path = work_dir + '/human.lib'
names, contigs = read_fasta(te_path)
label_set = set()
for name in names:
label = name.split('#')[1]
label_set.add(label)
print(label_set)
non_ltr_path = work_dir + '/non_ltr.fa'
non_ltr_contigs = {}
for name in names:
if 'LINE' in name or 'SINE' in name:
non_ltr_contigs[name] = contigs[name]
store_fasta(non_ltr_contigs, non_ltr_path)
# # 将RepeatModeler2中的non-ltr抽取出来
# work_dir = '/homeb/hukang/KmerRepFinder_test/library/curated_lib/Repbase_28.06/human'
# te_path = work_dir + '/human.lib'
# names, contigs = read_fasta(te_path)
# label_set = set()
# for name in names:
# label = name.split('#')[1]
# label_set.add(label)
# print(label_set)
# non_ltr_path = work_dir + '/non_ltr.fa'
# non_ltr_contigs = {}
# for name in names:
# if 'LINE' in name or 'SINE' in name:
# non_ltr_contigs[name] = contigs[name]
# store_fasta(non_ltr_contigs, non_ltr_path)

# align_file = '/home/hukang/test/HiTE/demo/test1/non_ltr_copies_0_0/chr_0:13880942-13881327.blast.bed.fa.maf.fa'
# debug = 1
Expand Down Expand Up @@ -4108,7 +4126,7 @@ def get_logo_seq(ltr_copies):
#
# column_data.to_csv('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv', index=False)
#
# draw_dist('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv')
draw_dist('/homeb/hukang/KmerRepFinder_test/library/nextflow_test2/rice/novel_tir/data.csv')

# 获取新的TIR转座子,得到它们的多序列比对,蛋白质结构信息
# tmp_output_dir = '/homeb/hukang/KmerRepFinder_test/library/nextflow_test4/rice'
Expand Down Expand Up @@ -4175,7 +4193,7 @@ def get_logo_seq(ltr_copies):
#tmp_output_dir = '/homeb/hukang/KmerRepFinder_test/library/all_tools_run_lib/rice_v7/HiTE'
#generate_zebrafish_repbases()
#generate_repbases()
#generate_rm2()
# generate_rm2()
#generate_EDTA()
# generate_HiTE()
# input = '/public/home/hpc194701009/WebTE_Lib/New_cash_crops/Solanum_tuberosum/GCF_000226075.1_SolTub_3.0_genomic.fna'
Expand Down
28 changes: 15 additions & 13 deletions module/Util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5375,7 +5375,7 @@ def generate_full_length_out(BlastnOut, full_length_out, TE_lib, reference, tmp_
full_length_threshold,
search_struct, tools_dir)

lines = []
lines = set()
for query_name in full_length_annotations.keys():
query_name = str(query_name)
for copy_annotation in full_length_annotations[query_name]:
Expand All @@ -5387,15 +5387,9 @@ def generate_full_length_out(BlastnOut, full_length_out, TE_lib, reference, tmp_
chr_start = int(chr_pos_parts[0]) + 1
chr_end = int(chr_pos_parts[1])
new_line = (query_name, chr_name, chr_start, chr_end)
lines.append(new_line)
sorted_lines = sorted(lines, key=lambda x: (x[1], x[2], x[3]))

with open(full_length_out, 'w') as f_save:
for line in sorted_lines:
new_line = line[0] + '\t' + line[1] + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + str(line[2]) + '\t' + str(line[3]) + '\t' + '-1' + '\t' + '-1' + '\n'
f_save.write(new_line)
lines.add(new_line)

return sorted_lines
return lines

def mask_genome_intactTE(TE_lib, genome_path, work_dir, thread, ref_index):
tmp_blast_dir = work_dir + '/mask_tmp_' + str(ref_index)
Expand Down Expand Up @@ -11095,10 +11089,10 @@ def multiple_alignment_blast_v1(repeats_path, tools_dir, coverage_threshold, cat
os.system(align_command)

# invoke the function to retrieve the full-length copies.
generate_full_length_out(blastn2Results_path, full_length_out, split_repeats_path, genome_path, tmp_dir, tools_dir,
lines = generate_full_length_out(blastn2Results_path, full_length_out, split_repeats_path, genome_path, tmp_dir, tools_dir,
coverage_threshold, category)

return full_length_out
return lines

def multi_process_align_v1(query_path, subject_path, blastnResults_path, tmp_blast_dir, threads, coverage_threshold, category, is_removed_dir=True):
tools_dir = ''
Expand Down Expand Up @@ -11170,9 +11164,17 @@ def multi_process_align_v1(query_path, subject_path, blastnResults_path, tmp_bla
jobs.append(job)
ex.shutdown(wait=True)

lines = set()
for job in as_completed(jobs):
cur_full_length_out = job.result()
os.system('cat ' + cur_full_length_out + ' >> ' + blastnResults_path)
cur_lines = job.result()
lines.update(cur_lines)
lines = list(lines)
sorted_lines = sorted(lines, key=lambda x: (x[1], x[2], x[3]))

with open(blastnResults_path, 'w') as f_save:
for line in sorted_lines:
new_line = line[0] + '\t' + line[1] + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + '-1' + '\t' + str(line[2]) + '\t' + str(line[3]) + '\t' + '-1' + '\t' + '-1' + '\n'
f_save.write(new_line)

if is_removed_dir:
os.system('rm -rf ' + tmp_blast_dir)
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ params {
conda_name = null // sample: /opt/conda/envs/HiTE
conda_cache = 'local_conda_cache'

docker_name = "kanghu/hite:2.0.4"
singularity_name = "docker://kanghu/hite:2.0.4"
docker_name = "kanghu/hite:3.2.0"
singularity_name = "docker://kanghu/hite:3.2.0"
singularity_cache = 'local_singularity_cache'

// Specify your pipeline's command line flags
Expand Down
2 changes: 1 addition & 1 deletion nextflow_base.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
========================================================================================
longmethyl Nextflow base config file
HiTE Nextflow base config file
========================================================================================
A 'blank slate' config file, appropriate for general use on most high performance
compute environments. Assumes that all software is installed and available on
Expand Down

0 comments on commit 1ccfc59

Please sign in to comment.