-
Notifications
You must be signed in to change notification settings - Fork 68
/
Copy pathtest_ucsc_gtf.py
105 lines (92 loc) · 3.75 KB
/
test_ucsc_gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from pyensembl import Genome, Database
from .common import TemporaryDirectory, eq_
from .data import data_path
UCSC_GENCODE_PATH = data_path("gencode.ucsc.small.gtf")
UCSC_REFSEQ_PATH = data_path("refseq.ucsc.small.gtf")
def test_ucsc_gencode_gtf():
with TemporaryDirectory() as tmpdir:
db = Database(UCSC_GENCODE_PATH, cache_directory_path=tmpdir)
df = db._load_gtf_as_dataframe()
exons = df[df["feature"] == "exon"]
# expect 12 exons from the dataframe
assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons)
def test_ucsc_gencode_genome():
"""
Testing with a small GENCODE GTF file downloaded from
http://genome.ucsc.edu/cgi-bin/hgTables
"""
with TemporaryDirectory() as tmpdir:
genome = Genome(
reference_name="GRCh38",
annotation_name="ucsc_test",
gtf_path_or_url=UCSC_GENCODE_PATH,
cache_directory_path=tmpdir,
)
genome.index()
genes = genome.genes()
for gene in genes:
assert gene.id, "Gene with missing ID in %s" % (genome,)
assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
transcripts = genome.transcripts()
for transcript in transcripts:
assert transcript.id, "Transcript with missing ID in %s" % (genome,)
assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (
len(transcripts),
transcripts,
)
gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
eq_(gene_uc001aak4.id, "uc001aak.4")
eq_(gene_uc001aak4.name, None)
eq_(gene_uc001aak4.biotype, None)
gene_1_17369 = genome.genes_at_locus("chr1", 17369)
eq_(gene_1_17369[0].id, "uc031tla.1")
transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
eq_(transcript_1_30564[0].id, "uc057aty.1")
def test_ucsc_refseq_gtf():
"""
Test GTF object with a small RefSeq GTF file downloaded from
http://genome.ucsc.edu/cgi-bin/hgTables
"""
with TemporaryDirectory() as tmpdir:
db = Database(UCSC_REFSEQ_PATH, cache_directory_path=tmpdir)
df = db._load_gtf_as_dataframe()
exons = df[df["feature"] == "exon"]
# expect 16 exons from the GTF
assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons)
def test_ucsc_refseq_genome():
"""
Test Genome object with a small RefSeq GTF file downloaded from
http://genome.ucsc.edu/cgi-bin/hgTables
"""
with TemporaryDirectory() as tmpdir:
genome = Genome(
reference_name="GRCh38",
annotation_name="ucsc_test",
gtf_path_or_url=UCSC_REFSEQ_PATH,
cache_directory_path=tmpdir,
)
genome.index()
genes = genome.genes()
for gene in genes:
assert gene.id, "Gene with missing ID in %s" % (
genome.db._load_gtf_as_dataframe(),
)
assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes)
transcripts = genome.transcripts()
for transcript in transcripts:
assert transcript.id, "Transcript with missing ID in %s" % (
genome.db._load_gtf_as_dataframe(),
)
assert len(transcripts) == 2, "Expected 2 transcripts, got %d: %s" % (
len(transcripts),
transcripts,
)
genes_at_locus = genome.genes_at_locus("chr1", 67092176)
assert (
len(genes_at_locus) == 2
), "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
len(genes_at_locus),
genes_at_locus,
)
ids = set([gene.id for gene in genes_at_locus])
eq_(set(["NM_001276352", "NR_075077"]), ids)