forked from hemberg-lab/scRNA.seq.course
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessing-raw-scrna-seq-data.html
851 lines (792 loc) · 106 KB
/
processing-raw-scrna-seq-data.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
<!DOCTYPE html>
<html >
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Analysis of single cell RNA-seq data</title>
<meta name="description" content="Analysis of single cell RNA-seq data">
<meta name="generator" content="bookdown 0.7 and GitBook 2.6.7">
<meta property="og:title" content="Analysis of single cell RNA-seq data" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Analysis of single cell RNA-seq data" />
<meta name="author" content="Vladimir Kiselev (wikiselev), Tallulah Andrews, Jennifer Westoby (Jenni_Westoby), Davis McCarthy (davisjmcc), Maren Büttner (marenbuettner) and Martin Hemberg (m_hemberg)">
<meta name="date" content="2018-05-29">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="prev" href="introduction-to-single-cell-rna-seq.html">
<link rel="next" href="construction-of-expression-matrix.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<!-- for Facebook -->
<meta property="og:url" content="http://hemberg-lab.github.io/scRNA.seq.course/" />
<meta property="og:description" content="In this course we will be surveying the existing problems as well as the available computational and statistical frameworks available for the analysis of scRNA-seq. The course is taught through the University of Cambridge Bioinformatics training unit, but the material found on these pages is meant to be used for anyone interested in learning about computational analysis of scRNA-seq data." />
<meta property="og:image" content="http://hemberg-lab.github.io/scRNA.seq.course/figures/RNA-Seq_workflow-5.pdf.jpg" />
<!-- for Twitter -->
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:title" content="Analysis of single-cell RNA-seq data" />
<meta name="twitter:description" content="In this course we will be surveying the existing problems as well as the available computational and statistical frameworks available for the analysis of scRNA-seq. The course is taught through the University of Cambridge Bioinformatics training unit, but the material found on these pages is meant to be used for anyone interested in learning about computational analysis of scRNA-seq data." />
<meta name="twitter:image" content="http://hemberg-lab.github.io/scRNA.seq.course/figures/RNA-Seq_workflow-5.pdf.jpg" />
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-71525309-1', 'auto');
ga('send', 'pageview');
</script>
<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
<link rel="stylesheet" href="style.css" type="text/css" />
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><a href="index.html">Table of Contents</a></li>
<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> About the course</a><ul>
<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#video"><i class="fa fa-check"></i><b>1.1</b> Video</a></li>
<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#registration"><i class="fa fa-check"></i><b>1.2</b> Registration</a></li>
<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#github"><i class="fa fa-check"></i><b>1.3</b> GitHub</a></li>
<li class="chapter" data-level="1.4" data-path="index.html"><a href="index.html#docker-image-rstudio"><i class="fa fa-check"></i><b>1.4</b> Docker image (RStudio)</a></li>
<li class="chapter" data-level="1.5" data-path="index.html"><a href="index.html#manual-installation"><i class="fa fa-check"></i><b>1.5</b> Manual installation</a></li>
<li class="chapter" data-level="1.6" data-path="index.html"><a href="index.html#license"><i class="fa fa-check"></i><b>1.6</b> License</a></li>
<li class="chapter" data-level="1.7" data-path="index.html"><a href="index.html#prerequisites"><i class="fa fa-check"></i><b>1.7</b> Prerequisites</a></li>
<li class="chapter" data-level="1.8" data-path="index.html"><a href="index.html#contact"><i class="fa fa-check"></i><b>1.8</b> Contact</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html"><i class="fa fa-check"></i><b>2</b> Introduction to single-cell RNA-seq</a><ul>
<li class="chapter" data-level="2.1" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#bulk-rna-seq"><i class="fa fa-check"></i><b>2.1</b> Bulk RNA-seq</a></li>
<li class="chapter" data-level="2.2" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#scrna-seq"><i class="fa fa-check"></i><b>2.2</b> scRNA-seq</a></li>
<li class="chapter" data-level="2.3" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#workflow"><i class="fa fa-check"></i><b>2.3</b> Workflow</a></li>
<li class="chapter" data-level="2.4" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#computational-analysis"><i class="fa fa-check"></i><b>2.4</b> Computational Analysis</a></li>
<li class="chapter" data-level="2.5" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#challenges"><i class="fa fa-check"></i><b>2.5</b> Challenges</a></li>
<li class="chapter" data-level="2.6" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#experimental-methods"><i class="fa fa-check"></i><b>2.6</b> Experimental methods</a></li>
<li class="chapter" data-level="2.7" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#what-platform-to-use-for-my-experiment"><i class="fa fa-check"></i><b>2.7</b> What platform to use for my experiment?</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html"><i class="fa fa-check"></i><b>3</b> Processing Raw scRNA-seq Data</a><ul>
<li class="chapter" data-level="3.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#fastqc"><i class="fa fa-check"></i><b>3.1</b> FastQC</a><ul>
<li class="chapter" data-level="3.1.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-and-downloading-the-report"><i class="fa fa-check"></i><b>3.1.1</b> Solution and Downloading the Report</a></li>
</ul></li>
<li class="chapter" data-level="3.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#trimming-reads"><i class="fa fa-check"></i><b>3.2</b> Trimming Reads</a><ul>
<li class="chapter" data-level="3.2.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution"><i class="fa fa-check"></i><b>3.2.1</b> Solution</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#file-formats"><i class="fa fa-check"></i><b>3.3</b> File formats</a><ul>
<li class="chapter" data-level="3.3.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#fastq"><i class="fa fa-check"></i><b>3.3.1</b> FastQ</a></li>
<li class="chapter" data-level="3.3.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#bam"><i class="fa fa-check"></i><b>3.3.2</b> BAM</a></li>
<li class="chapter" data-level="3.3.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#cram"><i class="fa fa-check"></i><b>3.3.3</b> CRAM</a></li>
<li class="chapter" data-level="3.3.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#mannually-inspecting-files"><i class="fa fa-check"></i><b>3.3.4</b> Mannually Inspecting files</a></li>
<li class="chapter" data-level="3.3.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#genome-fasta-gtf"><i class="fa fa-check"></i><b>3.3.5</b> Genome (FASTA, GTF)</a></li>
</ul></li>
<li class="chapter" data-level="3.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#demultiplexing"><i class="fa fa-check"></i><b>3.4</b> Demultiplexing</a><ul>
<li class="chapter" data-level="3.4.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#identifying-cell-containing-dropletsmicrowells"><i class="fa fa-check"></i><b>3.4.1</b> Identifying cell-containing droplets/microwells</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#using-star-to-align-reads"><i class="fa fa-check"></i><b>3.5</b> Using STAR to Align Reads</a><ul>
<li class="chapter" data-level="3.5.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-for-star-alignment"><i class="fa fa-check"></i><b>3.5.1</b> Solution for STAR Alignment</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#kallisto-and-pseudo-alignment"><i class="fa fa-check"></i><b>3.6</b> Kallisto and Pseudo-Alignment</a><ul>
<li class="chapter" data-level="3.6.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#what-is-a-k-mer"><i class="fa fa-check"></i><b>3.6.1</b> What is a k-mer?</a></li>
<li class="chapter" data-level="3.6.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#why-map-k-mers-rather-than-reads"><i class="fa fa-check"></i><b>3.6.2</b> Why map k-mers rather than reads?</a></li>
<li class="chapter" data-level="3.6.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#kallistos-pseudo-mode"><i class="fa fa-check"></i><b>3.6.3</b> Kallisto’s pseudo mode</a></li>
<li class="chapter" data-level="3.6.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-to-kallisto-pseudo-alignment"><i class="fa fa-check"></i><b>3.6.4</b> Solution to Kallisto Pseudo-Alignment</a></li>
<li class="chapter" data-level="3.6.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#understanding-the-output-of-kallisto-pseudo-alignment"><i class="fa fa-check"></i><b>3.6.5</b> Understanding the Output of Kallisto Pseudo-Alignment</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html"><i class="fa fa-check"></i><b>4</b> Construction of expression matrix</a><ul>
<li class="chapter" data-level="4.1" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-qc"><i class="fa fa-check"></i><b>4.1</b> Reads QC</a></li>
<li class="chapter" data-level="4.2" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-alignment"><i class="fa fa-check"></i><b>4.2</b> Reads alignment</a></li>
<li class="chapter" data-level="4.3" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#alignment-example"><i class="fa fa-check"></i><b>4.3</b> Alignment example</a></li>
<li class="chapter" data-level="4.4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#mapping-qc"><i class="fa fa-check"></i><b>4.4</b> Mapping QC</a></li>
<li class="chapter" data-level="4.5" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-quantification"><i class="fa fa-check"></i><b>4.5</b> Reads quantification</a></li>
<li class="chapter" data-level="4.6" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#umichapter"><i class="fa fa-check"></i><b>4.6</b> Unique Molecular Identifiers (UMIs)</a><ul>
<li class="chapter" data-level="4.6.1" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#introduction"><i class="fa fa-check"></i><b>4.6.1</b> Introduction</a></li>
<li class="chapter" data-level="4.6.2" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#mapping-barcodes"><i class="fa fa-check"></i><b>4.6.2</b> Mapping Barcodes</a></li>
<li class="chapter" data-level="4.6.3" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#counting-barcodes"><i class="fa fa-check"></i><b>4.6.3</b> Counting Barcodes</a></li>
<li class="chapter" data-level="4.6.4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#correcting-for-errors"><i class="fa fa-check"></i><b>4.6.4</b> Correcting for Errors</a></li>
<li class="chapter" data-level="4.6.5" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#downstream-analysis"><i class="fa fa-check"></i><b>4.6.5</b> Downstream Analysis</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html"><i class="fa fa-check"></i><b>5</b> Introduction to R/Bioconductor</a><ul>
<li class="chapter" data-level="5.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#installing-packages"><i class="fa fa-check"></i><b>5.1</b> Installing packages</a><ul>
<li class="chapter" data-level="5.1.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#cran"><i class="fa fa-check"></i><b>5.1.1</b> CRAN</a></li>
<li class="chapter" data-level="5.1.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#github-1"><i class="fa fa-check"></i><b>5.1.2</b> Github</a></li>
<li class="chapter" data-level="5.1.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor"><i class="fa fa-check"></i><b>5.1.3</b> Bioconductor</a></li>
<li class="chapter" data-level="5.1.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#source"><i class="fa fa-check"></i><b>5.1.4</b> Source</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#installation-instructions"><i class="fa fa-check"></i><b>5.2</b> Installation instructions:</a></li>
<li class="chapter" data-level="5.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#data-typesclasses"><i class="fa fa-check"></i><b>5.3</b> Data-types/classes</a><ul>
<li class="chapter" data-level="5.3.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#numeric"><i class="fa fa-check"></i><b>5.3.1</b> Numeric</a></li>
<li class="chapter" data-level="5.3.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#characterstring"><i class="fa fa-check"></i><b>5.3.2</b> Character/String</a></li>
<li class="chapter" data-level="5.3.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#logical"><i class="fa fa-check"></i><b>5.3.3</b> Logical</a></li>
<li class="chapter" data-level="5.3.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#factors"><i class="fa fa-check"></i><b>5.3.4</b> Factors</a></li>
<li class="chapter" data-level="5.3.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#checking-classtype"><i class="fa fa-check"></i><b>5.3.5</b> Checking class/type</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#basic-data-structures"><i class="fa fa-check"></i><b>5.4</b> Basic data structures</a></li>
<li class="chapter" data-level="5.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#more-information"><i class="fa fa-check"></i><b>5.5</b> More information</a></li>
<li class="chapter" data-level="5.6" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#data-types"><i class="fa fa-check"></i><b>5.6</b> Data Types</a><ul>
<li class="chapter" data-level="5.6.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-tidy-data"><i class="fa fa-check"></i><b>5.6.1</b> What is Tidy Data?</a></li>
<li class="chapter" data-level="5.6.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-rich-data"><i class="fa fa-check"></i><b>5.6.2</b> What is Rich Data?</a></li>
<li class="chapter" data-level="5.6.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-bioconductor"><i class="fa fa-check"></i><b>5.6.3</b> What is Bioconductor?</a></li>
<li class="chapter" data-level="5.6.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#singlecellexperiment-class"><i class="fa fa-check"></i><b>5.6.4</b> <code>SingleCellExperiment</code> class</a></li>
<li class="chapter" data-level="5.6.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#scater-package"><i class="fa fa-check"></i><b>5.6.5</b> <code>scater</code> package</a></li>
</ul></li>
<li class="chapter" data-level="5.7" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor-singlecellexperiment-and-scater"><i class="fa fa-check"></i><b>5.7</b> Bioconductor, <code>SingleCellExperiment</code> and <code>scater</code></a><ul>
<li class="chapter" data-level="5.7.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor-1"><i class="fa fa-check"></i><b>5.7.1</b> Bioconductor</a></li>
<li class="chapter" data-level="5.7.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#singlecellexperiment-class-1"><i class="fa fa-check"></i><b>5.7.2</b> <code>SingleCellExperiment</code> class</a></li>
<li class="chapter" data-level="5.7.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#scater-package-1"><i class="fa fa-check"></i><b>5.7.3</b> <code>scater</code> package</a></li>
</ul></li>
<li class="chapter" data-level="5.8" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#an-introduction-to-ggplot2"><i class="fa fa-check"></i><b>5.8</b> An Introduction to ggplot2</a><ul>
<li class="chapter" data-level="5.8.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-ggplot2"><i class="fa fa-check"></i><b>5.8.1</b> What is ggplot2?</a></li>
<li class="chapter" data-level="5.8.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#principles-of-ggplot2"><i class="fa fa-check"></i><b>5.8.2</b> Principles of ggplot2</a></li>
<li class="chapter" data-level="5.8.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#using-the-aes-mapping-function"><i class="fa fa-check"></i><b>5.8.3</b> Using the <code>aes</code> mapping function</a></li>
<li class="chapter" data-level="5.8.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#geoms"><i class="fa fa-check"></i><b>5.8.4</b> Geoms</a></li>
<li class="chapter" data-level="5.8.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#plotting-data-from-more-than-2-cells"><i class="fa fa-check"></i><b>5.8.5</b> Plotting data from more than 2 cells</a></li>
<li class="chapter" data-level="5.8.6" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#plotting-heatmaps"><i class="fa fa-check"></i><b>5.8.6</b> Plotting heatmaps</a></li>
<li class="chapter" data-level="5.8.7" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#principle-component-analysis"><i class="fa fa-check"></i><b>5.8.7</b> Principle Component Analysis</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="6" data-path="tabula-muris.html"><a href="tabula-muris.html"><i class="fa fa-check"></i><b>6</b> Tabula Muris</a><ul>
<li class="chapter" data-level="6.1" data-path="tabula-muris.html"><a href="tabula-muris.html#introduction-1"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
<li class="chapter" data-level="6.2" data-path="tabula-muris.html"><a href="tabula-muris.html#downloading-the-data"><i class="fa fa-check"></i><b>6.2</b> Downloading the data</a></li>
<li class="chapter" data-level="6.3" data-path="tabula-muris.html"><a href="tabula-muris.html#reading-the-data-smartseq2"><i class="fa fa-check"></i><b>6.3</b> Reading the data (Smartseq2)</a></li>
<li class="chapter" data-level="6.4" data-path="tabula-muris.html"><a href="tabula-muris.html#building-a-scater-object"><i class="fa fa-check"></i><b>6.4</b> Building a scater object</a></li>
<li class="chapter" data-level="6.5" data-path="tabula-muris.html"><a href="tabula-muris.html#reading-the-data-10x"><i class="fa fa-check"></i><b>6.5</b> Reading the data (10X)</a></li>
<li class="chapter" data-level="6.6" data-path="tabula-muris.html"><a href="tabula-muris.html#building-a-scater-object-1"><i class="fa fa-check"></i><b>6.6</b> Building a scater object</a></li>
<li class="chapter" data-level="6.7" data-path="tabula-muris.html"><a href="tabula-muris.html#advanced-exercise"><i class="fa fa-check"></i><b>6.7</b> Advanced Exercise</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html"><i class="fa fa-check"></i><b>7</b> Cleaning the Expression Matrix</a><ul>
<li class="chapter" data-level="7.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exprs-qc"><i class="fa fa-check"></i><b>7.1</b> Expression QC (UMI)</a><ul>
<li class="chapter" data-level="7.1.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-2"><i class="fa fa-check"></i><b>7.1.1</b> Introduction</a></li>
<li class="chapter" data-level="7.1.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#tung-dataset"><i class="fa fa-check"></i><b>7.1.2</b> Tung dataset</a></li>
<li class="chapter" data-level="7.1.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cell-qc"><i class="fa fa-check"></i><b>7.1.3</b> Cell QC</a></li>
<li class="chapter" data-level="7.1.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cell-filtering"><i class="fa fa-check"></i><b>7.1.4</b> Cell filtering</a></li>
<li class="chapter" data-level="7.1.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#compare-filterings"><i class="fa fa-check"></i><b>7.1.5</b> Compare filterings</a></li>
<li class="chapter" data-level="7.1.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#gene-analysis"><i class="fa fa-check"></i><b>7.1.6</b> Gene analysis</a></li>
<li class="chapter" data-level="7.1.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#save-the-data"><i class="fa fa-check"></i><b>7.1.7</b> Save the data</a></li>
<li class="chapter" data-level="7.1.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise"><i class="fa fa-check"></i><b>7.1.8</b> Big Exercise</a></li>
<li class="chapter" data-level="7.1.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo"><i class="fa fa-check"></i><b>7.1.9</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#expression-qc-reads"><i class="fa fa-check"></i><b>7.2</b> Expression QC (Reads)</a></li>
<li class="chapter" data-level="7.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#data-visualization"><i class="fa fa-check"></i><b>7.3</b> Data visualization</a><ul>
<li class="chapter" data-level="7.3.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-3"><i class="fa fa-check"></i><b>7.3.1</b> Introduction</a></li>
<li class="chapter" data-level="7.3.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#visual-pca"><i class="fa fa-check"></i><b>7.3.2</b> PCA plot</a></li>
<li class="chapter" data-level="7.3.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#visual-tsne"><i class="fa fa-check"></i><b>7.3.3</b> tSNE map</a></li>
<li class="chapter" data-level="7.3.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise-1"><i class="fa fa-check"></i><b>7.3.4</b> Big Exercise</a></li>
<li class="chapter" data-level="7.3.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-1"><i class="fa fa-check"></i><b>7.3.5</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#data-visualization-reads"><i class="fa fa-check"></i><b>7.4</b> Data visualization (Reads)</a></li>
<li class="chapter" data-level="7.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#identifying-confounding-factors"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors</a><ul>
<li class="chapter" data-level="7.5.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-4"><i class="fa fa-check"></i><b>7.5.1</b> Introduction</a></li>
<li class="chapter" data-level="7.5.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#correlations-with-pcs"><i class="fa fa-check"></i><b>7.5.2</b> Correlations with PCs</a></li>
<li class="chapter" data-level="7.5.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#explanatory-variables"><i class="fa fa-check"></i><b>7.5.3</b> Explanatory variables</a></li>
<li class="chapter" data-level="7.5.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#other-confounders"><i class="fa fa-check"></i><b>7.5.4</b> Other confounders</a></li>
<li class="chapter" data-level="7.5.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exercise"><i class="fa fa-check"></i><b>7.5.5</b> Exercise</a></li>
<li class="chapter" data-level="7.5.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-2"><i class="fa fa-check"></i><b>7.5.6</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.6</b> Identifying confounding factors (Reads)</a></li>
<li class="chapter" data-level="7.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-theory"><i class="fa fa-check"></i><b>7.7</b> Normalization theory</a><ul>
<li class="chapter" data-level="7.7.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-5"><i class="fa fa-check"></i><b>7.7.1</b> Introduction</a></li>
<li class="chapter" data-level="7.7.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#library-size-1"><i class="fa fa-check"></i><b>7.7.2</b> Library size</a></li>
<li class="chapter" data-level="7.7.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalisations"><i class="fa fa-check"></i><b>7.7.3</b> Normalisations</a></li>
<li class="chapter" data-level="7.7.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#effectiveness"><i class="fa fa-check"></i><b>7.7.4</b> Effectiveness</a></li>
</ul></li>
<li class="chapter" data-level="7.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-practice-umi"><i class="fa fa-check"></i><b>7.8</b> Normalization practice (UMI)</a><ul>
<li class="chapter" data-level="7.8.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#raw"><i class="fa fa-check"></i><b>7.8.1</b> Raw</a></li>
<li class="chapter" data-level="7.8.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cpm-1"><i class="fa fa-check"></i><b>7.8.2</b> CPM</a></li>
<li class="chapter" data-level="7.8.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#size-factor-rle"><i class="fa fa-check"></i><b>7.8.3</b> Size-factor (RLE)</a></li>
<li class="chapter" data-level="7.8.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#upperquantile"><i class="fa fa-check"></i><b>7.8.4</b> Upperquantile</a></li>
<li class="chapter" data-level="7.8.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#tmm-1"><i class="fa fa-check"></i><b>7.8.5</b> TMM</a></li>
<li class="chapter" data-level="7.8.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#scran-1"><i class="fa fa-check"></i><b>7.8.6</b> scran</a></li>
<li class="chapter" data-level="7.8.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#downsampling-1"><i class="fa fa-check"></i><b>7.8.7</b> Downsampling</a></li>
<li class="chapter" data-level="7.8.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalisation-for-genetranscript-length"><i class="fa fa-check"></i><b>7.8.8</b> Normalisation for gene/transcript length</a></li>
<li class="chapter" data-level="7.8.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exercise-1"><i class="fa fa-check"></i><b>7.8.9</b> Exercise</a></li>
<li class="chapter" data-level="7.8.10" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.8.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-practice-reads"><i class="fa fa-check"></i><b>7.9</b> Normalization practice (Reads)</a></li>
<li class="chapter" data-level="7.10" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.10</b> Dealing with confounders</a><ul>
<li class="chapter" data-level="7.10.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-6"><i class="fa fa-check"></i><b>7.10.1</b> Introduction</a></li>
<li class="chapter" data-level="7.10.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#remove-unwanted-variation"><i class="fa fa-check"></i><b>7.10.2</b> Remove Unwanted Variation</a></li>
<li class="chapter" data-level="7.10.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#combat"><i class="fa fa-check"></i><b>7.10.3</b> Combat</a></li>
<li class="chapter" data-level="7.10.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#mnncorrect"><i class="fa fa-check"></i><b>7.10.4</b> mnnCorrect</a></li>
<li class="chapter" data-level="7.10.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#glm"><i class="fa fa-check"></i><b>7.10.5</b> GLM</a></li>
<li class="chapter" data-level="7.10.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.10.6</b> How to evaluate and compare confounder removal strategies</a></li>
<li class="chapter" data-level="7.10.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise-2"><i class="fa fa-check"></i><b>7.10.7</b> Big Exercise</a></li>
<li class="chapter" data-level="7.10.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.10.8</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.11" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#dealing-with-confounders-reads"><i class="fa fa-check"></i><b>7.11</b> Dealing with confounders (Reads)</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="biological-analysis.html"><a href="biological-analysis.html"><i class="fa fa-check"></i><b>8</b> Biological Analysis</a><ul>
<li class="chapter" data-level="8.1" data-path="biological-analysis.html"><a href="biological-analysis.html#clustering-introduction"><i class="fa fa-check"></i><b>8.1</b> Clustering Introduction</a><ul>
<li class="chapter" data-level="8.1.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-7"><i class="fa fa-check"></i><b>8.1.1</b> Introduction</a></li>
<li class="chapter" data-level="8.1.2" data-path="biological-analysis.html"><a href="biological-analysis.html#dimensionality-reductions"><i class="fa fa-check"></i><b>8.1.2</b> Dimensionality reductions</a></li>
<li class="chapter" data-level="8.1.3" data-path="biological-analysis.html"><a href="biological-analysis.html#clustering-methods"><i class="fa fa-check"></i><b>8.1.3</b> Clustering methods</a></li>
<li class="chapter" data-level="8.1.4" data-path="biological-analysis.html"><a href="biological-analysis.html#challenges-in-clustering"><i class="fa fa-check"></i><b>8.1.4</b> Challenges in clustering</a></li>
<li class="chapter" data-level="8.1.5" data-path="biological-analysis.html"><a href="biological-analysis.html#tools-for-scrna-seq-data"><i class="fa fa-check"></i><b>8.1.5</b> Tools for scRNA-seq data</a></li>
<li class="chapter" data-level="8.1.6" data-path="biological-analysis.html"><a href="biological-analysis.html#comparing-clustering"><i class="fa fa-check"></i><b>8.1.6</b> Comparing clustering</a></li>
</ul></li>
<li class="chapter" data-level="8.2" data-path="biological-analysis.html"><a href="biological-analysis.html#clust-methods"><i class="fa fa-check"></i><b>8.2</b> Clustering example</a><ul>
<li class="chapter" data-level="8.2.1" data-path="biological-analysis.html"><a href="biological-analysis.html#deng-dataset"><i class="fa fa-check"></i><b>8.2.1</b> Deng dataset</a></li>
<li class="chapter" data-level="8.2.2" data-path="biological-analysis.html"><a href="biological-analysis.html#sc3-1"><i class="fa fa-check"></i><b>8.2.2</b> SC3</a></li>
<li class="chapter" data-level="8.2.3" data-path="biological-analysis.html"><a href="biological-analysis.html#pcareduce-1"><i class="fa fa-check"></i><b>8.2.3</b> pcaReduce</a></li>
<li class="chapter" data-level="8.2.4" data-path="biological-analysis.html"><a href="biological-analysis.html#tsne-kmeans"><i class="fa fa-check"></i><b>8.2.4</b> tSNE + kmeans</a></li>
<li class="chapter" data-level="8.2.5" data-path="biological-analysis.html"><a href="biological-analysis.html#snn-cliq-1"><i class="fa fa-check"></i><b>8.2.5</b> SNN-Cliq</a></li>
<li class="chapter" data-level="8.2.6" data-path="biological-analysis.html"><a href="biological-analysis.html#sincera-1"><i class="fa fa-check"></i><b>8.2.6</b> SINCERA</a></li>
<li class="chapter" data-level="8.2.7" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-5"><i class="fa fa-check"></i><b>8.2.7</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.3" data-path="biological-analysis.html"><a href="biological-analysis.html#feature-selection"><i class="fa fa-check"></i><b>8.3</b> Feature Selection</a><ul>
<li class="chapter" data-level="8.3.1" data-path="biological-analysis.html"><a href="biological-analysis.html#identifying-genes-vs-a-null-model"><i class="fa fa-check"></i><b>8.3.1</b> Identifying Genes vs a Null Model</a></li>
<li class="chapter" data-level="8.3.2" data-path="biological-analysis.html"><a href="biological-analysis.html#correlated-expression"><i class="fa fa-check"></i><b>8.3.2</b> Correlated Expression</a></li>
<li class="chapter" data-level="8.3.3" data-path="biological-analysis.html"><a href="biological-analysis.html#comparing-methods"><i class="fa fa-check"></i><b>8.3.3</b> Comparing Methods</a></li>
<li class="chapter" data-level="8.3.4" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-6"><i class="fa fa-check"></i><b>8.3.4</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.4" data-path="biological-analysis.html"><a href="biological-analysis.html#pseudotime-analysis"><i class="fa fa-check"></i><b>8.4</b> Pseudotime analysis</a><ul>
<li class="chapter" data-level="8.4.1" data-path="biological-analysis.html"><a href="biological-analysis.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>8.4.1</b> First look at Deng data</a></li>
<li class="chapter" data-level="8.4.2" data-path="biological-analysis.html"><a href="biological-analysis.html#tscan"><i class="fa fa-check"></i><b>8.4.2</b> TSCAN</a></li>
<li class="chapter" data-level="8.4.3" data-path="biological-analysis.html"><a href="biological-analysis.html#monocle"><i class="fa fa-check"></i><b>8.4.3</b> monocle</a></li>
<li class="chapter" data-level="8.4.4" data-path="biological-analysis.html"><a href="biological-analysis.html#diffusion-maps"><i class="fa fa-check"></i><b>8.4.4</b> Diffusion maps</a></li>
<li class="chapter" data-level="8.4.5" data-path="biological-analysis.html"><a href="biological-analysis.html#slicer"><i class="fa fa-check"></i><b>8.4.5</b> SLICER</a></li>
<li class="chapter" data-level="8.4.6" data-path="biological-analysis.html"><a href="biological-analysis.html#ouija"><i class="fa fa-check"></i><b>8.4.6</b> Ouija</a></li>
<li class="chapter" data-level="8.4.7" data-path="biological-analysis.html"><a href="biological-analysis.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>8.4.7</b> Comparison of the methods</a></li>
<li class="chapter" data-level="8.4.8" data-path="biological-analysis.html"><a href="biological-analysis.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>8.4.8</b> Expression of genes through time</a></li>
<li class="chapter" data-level="8.4.9" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-7"><i class="fa fa-check"></i><b>8.4.9</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.5" data-path="biological-analysis.html"><a href="biological-analysis.html#imputation"><i class="fa fa-check"></i><b>8.5</b> Imputation</a><ul>
<li class="chapter" data-level="8.5.1" data-path="biological-analysis.html"><a href="biological-analysis.html#scimpute"><i class="fa fa-check"></i><b>8.5.1</b> scImpute</a></li>
<li class="chapter" data-level="8.5.2" data-path="biological-analysis.html"><a href="biological-analysis.html#magic"><i class="fa fa-check"></i><b>8.5.2</b> MAGIC</a></li>
<li class="chapter" data-level="8.5.3" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-8"><i class="fa fa-check"></i><b>8.5.3</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.6" data-path="biological-analysis.html"><a href="biological-analysis.html#dechapter"><i class="fa fa-check"></i><b>8.6</b> Differential Expression (DE) analysis</a><ul>
<li class="chapter" data-level="8.6.1" data-path="biological-analysis.html"><a href="biological-analysis.html#bulk-rna-seq-1"><i class="fa fa-check"></i><b>8.6.1</b> Bulk RNA-seq</a></li>
<li class="chapter" data-level="8.6.2" data-path="biological-analysis.html"><a href="biological-analysis.html#single-cell-rna-seq"><i class="fa fa-check"></i><b>8.6.2</b> Single cell RNA-seq</a></li>
<li class="chapter" data-level="8.6.3" data-path="biological-analysis.html"><a href="biological-analysis.html#differences-in-distribution"><i class="fa fa-check"></i><b>8.6.3</b> Differences in Distribution</a></li>
<li class="chapter" data-level="8.6.4" data-path="biological-analysis.html"><a href="biological-analysis.html#models-of-single-cell-rnaseq-data"><i class="fa fa-check"></i><b>8.6.4</b> Models of single-cell RNASeq data</a></li>
</ul></li>
<li class="chapter" data-level="8.7" data-path="biological-analysis.html"><a href="biological-analysis.html#de-in-a-real-dataset"><i class="fa fa-check"></i><b>8.7</b> DE in a real dataset</a><ul>
<li class="chapter" data-level="8.7.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-8"><i class="fa fa-check"></i><b>8.7.1</b> Introduction</a></li>
<li class="chapter" data-level="8.7.2" data-path="biological-analysis.html"><a href="biological-analysis.html#kolmogorov-smirnov-test"><i class="fa fa-check"></i><b>8.7.2</b> Kolmogorov-Smirnov test</a></li>
<li class="chapter" data-level="8.7.3" data-path="biological-analysis.html"><a href="biological-analysis.html#wilcoxmann-whitney-u-test"><i class="fa fa-check"></i><b>8.7.3</b> Wilcox/Mann-Whitney-U Test</a></li>
<li class="chapter" data-level="8.7.4" data-path="biological-analysis.html"><a href="biological-analysis.html#edger"><i class="fa fa-check"></i><b>8.7.4</b> edgeR</a></li>
<li class="chapter" data-level="8.7.5" data-path="biological-analysis.html"><a href="biological-analysis.html#monocle-1"><i class="fa fa-check"></i><b>8.7.5</b> Monocle</a></li>
<li class="chapter" data-level="8.7.6" data-path="biological-analysis.html"><a href="biological-analysis.html#mast"><i class="fa fa-check"></i><b>8.7.6</b> MAST</a></li>
<li class="chapter" data-level="8.7.7" data-path="biological-analysis.html"><a href="biological-analysis.html#slow-methods-1h-to-run"><i class="fa fa-check"></i><b>8.7.7</b> Slow Methods (>1h to run)</a></li>
<li class="chapter" data-level="8.7.8" data-path="biological-analysis.html"><a href="biological-analysis.html#bpsc"><i class="fa fa-check"></i><b>8.7.8</b> BPSC</a></li>
<li class="chapter" data-level="8.7.9" data-path="biological-analysis.html"><a href="biological-analysis.html#scde"><i class="fa fa-check"></i><b>8.7.9</b> SCDE</a></li>
<li class="chapter" data-level="8.7.10" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-9"><i class="fa fa-check"></i><b>8.7.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.8" data-path="biological-analysis.html"><a href="biological-analysis.html#comparingcombining-scrnaseq-datasets"><i class="fa fa-check"></i><b>8.8</b> Comparing/Combining scRNASeq datasets</a><ul>
<li class="chapter" data-level="8.8.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-9"><i class="fa fa-check"></i><b>8.8.1</b> Introduction</a></li>
<li class="chapter" data-level="8.8.2" data-path="biological-analysis.html"><a href="biological-analysis.html#datasets"><i class="fa fa-check"></i><b>8.8.2</b> Datasets</a></li>
<li class="chapter" data-level="8.8.3" data-path="biological-analysis.html"><a href="biological-analysis.html#projecting-cells-onto-annotated-cell-types-scmap"><i class="fa fa-check"></i><b>8.8.3</b> Projecting cells onto annotated cell-types (scmap)</a></li>
<li class="chapter" data-level="8.8.4" data-path="biological-analysis.html"><a href="biological-analysis.html#cell-to-cell-mapping"><i class="fa fa-check"></i><b>8.8.4</b> Cell-to-Cell mapping</a></li>
<li class="chapter" data-level="8.8.5" data-path="biological-analysis.html"><a href="biological-analysis.html#metaneighbour"><i class="fa fa-check"></i><b>8.8.5</b> Metaneighbour</a></li>
<li class="chapter" data-level="8.8.6" data-path="biological-analysis.html"><a href="biological-analysis.html#mnncorrect-1"><i class="fa fa-check"></i><b>8.8.6</b> mnnCorrect</a></li>
<li class="chapter" data-level="8.8.7" data-path="biological-analysis.html"><a href="biological-analysis.html#cannonical-correlation-analysis-seurat"><i class="fa fa-check"></i><b>8.8.7</b> Cannonical Correlation Analysis (Seurat)</a></li>
<li class="chapter" data-level="8.8.8" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-10"><i class="fa fa-check"></i><b>8.8.8</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.9" data-path="biological-analysis.html"><a href="biological-analysis.html#search-scrna-seq-data"><i class="fa fa-check"></i><b>8.9</b> Search scRNA-Seq data</a><ul>
<li class="chapter" data-level="8.9.1" data-path="biological-analysis.html"><a href="biological-analysis.html#about"><i class="fa fa-check"></i><b>8.9.1</b> About</a></li>
<li class="chapter" data-level="8.9.2" data-path="biological-analysis.html"><a href="biological-analysis.html#dataset"><i class="fa fa-check"></i><b>8.9.2</b> Dataset</a></li>
<li class="chapter" data-level="8.9.3" data-path="biological-analysis.html"><a href="biological-analysis.html#gene-index"><i class="fa fa-check"></i><b>8.9.3</b> Gene Index</a></li>
<li class="chapter" data-level="8.9.4" data-path="biological-analysis.html"><a href="biological-analysis.html#marker-genes"><i class="fa fa-check"></i><b>8.9.4</b> Marker genes</a></li>
<li class="chapter" data-level="8.9.5" data-path="biological-analysis.html"><a href="biological-analysis.html#search-cells-by-a-gene-list"><i class="fa fa-check"></i><b>8.9.5</b> Search cells by a gene list</a></li>
<li class="chapter" data-level="8.9.6" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-11"><i class="fa fa-check"></i><b>8.9.6</b> sessionInfo()</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="9" data-path="seurat-chapter.html"><a href="seurat-chapter.html"><i class="fa fa-check"></i><b>9</b> Seurat</a><ul>
<li class="chapter" data-level="9.1" data-path="seurat-chapter.html"><a href="seurat-chapter.html#seurat-object-class"><i class="fa fa-check"></i><b>9.1</b> <code>Seurat</code> object class</a></li>
<li class="chapter" data-level="9.2" data-path="seurat-chapter.html"><a href="seurat-chapter.html#expression-qc"><i class="fa fa-check"></i><b>9.2</b> Expression QC</a></li>
<li class="chapter" data-level="9.3" data-path="seurat-chapter.html"><a href="seurat-chapter.html#normalization"><i class="fa fa-check"></i><b>9.3</b> Normalization</a></li>
<li class="chapter" data-level="9.4" data-path="seurat-chapter.html"><a href="seurat-chapter.html#highly-variable-genes-1"><i class="fa fa-check"></i><b>9.4</b> Highly variable genes</a></li>
<li class="chapter" data-level="9.5" data-path="seurat-chapter.html"><a href="seurat-chapter.html#dealing-with-confounders-1"><i class="fa fa-check"></i><b>9.5</b> Dealing with confounders</a></li>
<li class="chapter" data-level="9.6" data-path="seurat-chapter.html"><a href="seurat-chapter.html#linear-dimensionality-reduction"><i class="fa fa-check"></i><b>9.6</b> Linear dimensionality reduction</a></li>
<li class="chapter" data-level="9.7" data-path="seurat-chapter.html"><a href="seurat-chapter.html#significant-pcs"><i class="fa fa-check"></i><b>9.7</b> Significant PCs</a></li>
<li class="chapter" data-level="9.8" data-path="seurat-chapter.html"><a href="seurat-chapter.html#clustering-cells"><i class="fa fa-check"></i><b>9.8</b> Clustering cells</a></li>
<li class="chapter" data-level="9.9" data-path="seurat-chapter.html"><a href="seurat-chapter.html#marker-genes-1"><i class="fa fa-check"></i><b>9.9</b> Marker genes</a></li>
<li class="chapter" data-level="9.10" data-path="seurat-chapter.html"><a href="seurat-chapter.html#sessioninfo-12"><i class="fa fa-check"></i><b>9.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><i class="fa fa-check"></i><b>10</b> “Ideal” scRNAseq pipeline (as of Oct 2017)</a><ul>
<li class="chapter" data-level="10.1" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#experimental-design"><i class="fa fa-check"></i><b>10.1</b> Experimental Design</a></li>
<li class="chapter" data-level="10.2" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#processing-reads"><i class="fa fa-check"></i><b>10.2</b> Processing Reads</a></li>
<li class="chapter" data-level="10.3" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#preparing-expression-matrix"><i class="fa fa-check"></i><b>10.3</b> Preparing Expression Matrix</a></li>
<li class="chapter" data-level="10.4" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#biological-interpretation"><i class="fa fa-check"></i><b>10.4</b> Biological Interpretation</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="advanced-exercises.html"><a href="advanced-exercises.html"><i class="fa fa-check"></i><b>11</b> Advanced exercises</a></li>
<li class="chapter" data-level="12" data-path="resources.html"><a href="resources.html"><i class="fa fa-check"></i><b>12</b> Resources</a><ul>
<li class="chapter" data-level="12.1" data-path="resources.html"><a href="resources.html#scrna-seq-protocols"><i class="fa fa-check"></i><b>12.1</b> scRNA-seq protocols</a></li>
<li class="chapter" data-level="12.2" data-path="resources.html"><a href="resources.html#external-rna-control-consortium-ercc"><i class="fa fa-check"></i><b>12.2</b> External RNA Control Consortium (ERCC)</a></li>
<li class="chapter" data-level="12.3" data-path="resources.html"><a href="resources.html#scrna-seq-analysis-tools"><i class="fa fa-check"></i><b>12.3</b> scRNA-seq analysis tools</a></li>
<li class="chapter" data-level="12.4" data-path="resources.html"><a href="resources.html#scrna-seq-public-datasets"><i class="fa fa-check"></i><b>12.4</b> scRNA-seq public datasets</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="references.html"><a href="references.html"><i class="fa fa-check"></i><b>13</b> References</a></li>
<li class="divider"></li>
<li><a href="http://www.sanger.ac.uk/science/groups/hemberg-group" target="blank">Hemberg Lab</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Analysis of single cell RNA-seq data</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="processing-raw-scrna-seq-data" class="section level1">
<h1><span class="header-section-number">3</span> Processing Raw scRNA-seq Data</h1>
<div id="fastqc" class="section level2">
<h2><span class="header-section-number">3.1</span> FastQC</h2>
<p>Once you’ve obtained your single-cell RNA-seq data, the first thing you need to do with it is check the quality of the reads you have sequenced. For this task, today we will be using a tool called FastQC. FastQC is a quality control tool for sequencing data, which can be used for both bulk and single-cell RNA-seq data. FastQC takes sequencing data as input and returns a report on read quality. Copy and paste this link into your browser to visit the FastQC website:</p>
<p><a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></p>
<p>This website contains links to download and install FastQC and documentation on the reports produced. Fortunately we have already installed FastQC for you today, so instead we will take a look at the documentation. Scroll down the webpage to ‘Example Reports’ and click ‘Good Illumina Data’. This gives an example of what an ideal report should look like for high quality Illumina reads data.</p>
<p>Now let’s make a FastQC report ourselves.</p>
<p>Today we will be performing our analysis using a single cell from an mESC dataset produced by <span class="citation">(Kolodziejczyk et al. <a href="#ref-Kolodziejczyk2015-xy">2015</a>)</span>. The cells were sequenced using the SMART-seq2 library preparation protocol and the reads are paired end. The files are located in <code>Share</code>.</p>
<p><strong>Note</strong> The current text of the course is written for an AWS server for people who attend our course in person. You will have to download the files (both <code>ERR522959_1.fastq</code> and <code>ERR522959_2.fastq</code>) and create <code>Share</code> directory yourself to run the commands. You can find the files here: <a href="https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/" class="uri">https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/</a></p>
<p>Now let’s look at the files:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">less</span> Share/ERR522959_1.fastq
<span class="fu">less</span> Share/ERR522959_2.fastq</code></pre></div>
<p>Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">fastqc</span> -h</code></pre></div>
<p>This command will tell you what options are available to pass to FastQC. Feel free to ask for help if you get stuck! If you are successful, you should generate a .zip and a .html file for both the forwards and the reverse reads files. Once you have been successful, feel free to have a go at the next section.</p>
<div id="solution-and-downloading-the-report" class="section level3">
<h3><span class="header-section-number">3.1.1</span> Solution and Downloading the Report</h3>
<p>If you haven’t done so already, generate the FastQC report using the commands below:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> fastqc_results
<span class="ex">fastqc</span> -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq</code></pre></div>
<p>Once the command has finished executing, you should have a total of four files - one zip file for each of the paired end reads, and one html file for each of the paired end reads. The report is in the html file. To view it, we will need to get it off AWS and onto your computer using either filezilla or scp. Ask an instructor if you are having difficulties.</p>
<p>Once the file is on you computer, click on it. Your FastQC report should open. Have a look through the file. Remember to look at both the forwards and the reverse end read reports! How good quality are the reads? Is there anything we should be concerned about? How might we address those concerns?</p>
<p>Feel free to chat to one of the instructors about your ideas.</p>
</div>
</div>
<div id="trimming-reads" class="section level2">
<h2><span class="header-section-number">3.2</span> Trimming Reads</h2>
<p>Fortunately there is software available for read trimming. Today we will be using Trim Galore!. Trim Galore! is a wrapper for the reads trimming software cutadapt.</p>
<p>Read trimming software can be used to trim sequencing adapters and/or low quality reads from the ends of reads. Given we noticed there was some adaptor contamination in our FastQC report, it is a good idea to trim adaptors from our data.</p>
<p>Task 2: What type of adapters were used in our data? Hint: Look at the FastQC report ‘Adapter Content’ plot.</p>
<p>Now let’s try to use Trim Galore! to remove those problematic adapters. It’s a good idea to check read quality again after trimming, so after you have trimmed your reads you should use FastQC to produce another report.</p>
<p>Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">trim_galore</span> -h</code></pre></div>
<p>To find out what options you can pass to Trim Galore. Hint 2: Read through the output of the above command carefully. The adaptor used in this experiment is quite common. Do you need to know the actual sequence of the adaptor to remove it?</p>
<p>Task 3: Produce a FastQC report for your trimmed reads files. Is the adapter contamination gone?</p>
<p>Once you think you have successfully trimmed your reads and have confirmed this by checking the FastQC report, feel free to check your results using the next section.</p>
<div id="solution" class="section level3">
<h3><span class="header-section-number">3.2.1</span> Solution</h3>
<p>You can use the command(s) below to trim the Nextera sequencing adapters:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> fastqc_trimmed_results
<span class="ex">trim_galore</span> --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq</code></pre></div>
<p>Remember to generate new FastQC reports for your trimmed reads files! FastQC should now show that your reads pass the ‘Adaptor Content’ plot. Feel free to ask one of the instructors if you have any questions.</p>
<p>Congratulations! You have now generated reads quality reports and performed adaptor trimming. In the next lab, we will use STAR and Kallisto to align our trimmed and quality-checked reads to a reference transcriptome.</p>
</div>
</div>
<div id="file-formats" class="section level2">
<h2><span class="header-section-number">3.3</span> File formats</h2>
<div id="fastq" class="section level3">
<h3><span class="header-section-number">3.3.1</span> FastQ</h3>
<p>FastQ is the most raw form of scRNASeq data you will encounter. All scRNASeq protocols are sequenced with paired-end sequencing. Barcode sequences may occur in one or both reads depending on the protocol employed. However, protocols using unique molecular identifiers (UMIs) will generally contain one read with the cell and UMI barcodes plus adapters but without any transcript sequence. Thus reads will be mapped as if they are single-end sequenced despite actually being paired end.</p>
<p>FastQ files have the format:</p>
<pre class="eval"><code>>ReadID
READ SEQUENCE
+
SEQUENCING QUALITY SCORES</code></pre>
</div>
<div id="bam" class="section level3">
<h3><span class="header-section-number">3.3.2</span> BAM</h3>
<p>BAM file format stores mapped reads in a standard and efficient manner. The human-readable version is called a SAM file, while the BAM file is the highly compressed version. BAM/SAM files contain a header which typically includes<br />
information on the sample preparation, sequencing and mapping; and a tab-separated row for each individual alignment of each read.</p>
<p>Alignment rows employ a standard format with the following columns:</p>
<ol style="list-style-type: decimal">
<li><p>QNAME : read name (generally will include UMI barcode if applicable)</p></li>
<li><p>FLAG : number tag indicating the “type” of alignment, <a href="https://broadinstitute.github.io/picard/explain-flags.html">link</a> to explanation of all possible “types”</p></li>
<li><p>RNAME : reference sequence name (i.e. chromosome read is mapped to).</p></li>
<li><p>POS : leftmost mapping position</p></li>
<li><p>MAPQ : Mapping quality</p></li>
<li><p>CIGAR : string indicating the matching/mismatching parts of the read (may include soft-clipping).</p></li>
<li><p>RNEXT : reference name of the mate/next read</p></li>
<li><p>PNEXT : POS for mate/next read</p></li>
<li><p>TLEN : Template length (length of reference region the read is mapped to)</p></li>
<li><p>SEQ : read sequence</p></li>
<li><p>QUAL : read quality</p></li>
</ol>
<p>BAM/SAM files can be converted to the other format using ‘samtools’:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">samtools</span> view -S -b file.sam <span class="op">></span> file.bam
<span class="ex">samtools</span> view -h file.bam <span class="op">></span> file.sam</code></pre></div>
<p>Some sequencing facilities will automatically map your reads to the a standard genome and deliver either BAM or CRAM formatted files. Generally they will not have included ERCC sequences in the genome thus no ERCC reads will be mapped in the BAM/CRAM file. To quantify ERCCs (or any other genetic alterations) or if you just want to use a different alignment algorithm than whatever is in the generic pipeline (often outdated), then you will need to convert the BAM/CRAM files back to FastQs:</p>
<p>BAM files can be converted to FastQ using bedtools. To ensure a single copy for multi-mapping reads first sort by read name and remove secondary alignments using samtools. <a href="https://broadinstitute.github.io/picard/index.html">Picard</a> also contains a method for converting BAM to FastQ files.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="co"># sort reads by name</span>
<span class="ex">samtools</span> sort -n original.bam -o sorted_by_name.bam
<span class="co"># remove secondary alignments</span>
<span class="ex">samtools</span> view -b -F 256 sorted_by_name.bam -o primary_alignment_only.bam
<span class="co"># convert to fastq</span>
<span class="ex">bedtools</span> bamtofastq -i primary_alignment_only.bam -fq read1.fq -fq2 read2.fq</code></pre></div>
</div>
<div id="cram" class="section level3">
<h3><span class="header-section-number">3.3.3</span> CRAM</h3>
<p><a href="https://www.ebi.ac.uk/ena/software/cram-usage">CRAM</a> files are similar to BAM files only they contain information in the header to the reference genome used in the mapping in the header. This allow the bases in each read that are identical to the reference to be further compressed. CRAM also supports some lossy data compression approaches to further optimize storage compared to BAMs. CRAMs are mainly used by the Sanger/EBI sequencing facility.</p>
<p>CRAM and BAM files can be interchanged using the lastest version of samtools (>=v1.0). However, this conversion may require downloading the reference genome into cache. Alternatively, you may pre-download the correct reference either from metadata in the header of the CRAM file, or from talking to whomever generated the CRAM and specify that file using ‘-T’ Thus we recommend setting a specific cache location prior to doing this:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="bu">export</span> <span class="va">REF_CACHE=</span>/path_to/cache_directory_for_reference_genome
<span class="ex">samtools</span> view -b -h -T reference_genome.fasta file.cram -o file.bam
<span class="ex">samtools</span> view -C -h -T reference_genome.fasta file.bam -o file.cram</code></pre></div>
</div>
<div id="mannually-inspecting-files" class="section level3">
<h3><span class="header-section-number">3.3.4</span> Mannually Inspecting files</h3>
<p>At times it may be useful to mannual inspect files for example to check the metadata in headers that the files are from the correct sample. ‘less’ and ‘more’ can be used to inspect any text files from the command line. By “pipe-ing” the output of samtools view into these commands using ‘|’ we check each of these file types without having to save multiple copies of each file.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">less</span> file.txt
<span class="fu">more</span> file.txt
<span class="co"># counts the number of lines in file.txt</span>
<span class="fu">wc</span> -l file.txt
<span class="ex">samtools</span> view -h file.[cram/bam] <span class="kw">|</span> <span class="fu">more</span>
<span class="co"># counts the number of lines in the samtools output</span>
<span class="ex">samtools</span> view -h file.[cram/bam] <span class="kw">|</span> <span class="fu">wc</span> -l</code></pre></div>
<p><strong>Exercises</strong></p>
<p>You have been provided with a small cram file: EXAMPLE.cram</p>
<p>Task 1: How was this file aligned? What software was used? What was used as the genome? (Hint: check the header)</p>
<p>Task 2: How many reads are unmapped/mapped? How total reads are there? How many secondary alignments are present? (Hint: use the FLAG)</p>
<p>Task 3: Convert the CRAM into two Fastq files. Did you get exactly one copy of each read? (name these files “10cells_read1.fastq” “10cells_read2.fastq”)</p>
<p>If you get stuck help information for each piece of software can be displayed by entering running the command “naked” - e.g. ‘samtools view’, ‘bedtools’</p>
<p><strong>Answer</strong></p>
</div>
<div id="genome-fasta-gtf" class="section level3">
<h3><span class="header-section-number">3.3.5</span> Genome (FASTA, GTF)</h3>
<p>To map your reads you will also need the reference genome and in many cases the genome annotation file (in either GTF or GFF format). These can be downloaded for model organisms from any of the main genomics databases: <a href="http://www.ensembl.org/info/data/ftp/index.html">Ensembl</a>, <a href="ftp://ftp.ncbi.nih.gov/genomes/">NCBI</a>, or <a href="http://hgdownload.soe.ucsc.edu/downloads.html">UCSC Genome Browser</a>.</p>
<p>GTF files contain annotations of genes, transcripts, and exons. They must contain: (1) seqname : chromosome/scaffold (2) source : where this annotation came from (3) feature : what kind of feature is this? (e.g. gene, transcript, exon) (4) start : start position (bp) (5) end : end position (bp) (6) score : a number (7) strand : + (forward) or - (reverse) (8) frame : if CDS indicates which base is the first base of the first codon (0 = first base, 1 = second base, etc..) (9) attribute : semicolon-separated list of tag-value pairs of extra information (e.g. names/IDs, biotype)</p>
<p>Empty fields are marked with “.”</p>
<p>In our experience Ensembl is the easiest of these to use, and has the largest set of annotations. NCBI tends to be more strict in including only high confidence gene annotations. Whereas UCSC contains multiple geneset annotations that use different criteria.</p>
<p>If you experimental system includes non-standard sequences these must be added to both the genome fasta and gtf to quantify their expression. Most commonly this is done for the ERCC spike-ins, although the same must be done for CRISPR- related sequences or other overexpression/reporter constructs.</p>
<p>For maximum utility/flexibility we recommend creating complete and detailed entries for any non-standard sequences added.</p>
<p>There is no standardized way to do this. So below is our custom perl script for creating a gtf and fasta file for ERCCs which can be appended to the genome. You may also need to alter a gtf file to deal with repetitive elements in introns when/if you want to quantify intronic reads. Any scripting language or even ‘awk’ and/or some text editors can be used to do this relatively efficiently, but they are beyond the scope of this course.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="co"># Converts the Annotation file from </span>
<span class="co"># https://www.thermofisher.com/order/catalog/product/4456740 to </span>
<span class="co"># gtf and fasta files that can be added to existing genome fasta & gtf files.</span>
<span class="ex">my</span> @FASTAlines = ();
<span class="ex">my</span> @GTFlines = ();
<span class="ex">open</span> (my <span class="va">$ifh</span>, <span class="st">"ERCC_Controls_Annotation.txt"</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="op"><</span><span class="va">$ifh</span><span class="op">></span>; <span class="co">#header</span>
<span class="kw">while</span> <span class="kw">(</span><span class="op"><</span><span class="va">$ifh</span><span class="op">></span><span class="kw">)</span> <span class="kw">{</span>
<span class="co"># Do all the important stuff</span>
<span class="ex">chomp</span><span class="kw">;</span>
<span class="ex">my</span> @record = split(/\t/);
<span class="ex">my</span> <span class="va">$sequence</span> = <span class="va">$record[4]</span><span class="kw">;</span>
<span class="va">$sequence</span> =<span class="ex">~</span> <span class="ex">s</span>/\<span class="ex">s+//g</span>; <span class="co"># get rid of any preceeding/tailing white space</span>
<span class="va">$sequence</span> = <span class="va">$sequence</span><span class="ex">.</span><span class="st">"NNNN"</span><span class="kw">;</span>
<span class="ex">my</span> <span class="va">$name</span> = <span class="va">$record[0]</span><span class="kw">;</span>
<span class="ex">my</span> <span class="va">$genbank</span> = <span class="va">$record[1]</span><span class="kw">;</span>
<span class="ex">push</span>(@FASTAlines, <span class="st">"></span><span class="va">$name</span><span class="st">\n</span><span class="va">$sequence</span><span class="st">\n"</span>);
<span class="co"># is GTF 1 indexed or 0 indexed? -> it is 1 indexed</span>
<span class="co"># + or - strand?</span>
<span class="ex">push</span>(@GTFlines, <span class="st">"</span><span class="va">$name</span><span class="st">\tERCC\tgene\t1\t"</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">"\t.\t+\t.\tgene_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; transcript_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; exon_number </span><span class="dt">\"</span><span class="st">1</span><span class="dt">\"</span><span class="st">; gene_name </span><span class="dt">\"</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">\n"</span>);
<span class="ex">push</span>(@GTFlines, <span class="st">"</span><span class="va">$name</span><span class="st">\tERCC\ttranscript\t1\t"</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">"\t.\t+\t.\tgene_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; transcript_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; exon_number </span><span class="dt">\"</span><span class="st">1</span><span class="dt">\"</span><span class="st">; gene_name </span><span class="dt">\"</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">\n"</span>);
<span class="ex">push</span>(@GTFlines, <span class="st">"</span><span class="va">$name</span><span class="st">\tERCC\texon\t1\t"</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">"\t.\t+\t.\tgene_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; transcript_id </span><span class="dt">\"</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">; exon_number </span><span class="dt">\"</span><span class="st">1</span><span class="dt">\"</span><span class="st">; gene_name </span><span class="dt">\"</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\"</span><span class="st">\n"</span>);
<span class="kw">}</span> <span class="ex">close</span>(<span class="va">$ifh</span>);
<span class="co"># Write output</span>
<span class="ex">open</span>(my <span class="va">$ofh</span>, <span class="st">">"</span>, <span class="st">"ERCC_Controls.fa"</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="ex">foreach</span> my <span class="va">$line</span> (@FASTAlines) <span class="kw">{</span>
<span class="ex">print</span> <span class="va">$ofh</span> <span class="va">$line</span><span class="kw">;</span>
<span class="kw">}</span> <span class="ex">close</span> (<span class="va">$ofh</span>);
<span class="ex">open</span>(<span class="va">$ofh</span>, <span class="st">">"</span>, <span class="st">"ERCC_Controls.gtf"</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="ex">foreach</span> my <span class="va">$line</span> (@GTFlines) <span class="kw">{</span>
<span class="ex">print</span> <span class="va">$ofh</span> <span class="va">$line</span><span class="kw">;</span>
<span class="kw">}</span> <span class="ex">close</span> (<span class="va">$ofh</span>);</code></pre></div>
</div>
</div>
<div id="demultiplexing" class="section level2">
<h2><span class="header-section-number">3.4</span> Demultiplexing</h2>
<p>Demultiplexing is done differently depending on the protocol used and the particular pipeline you are using a full pipeline. The most flexible demultiplexing pipeline we are aware of is <a href="https://github.com/sdparekh/zUMIs/wiki/Usage">zUMIs</a> which can be used to demultiplex and map most UMI-based protocols. For Smartseq2 or other paired-end full transcript protocols the data will usually already be demultiplexed. Public repositories such as GEO or ArrayExpress require data small-scale/plate-based scRNASeq data to be demultiplexed prior to upload, and many sequencing facilities will automatically demultiplex data before returning it to you. If you aren’t using a published pipeline and the data was not demultiplexed by the sequencing facility you will have to demultiplex it yourself. This usually requires writing a custom script since barcodes may be of different lengths and different locations in the reads depending on the protocols used.</p>
<p>For all data-type “demultiplexing” involves identifying and removing the cell-barcode sequence from one or both reads. If the expected cell-barcodes are known ahead of time, i.e. the data is from a PCR-plate-based protocol, all that is necessarily is to compare each cell-barcode to the expected barcodes and assign the associated reads to the closest cell-barcode (with maximum mismatches of 1 or 2 depending on the design of the cell-barcodes). These data are generally demultiplexed prior to mapping as an easy way of parallelizing the mapping step.</p>
<p>We have <a href="https://github.com/tallulandrews/scRNASeqPipeline">publicly available</a> perl scripts capable of demultiplexing any scRNASeq data with a single cell-barcode with or without UMIs for plate-based protocols. These can be used as so:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">perl</span> 1_Flexible_UMI_Demultiplexing.pl 10cells_read1.fq 10cells_read2.fq <span class="st">"C12U8"</span> 10cells_barcodes.txt 2 Ex</code></pre></div>
<pre><code>##
## Doesn't match any cell: 0
## Ambiguous: 0
## Exact Matches: 400
## Contain mismatches: 0
## Input Reads: 400
## Output Reads: 400
## Barcode Structure: 12 bp CellID followed by 8 bp UMI</code></pre>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">perl</span> 1_Flexible_FullTranscript_Demultiplexing.pl 10cells_read1.fq 10cells_read2.fq <span class="st">"start"</span> 12 10cells_barcodes.txt 2 Ex</code></pre></div>
<pre><code>##
## Doesn't match any cell: 0
## Ambiguous: 0
## Exact Matches: 400
## Contain Mismatches: 0
## Input Reads: 400
## Output Reads: 400</code></pre>
<p>For UMI containing data, demultiplexing includes attaching the UMI code to the read name of the gene-body containing read. If the data are from a droplet-based protocol or SeqWell where the number of expected barcodes is much higher than the expected number of cell, then usually the cell-barcode will also be attached to the read name to avoid generating a very large number of files. In these cases, demultiplexing will happen during the quantification step to facilitate the identification of cell-barcodes which correspond to intact cells rather than background noise.</p>
<div id="identifying-cell-containing-dropletsmicrowells" class="section level3">
<h3><span class="header-section-number">3.4.1</span> Identifying cell-containing droplets/microwells</h3>
<p>For droplet based methods only a fraction of droplets contain both beads and an intact cell. However, biology experiments are messy and some RNA will leak out of dead/damaged cells. So droplets without an intact cell are likely to capture a small amount of the ambient RNA which will end up in the sequencing library and contribute a reads to the final sequencing output. The variation in droplet size, amplification efficiency, and sequencing will lead both “background” and real cells to have a wide range of library sizes. Various approaches have been used to try to distinguish those cell barcodes which correspond to real cells.</p>
<p>Most methods use the total molecules (could be applied to total reads) per barcode and try to find a “break point” between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let’s load some example simulated data which contain both large and small cells:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">umi_per_barcode <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"droplet_id_example_per_barcode.txt.gz"</span>)
truth <-<span class="st"> </span><span class="kw">read.delim</span>(<span class="st">"droplet_id_example_truth.gz"</span>, <span class="dt">sep=</span><span class="st">","</span>)</code></pre></div>
<p><strong>Exercise</strong> How many unique barcodes were detected? How many true cells are present in the data? To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules.</p>
<p><strong>Answer</strong></p>
<p>One approach is to look for the inflection point where the total molecules per barcode suddenly drops:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">barcode_rank <-<span class="st"> </span><span class="kw">rank</span>(<span class="op">-</span>umi_per_barcode[,<span class="dv">2</span>])
<span class="kw">plot</span>(barcode_rank, umi_per_barcode[,<span class="dv">2</span>], <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-11-1.png" width="672" /></p>
<p>Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">log_lib_size <-<span class="st"> </span><span class="kw">log10</span>(umi_per_barcode[,<span class="dv">2</span>])
<span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-12-1.png" width="672" /> That’s better, the “knee” in the distribution is much more pronounced. We could manually estimate where the “knee” is but it much more reproducible to algorithmically identify this point.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># inflection point</span>
o <-<span class="st"> </span><span class="kw">order</span>(barcode_rank)
log_lib_size <-<span class="st"> </span>log_lib_size[o]
barcode_rank <-<span class="st"> </span>barcode_rank[o]
rawdiff <-<span class="st"> </span><span class="kw">diff</span>(log_lib_size)<span class="op">/</span><span class="kw">diff</span>(barcode_rank)
inflection <-<span class="st"> </span><span class="kw">which</span>(rawdiff <span class="op">==</span><span class="st"> </span><span class="kw">min</span>(rawdiff[<span class="dv">100</span><span class="op">:</span><span class="kw">length</span>(rawdiff)], <span class="dt">na.rm=</span><span class="ot">TRUE</span>))
<span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))
<span class="kw">abline</span>(<span class="dt">v=</span>inflection, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-13-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">threshold <-<span class="st"> </span><span class="dv">10</span><span class="op">^</span>log_lib_size[inflection]
cells <-<span class="st"> </span>umi_per_barcode[umi_per_barcode[,<span class="dv">2</span>] <span class="op">></span><span class="st"> </span>threshold,<span class="dv">1</span>]
TPR <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)
Recall <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="kw">c</span>(TPR, Recall)</code></pre></div>
<pre><code>## [1] 1.0000000 0.7831707</code></pre>
<p>Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="op">-</span><span class="dv">92497</span>)
<span class="co"># mixture model</span>
<span class="kw">require</span>(<span class="st">"mixtools"</span>)</code></pre></div>
<pre><code>## Loading required package: mixtools</code></pre>
<pre><code>## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mix <-<span class="st"> </span><span class="kw">normalmixEM</span>(log_lib_size)</code></pre></div>
<pre><code>## number of iterations= 43</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(mix, <span class="dt">which=</span><span class="dv">2</span>, <span class="dt">xlab2=</span><span class="st">"log(mol per cell)"</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">p1 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">1</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">1</span>])
p2 <-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">2</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">2</span>])
<span class="cf">if</span> (mix<span class="op">$</span>mu[<span class="dv">1</span>] <span class="op"><</span><span class="st"> </span>mix<span class="op">$</span>mu[<span class="dv">2</span>]) {
split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p2 <span class="op">></span><span class="st"> </span>p1])
} <span class="cf">else</span> {
split <-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p1 <span class="op">></span><span class="st"> </span>p2])
}</code></pre></div>
<p><strong>Exercise</strong> Identify cells using this split point and calculate the TPR and Recall.</p>
<p><strong>Answer</strong></p>
<p>A third, used by CellRanger, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">n_cells <-<span class="st"> </span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="co"># CellRanger</span>
totals <-<span class="st"> </span>umi_per_barcode[,<span class="dv">2</span>]
totals <-<span class="st"> </span><span class="kw">sort</span>(totals, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)
<span class="co"># 99th percentile of top n_cells divided by 10</span>
thresh =<span class="st"> </span>totals[<span class="kw">round</span>(<span class="fl">0.01</span><span class="op">*</span>n_cells)]<span class="op">/</span><span class="dv">10</span>
<span class="kw">plot</span>(totals, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))
<span class="kw">abline</span>(<span class="dt">h=</span>thresh, <span class="dt">col=</span><span class="st">"red"</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-16-1.png" width="672" /> <strong>Exercise</strong> Identify cells using this threshodl and calculate the TPR and Recall.</p>
<p><strong>Answer</strong></p>
<p>Finally (EmptyDrops)[<a href="https://github.com/MarioniLab/DropletUtils" class="uri">https://github.com/MarioniLab/DropletUtils</a>], which is currently in beta testing, uses the full genes x cells molecule count matrix for all droplets and estimates the profile of “background” RNA from those droplets with extremely low counts, then looks for cells with gene-expression profiles which differ from the background. This is combined with an inflection point method since background RNA often looks very similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples.</p>
<p>Below we have provided code for how this method is currently run: (We will update this page when the method is officially released)</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">require</span>(<span class="st">"Matrix"</span>)
raw.counts <-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">"droplet_id_example.rds"</span>)
<span class="kw">require</span>(<span class="st">"DropletUtils"</span>)
<span class="co"># emptyDrops</span>
<span class="kw">set.seed</span>(<span class="dv">100</span>)
e.out <-<span class="st"> </span><span class="kw">emptyDrops</span>(my.counts)
is.cell <-<span class="st"> </span>e.out<span class="op">$</span>FDR <span class="op"><=</span><span class="st"> </span><span class="fl">0.01</span>
<span class="kw">sum</span>(is.cell, <span class="dt">na.rm=</span><span class="ot">TRUE</span>)
<span class="kw">plot</span>(e.out<span class="op">$</span>Total, <span class="op">-</span>e.out<span class="op">$</span>LogProb, <span class="dt">col=</span><span class="kw">ifelse</span>(is.cell, <span class="st">"red"</span>, <span class="st">"black"</span>),
<span class="dt">xlab=</span><span class="st">"Total UMI count"</span>, <span class="dt">ylab=</span><span class="st">"-Log Probability"</span>)
cells <-<span class="st"> </span><span class="kw">colnames</span>(raw.counts)[is.cell]
TPR <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)
Recall <-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="kw">c</span>(TPR, Recall)</code></pre></div>
</div>
</div>
<div id="using-star-to-align-reads" class="section level2">
<h2><span class="header-section-number">3.5</span> Using STAR to Align Reads</h2>
<p>Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples.</p>
<p>Many tools have been developed for read alignment, but today we will focus on two. The first tool we will consider is STAR <span class="citation">(<span class="citeproc-not-found" data-reference-id="dobin"><strong>???</strong></span>)</span>. For each read in our reads data, STAR tries to find the longest possible sequence which matches one or more sequences in the reference genome. For example, in the figure below, we have a read (blue) which spans two exons and an alternative splicing junction (purple). STAR finds that the first part of the read is the same as the sequence of the first exon, whilst the second part of the read matches the sequence in the second exon. Because STAR is able to recognise splicing events in this way, it is described as a ‘splice aware’ aligner.</p>
<div class="figure">
<img src="L1-images/STAR_explanation.png" alt="Figure 1: Diagram of how STAR performs alignments, taken from Dobin et al." />
<p class="caption">Figure 1: Diagram of how STAR performs alignments, taken from Dobin et al.</p>
</div>
<p>Usually STAR aligns reads to a reference genome, potentially allowing it to detect novel splicing events or chromosomal rearrangements. However, one issue with STAR is that it needs a lot of RAM, especially if your reference genome is large (eg. mouse and human). To speed up our analysis today, we will use STAR to align reads from to a reference transcriptome of 2000 transcripts. Note that this is NOT normal or recommended practice, we only do it here for reasons of time. We recommend that normally you should align to a reference genome.</p>
<p>Two steps are required to perform STAR alignment. In the first step, the user provides STAR with reference genome sequences (FASTA) and annotations (GTF), which STAR uses to create a genome index. In the second step, STAR maps the user’s reads data to the genome index.</p>
<p>Let’s create the index now. Remember, for reasons of time we are aligning to a transcriptome rather than a genome today, meaning we only need to provide STAR with the sequences of the transcripts we will be aligning reads to. You can obtain transcriptomes for many model organisms from Ensembl (<a href="https://www.ensembl.org/info/data/ftp/index.html" class="uri">https://www.ensembl.org/info/data/ftp/index.html</a>).</p>
<p>Task 1: Execute the commands below to create the index:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> indices
<span class="fu">mkdir</span> indices/STAR
<span class="ex">STAR</span> --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/2000_reference.transcripts.fa</code></pre></div>
<p>Task 2: What does each of the options we used do? Hint: Use the STAR manual to help you (<a href="https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf" class="uri">https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf</a>)</p>
<p>Task 3: How would the command we used in Task 1 be different if we were aligning to the genome rather than the transcriptome?</p>
<p>Now that we have created the index, we can perform the mapping step.</p>
<p>Task 4: Try to work out what command you should use to map our trimmed reads (from ERR522959) to the index you created. Use the STAR manual to help you. One you think you know the answer, check whether it matches the solution in the next section and execute the alignment.</p>
<p>Task 5: Try to understand the output of your alignment. Talk to one of the instructors if you need help!</p>
<div id="solution-for-star-alignment" class="section level3">
<h3><span class="header-section-number">3.5.1</span> Solution for STAR Alignment</h3>
<p>You can use the folowing commands to perform the mapping step:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> results
<span class="fu">mkdir</span> results/STAR
<span class="ex">STAR</span> --runThreadN 4 --genomeDir indices/STAR --readFilesIn Share/ERR522959_1.fastq Share/ERR522959_2.fastq --outFileNamePrefix results/STAR/</code></pre></div>
</div>
</div>
<div id="kallisto-and-pseudo-alignment" class="section level2">
<h2><span class="header-section-number">3.6</span> Kallisto and Pseudo-Alignment</h2>
<p>STAR is a reads aligner, whereas Kallisto is a pseudo-aligner <span class="citation">(Bray et al. <a href="#ref-bray_2016">2016</a>)</span>. The main difference between aligners and pseudo-aligners is that whereas aligners map reads to a reference, pseudo-aligners map k-mers to a reference.</p>
<div id="what-is-a-k-mer" class="section level3">
<h3><span class="header-section-number">3.6.1</span> What is a k-mer?</h3>
<p>A k-mer is a sequence of length k derived from a read. For example, imagine we have a read with the sequence ATCCCGGGTTAT and we want to make 7-mers from it. To do this, we would find the first 7-mer by counting the first seven bases of the read. We would find the second 7-mer by moving one base along, then counting the next seven bases. Figure 2 shows all the 7-mers that could be derived from our read:</p>
<div class="figure">
<img src="L1-images/Kmers.png" alt="Figure 2: The 7-mers derived from an example read" />
<p class="caption">Figure 2: The 7-mers derived from an example read</p>
</div>
</div>
<div id="why-map-k-mers-rather-than-reads" class="section level3">
<h3><span class="header-section-number">3.6.2</span> Why map k-mers rather than reads?</h3>
<p>There are two main reasons:</p>
<ol style="list-style-type: decimal">
<li><p>Pseudo-aligners use k-mers and a computational trick to make pseudo-alignment much faster than traditional aligners. If you are interested in how this is acheived, see (Bray et al., 2017) for details.</p></li>
<li><p>Under some circumstances, pseudo-aligners may be able to cope better with sequencing errors than traditional aligners. For example, imagine there was a sequencing error in the first base of the read above and the A was actually a T. This would impact on the pseudo-aligners ability to map the first 7-mer but none of the following 7-mers.</p></li>
</ol>
</div>
<div id="kallistos-pseudo-mode" class="section level3">
<h3><span class="header-section-number">3.6.3</span> Kallisto’s pseudo mode</h3>
<p>Kallisto has a specially designed mode for pseudo-aligning reads from single-cell RNA-seq experiments. Unlike STAR, Kallisto psuedo-aligns to a reference transcriptome rather than a reference genome. This means Kallisto maps reads to splice isoforms rather than genes. Mapping reads to isoforms rather than genes is especially challenging for single-cell RNA-seq for the following reasons:</p>
<ul>
<li>Single-cell RNA-seq is lower coverage than bulk RNA-seq, meaning the total amount of information available from reads is reduced.</li>
<li>Many single-cell RNA-seq protocols have 3’ coverage bias, meaning if two isoforms differ only at their 5’ end, it might not be possible to work out which isoform the read came from.</li>
<li>Some single-cell RNA-seq protocols have short read lengths, which can also mean it is not possible to work out which isoform the read came from.</li>
</ul>
<p>Kallisto’s pseudo mode takes a slightly different approach to pseudo-alignment. Instead of aligning to isoforms, Kallisto aligns to equivalence classes. Essentially, this means if a read maps to multiple isoforms, Kallisto records the read as mapping to an equivalence class containing all the isoforms it maps to. Instead of using gene or isoform expression estimates in downstream analysis such as clustering, equivalence class counts can be used instead. Figure 3 shows a diagram which helps explain this.</p>
<div class="figure">
<img src="L1-images/TCC.jpg" alt="Figure 3: A diagram explaining Kallisto’s Equivalence Classes, taken from https://pachterlab.github.io/kallisto/singlecell.html." />
<p class="caption">Figure 3: A diagram explaining Kallisto’s Equivalence Classes, taken from <a href="https://pachterlab.github.io/kallisto/singlecell.html" class="uri">https://pachterlab.github.io/kallisto/singlecell.html</a>.</p>
</div>
<p>Today we will just perform pseudo-alignment with one cell, but Kallisto is also capable of pseudo-aligning multiple cells simultaneously and using information from UMIs. See <a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a> for details.</p>
<p>As for STAR, you will need to produce an index for Kallisto before the pseudo-alignment step.</p>
<p>Task 6: Use the below command to produce the Kallisto index. Use the Kallisto manual (<a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a>) to work out what the options do in this command.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> indices/Kallisto
<span class="ex">kallisto</span> index -i indices/Kallisto/transcripts.idx Share/2000_reference.transcripts.fa</code></pre></div>
<p>Task 7: Use the Kallisto manual to work out what command to use to perform pseudo-alignment. One you think you know the answer, check whether it matches the solution in the next section and execute the pseudo-alignment.</p>
</div>
<div id="solution-to-kallisto-pseudo-alignment" class="section level3">
<h3><span class="header-section-number">3.6.4</span> Solution to Kallisto Pseudo-Alignment</h3>
<p>Use the below command to perform pseudo-alignment</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> results/Kallisto
<span class="ex">kallisto</span> pseudo -i indices/Kallisto/transcripts.idx -o results/Kallisto -b batch.txt </code></pre></div>
<p>See <a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a> for instructions on creating batch.txt, or ask an instructor if you get stuck.</p>
</div>
<div id="understanding-the-output-of-kallisto-pseudo-alignment" class="section level3">
<h3><span class="header-section-number">3.6.5</span> Understanding the Output of Kallisto Pseudo-Alignment</h3>
<p>The command above should produce 4 files - matrix.cells, matrix.ec, matrix.tsv and run_info.json.</p>
<ul>
<li>matrix.cells contains a list of cell IDs. As we only used one cell, this file should just contain “ERR522959”</li>
<li>matrix.ec contains information about the equivalence classes used. The first number in each row is the equivalence class ID. The second number(s) correspond to the transcript ID(s) in that equivalence class. For example “10 1,2,3” would mean that equivalence class 10 contains transcript IDs 1,2 and 3. The ID numbers correspond to the order that the transcripts appear in reference.transcripts.fa. Zero indexing is used, meaning transcript IDs 1,2 and 3 correspond to the second, third and fourth transcripts in 2000_reference.transcripts.fa.</li>
<li>matrix.tsv contains information about how many reads in each cell map to each equivalence class. The first number is the equivalence class ID, as defined in matrix.ec. The second number is the cell ID, where the cell ID corresponds to the order that the cell came in the matrix.cells file. The third number is the number of reads which fall into that equivalence class. For example, “5 1 3” means that 3 reads from cell 1 map to equivalence class 5. Note that zero indexing is used, so cell 1 corresponds to the second line of matrix.cells.</li>
<li>run_info.json contains information about how Kallisto was executed and can be ignored.</li>
</ul>
</div>
</div>
</div>
<h3> References</h3>
<div id="refs" class="references">
<div id="ref-Kolodziejczyk2015-xy">
<p>Kolodziejczyk, Aleksandra A., Jong Kyoung Kim, Valentine Svensson, John C. Marioni, and Sarah A. Teichmann. 2015. “The Technology and Biology of Single-Cell RNA Sequencing.” <em>Molecular Cell</em> 58 (4). Elsevier BV: 610–20. doi:<a href="https://doi.org/10.1016/j.molcel.2015.04.005">10.1016/j.molcel.2015.04.005</a>.</p>
</div>
<div id="ref-bray_2016">
<p>Bray, Nicolas L, Harold Pimentel, Páll Melsted, and Lior Pachter. 2016. “Near-Optimal Probabilistic Rna-Seq Quantification.” <em>Nat Biotechnol</em> 34 (5): 525–27. doi:<a href="https://doi.org/10.1038/nbt.3519">10.1038/nbt.3519</a>.</p>
</div>
</div>
</section>
</div>
</div>
</div>
<a href="introduction-to-single-cell-rna-seq.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="construction-of-expression-matrix.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"download": ["scRNA-seq-course.pdf"],
"toc": {
"collapse": "section"
},
"search": true
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "";
if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:" && /^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>