docs/processing-raw-scrna-seq-data.html

<!DOCTYPE html>
<html >

<head>

  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <title>Analysis of single cell RNA-seq data</title>
  <meta name="description" content="Analysis of single cell RNA-seq data">
  <meta name="generator" content="bookdown 0.7 and GitBook 2.6.7">

  <meta property="og:title" content="Analysis of single cell RNA-seq data" />
  <meta property="og:type" content="book" />
  
  
  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Analysis of single cell RNA-seq data" />
  
  
<meta name="author" content="Vladimir Kiselev (wikiselev), Tallulah Andrews, Jennifer Westoby (Jenni_Westoby), Davis McCarthy (davisjmcc), Maren Büttner (marenbuettner) and Martin Hemberg (m_hemberg)">


<meta name="date" content="2018-05-29">

  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black">
  
  
<link rel="prev" href="introduction-to-single-cell-rna-seq.html">
<link rel="next" href="construction-of-expression-matrix.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />


<!-- for Facebook -->  
<meta property="og:url" content="http://hemberg-lab.github.io/scRNA.seq.course/" />
<meta property="og:description" content="In this course we will be surveying the existing problems as well as the available computational and statistical frameworks available for the analysis of scRNA-seq. The course is taught through the University of Cambridge Bioinformatics training unit, but the material found on these pages is meant to be used for anyone interested in learning about computational analysis of scRNA-seq data." />
<meta property="og:image" content="http://hemberg-lab.github.io/scRNA.seq.course/figures/RNA-Seq_workflow-5.pdf.jpg" />

<!-- for Twitter -->          
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:title" content="Analysis of single-cell RNA-seq data" />
<meta name="twitter:description" content="In this course we will be surveying the existing problems as well as the available computational and statistical frameworks available for the analysis of scRNA-seq. The course is taught through the University of Cambridge Bioinformatics training unit, but the material found on these pages is meant to be used for anyone interested in learning about computational analysis of scRNA-seq data." />
<meta name="twitter:image" content="http://hemberg-lab.github.io/scRNA.seq.course/figures/RNA-Seq_workflow-5.pdf.jpg" />

<!-- Google Analytics -->
<script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

  ga('create', 'UA-71525309-1', 'auto');
  ga('send', 'pageview');

</script>


<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
  margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><a href="index.html">Table of Contents</a></li>

<li class="divider"></li>
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> About the course</a><ul>
<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#video"><i class="fa fa-check"></i><b>1.1</b> Video</a></li>
<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#registration"><i class="fa fa-check"></i><b>1.2</b> Registration</a></li>
<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#github"><i class="fa fa-check"></i><b>1.3</b> GitHub</a></li>
<li class="chapter" data-level="1.4" data-path="index.html"><a href="index.html#docker-image-rstudio"><i class="fa fa-check"></i><b>1.4</b> Docker image (RStudio)</a></li>
<li class="chapter" data-level="1.5" data-path="index.html"><a href="index.html#manual-installation"><i class="fa fa-check"></i><b>1.5</b> Manual installation</a></li>
<li class="chapter" data-level="1.6" data-path="index.html"><a href="index.html#license"><i class="fa fa-check"></i><b>1.6</b> License</a></li>
<li class="chapter" data-level="1.7" data-path="index.html"><a href="index.html#prerequisites"><i class="fa fa-check"></i><b>1.7</b> Prerequisites</a></li>
<li class="chapter" data-level="1.8" data-path="index.html"><a href="index.html#contact"><i class="fa fa-check"></i><b>1.8</b> Contact</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html"><i class="fa fa-check"></i><b>2</b> Introduction to single-cell RNA-seq</a><ul>
<li class="chapter" data-level="2.1" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#bulk-rna-seq"><i class="fa fa-check"></i><b>2.1</b> Bulk RNA-seq</a></li>
<li class="chapter" data-level="2.2" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#scrna-seq"><i class="fa fa-check"></i><b>2.2</b> scRNA-seq</a></li>
<li class="chapter" data-level="2.3" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#workflow"><i class="fa fa-check"></i><b>2.3</b> Workflow</a></li>
<li class="chapter" data-level="2.4" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#computational-analysis"><i class="fa fa-check"></i><b>2.4</b> Computational Analysis</a></li>
<li class="chapter" data-level="2.5" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#challenges"><i class="fa fa-check"></i><b>2.5</b> Challenges</a></li>
<li class="chapter" data-level="2.6" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#experimental-methods"><i class="fa fa-check"></i><b>2.6</b> Experimental methods</a></li>
<li class="chapter" data-level="2.7" data-path="introduction-to-single-cell-rna-seq.html"><a href="introduction-to-single-cell-rna-seq.html#what-platform-to-use-for-my-experiment"><i class="fa fa-check"></i><b>2.7</b> What platform to use for my experiment?</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html"><i class="fa fa-check"></i><b>3</b> Processing Raw scRNA-seq Data</a><ul>
<li class="chapter" data-level="3.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#fastqc"><i class="fa fa-check"></i><b>3.1</b> FastQC</a><ul>
<li class="chapter" data-level="3.1.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-and-downloading-the-report"><i class="fa fa-check"></i><b>3.1.1</b> Solution and Downloading the Report</a></li>
</ul></li>
<li class="chapter" data-level="3.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#trimming-reads"><i class="fa fa-check"></i><b>3.2</b> Trimming Reads</a><ul>
<li class="chapter" data-level="3.2.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution"><i class="fa fa-check"></i><b>3.2.1</b> Solution</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#file-formats"><i class="fa fa-check"></i><b>3.3</b> File formats</a><ul>
<li class="chapter" data-level="3.3.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#fastq"><i class="fa fa-check"></i><b>3.3.1</b> FastQ</a></li>
<li class="chapter" data-level="3.3.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#bam"><i class="fa fa-check"></i><b>3.3.2</b> BAM</a></li>
<li class="chapter" data-level="3.3.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#cram"><i class="fa fa-check"></i><b>3.3.3</b> CRAM</a></li>
<li class="chapter" data-level="3.3.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#mannually-inspecting-files"><i class="fa fa-check"></i><b>3.3.4</b> Mannually Inspecting files</a></li>
<li class="chapter" data-level="3.3.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#genome-fasta-gtf"><i class="fa fa-check"></i><b>3.3.5</b> Genome (FASTA, GTF)</a></li>
</ul></li>
<li class="chapter" data-level="3.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#demultiplexing"><i class="fa fa-check"></i><b>3.4</b> Demultiplexing</a><ul>
<li class="chapter" data-level="3.4.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#identifying-cell-containing-dropletsmicrowells"><i class="fa fa-check"></i><b>3.4.1</b> Identifying cell-containing droplets/microwells</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#using-star-to-align-reads"><i class="fa fa-check"></i><b>3.5</b> Using STAR to Align Reads</a><ul>
<li class="chapter" data-level="3.5.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-for-star-alignment"><i class="fa fa-check"></i><b>3.5.1</b> Solution for STAR Alignment</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#kallisto-and-pseudo-alignment"><i class="fa fa-check"></i><b>3.6</b> Kallisto and Pseudo-Alignment</a><ul>
<li class="chapter" data-level="3.6.1" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#what-is-a-k-mer"><i class="fa fa-check"></i><b>3.6.1</b> What is a k-mer?</a></li>
<li class="chapter" data-level="3.6.2" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#why-map-k-mers-rather-than-reads"><i class="fa fa-check"></i><b>3.6.2</b> Why map k-mers rather than reads?</a></li>
<li class="chapter" data-level="3.6.3" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#kallistos-pseudo-mode"><i class="fa fa-check"></i><b>3.6.3</b> Kallisto’s pseudo mode</a></li>
<li class="chapter" data-level="3.6.4" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#solution-to-kallisto-pseudo-alignment"><i class="fa fa-check"></i><b>3.6.4</b> Solution to Kallisto Pseudo-Alignment</a></li>
<li class="chapter" data-level="3.6.5" data-path="processing-raw-scrna-seq-data.html"><a href="processing-raw-scrna-seq-data.html#understanding-the-output-of-kallisto-pseudo-alignment"><i class="fa fa-check"></i><b>3.6.5</b> Understanding the Output of Kallisto Pseudo-Alignment</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html"><i class="fa fa-check"></i><b>4</b> Construction of expression matrix</a><ul>
<li class="chapter" data-level="4.1" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-qc"><i class="fa fa-check"></i><b>4.1</b> Reads QC</a></li>
<li class="chapter" data-level="4.2" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-alignment"><i class="fa fa-check"></i><b>4.2</b> Reads alignment</a></li>
<li class="chapter" data-level="4.3" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#alignment-example"><i class="fa fa-check"></i><b>4.3</b> Alignment example</a></li>
<li class="chapter" data-level="4.4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#mapping-qc"><i class="fa fa-check"></i><b>4.4</b> Mapping QC</a></li>
<li class="chapter" data-level="4.5" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#reads-quantification"><i class="fa fa-check"></i><b>4.5</b> Reads quantification</a></li>
<li class="chapter" data-level="4.6" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#umichapter"><i class="fa fa-check"></i><b>4.6</b> Unique Molecular Identifiers (UMIs)</a><ul>
<li class="chapter" data-level="4.6.1" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#introduction"><i class="fa fa-check"></i><b>4.6.1</b> Introduction</a></li>
<li class="chapter" data-level="4.6.2" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#mapping-barcodes"><i class="fa fa-check"></i><b>4.6.2</b> Mapping Barcodes</a></li>
<li class="chapter" data-level="4.6.3" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#counting-barcodes"><i class="fa fa-check"></i><b>4.6.3</b> Counting Barcodes</a></li>
<li class="chapter" data-level="4.6.4" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#correcting-for-errors"><i class="fa fa-check"></i><b>4.6.4</b> Correcting for Errors</a></li>
<li class="chapter" data-level="4.6.5" data-path="construction-of-expression-matrix.html"><a href="construction-of-expression-matrix.html#downstream-analysis"><i class="fa fa-check"></i><b>4.6.5</b> Downstream Analysis</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html"><i class="fa fa-check"></i><b>5</b> Introduction to R/Bioconductor</a><ul>
<li class="chapter" data-level="5.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#installing-packages"><i class="fa fa-check"></i><b>5.1</b> Installing packages</a><ul>
<li class="chapter" data-level="5.1.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#cran"><i class="fa fa-check"></i><b>5.1.1</b> CRAN</a></li>
<li class="chapter" data-level="5.1.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#github-1"><i class="fa fa-check"></i><b>5.1.2</b> Github</a></li>
<li class="chapter" data-level="5.1.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor"><i class="fa fa-check"></i><b>5.1.3</b> Bioconductor</a></li>
<li class="chapter" data-level="5.1.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#source"><i class="fa fa-check"></i><b>5.1.4</b> Source</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#installation-instructions"><i class="fa fa-check"></i><b>5.2</b> Installation instructions:</a></li>
<li class="chapter" data-level="5.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#data-typesclasses"><i class="fa fa-check"></i><b>5.3</b> Data-types/classes</a><ul>
<li class="chapter" data-level="5.3.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#numeric"><i class="fa fa-check"></i><b>5.3.1</b> Numeric</a></li>
<li class="chapter" data-level="5.3.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#characterstring"><i class="fa fa-check"></i><b>5.3.2</b> Character/String</a></li>
<li class="chapter" data-level="5.3.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#logical"><i class="fa fa-check"></i><b>5.3.3</b> Logical</a></li>
<li class="chapter" data-level="5.3.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#factors"><i class="fa fa-check"></i><b>5.3.4</b> Factors</a></li>
<li class="chapter" data-level="5.3.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#checking-classtype"><i class="fa fa-check"></i><b>5.3.5</b> Checking class/type</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#basic-data-structures"><i class="fa fa-check"></i><b>5.4</b> Basic data structures</a></li>
<li class="chapter" data-level="5.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#more-information"><i class="fa fa-check"></i><b>5.5</b> More information</a></li>
<li class="chapter" data-level="5.6" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#data-types"><i class="fa fa-check"></i><b>5.6</b> Data Types</a><ul>
<li class="chapter" data-level="5.6.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-tidy-data"><i class="fa fa-check"></i><b>5.6.1</b> What is Tidy Data?</a></li>
<li class="chapter" data-level="5.6.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-rich-data"><i class="fa fa-check"></i><b>5.6.2</b> What is Rich Data?</a></li>
<li class="chapter" data-level="5.6.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-bioconductor"><i class="fa fa-check"></i><b>5.6.3</b> What is Bioconductor?</a></li>
<li class="chapter" data-level="5.6.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#singlecellexperiment-class"><i class="fa fa-check"></i><b>5.6.4</b> <code>SingleCellExperiment</code> class</a></li>
<li class="chapter" data-level="5.6.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#scater-package"><i class="fa fa-check"></i><b>5.6.5</b> <code>scater</code> package</a></li>
</ul></li>
<li class="chapter" data-level="5.7" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor-singlecellexperiment-and-scater"><i class="fa fa-check"></i><b>5.7</b> Bioconductor, <code>SingleCellExperiment</code> and <code>scater</code></a><ul>
<li class="chapter" data-level="5.7.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#bioconductor-1"><i class="fa fa-check"></i><b>5.7.1</b> Bioconductor</a></li>
<li class="chapter" data-level="5.7.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#singlecellexperiment-class-1"><i class="fa fa-check"></i><b>5.7.2</b> <code>SingleCellExperiment</code> class</a></li>
<li class="chapter" data-level="5.7.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#scater-package-1"><i class="fa fa-check"></i><b>5.7.3</b> <code>scater</code> package</a></li>
</ul></li>
<li class="chapter" data-level="5.8" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#an-introduction-to-ggplot2"><i class="fa fa-check"></i><b>5.8</b> An Introduction to ggplot2</a><ul>
<li class="chapter" data-level="5.8.1" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#what-is-ggplot2"><i class="fa fa-check"></i><b>5.8.1</b> What is ggplot2?</a></li>
<li class="chapter" data-level="5.8.2" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#principles-of-ggplot2"><i class="fa fa-check"></i><b>5.8.2</b> Principles of ggplot2</a></li>
<li class="chapter" data-level="5.8.3" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#using-the-aes-mapping-function"><i class="fa fa-check"></i><b>5.8.3</b> Using the <code>aes</code> mapping function</a></li>
<li class="chapter" data-level="5.8.4" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#geoms"><i class="fa fa-check"></i><b>5.8.4</b> Geoms</a></li>
<li class="chapter" data-level="5.8.5" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#plotting-data-from-more-than-2-cells"><i class="fa fa-check"></i><b>5.8.5</b> Plotting data from more than 2 cells</a></li>
<li class="chapter" data-level="5.8.6" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#plotting-heatmaps"><i class="fa fa-check"></i><b>5.8.6</b> Plotting heatmaps</a></li>
<li class="chapter" data-level="5.8.7" data-path="introduction-to-rbioconductor.html"><a href="introduction-to-rbioconductor.html#principle-component-analysis"><i class="fa fa-check"></i><b>5.8.7</b> Principle Component Analysis</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="6" data-path="tabula-muris.html"><a href="tabula-muris.html"><i class="fa fa-check"></i><b>6</b> Tabula Muris</a><ul>
<li class="chapter" data-level="6.1" data-path="tabula-muris.html"><a href="tabula-muris.html#introduction-1"><i class="fa fa-check"></i><b>6.1</b> Introduction</a></li>
<li class="chapter" data-level="6.2" data-path="tabula-muris.html"><a href="tabula-muris.html#downloading-the-data"><i class="fa fa-check"></i><b>6.2</b> Downloading the data</a></li>
<li class="chapter" data-level="6.3" data-path="tabula-muris.html"><a href="tabula-muris.html#reading-the-data-smartseq2"><i class="fa fa-check"></i><b>6.3</b> Reading the data (Smartseq2)</a></li>
<li class="chapter" data-level="6.4" data-path="tabula-muris.html"><a href="tabula-muris.html#building-a-scater-object"><i class="fa fa-check"></i><b>6.4</b> Building a scater object</a></li>
<li class="chapter" data-level="6.5" data-path="tabula-muris.html"><a href="tabula-muris.html#reading-the-data-10x"><i class="fa fa-check"></i><b>6.5</b> Reading the data (10X)</a></li>
<li class="chapter" data-level="6.6" data-path="tabula-muris.html"><a href="tabula-muris.html#building-a-scater-object-1"><i class="fa fa-check"></i><b>6.6</b> Building a scater object</a></li>
<li class="chapter" data-level="6.7" data-path="tabula-muris.html"><a href="tabula-muris.html#advanced-exercise"><i class="fa fa-check"></i><b>6.7</b> Advanced Exercise</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html"><i class="fa fa-check"></i><b>7</b> Cleaning the Expression Matrix</a><ul>
<li class="chapter" data-level="7.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exprs-qc"><i class="fa fa-check"></i><b>7.1</b> Expression QC (UMI)</a><ul>
<li class="chapter" data-level="7.1.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-2"><i class="fa fa-check"></i><b>7.1.1</b> Introduction</a></li>
<li class="chapter" data-level="7.1.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#tung-dataset"><i class="fa fa-check"></i><b>7.1.2</b> Tung dataset</a></li>
<li class="chapter" data-level="7.1.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cell-qc"><i class="fa fa-check"></i><b>7.1.3</b> Cell QC</a></li>
<li class="chapter" data-level="7.1.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cell-filtering"><i class="fa fa-check"></i><b>7.1.4</b> Cell filtering</a></li>
<li class="chapter" data-level="7.1.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#compare-filterings"><i class="fa fa-check"></i><b>7.1.5</b> Compare filterings</a></li>
<li class="chapter" data-level="7.1.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#gene-analysis"><i class="fa fa-check"></i><b>7.1.6</b> Gene analysis</a></li>
<li class="chapter" data-level="7.1.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#save-the-data"><i class="fa fa-check"></i><b>7.1.7</b> Save the data</a></li>
<li class="chapter" data-level="7.1.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise"><i class="fa fa-check"></i><b>7.1.8</b> Big Exercise</a></li>
<li class="chapter" data-level="7.1.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo"><i class="fa fa-check"></i><b>7.1.9</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#expression-qc-reads"><i class="fa fa-check"></i><b>7.2</b> Expression QC (Reads)</a></li>
<li class="chapter" data-level="7.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#data-visualization"><i class="fa fa-check"></i><b>7.3</b> Data visualization</a><ul>
<li class="chapter" data-level="7.3.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-3"><i class="fa fa-check"></i><b>7.3.1</b> Introduction</a></li>
<li class="chapter" data-level="7.3.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#visual-pca"><i class="fa fa-check"></i><b>7.3.2</b> PCA plot</a></li>
<li class="chapter" data-level="7.3.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#visual-tsne"><i class="fa fa-check"></i><b>7.3.3</b> tSNE map</a></li>
<li class="chapter" data-level="7.3.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise-1"><i class="fa fa-check"></i><b>7.3.4</b> Big Exercise</a></li>
<li class="chapter" data-level="7.3.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-1"><i class="fa fa-check"></i><b>7.3.5</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#data-visualization-reads"><i class="fa fa-check"></i><b>7.4</b> Data visualization (Reads)</a></li>
<li class="chapter" data-level="7.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#identifying-confounding-factors"><i class="fa fa-check"></i><b>7.5</b> Identifying confounding factors</a><ul>
<li class="chapter" data-level="7.5.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-4"><i class="fa fa-check"></i><b>7.5.1</b> Introduction</a></li>
<li class="chapter" data-level="7.5.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#correlations-with-pcs"><i class="fa fa-check"></i><b>7.5.2</b> Correlations with PCs</a></li>
<li class="chapter" data-level="7.5.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#explanatory-variables"><i class="fa fa-check"></i><b>7.5.3</b> Explanatory variables</a></li>
<li class="chapter" data-level="7.5.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#other-confounders"><i class="fa fa-check"></i><b>7.5.4</b> Other confounders</a></li>
<li class="chapter" data-level="7.5.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exercise"><i class="fa fa-check"></i><b>7.5.5</b> Exercise</a></li>
<li class="chapter" data-level="7.5.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-2"><i class="fa fa-check"></i><b>7.5.6</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#identifying-confounding-factors-reads"><i class="fa fa-check"></i><b>7.6</b> Identifying confounding factors (Reads)</a></li>
<li class="chapter" data-level="7.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-theory"><i class="fa fa-check"></i><b>7.7</b> Normalization theory</a><ul>
<li class="chapter" data-level="7.7.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-5"><i class="fa fa-check"></i><b>7.7.1</b> Introduction</a></li>
<li class="chapter" data-level="7.7.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#library-size-1"><i class="fa fa-check"></i><b>7.7.2</b> Library size</a></li>
<li class="chapter" data-level="7.7.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalisations"><i class="fa fa-check"></i><b>7.7.3</b> Normalisations</a></li>
<li class="chapter" data-level="7.7.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#effectiveness"><i class="fa fa-check"></i><b>7.7.4</b> Effectiveness</a></li>
</ul></li>
<li class="chapter" data-level="7.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-practice-umi"><i class="fa fa-check"></i><b>7.8</b> Normalization practice (UMI)</a><ul>
<li class="chapter" data-level="7.8.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#raw"><i class="fa fa-check"></i><b>7.8.1</b> Raw</a></li>
<li class="chapter" data-level="7.8.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#cpm-1"><i class="fa fa-check"></i><b>7.8.2</b> CPM</a></li>
<li class="chapter" data-level="7.8.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#size-factor-rle"><i class="fa fa-check"></i><b>7.8.3</b> Size-factor (RLE)</a></li>
<li class="chapter" data-level="7.8.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#upperquantile"><i class="fa fa-check"></i><b>7.8.4</b> Upperquantile</a></li>
<li class="chapter" data-level="7.8.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#tmm-1"><i class="fa fa-check"></i><b>7.8.5</b> TMM</a></li>
<li class="chapter" data-level="7.8.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#scran-1"><i class="fa fa-check"></i><b>7.8.6</b> scran</a></li>
<li class="chapter" data-level="7.8.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#downsampling-1"><i class="fa fa-check"></i><b>7.8.7</b> Downsampling</a></li>
<li class="chapter" data-level="7.8.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalisation-for-genetranscript-length"><i class="fa fa-check"></i><b>7.8.8</b> Normalisation for gene/transcript length</a></li>
<li class="chapter" data-level="7.8.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#exercise-1"><i class="fa fa-check"></i><b>7.8.9</b> Exercise</a></li>
<li class="chapter" data-level="7.8.10" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-3"><i class="fa fa-check"></i><b>7.8.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.9" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#normalization-practice-reads"><i class="fa fa-check"></i><b>7.9</b> Normalization practice (Reads)</a></li>
<li class="chapter" data-level="7.10" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#dealing-with-confounders"><i class="fa fa-check"></i><b>7.10</b> Dealing with confounders</a><ul>
<li class="chapter" data-level="7.10.1" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#introduction-6"><i class="fa fa-check"></i><b>7.10.1</b> Introduction</a></li>
<li class="chapter" data-level="7.10.2" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#remove-unwanted-variation"><i class="fa fa-check"></i><b>7.10.2</b> Remove Unwanted Variation</a></li>
<li class="chapter" data-level="7.10.3" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#combat"><i class="fa fa-check"></i><b>7.10.3</b> Combat</a></li>
<li class="chapter" data-level="7.10.4" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#mnncorrect"><i class="fa fa-check"></i><b>7.10.4</b> mnnCorrect</a></li>
<li class="chapter" data-level="7.10.5" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#glm"><i class="fa fa-check"></i><b>7.10.5</b> GLM</a></li>
<li class="chapter" data-level="7.10.6" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#how-to-evaluate-and-compare-confounder-removal-strategies"><i class="fa fa-check"></i><b>7.10.6</b> How to evaluate and compare confounder removal strategies</a></li>
<li class="chapter" data-level="7.10.7" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#big-exercise-2"><i class="fa fa-check"></i><b>7.10.7</b> Big Exercise</a></li>
<li class="chapter" data-level="7.10.8" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#sessioninfo-4"><i class="fa fa-check"></i><b>7.10.8</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="7.11" data-path="cleaning-the-expression-matrix.html"><a href="cleaning-the-expression-matrix.html#dealing-with-confounders-reads"><i class="fa fa-check"></i><b>7.11</b> Dealing with confounders (Reads)</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="biological-analysis.html"><a href="biological-analysis.html"><i class="fa fa-check"></i><b>8</b> Biological Analysis</a><ul>
<li class="chapter" data-level="8.1" data-path="biological-analysis.html"><a href="biological-analysis.html#clustering-introduction"><i class="fa fa-check"></i><b>8.1</b> Clustering Introduction</a><ul>
<li class="chapter" data-level="8.1.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-7"><i class="fa fa-check"></i><b>8.1.1</b> Introduction</a></li>
<li class="chapter" data-level="8.1.2" data-path="biological-analysis.html"><a href="biological-analysis.html#dimensionality-reductions"><i class="fa fa-check"></i><b>8.1.2</b> Dimensionality reductions</a></li>
<li class="chapter" data-level="8.1.3" data-path="biological-analysis.html"><a href="biological-analysis.html#clustering-methods"><i class="fa fa-check"></i><b>8.1.3</b> Clustering methods</a></li>
<li class="chapter" data-level="8.1.4" data-path="biological-analysis.html"><a href="biological-analysis.html#challenges-in-clustering"><i class="fa fa-check"></i><b>8.1.4</b> Challenges in clustering</a></li>
<li class="chapter" data-level="8.1.5" data-path="biological-analysis.html"><a href="biological-analysis.html#tools-for-scrna-seq-data"><i class="fa fa-check"></i><b>8.1.5</b> Tools for scRNA-seq data</a></li>
<li class="chapter" data-level="8.1.6" data-path="biological-analysis.html"><a href="biological-analysis.html#comparing-clustering"><i class="fa fa-check"></i><b>8.1.6</b> Comparing clustering</a></li>
</ul></li>
<li class="chapter" data-level="8.2" data-path="biological-analysis.html"><a href="biological-analysis.html#clust-methods"><i class="fa fa-check"></i><b>8.2</b> Clustering example</a><ul>
<li class="chapter" data-level="8.2.1" data-path="biological-analysis.html"><a href="biological-analysis.html#deng-dataset"><i class="fa fa-check"></i><b>8.2.1</b> Deng dataset</a></li>
<li class="chapter" data-level="8.2.2" data-path="biological-analysis.html"><a href="biological-analysis.html#sc3-1"><i class="fa fa-check"></i><b>8.2.2</b> SC3</a></li>
<li class="chapter" data-level="8.2.3" data-path="biological-analysis.html"><a href="biological-analysis.html#pcareduce-1"><i class="fa fa-check"></i><b>8.2.3</b> pcaReduce</a></li>
<li class="chapter" data-level="8.2.4" data-path="biological-analysis.html"><a href="biological-analysis.html#tsne-kmeans"><i class="fa fa-check"></i><b>8.2.4</b> tSNE + kmeans</a></li>
<li class="chapter" data-level="8.2.5" data-path="biological-analysis.html"><a href="biological-analysis.html#snn-cliq-1"><i class="fa fa-check"></i><b>8.2.5</b> SNN-Cliq</a></li>
<li class="chapter" data-level="8.2.6" data-path="biological-analysis.html"><a href="biological-analysis.html#sincera-1"><i class="fa fa-check"></i><b>8.2.6</b> SINCERA</a></li>
<li class="chapter" data-level="8.2.7" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-5"><i class="fa fa-check"></i><b>8.2.7</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.3" data-path="biological-analysis.html"><a href="biological-analysis.html#feature-selection"><i class="fa fa-check"></i><b>8.3</b> Feature Selection</a><ul>
<li class="chapter" data-level="8.3.1" data-path="biological-analysis.html"><a href="biological-analysis.html#identifying-genes-vs-a-null-model"><i class="fa fa-check"></i><b>8.3.1</b> Identifying Genes vs a Null Model</a></li>
<li class="chapter" data-level="8.3.2" data-path="biological-analysis.html"><a href="biological-analysis.html#correlated-expression"><i class="fa fa-check"></i><b>8.3.2</b> Correlated Expression</a></li>
<li class="chapter" data-level="8.3.3" data-path="biological-analysis.html"><a href="biological-analysis.html#comparing-methods"><i class="fa fa-check"></i><b>8.3.3</b> Comparing Methods</a></li>
<li class="chapter" data-level="8.3.4" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-6"><i class="fa fa-check"></i><b>8.3.4</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.4" data-path="biological-analysis.html"><a href="biological-analysis.html#pseudotime-analysis"><i class="fa fa-check"></i><b>8.4</b> Pseudotime analysis</a><ul>
<li class="chapter" data-level="8.4.1" data-path="biological-analysis.html"><a href="biological-analysis.html#first-look-at-deng-data"><i class="fa fa-check"></i><b>8.4.1</b> First look at Deng data</a></li>
<li class="chapter" data-level="8.4.2" data-path="biological-analysis.html"><a href="biological-analysis.html#tscan"><i class="fa fa-check"></i><b>8.4.2</b> TSCAN</a></li>
<li class="chapter" data-level="8.4.3" data-path="biological-analysis.html"><a href="biological-analysis.html#monocle"><i class="fa fa-check"></i><b>8.4.3</b> monocle</a></li>
<li class="chapter" data-level="8.4.4" data-path="biological-analysis.html"><a href="biological-analysis.html#diffusion-maps"><i class="fa fa-check"></i><b>8.4.4</b> Diffusion maps</a></li>
<li class="chapter" data-level="8.4.5" data-path="biological-analysis.html"><a href="biological-analysis.html#slicer"><i class="fa fa-check"></i><b>8.4.5</b> SLICER</a></li>
<li class="chapter" data-level="8.4.6" data-path="biological-analysis.html"><a href="biological-analysis.html#ouija"><i class="fa fa-check"></i><b>8.4.6</b> Ouija</a></li>
<li class="chapter" data-level="8.4.7" data-path="biological-analysis.html"><a href="biological-analysis.html#comparison-of-the-methods"><i class="fa fa-check"></i><b>8.4.7</b> Comparison of the methods</a></li>
<li class="chapter" data-level="8.4.8" data-path="biological-analysis.html"><a href="biological-analysis.html#expression-of-genes-through-time"><i class="fa fa-check"></i><b>8.4.8</b> Expression of genes through time</a></li>
<li class="chapter" data-level="8.4.9" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-7"><i class="fa fa-check"></i><b>8.4.9</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.5" data-path="biological-analysis.html"><a href="biological-analysis.html#imputation"><i class="fa fa-check"></i><b>8.5</b> Imputation</a><ul>
<li class="chapter" data-level="8.5.1" data-path="biological-analysis.html"><a href="biological-analysis.html#scimpute"><i class="fa fa-check"></i><b>8.5.1</b> scImpute</a></li>
<li class="chapter" data-level="8.5.2" data-path="biological-analysis.html"><a href="biological-analysis.html#magic"><i class="fa fa-check"></i><b>8.5.2</b> MAGIC</a></li>
<li class="chapter" data-level="8.5.3" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-8"><i class="fa fa-check"></i><b>8.5.3</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.6" data-path="biological-analysis.html"><a href="biological-analysis.html#dechapter"><i class="fa fa-check"></i><b>8.6</b> Differential Expression (DE) analysis</a><ul>
<li class="chapter" data-level="8.6.1" data-path="biological-analysis.html"><a href="biological-analysis.html#bulk-rna-seq-1"><i class="fa fa-check"></i><b>8.6.1</b> Bulk RNA-seq</a></li>
<li class="chapter" data-level="8.6.2" data-path="biological-analysis.html"><a href="biological-analysis.html#single-cell-rna-seq"><i class="fa fa-check"></i><b>8.6.2</b> Single cell RNA-seq</a></li>
<li class="chapter" data-level="8.6.3" data-path="biological-analysis.html"><a href="biological-analysis.html#differences-in-distribution"><i class="fa fa-check"></i><b>8.6.3</b> Differences in Distribution</a></li>
<li class="chapter" data-level="8.6.4" data-path="biological-analysis.html"><a href="biological-analysis.html#models-of-single-cell-rnaseq-data"><i class="fa fa-check"></i><b>8.6.4</b> Models of single-cell RNASeq data</a></li>
</ul></li>
<li class="chapter" data-level="8.7" data-path="biological-analysis.html"><a href="biological-analysis.html#de-in-a-real-dataset"><i class="fa fa-check"></i><b>8.7</b> DE in a real dataset</a><ul>
<li class="chapter" data-level="8.7.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-8"><i class="fa fa-check"></i><b>8.7.1</b> Introduction</a></li>
<li class="chapter" data-level="8.7.2" data-path="biological-analysis.html"><a href="biological-analysis.html#kolmogorov-smirnov-test"><i class="fa fa-check"></i><b>8.7.2</b> Kolmogorov-Smirnov test</a></li>
<li class="chapter" data-level="8.7.3" data-path="biological-analysis.html"><a href="biological-analysis.html#wilcoxmann-whitney-u-test"><i class="fa fa-check"></i><b>8.7.3</b> Wilcox/Mann-Whitney-U Test</a></li>
<li class="chapter" data-level="8.7.4" data-path="biological-analysis.html"><a href="biological-analysis.html#edger"><i class="fa fa-check"></i><b>8.7.4</b> edgeR</a></li>
<li class="chapter" data-level="8.7.5" data-path="biological-analysis.html"><a href="biological-analysis.html#monocle-1"><i class="fa fa-check"></i><b>8.7.5</b> Monocle</a></li>
<li class="chapter" data-level="8.7.6" data-path="biological-analysis.html"><a href="biological-analysis.html#mast"><i class="fa fa-check"></i><b>8.7.6</b> MAST</a></li>
<li class="chapter" data-level="8.7.7" data-path="biological-analysis.html"><a href="biological-analysis.html#slow-methods-1h-to-run"><i class="fa fa-check"></i><b>8.7.7</b> Slow Methods (&gt;1h to run)</a></li>
<li class="chapter" data-level="8.7.8" data-path="biological-analysis.html"><a href="biological-analysis.html#bpsc"><i class="fa fa-check"></i><b>8.7.8</b> BPSC</a></li>
<li class="chapter" data-level="8.7.9" data-path="biological-analysis.html"><a href="biological-analysis.html#scde"><i class="fa fa-check"></i><b>8.7.9</b> SCDE</a></li>
<li class="chapter" data-level="8.7.10" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-9"><i class="fa fa-check"></i><b>8.7.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.8" data-path="biological-analysis.html"><a href="biological-analysis.html#comparingcombining-scrnaseq-datasets"><i class="fa fa-check"></i><b>8.8</b> Comparing/Combining scRNASeq datasets</a><ul>
<li class="chapter" data-level="8.8.1" data-path="biological-analysis.html"><a href="biological-analysis.html#introduction-9"><i class="fa fa-check"></i><b>8.8.1</b> Introduction</a></li>
<li class="chapter" data-level="8.8.2" data-path="biological-analysis.html"><a href="biological-analysis.html#datasets"><i class="fa fa-check"></i><b>8.8.2</b> Datasets</a></li>
<li class="chapter" data-level="8.8.3" data-path="biological-analysis.html"><a href="biological-analysis.html#projecting-cells-onto-annotated-cell-types-scmap"><i class="fa fa-check"></i><b>8.8.3</b> Projecting cells onto annotated cell-types (scmap)</a></li>
<li class="chapter" data-level="8.8.4" data-path="biological-analysis.html"><a href="biological-analysis.html#cell-to-cell-mapping"><i class="fa fa-check"></i><b>8.8.4</b> Cell-to-Cell mapping</a></li>
<li class="chapter" data-level="8.8.5" data-path="biological-analysis.html"><a href="biological-analysis.html#metaneighbour"><i class="fa fa-check"></i><b>8.8.5</b> Metaneighbour</a></li>
<li class="chapter" data-level="8.8.6" data-path="biological-analysis.html"><a href="biological-analysis.html#mnncorrect-1"><i class="fa fa-check"></i><b>8.8.6</b> mnnCorrect</a></li>
<li class="chapter" data-level="8.8.7" data-path="biological-analysis.html"><a href="biological-analysis.html#cannonical-correlation-analysis-seurat"><i class="fa fa-check"></i><b>8.8.7</b> Cannonical Correlation Analysis (Seurat)</a></li>
<li class="chapter" data-level="8.8.8" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-10"><i class="fa fa-check"></i><b>8.8.8</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="8.9" data-path="biological-analysis.html"><a href="biological-analysis.html#search-scrna-seq-data"><i class="fa fa-check"></i><b>8.9</b> Search scRNA-Seq data</a><ul>
<li class="chapter" data-level="8.9.1" data-path="biological-analysis.html"><a href="biological-analysis.html#about"><i class="fa fa-check"></i><b>8.9.1</b> About</a></li>
<li class="chapter" data-level="8.9.2" data-path="biological-analysis.html"><a href="biological-analysis.html#dataset"><i class="fa fa-check"></i><b>8.9.2</b> Dataset</a></li>
<li class="chapter" data-level="8.9.3" data-path="biological-analysis.html"><a href="biological-analysis.html#gene-index"><i class="fa fa-check"></i><b>8.9.3</b> Gene Index</a></li>
<li class="chapter" data-level="8.9.4" data-path="biological-analysis.html"><a href="biological-analysis.html#marker-genes"><i class="fa fa-check"></i><b>8.9.4</b> Marker genes</a></li>
<li class="chapter" data-level="8.9.5" data-path="biological-analysis.html"><a href="biological-analysis.html#search-cells-by-a-gene-list"><i class="fa fa-check"></i><b>8.9.5</b> Search cells by a gene list</a></li>
<li class="chapter" data-level="8.9.6" data-path="biological-analysis.html"><a href="biological-analysis.html#sessioninfo-11"><i class="fa fa-check"></i><b>8.9.6</b> sessionInfo()</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="9" data-path="seurat-chapter.html"><a href="seurat-chapter.html"><i class="fa fa-check"></i><b>9</b> Seurat</a><ul>
<li class="chapter" data-level="9.1" data-path="seurat-chapter.html"><a href="seurat-chapter.html#seurat-object-class"><i class="fa fa-check"></i><b>9.1</b> <code>Seurat</code> object class</a></li>
<li class="chapter" data-level="9.2" data-path="seurat-chapter.html"><a href="seurat-chapter.html#expression-qc"><i class="fa fa-check"></i><b>9.2</b> Expression QC</a></li>
<li class="chapter" data-level="9.3" data-path="seurat-chapter.html"><a href="seurat-chapter.html#normalization"><i class="fa fa-check"></i><b>9.3</b> Normalization</a></li>
<li class="chapter" data-level="9.4" data-path="seurat-chapter.html"><a href="seurat-chapter.html#highly-variable-genes-1"><i class="fa fa-check"></i><b>9.4</b> Highly variable genes</a></li>
<li class="chapter" data-level="9.5" data-path="seurat-chapter.html"><a href="seurat-chapter.html#dealing-with-confounders-1"><i class="fa fa-check"></i><b>9.5</b> Dealing with confounders</a></li>
<li class="chapter" data-level="9.6" data-path="seurat-chapter.html"><a href="seurat-chapter.html#linear-dimensionality-reduction"><i class="fa fa-check"></i><b>9.6</b> Linear dimensionality reduction</a></li>
<li class="chapter" data-level="9.7" data-path="seurat-chapter.html"><a href="seurat-chapter.html#significant-pcs"><i class="fa fa-check"></i><b>9.7</b> Significant PCs</a></li>
<li class="chapter" data-level="9.8" data-path="seurat-chapter.html"><a href="seurat-chapter.html#clustering-cells"><i class="fa fa-check"></i><b>9.8</b> Clustering cells</a></li>
<li class="chapter" data-level="9.9" data-path="seurat-chapter.html"><a href="seurat-chapter.html#marker-genes-1"><i class="fa fa-check"></i><b>9.9</b> Marker genes</a></li>
<li class="chapter" data-level="9.10" data-path="seurat-chapter.html"><a href="seurat-chapter.html#sessioninfo-12"><i class="fa fa-check"></i><b>9.10</b> sessionInfo()</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><i class="fa fa-check"></i><b>10</b> “Ideal” scRNAseq pipeline (as of Oct 2017)</a><ul>
<li class="chapter" data-level="10.1" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#experimental-design"><i class="fa fa-check"></i><b>10.1</b> Experimental Design</a></li>
<li class="chapter" data-level="10.2" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#processing-reads"><i class="fa fa-check"></i><b>10.2</b> Processing Reads</a></li>
<li class="chapter" data-level="10.3" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#preparing-expression-matrix"><i class="fa fa-check"></i><b>10.3</b> Preparing Expression Matrix</a></li>
<li class="chapter" data-level="10.4" data-path="ideal-scrnaseq-pipeline-as-of-oct-2017.html"><a href="ideal-scrnaseq-pipeline-as-of-oct-2017.html#biological-interpretation"><i class="fa fa-check"></i><b>10.4</b> Biological Interpretation</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="advanced-exercises.html"><a href="advanced-exercises.html"><i class="fa fa-check"></i><b>11</b> Advanced exercises</a></li>
<li class="chapter" data-level="12" data-path="resources.html"><a href="resources.html"><i class="fa fa-check"></i><b>12</b> Resources</a><ul>
<li class="chapter" data-level="12.1" data-path="resources.html"><a href="resources.html#scrna-seq-protocols"><i class="fa fa-check"></i><b>12.1</b> scRNA-seq protocols</a></li>
<li class="chapter" data-level="12.2" data-path="resources.html"><a href="resources.html#external-rna-control-consortium-ercc"><i class="fa fa-check"></i><b>12.2</b> External RNA Control Consortium (ERCC)</a></li>
<li class="chapter" data-level="12.3" data-path="resources.html"><a href="resources.html#scrna-seq-analysis-tools"><i class="fa fa-check"></i><b>12.3</b> scRNA-seq analysis tools</a></li>
<li class="chapter" data-level="12.4" data-path="resources.html"><a href="resources.html#scrna-seq-public-datasets"><i class="fa fa-check"></i><b>12.4</b> scRNA-seq public datasets</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="references.html"><a href="references.html"><i class="fa fa-check"></i><b>13</b> References</a></li>
<li class="divider"></li>
<li><a href="http://www.sanger.ac.uk/science/groups/hemberg-group" target="blank">Hemberg Lab</a></li>

</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Analysis of single cell RNA-seq data</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="processing-raw-scrna-seq-data" class="section level1">
<h1><span class="header-section-number">3</span> Processing Raw scRNA-seq Data</h1>
<div id="fastqc" class="section level2">
<h2><span class="header-section-number">3.1</span> FastQC</h2>
<p>Once you’ve obtained your single-cell RNA-seq data, the first thing you need to do with it is check the quality of the reads you have sequenced. For this task, today we will be using a tool called FastQC. FastQC is a quality control tool for sequencing data, which can be used for both bulk and single-cell RNA-seq data. FastQC takes sequencing data as input and returns a report on read quality. Copy and paste this link into your browser to visit the FastQC website:</p>
<p><a href="https://www.bioinformatics.babraham.ac.uk/projects/fastqc/" class="uri">https://www.bioinformatics.babraham.ac.uk/projects/fastqc/</a></p>
<p>This website contains links to download and install FastQC and documentation on the reports produced. Fortunately we have already installed FastQC for you today, so instead we will take a look at the documentation. Scroll down the webpage to ‘Example Reports’ and click ‘Good Illumina Data’. This gives an example of what an ideal report should look like for high quality Illumina reads data.</p>
<p>Now let’s make a FastQC report ourselves.</p>
<p>Today we will be performing our analysis using a single cell from an mESC dataset produced by <span class="citation">(Kolodziejczyk et al. <a href="#ref-Kolodziejczyk2015-xy">2015</a>)</span>. The cells were sequenced using the SMART-seq2 library preparation protocol and the reads are paired end. The files are located in <code>Share</code>.</p>
<p><strong>Note</strong> The current text of the course is written for an AWS server for people who attend our course in person. You will have to download the files (both <code>ERR522959_1.fastq</code> and <code>ERR522959_2.fastq</code>) and create <code>Share</code> directory yourself to run the commands. You can find the files here: <a href="https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/" class="uri">https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-2600/samples/</a></p>
<p>Now let’s look at the files:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">less</span> Share/ERR522959_1.fastq
<span class="fu">less</span> Share/ERR522959_2.fastq</code></pre></div>
<p>Task 1: Try to work out what command you should use to produce the FastQC report. Hint: Try executing</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">fastqc</span> -h</code></pre></div>
<p>This command will tell you what options are available to pass to FastQC. Feel free to ask for help if you get stuck! If you are successful, you should generate a .zip and a .html file for both the forwards and the reverse reads files. Once you have been successful, feel free to have a go at the next section.</p>
<div id="solution-and-downloading-the-report" class="section level3">
<h3><span class="header-section-number">3.1.1</span> Solution and Downloading the Report</h3>
<p>If you haven’t done so already, generate the FastQC report using the commands below:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> fastqc_results
<span class="ex">fastqc</span> -o fastqc_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq</code></pre></div>
<p>Once the command has finished executing, you should have a total of four files - one zip file for each of the paired end reads, and one html file for each of the paired end reads. The report is in the html file. To view it, we will need to get it off AWS and onto your computer using either filezilla or scp. Ask an instructor if you are having difficulties.</p>
<p>Once the file is on you computer, click on it. Your FastQC report should open. Have a look through the file. Remember to look at both the forwards and the reverse end read reports! How good quality are the reads? Is there anything we should be concerned about? How might we address those concerns?</p>
<p>Feel free to chat to one of the instructors about your ideas.</p>
</div>
</div>
<div id="trimming-reads" class="section level2">
<h2><span class="header-section-number">3.2</span> Trimming Reads</h2>
<p>Fortunately there is software available for read trimming. Today we will be using Trim Galore!. Trim Galore! is a wrapper for the reads trimming software cutadapt.</p>
<p>Read trimming software can be used to trim sequencing adapters and/or low quality reads from the ends of reads. Given we noticed there was some adaptor contamination in our FastQC report, it is a good idea to trim adaptors from our data.</p>
<p>Task 2: What type of adapters were used in our data? Hint: Look at the FastQC report ‘Adapter Content’ plot.</p>
<p>Now let’s try to use Trim Galore! to remove those problematic adapters. It’s a good idea to check read quality again after trimming, so after you have trimmed your reads you should use FastQC to produce another report.</p>
<p>Task 3: Work out the command you should use to trim the adapters from our data. Hint 1: You can use</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">trim_galore</span> -h</code></pre></div>
<p>To find out what options you can pass to Trim Galore. Hint 2: Read through the output of the above command carefully. The adaptor used in this experiment is quite common. Do you need to know the actual sequence of the adaptor to remove it?</p>
<p>Task 3: Produce a FastQC report for your trimmed reads files. Is the adapter contamination gone?</p>
<p>Once you think you have successfully trimmed your reads and have confirmed this by checking the FastQC report, feel free to check your results using the next section.</p>
<div id="solution" class="section level3">
<h3><span class="header-section-number">3.2.1</span> Solution</h3>
<p>You can use the command(s) below to trim the Nextera sequencing adapters:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> fastqc_trimmed_results
<span class="ex">trim_galore</span> --nextera -o fastqc_trimmed_results Share/ERR522959_1.fastq Share/ERR522959_2.fastq</code></pre></div>
<p>Remember to generate new FastQC reports for your trimmed reads files! FastQC should now show that your reads pass the ‘Adaptor Content’ plot. Feel free to ask one of the instructors if you have any questions.</p>
<p>Congratulations! You have now generated reads quality reports and performed adaptor trimming. In the next lab, we will use STAR and Kallisto to align our trimmed and quality-checked reads to a reference transcriptome.</p>

</div>
</div>
<div id="file-formats" class="section level2">
<h2><span class="header-section-number">3.3</span> File formats</h2>
<div id="fastq" class="section level3">
<h3><span class="header-section-number">3.3.1</span> FastQ</h3>
<p>FastQ is the most raw form of scRNASeq data you will encounter. All scRNASeq protocols are sequenced with paired-end sequencing. Barcode sequences may occur in one or both reads depending on the protocol employed. However, protocols using unique molecular identifiers (UMIs) will generally contain one read with the cell and UMI barcodes plus adapters but without any transcript sequence. Thus reads will be mapped as if they are single-end sequenced despite actually being paired end.</p>
<p>FastQ files have the format:</p>
<pre class="eval"><code>&gt;ReadID
READ SEQUENCE
+
SEQUENCING QUALITY SCORES</code></pre>
</div>
<div id="bam" class="section level3">
<h3><span class="header-section-number">3.3.2</span> BAM</h3>
<p>BAM file format stores mapped reads in a standard and efficient manner. The human-readable version is called a SAM file, while the BAM file is the highly compressed version. BAM/SAM files contain a header which typically includes<br />
information on the sample preparation, sequencing and mapping; and a tab-separated row for each individual alignment of each read.</p>
<p>Alignment rows employ a standard format with the following columns:</p>
<ol style="list-style-type: decimal">
<li><p>QNAME : read name (generally will include UMI barcode if applicable)</p></li>
<li><p>FLAG : number tag indicating the “type” of alignment, <a href="https://broadinstitute.github.io/picard/explain-flags.html">link</a> to explanation of all possible “types”</p></li>
<li><p>RNAME : reference sequence name (i.e. chromosome read is mapped to).</p></li>
<li><p>POS : leftmost mapping position</p></li>
<li><p>MAPQ : Mapping quality</p></li>
<li><p>CIGAR : string indicating the matching/mismatching parts of the read (may include soft-clipping).</p></li>
<li><p>RNEXT : reference name of the mate/next read</p></li>
<li><p>PNEXT : POS for mate/next read</p></li>
<li><p>TLEN : Template length (length of reference region the read is mapped to)</p></li>
<li><p>SEQ : read sequence</p></li>
<li><p>QUAL : read quality</p></li>
</ol>
<p>BAM/SAM files can be converted to the other format using ‘samtools’:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="ex">samtools</span> view -S -b file.sam <span class="op">&gt;</span> file.bam
<span class="ex">samtools</span> view -h file.bam <span class="op">&gt;</span> file.sam</code></pre></div>
<p>Some sequencing facilities will automatically map your reads to the a standard genome and deliver either BAM or CRAM formatted files. Generally they will not have included ERCC sequences in the genome thus no ERCC reads will be mapped in the BAM/CRAM file. To quantify ERCCs (or any other genetic alterations) or if you just want to use a different alignment algorithm than whatever is in the generic pipeline (often outdated), then you will need to convert the BAM/CRAM files back to FastQs:</p>
<p>BAM files can be converted to FastQ using bedtools. To ensure a single copy for multi-mapping reads first sort by read name and remove secondary alignments using samtools. <a href="https://broadinstitute.github.io/picard/index.html">Picard</a> also contains a method for converting BAM to FastQ files.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="co"># sort reads by name</span>
<span class="ex">samtools</span> sort -n original.bam -o sorted_by_name.bam
<span class="co"># remove secondary alignments</span>
<span class="ex">samtools</span> view -b -F 256 sorted_by_name.bam -o primary_alignment_only.bam
<span class="co"># convert to fastq</span>
<span class="ex">bedtools</span> bamtofastq -i primary_alignment_only.bam -fq read1.fq -fq2 read2.fq</code></pre></div>
</div>
<div id="cram" class="section level3">
<h3><span class="header-section-number">3.3.3</span> CRAM</h3>
<p><a href="https://www.ebi.ac.uk/ena/software/cram-usage">CRAM</a> files are similar to BAM files only they contain information in the header to the reference genome used in the mapping in the header. This allow the bases in each read that are identical to the reference to be further compressed. CRAM also supports some lossy data compression approaches to further optimize storage compared to BAMs. CRAMs are mainly used by the Sanger/EBI sequencing facility.</p>
<p>CRAM and BAM files can be interchanged using the lastest version of samtools (&gt;=v1.0). However, this conversion may require downloading the reference genome into cache. Alternatively, you may pre-download the correct reference either from metadata in the header of the CRAM file, or from talking to whomever generated the CRAM and specify that file using ‘-T’ Thus we recommend setting a specific cache location prior to doing this:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="bu">export</span> <span class="va">REF_CACHE=</span>/path_to/cache_directory_for_reference_genome
<span class="ex">samtools</span> view -b -h -T reference_genome.fasta file.cram -o file.bam
<span class="ex">samtools</span> view -C -h -T reference_genome.fasta file.bam -o file.cram</code></pre></div>
</div>
<div id="mannually-inspecting-files" class="section level3">
<h3><span class="header-section-number">3.3.4</span> Mannually Inspecting files</h3>
<p>At times it may be useful to mannual inspect files for example to check the metadata in headers that the files are from the correct sample. ‘less’ and ‘more’ can be used to inspect any text files from the command line. By “pipe-ing” the output of samtools view into these commands using ‘|’ we check each of these file types without having to save multiple copies of each file.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">less</span> file.txt
<span class="fu">more</span> file.txt
<span class="co"># counts the number of lines in file.txt</span>
<span class="fu">wc</span> -l file.txt
<span class="ex">samtools</span> view -h file.[cram/bam] <span class="kw">|</span> <span class="fu">more</span>
<span class="co"># counts the number of lines in the samtools output</span>
<span class="ex">samtools</span> view -h file.[cram/bam] <span class="kw">|</span> <span class="fu">wc</span> -l</code></pre></div>
<p><strong>Exercises</strong></p>
<p>You have been provided with a small cram file: EXAMPLE.cram</p>
<p>Task 1: How was this file aligned? What software was used? What was used as the genome? (Hint: check the header)</p>
<p>Task 2: How many reads are unmapped/mapped? How total reads are there? How many secondary alignments are present? (Hint: use the FLAG)</p>
<p>Task 3: Convert the CRAM into two Fastq files. Did you get exactly one copy of each read? (name these files “10cells_read1.fastq” “10cells_read2.fastq”)</p>
<p>If you get stuck help information for each piece of software can be displayed by entering running the command “naked” - e.g. ‘samtools view’, ‘bedtools’</p>
<p><strong>Answer</strong></p>
</div>
<div id="genome-fasta-gtf" class="section level3">
<h3><span class="header-section-number">3.3.5</span> Genome (FASTA, GTF)</h3>
<p>To map your reads you will also need the reference genome and in many cases the genome annotation file (in either GTF or GFF format). These can be downloaded for model organisms from any of the main genomics databases: <a href="http://www.ensembl.org/info/data/ftp/index.html">Ensembl</a>, <a href="ftp://ftp.ncbi.nih.gov/genomes/">NCBI</a>, or <a href="http://hgdownload.soe.ucsc.edu/downloads.html">UCSC Genome Browser</a>.</p>
<p>GTF files contain annotations of genes, transcripts, and exons. They must contain: (1) seqname : chromosome/scaffold (2) source : where this annotation came from (3) feature : what kind of feature is this? (e.g. gene, transcript, exon) (4) start : start position (bp) (5) end : end position (bp) (6) score : a number (7) strand : + (forward) or - (reverse) (8) frame : if CDS indicates which base is the first base of the first codon (0 = first base, 1 = second base, etc..) (9) attribute : semicolon-separated list of tag-value pairs of extra information (e.g. names/IDs, biotype)</p>
<p>Empty fields are marked with “.”</p>
<p>In our experience Ensembl is the easiest of these to use, and has the largest set of annotations. NCBI tends to be more strict in including only high confidence gene annotations. Whereas UCSC contains multiple geneset annotations that use different criteria.</p>
<p>If you experimental system includes non-standard sequences these must be added to both the genome fasta and gtf to quantify their expression. Most commonly this is done for the ERCC spike-ins, although the same must be done for CRISPR- related sequences or other overexpression/reporter constructs.</p>
<p>For maximum utility/flexibility we recommend creating complete and detailed entries for any non-standard sequences added.</p>
<p>There is no standardized way to do this. So below is our custom perl script for creating a gtf and fasta file for ERCCs which can be appended to the genome. You may also need to alter a gtf file to deal with repetitive elements in introns when/if you want to quantify intronic reads. Any scripting language or even ‘awk’ and/or some text editors can be used to do this relatively efficiently, but they are beyond the scope of this course.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="co"># Converts the Annotation file from </span>
<span class="co"># https://www.thermofisher.com/order/catalog/product/4456740 to </span>
<span class="co"># gtf and fasta files that can be added to existing genome fasta &amp; gtf files.</span>

<span class="ex">my</span> @FASTAlines = ();
<span class="ex">my</span> @GTFlines = ();
<span class="ex">open</span> (my <span class="va">$ifh</span>, <span class="st">&quot;ERCC_Controls_Annotation.txt&quot;</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="op">&lt;</span><span class="va">$ifh</span><span class="op">&gt;</span>; <span class="co">#header</span>
<span class="kw">while</span> <span class="kw">(</span><span class="op">&lt;</span><span class="va">$ifh</span><span class="op">&gt;</span><span class="kw">)</span> <span class="kw">{</span>
    <span class="co"># Do all the important stuff</span>
    <span class="ex">chomp</span><span class="kw">;</span>
    <span class="ex">my</span> @record = split(/\t/);
    <span class="ex">my</span> <span class="va">$sequence</span> = <span class="va">$record[4]</span><span class="kw">;</span>
    <span class="va">$sequence</span> =<span class="ex">~</span> <span class="ex">s</span>/\<span class="ex">s+//g</span>; <span class="co"># get rid of any preceeding/tailing white space</span>
    <span class="va">$sequence</span> = <span class="va">$sequence</span><span class="ex">.</span><span class="st">&quot;NNNN&quot;</span><span class="kw">;</span>
    <span class="ex">my</span> <span class="va">$name</span> = <span class="va">$record[0]</span><span class="kw">;</span>
    <span class="ex">my</span> <span class="va">$genbank</span> = <span class="va">$record[1]</span><span class="kw">;</span>
    <span class="ex">push</span>(@FASTAlines, <span class="st">&quot;&gt;</span><span class="va">$name</span><span class="st">\n</span><span class="va">$sequence</span><span class="st">\n&quot;</span>);
<span class="co"># is GTF 1 indexed or 0 indexed? -&gt; it is 1 indexed</span>
<span class="co"># + or - strand?</span>
    <span class="ex">push</span>(@GTFlines, <span class="st">&quot;</span><span class="va">$name</span><span class="st">\tERCC\tgene\t1\t&quot;</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">&quot;\t.\t+\t.\tgene_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; transcript_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; exon_number </span><span class="dt">\&quot;</span><span class="st">1</span><span class="dt">\&quot;</span><span class="st">; gene_name </span><span class="dt">\&quot;</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">\n&quot;</span>);
    <span class="ex">push</span>(@GTFlines, <span class="st">&quot;</span><span class="va">$name</span><span class="st">\tERCC\ttranscript\t1\t&quot;</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">&quot;\t.\t+\t.\tgene_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; transcript_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; exon_number </span><span class="dt">\&quot;</span><span class="st">1</span><span class="dt">\&quot;</span><span class="st">; gene_name </span><span class="dt">\&quot;</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">\n&quot;</span>);
    <span class="ex">push</span>(@GTFlines, <span class="st">&quot;</span><span class="va">$name</span><span class="st">\tERCC\texon\t1\t&quot;</span>.(length(<span class="va">$sequence</span>)<span class="ex">-2</span>)<span class="ex">.</span><span class="st">&quot;\t.\t+\t.\tgene_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; transcript_id </span><span class="dt">\&quot;</span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">; exon_number </span><span class="dt">\&quot;</span><span class="st">1</span><span class="dt">\&quot;</span><span class="st">; gene_name </span><span class="dt">\&quot;</span><span class="st">ERCC </span><span class="va">$name</span><span class="st">-</span><span class="va">$genbank</span><span class="dt">\&quot;</span><span class="st">\n&quot;</span>);
<span class="kw">}</span> <span class="ex">close</span>(<span class="va">$ifh</span>);

<span class="co"># Write output</span>
<span class="ex">open</span>(my <span class="va">$ofh</span>, <span class="st">&quot;&gt;&quot;</span>, <span class="st">&quot;ERCC_Controls.fa&quot;</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="ex">foreach</span> my <span class="va">$line</span> (@FASTAlines) <span class="kw">{</span>
    <span class="ex">print</span> <span class="va">$ofh</span> <span class="va">$line</span><span class="kw">;</span>
<span class="kw">}</span> <span class="ex">close</span> (<span class="va">$ofh</span>);

<span class="ex">open</span>(<span class="va">$ofh</span>, <span class="st">&quot;&gt;&quot;</span>, <span class="st">&quot;ERCC_Controls.gtf&quot;</span>) <span class="ex">or</span> die <span class="va">$!</span><span class="kw">;</span>
<span class="ex">foreach</span> my <span class="va">$line</span> (@GTFlines) <span class="kw">{</span>
    <span class="ex">print</span> <span class="va">$ofh</span> <span class="va">$line</span><span class="kw">;</span>
<span class="kw">}</span> <span class="ex">close</span> (<span class="va">$ofh</span>);</code></pre></div>
</div>
</div>
<div id="demultiplexing" class="section level2">
<h2><span class="header-section-number">3.4</span> Demultiplexing</h2>
<p>Demultiplexing is done differently depending on the protocol used and the particular pipeline you are using a full pipeline. The most flexible demultiplexing pipeline we are aware of is <a href="https://github.com/sdparekh/zUMIs/wiki/Usage">zUMIs</a> which can be used to demultiplex and map most UMI-based protocols. For Smartseq2 or other paired-end full transcript protocols the data will usually already be demultiplexed. Public repositories such as GEO or ArrayExpress require data small-scale/plate-based scRNASeq data to be demultiplexed prior to upload, and many sequencing facilities will automatically demultiplex data before returning it to you. If you aren’t using a published pipeline and the data was not demultiplexed by the sequencing facility you will have to demultiplex it yourself. This usually requires writing a custom script since barcodes may be of different lengths and different locations in the reads depending on the protocols used.</p>
<p>For all data-type “demultiplexing” involves identifying and removing the cell-barcode sequence from one or both reads. If the expected cell-barcodes are known ahead of time, i.e. the data is from a PCR-plate-based protocol, all that is necessarily is to compare each cell-barcode to the expected barcodes and assign the associated reads to the closest cell-barcode (with maximum mismatches of 1 or 2 depending on the design of the cell-barcodes). These data are generally demultiplexed prior to mapping as an easy way of parallelizing the mapping step.</p>
<p>We have <a href="https://github.com/tallulandrews/scRNASeqPipeline">publicly available</a> perl scripts capable of demultiplexing any scRNASeq data with a single cell-barcode with or without UMIs for plate-based protocols. These can be used as so:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">perl</span> 1_Flexible_UMI_Demultiplexing.pl 10cells_read1.fq 10cells_read2.fq <span class="st">&quot;C12U8&quot;</span> 10cells_barcodes.txt 2 Ex</code></pre></div>
<pre><code>## 
##  Doesn&#39;t match any cell: 0
##  Ambiguous: 0
##  Exact Matches: 400
##  Contain mismatches: 0
##  Input Reads: 400
##  Output Reads: 400
## Barcode Structure: 12 bp CellID followed by 8 bp UMI</code></pre>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">perl</span> 1_Flexible_FullTranscript_Demultiplexing.pl 10cells_read1.fq 10cells_read2.fq <span class="st">&quot;start&quot;</span> 12 10cells_barcodes.txt 2 Ex</code></pre></div>
<pre><code>## 
## Doesn&#39;t match any cell: 0
## Ambiguous: 0
## Exact Matches: 400
## Contain Mismatches: 0
## Input Reads: 400
## Output Reads: 400</code></pre>
<p>For UMI containing data, demultiplexing includes attaching the UMI code to the read name of the gene-body containing read. If the data are from a droplet-based protocol or SeqWell where the number of expected barcodes is much higher than the expected number of cell, then usually the cell-barcode will also be attached to the read name to avoid generating a very large number of files. In these cases, demultiplexing will happen during the quantification step to facilitate the identification of cell-barcodes which correspond to intact cells rather than background noise.</p>
<div id="identifying-cell-containing-dropletsmicrowells" class="section level3">
<h3><span class="header-section-number">3.4.1</span> Identifying cell-containing droplets/microwells</h3>
<p>For droplet based methods only a fraction of droplets contain both beads and an intact cell. However, biology experiments are messy and some RNA will leak out of dead/damaged cells. So droplets without an intact cell are likely to capture a small amount of the ambient RNA which will end up in the sequencing library and contribute a reads to the final sequencing output. The variation in droplet size, amplification efficiency, and sequencing will lead both “background” and real cells to have a wide range of library sizes. Various approaches have been used to try to distinguish those cell barcodes which correspond to real cells.</p>
<p>Most methods use the total molecules (could be applied to total reads) per barcode and try to find a “break point” between bigger libraries which are cells + some background and smaller libraries assumed to be purely background. Let’s load some example simulated data which contain both large and small cells:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">umi_per_barcode &lt;-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">&quot;droplet_id_example_per_barcode.txt.gz&quot;</span>)
truth &lt;-<span class="st"> </span><span class="kw">read.delim</span>(<span class="st">&quot;droplet_id_example_truth.gz&quot;</span>, <span class="dt">sep=</span><span class="st">&quot;,&quot;</span>)</code></pre></div>
<p><strong>Exercise</strong> How many unique barcodes were detected? How many true cells are present in the data? To simplify calculations for this section exclude all barcodes with fewer than 10 total molecules.</p>
<p><strong>Answer</strong></p>
<p>One approach is to look for the inflection point where the total molecules per barcode suddenly drops:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">barcode_rank &lt;-<span class="st"> </span><span class="kw">rank</span>(<span class="op">-</span>umi_per_barcode[,<span class="dv">2</span>])
<span class="kw">plot</span>(barcode_rank, umi_per_barcode[,<span class="dv">2</span>], <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-11-1.png" width="672" /></p>
<p>Here we can see an roughly exponential curve of library sizes, so to make things simpler lets log-transform them.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">log_lib_size &lt;-<span class="st"> </span><span class="kw">log10</span>(umi_per_barcode[,<span class="dv">2</span>])
<span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-12-1.png" width="672" /> That’s better, the “knee” in the distribution is much more pronounced. We could manually estimate where the “knee” is but it much more reproducible to algorithmically identify this point.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># inflection point</span>
o &lt;-<span class="st"> </span><span class="kw">order</span>(barcode_rank)
log_lib_size &lt;-<span class="st"> </span>log_lib_size[o]
barcode_rank &lt;-<span class="st"> </span>barcode_rank[o]

rawdiff &lt;-<span class="st"> </span><span class="kw">diff</span>(log_lib_size)<span class="op">/</span><span class="kw">diff</span>(barcode_rank)
inflection &lt;-<span class="st"> </span><span class="kw">which</span>(rawdiff <span class="op">==</span><span class="st"> </span><span class="kw">min</span>(rawdiff[<span class="dv">100</span><span class="op">:</span><span class="kw">length</span>(rawdiff)], <span class="dt">na.rm=</span><span class="ot">TRUE</span>))

<span class="kw">plot</span>(barcode_rank, log_lib_size, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))
<span class="kw">abline</span>(<span class="dt">v=</span>inflection, <span class="dt">col=</span><span class="st">&quot;red&quot;</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-13-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">threshold &lt;-<span class="st"> </span><span class="dv">10</span><span class="op">^</span>log_lib_size[inflection]

cells &lt;-<span class="st"> </span>umi_per_barcode[umi_per_barcode[,<span class="dv">2</span>] <span class="op">&gt;</span><span class="st"> </span>threshold,<span class="dv">1</span>]
TPR &lt;-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)
Recall &lt;-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="kw">c</span>(TPR, Recall)</code></pre></div>
<pre><code>## [1] 1.0000000 0.7831707</code></pre>
<p>Another is to fix a mixture model and find where the higher and lower distributions intersect. However, data may not fit the assumed distributions very well:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="op">-</span><span class="dv">92497</span>)
<span class="co"># mixture model</span>
<span class="kw">require</span>(<span class="st">&quot;mixtools&quot;</span>)</code></pre></div>
<pre><code>## Loading required package: mixtools</code></pre>
<pre><code>## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mix &lt;-<span class="st"> </span><span class="kw">normalmixEM</span>(log_lib_size)</code></pre></div>
<pre><code>## number of iterations= 43</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(mix, <span class="dt">which=</span><span class="dv">2</span>, <span class="dt">xlab2=</span><span class="st">&quot;log(mol per cell)&quot;</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">p1 &lt;-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">1</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">1</span>])
p2 &lt;-<span class="st"> </span><span class="kw">dnorm</span>(log_lib_size, <span class="dt">mean=</span>mix<span class="op">$</span>mu[<span class="dv">2</span>], <span class="dt">sd=</span>mix<span class="op">$</span>sigma[<span class="dv">2</span>])
<span class="cf">if</span> (mix<span class="op">$</span>mu[<span class="dv">1</span>] <span class="op">&lt;</span><span class="st"> </span>mix<span class="op">$</span>mu[<span class="dv">2</span>]) {
    split &lt;-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p2 <span class="op">&gt;</span><span class="st"> </span>p1])
} <span class="cf">else</span> {
    split &lt;-<span class="st"> </span><span class="kw">min</span>(log_lib_size[p1 <span class="op">&gt;</span><span class="st"> </span>p2])
}</code></pre></div>
<p><strong>Exercise</strong> Identify cells using this split point and calculate the TPR and Recall.</p>
<p><strong>Answer</strong></p>
<p>A third, used by CellRanger, assumes a ~10-fold range of library sizes for real cells and estimates this range using the expected number of cells.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">n_cells &lt;-<span class="st"> </span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="co"># CellRanger</span>
totals &lt;-<span class="st"> </span>umi_per_barcode[,<span class="dv">2</span>]
totals &lt;-<span class="st"> </span><span class="kw">sort</span>(totals, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)
<span class="co"># 99th percentile of top n_cells divided by 10</span>
thresh =<span class="st"> </span>totals[<span class="kw">round</span>(<span class="fl">0.01</span><span class="op">*</span>n_cells)]<span class="op">/</span><span class="dv">10</span>
<span class="kw">plot</span>(totals, <span class="dt">xlim=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">8000</span>))
<span class="kw">abline</span>(<span class="dt">h=</span>thresh, <span class="dt">col=</span><span class="st">&quot;red&quot;</span>, <span class="dt">lwd=</span><span class="dv">2</span>)</code></pre></div>
<p><img src="05-L1-process-raw_files/figure-html/unnamed-chunk-16-1.png" width="672" /> <strong>Exercise</strong> Identify cells using this threshodl and calculate the TPR and Recall.</p>
<p><strong>Answer</strong></p>
<p>Finally (EmptyDrops)[<a href="https://github.com/MarioniLab/DropletUtils" class="uri">https://github.com/MarioniLab/DropletUtils</a>], which is currently in beta testing, uses the full genes x cells molecule count matrix for all droplets and estimates the profile of “background” RNA from those droplets with extremely low counts, then looks for cells with gene-expression profiles which differ from the background. This is combined with an inflection point method since background RNA often looks very similar to the expression profile of the largests cells in a population. As such EmptyDrops is the only method able to identify barcodes for very small cells in highly diverse samples.</p>
<p>Below we have provided code for how this method is currently run: (We will update this page when the method is officially released)</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">require</span>(<span class="st">&quot;Matrix&quot;</span>)
raw.counts &lt;-<span class="st"> </span><span class="kw">readRDS</span>(<span class="st">&quot;droplet_id_example.rds&quot;</span>)

<span class="kw">require</span>(<span class="st">&quot;DropletUtils&quot;</span>)
<span class="co"># emptyDrops</span>
<span class="kw">set.seed</span>(<span class="dv">100</span>)
e.out &lt;-<span class="st"> </span><span class="kw">emptyDrops</span>(my.counts)
is.cell &lt;-<span class="st"> </span>e.out<span class="op">$</span>FDR <span class="op">&lt;=</span><span class="st"> </span><span class="fl">0.01</span>
<span class="kw">sum</span>(is.cell, <span class="dt">na.rm=</span><span class="ot">TRUE</span>)
<span class="kw">plot</span>(e.out<span class="op">$</span>Total, <span class="op">-</span>e.out<span class="op">$</span>LogProb, <span class="dt">col=</span><span class="kw">ifelse</span>(is.cell, <span class="st">&quot;red&quot;</span>, <span class="st">&quot;black&quot;</span>),
    <span class="dt">xlab=</span><span class="st">&quot;Total UMI count&quot;</span>, <span class="dt">ylab=</span><span class="st">&quot;-Log Probability&quot;</span>)

cells &lt;-<span class="st"> </span><span class="kw">colnames</span>(raw.counts)[is.cell]

TPR &lt;-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(cells)
Recall &lt;-<span class="st"> </span><span class="kw">sum</span>(cells <span class="op">%in%</span><span class="st"> </span>truth[,<span class="dv">1</span>])<span class="op">/</span><span class="kw">length</span>(truth[,<span class="dv">1</span>])
<span class="kw">c</span>(TPR, Recall)</code></pre></div>

</div>
</div>
<div id="using-star-to-align-reads" class="section level2">
<h2><span class="header-section-number">3.5</span> Using STAR to Align Reads</h2>
<p>Now we have trimmed our reads and established that they are of good quality, we would like to map them to a reference genome. This process is known as alignment. Some form of alignment is generally required if we want to quantify gene expression or find genes which are differentially expressed between samples.</p>
<p>Many tools have been developed for read alignment, but today we will focus on two. The first tool we will consider is STAR <span class="citation">(<span class="citeproc-not-found" data-reference-id="dobin"><strong>???</strong></span>)</span>. For each read in our reads data, STAR tries to find the longest possible sequence which matches one or more sequences in the reference genome. For example, in the figure below, we have a read (blue) which spans two exons and an alternative splicing junction (purple). STAR finds that the first part of the read is the same as the sequence of the first exon, whilst the second part of the read matches the sequence in the second exon. Because STAR is able to recognise splicing events in this way, it is described as a ‘splice aware’ aligner.</p>
<div class="figure">
<img src="L1-images/STAR_explanation.png" alt="Figure 1: Diagram of how STAR performs alignments, taken from Dobin et al." />
<p class="caption">Figure 1: Diagram of how STAR performs alignments, taken from Dobin et al.</p>
</div>
<p>Usually STAR aligns reads to a reference genome, potentially allowing it to detect novel splicing events or chromosomal rearrangements. However, one issue with STAR is that it needs a lot of RAM, especially if your reference genome is large (eg. mouse and human). To speed up our analysis today, we will use STAR to align reads from to a reference transcriptome of 2000 transcripts. Note that this is NOT normal or recommended practice, we only do it here for reasons of time. We recommend that normally you should align to a reference genome.</p>
<p>Two steps are required to perform STAR alignment. In the first step, the user provides STAR with reference genome sequences (FASTA) and annotations (GTF), which STAR uses to create a genome index. In the second step, STAR maps the user’s reads data to the genome index.</p>
<p>Let’s create the index now. Remember, for reasons of time we are aligning to a transcriptome rather than a genome today, meaning we only need to provide STAR with the sequences of the transcripts we will be aligning reads to. You can obtain transcriptomes for many model organisms from Ensembl (<a href="https://www.ensembl.org/info/data/ftp/index.html" class="uri">https://www.ensembl.org/info/data/ftp/index.html</a>).</p>
<p>Task 1: Execute the commands below to create the index:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> indices
<span class="fu">mkdir</span> indices/STAR
<span class="ex">STAR</span> --runThreadN 4 --runMode genomeGenerate --genomeDir indices/STAR --genomeFastaFiles Share/2000_reference.transcripts.fa</code></pre></div>
<p>Task 2: What does each of the options we used do? Hint: Use the STAR manual to help you (<a href="https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf" class="uri">https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf</a>)</p>
<p>Task 3: How would the command we used in Task 1 be different if we were aligning to the genome rather than the transcriptome?</p>
<p>Now that we have created the index, we can perform the mapping step.</p>
<p>Task 4: Try to work out what command you should use to map our trimmed reads (from ERR522959) to the index you created. Use the STAR manual to help you. One you think you know the answer, check whether it matches the solution in the next section and execute the alignment.</p>
<p>Task 5: Try to understand the output of your alignment. Talk to one of the instructors if you need help!</p>
<div id="solution-for-star-alignment" class="section level3">
<h3><span class="header-section-number">3.5.1</span> Solution for STAR Alignment</h3>
<p>You can use the folowing commands to perform the mapping step:</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> results
<span class="fu">mkdir</span> results/STAR

<span class="ex">STAR</span> --runThreadN 4 --genomeDir indices/STAR --readFilesIn Share/ERR522959_1.fastq Share/ERR522959_2.fastq --outFileNamePrefix results/STAR/</code></pre></div>
</div>
</div>
<div id="kallisto-and-pseudo-alignment" class="section level2">
<h2><span class="header-section-number">3.6</span> Kallisto and Pseudo-Alignment</h2>
<p>STAR is a reads aligner, whereas Kallisto is a pseudo-aligner <span class="citation">(Bray et al. <a href="#ref-bray_2016">2016</a>)</span>. The main difference between aligners and pseudo-aligners is that whereas aligners map reads to a reference, pseudo-aligners map k-mers to a reference.</p>
<div id="what-is-a-k-mer" class="section level3">
<h3><span class="header-section-number">3.6.1</span> What is a k-mer?</h3>
<p>A k-mer is a sequence of length k derived from a read. For example, imagine we have a read with the sequence ATCCCGGGTTAT and we want to make 7-mers from it. To do this, we would find the first 7-mer by counting the first seven bases of the read. We would find the second 7-mer by moving one base along, then counting the next seven bases. Figure 2 shows all the 7-mers that could be derived from our read:</p>
<div class="figure">
<img src="L1-images/Kmers.png" alt="Figure 2: The 7-mers derived from an example read" />
<p class="caption">Figure 2: The 7-mers derived from an example read</p>
</div>
</div>
<div id="why-map-k-mers-rather-than-reads" class="section level3">
<h3><span class="header-section-number">3.6.2</span> Why map k-mers rather than reads?</h3>
<p>There are two main reasons:</p>
<ol style="list-style-type: decimal">
<li><p>Pseudo-aligners use k-mers and a computational trick to make pseudo-alignment much faster than traditional aligners. If you are interested in how this is acheived, see (Bray et al., 2017) for details.</p></li>
<li><p>Under some circumstances, pseudo-aligners may be able to cope better with sequencing errors than traditional aligners. For example, imagine there was a sequencing error in the first base of the read above and the A was actually a T. This would impact on the pseudo-aligners ability to map the first 7-mer but none of the following 7-mers.</p></li>
</ol>
</div>
<div id="kallistos-pseudo-mode" class="section level3">
<h3><span class="header-section-number">3.6.3</span> Kallisto’s pseudo mode</h3>
<p>Kallisto has a specially designed mode for pseudo-aligning reads from single-cell RNA-seq experiments. Unlike STAR, Kallisto psuedo-aligns to a reference transcriptome rather than a reference genome. This means Kallisto maps reads to splice isoforms rather than genes. Mapping reads to isoforms rather than genes is especially challenging for single-cell RNA-seq for the following reasons:</p>
<ul>
<li>Single-cell RNA-seq is lower coverage than bulk RNA-seq, meaning the total amount of information available from reads is reduced.</li>
<li>Many single-cell RNA-seq protocols have 3’ coverage bias, meaning if two isoforms differ only at their 5’ end, it might not be possible to work out which isoform the read came from.</li>
<li>Some single-cell RNA-seq protocols have short read lengths, which can also mean it is not possible to work out which isoform the read came from.</li>
</ul>
<p>Kallisto’s pseudo mode takes a slightly different approach to pseudo-alignment. Instead of aligning to isoforms, Kallisto aligns to equivalence classes. Essentially, this means if a read maps to multiple isoforms, Kallisto records the read as mapping to an equivalence class containing all the isoforms it maps to. Instead of using gene or isoform expression estimates in downstream analysis such as clustering, equivalence class counts can be used instead. Figure 3 shows a diagram which helps explain this.</p>
<div class="figure">
<img src="L1-images/TCC.jpg" alt="Figure 3: A diagram explaining Kallisto’s Equivalence Classes, taken from https://pachterlab.github.io/kallisto/singlecell.html." />
<p class="caption">Figure 3: A diagram explaining Kallisto’s Equivalence Classes, taken from <a href="https://pachterlab.github.io/kallisto/singlecell.html" class="uri">https://pachterlab.github.io/kallisto/singlecell.html</a>.</p>
</div>
<p>Today we will just perform pseudo-alignment with one cell, but Kallisto is also capable of pseudo-aligning multiple cells simultaneously and using information from UMIs. See <a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a> for details.</p>
<p>As for STAR, you will need to produce an index for Kallisto before the pseudo-alignment step.</p>
<p>Task 6: Use the below command to produce the Kallisto index. Use the Kallisto manual (<a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a>) to work out what the options do in this command.</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> indices/Kallisto
<span class="ex">kallisto</span> index -i indices/Kallisto/transcripts.idx Share/2000_reference.transcripts.fa</code></pre></div>
<p>Task 7: Use the Kallisto manual to work out what command to use to perform pseudo-alignment. One you think you know the answer, check whether it matches the solution in the next section and execute the pseudo-alignment.</p>
</div>
<div id="solution-to-kallisto-pseudo-alignment" class="section level3">
<h3><span class="header-section-number">3.6.4</span> Solution to Kallisto Pseudo-Alignment</h3>
<p>Use the below command to perform pseudo-alignment</p>
<div class="sourceCode"><pre class="sourceCode bash"><code class="sourceCode bash"><span class="fu">mkdir</span> results/Kallisto
<span class="ex">kallisto</span> pseudo -i indices/Kallisto/transcripts.idx -o results/Kallisto -b batch.txt </code></pre></div>
<p>See <a href="https://pachterlab.github.io/kallisto/manual" class="uri">https://pachterlab.github.io/kallisto/manual</a> for instructions on creating batch.txt, or ask an instructor if you get stuck.</p>
</div>
<div id="understanding-the-output-of-kallisto-pseudo-alignment" class="section level3">
<h3><span class="header-section-number">3.6.5</span> Understanding the Output of Kallisto Pseudo-Alignment</h3>
<p>The command above should produce 4 files - matrix.cells, matrix.ec, matrix.tsv and run_info.json.</p>
<ul>
<li>matrix.cells contains a list of cell IDs. As we only used one cell, this file should just contain “ERR522959”</li>
<li>matrix.ec contains information about the equivalence classes used. The first number in each row is the equivalence class ID. The second number(s) correspond to the transcript ID(s) in that equivalence class. For example “10 1,2,3” would mean that equivalence class 10 contains transcript IDs 1,2 and 3. The ID numbers correspond to the order that the transcripts appear in reference.transcripts.fa. Zero indexing is used, meaning transcript IDs 1,2 and 3 correspond to the second, third and fourth transcripts in 2000_reference.transcripts.fa.</li>
<li>matrix.tsv contains information about how many reads in each cell map to each equivalence class. The first number is the equivalence class ID, as defined in matrix.ec. The second number is the cell ID, where the cell ID corresponds to the order that the cell came in the matrix.cells file. The third number is the number of reads which fall into that equivalence class. For example, “5 1 3” means that 3 reads from cell 1 map to equivalence class 5. Note that zero indexing is used, so cell 1 corresponds to the second line of matrix.cells.</li>
<li>run_info.json contains information about how Kallisto was executed and can be ignored.</li>
</ul>

</div>
</div>
</div>
<h3> References</h3>
<div id="refs" class="references">
<div id="ref-Kolodziejczyk2015-xy">
<p>Kolodziejczyk, Aleksandra A., Jong Kyoung Kim, Valentine Svensson, John C. Marioni, and Sarah A. Teichmann. 2015. “The Technology and Biology of Single-Cell RNA Sequencing.” <em>Molecular Cell</em> 58 (4). Elsevier BV: 610–20. doi:<a href="https://doi.org/10.1016/j.molcel.2015.04.005">10.1016/j.molcel.2015.04.005</a>.</p>
</div>
<div id="ref-bray_2016">
<p>Bray, Nicolas L, Harold Pimentel, Páll Melsted, and Lior Pachter. 2016. “Near-Optimal Probabilistic Rna-Seq Quantification.” <em>Nat Biotechnol</em> 34 (5): 525–27. doi:<a href="https://doi.org/10.1038/nbt.3519">10.1038/nbt.3519</a>.</p>
</div>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="introduction-to-single-cell-rna-seq.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="construction-of-expression-matrix.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"download": ["scRNA-seq-course.pdf"],
"toc": {
"collapse": "section"
},
"search": true
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "";
    if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:" && /^https?:/.test(src))
      src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>