docs/2-4-tlsa.html

<!DOCTYPE html>
<html >

<head>

  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <title>2.4 The Least Squares Assumptions | Introduction to Econometrics with R</title>
  <meta name="description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js.">
  <meta name="generator" content="bookdown  and GitBook 2.6.7">

  <meta property="og:title" content="2.4 The Least Squares Assumptions | Introduction to Econometrics with R" />
  <meta property="og:type" content="book" />
  <meta property="og:url" content="https://www.econometrics-with-r.org/" />
  <meta property="og:image" content="https://www.econometrics-with-r.org/images/cover.png" />
  <meta property="og:description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js." />
  <meta name="github-repo" content="mca91/EconometricsWithR" />

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="2.4 The Least Squares Assumptions | Introduction to Econometrics with R" />
  
  <meta name="twitter:description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js." />
  <meta name="twitter:image" content="https://www.econometrics-with-r.org/images/cover.png" />

<meta name="author" content="Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer">


<meta name="date" content="2019-03-12">

  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="apple-mobile-web-app-capable" content="yes">
  <meta name="apple-mobile-web-app-status-bar-style" content="black">
  
  
<link rel="prev" href="2-3-measures-of-fit.html">
<link rel="next" href="2-5-tsdotoe.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />


<!-- font families -->

<link href="https://fonts.googleapis.com/css?family=PT+Sans|Pacifico|Source+Sans+Pro" rel="stylesheet">

<script src="js/hideOutput.js"></script>

<!-- Mathjax -->
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/default.js"></script>

 <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        extensions: ["tex2jax.js", "TeX/AMSmath.js"],
        tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]},
        jax: ["input/TeX","output/CommonHTML"]
      });
      MathJax.Hub.processSectionDelay = 0;
  </script>

<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-110299877-1"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-110299877-1');
</script>

<!-- open review block -->

<script async defer src="https://hypothes.is/embed.js"></script>


<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
  { position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
  { content: attr(data-line-number);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; pointer-events: all; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  { background-color: #f8f8f8; }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ef2929; } /* Alert */
code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #c4a000; } /* Attribute */
code span.bn { color: #0000cf; } /* BaseN */
code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4e9a06; } /* Char */
code span.cn { color: #000000; } /* Constant */
code span.co { color: #8f5902; font-style: italic; } /* Comment */
code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */
code span.dt { color: #204a87; } /* DataType */
code span.dv { color: #0000cf; } /* DecVal */
code span.er { color: #a40000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #0000cf; } /* Float */
code span.fu { color: #000000; } /* Function */
code span.im { } /* Import */
code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #204a87; font-weight: bold; } /* Keyword */
code span.op { color: #ce5c00; font-weight: bold; } /* Operator */
code span.ot { color: #8f5902; } /* Other */
code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */
code span.sc { color: #000000; } /* SpecialChar */
code span.ss { color: #4e9a06; } /* SpecialString */
code span.st { color: #4e9a06; } /* String */
code span.va { color: #000000; } /* Variable */
code span.vs { color: #4e9a06; } /* VerbatimString */
code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */
</style>

<link rel="stylesheet" href="style.css" type="text/css" />
<link rel="stylesheet" href="toc.css" type="text/css" />
</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li><center><img src="images/logo.png" alt="logo" width="50%" height="50%"style="margin: 15px 0 0 0"></center></li>

<li class="divider"></li>
<li class="chapter" data-level="" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i>Preface</a></li>
<li class="chapter" data-level="1" data-path="1-introduction.html"><a href="1-introduction.html"><i class="fa fa-check"></i><b>1</b> Introduction</a><ul>
<li class="chapter" data-level="1.1" data-path="1-1-a-very-short-introduction-to-r-and-rstudio.html"><a href="1-1-a-very-short-introduction-to-r-and-rstudio.html"><i class="fa fa-check"></i><b>1.1</b> A Very Short Introduction to <tt>R</tt> and <em>RStudio</em></a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="2-lrwor.html"><a href="2-lrwor.html"><i class="fa fa-check"></i><b>2</b> Linear Regression with One Regressor</a><ul>
<li class="chapter" data-level="2.1" data-path="2-1-simple-linear-regression.html"><a href="2-1-simple-linear-regression.html"><i class="fa fa-check"></i><b>2.1</b> Simple Linear Regression</a></li>
<li class="chapter" data-level="2.2" data-path="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><a href="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><i class="fa fa-check"></i><b>2.2</b> Estimating the Coefficients of the Linear Regression Model</a><ul>
<li class="chapter" data-level="" data-path="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><a href="2-2-estimating-the-coefficients-of-the-linear-regression-model.html#the-ordinary-least-squares-estimator"><i class="fa fa-check"></i>The Ordinary Least Squares Estimator</a></li>
</ul></li>
<li class="chapter" data-level="2.3" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html"><i class="fa fa-check"></i><b>2.3</b> Measures of Fit</a><ul>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#the-coefficient-of-determination"><i class="fa fa-check"></i>The Coefficient of Determination</a></li>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#the-standard-error-of-the-regression"><i class="fa fa-check"></i>The Standard Error of the Regression</a></li>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#application-to-the-test-score-data"><i class="fa fa-check"></i>Application to the Test Score Data</a></li>
</ul></li>
<li class="chapter" data-level="2.4" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html"><i class="fa fa-check"></i><b>2.4</b> The Least Squares Assumptions</a><ul>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-1-the-error-term-has-conditional-mean-of-zero"><i class="fa fa-check"></i>Assumption 1: The Error Term has Conditional Mean of Zero</a></li>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-2-independently-and-identically-distributed-data"><i class="fa fa-check"></i>Assumption 2: Independently and Identically Distributed Data</a></li>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-3-large-outliers-are-unlikely"><i class="fa fa-check"></i>Assumption 3: Large Outliers are Unlikely</a></li>
</ul></li>
<li class="chapter" data-level="2.5" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html"><i class="fa fa-check"></i><b>2.5</b> The Sampling Distribution of the OLS Estimator</a><ul>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-1"><i class="fa fa-check"></i>Simulation Study 1</a></li>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-2"><i class="fa fa-check"></i>Simulation Study 2</a></li>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-3"><i class="fa fa-check"></i>Simulation Study 3</a></li>
</ul></li>
<li class="chapter" data-level="2.6" data-path="2-6-exercises.html"><a href="2-6-exercises.html"><i class="fa fa-check"></i><b>2.6</b> Exercises</a></li>
</ul></li>
<li class="divider"></li>
<li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>

</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Introduction to Econometrics with R</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div class = rmdreview>
This book is in <b>Open Review</b>. We want your feedback to make the book better for you and other students. You may annotate some text by <span style="background-color: #3297FD; color: white">selecting it with the cursor</span> and then click the <i class="h-icon-annotate"></i> on the pop-up menu. You can also see the annotations of others: click the <i class="h-icon-chevron-left"></i> in the upper right hand corner of the page <i class="fa fa-arrow-circle-right  fa-rotate-315" aria-hidden="true"></i>
</div>
<div id="tlsa" class="section level2">
<h2><span class="header-section-number">2.4</span> The Least Squares Assumptions</h2>
<p>OLS performs well under a quite broad variety of different circumstances. However, there are some assumptions which need to be satisfied in order to ensure that the estimates are normally distributed in large samples (we discuss this in Chapter <a href="2-5-tsdotoe.html#tsdotoe">2.5</a>.</p>
<div id="KC4.3" class="keyconcept">
<h3 class="right">
Key Concept 4.3
</h3>
<h3 class="left">
The Least Squares Assumptions
</h3>
<p>
<p><span class="math display">\[Y_i = \beta_0 + \beta_1 X_i + u_i \text{, } i = 1,\dots,n\]</span>
where</p>
<ol style="list-style-type: decimal">
<li>The error term <span class="math inline">\(u_i\)</span> has conditional mean zero given <span class="math inline">\(X_i\)</span>: <span class="math inline">\(E(u_i|X_i) = 0\)</span>.</li>
<li><span class="math inline">\((X_i,Y_i), i = 1,\dots,n\)</span> are independent and identically distributed (i.i.d.) draws from their joint distribution.</li>
<li>Large outliers are unlikely: <span class="math inline">\(X_i\)</span> and <span class="math inline">\(Y_i\)</span> have nonzero finite fourth moments.</li>
</ol>
</p>
</div>
<div id="assumption-1-the-error-term-has-conditional-mean-of-zero" class="section level3 unnumbered">
<h3>Assumption 1: The Error Term has Conditional Mean of Zero</h3>
<p>This means that no matter which value we choose for <span class="math inline">\(X\)</span>, the error term <span class="math inline">\(u\)</span> must not show any systematic pattern and must have a mean of <span class="math inline">\(0\)</span>.
Consider the case that, unconditionally, <span class="math inline">\(E(u) = 0\)</span>, but for low and high values of <span class="math inline">\(X\)</span>, the error term tends to be positive and for midrange values of
<span class="math inline">\(X\)</span> the error tends to be negative. We can use R to construct such an example. To do so we generate our own data using <tt>R</tt>’s built-in random number generators.</p>
<p>We will use the following functions:</p>
<ul>
<li><tt>runif()</tt> - generates uniformly distributed random numbers</li>
<li><tt>rnorm()</tt> - generates normally distributed random numbers</li>
<li><tt>predict()</tt> - does predictions based on the results of model fitting functions like <tt>lm()</tt></li>
<li><tt>lines()</tt> - adds line segments to an existing plot</li>
</ul>
<p>We start by creating a vector containing values that are uniformly distributed on the interval <span class="math inline">\([-5,5]\)</span>. This can be done with the function <tt>runif()</tt>. We also need to simulate the error term. For this we generate normally distributed random numbers with a mean equal to <span class="math inline">\(0\)</span> and a variance of <span class="math inline">\(1\)</span> using <tt>rnorm()</tt>. The <span class="math inline">\(Y\)</span> values are obtained as a quadratic function of the <span class="math inline">\(X\)</span> values and the error.</p>
<p>After generating the data we estimate both a simple regression model and a quadratic model that also includes the regressor <span class="math inline">\(X^2\)</span> (this is a multiple regression model, see Chapter <a href="#rmwmr"><strong>??</strong></a>). Finally, we plot the simulated data and add the estimated regression line of a simple regression model as well as the predictions made with a quadratic model to compare the fit graphically.</p>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set a seed to make the results reproducible</span>
<span class="kw">set.seed</span>(<span class="dv">321</span>)

<span class="co"># simulate the data </span>
X &lt;-<span class="st"> </span><span class="kw">runif</span>(<span class="dv">50</span>, <span class="dt">min =</span> <span class="dv">-5</span>, <span class="dt">max =</span> <span class="dv">5</span>)
u &lt;-<span class="st"> </span><span class="kw">rnorm</span>(<span class="dv">50</span>, <span class="dt">sd =</span> <span class="dv">5</span>)  

<span class="co"># the true relation  </span>
Y &lt;-<span class="st"> </span>X<span class="op">^</span><span class="dv">2</span> <span class="op">+</span><span class="st"> </span><span class="dv">2</span> <span class="op">*</span><span class="st"> </span>X <span class="op">+</span><span class="st"> </span>u                

<span class="co"># estimate a simple regression model </span>
mod_simple &lt;-<span class="st"> </span><span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X)

<span class="co"># predict using a quadratic model </span>
prediction &lt;-<span class="st"> </span><span class="kw">predict</span>(<span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X <span class="op">+</span><span class="st"> </span><span class="kw">I</span>(X<span class="op">^</span><span class="dv">2</span>)), <span class="kw">data.frame</span>(<span class="dt">X =</span> <span class="kw">sort</span>(X)))

<span class="co"># plot the results</span>
<span class="kw">plot</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="kw">abline</span>(mod_simple, <span class="dt">col =</span> <span class="st">&quot;red&quot;</span>)
<span class="kw">lines</span>(<span class="kw">sort</span>(X), prediction)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-34-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
<p>The plot shows what is meant by <span class="math inline">\(E(u_i|X_i) = 0\)</span> and why it does not hold for the linear model:</p>
<p>Using the quadratic model (represented by the black curve) we see that there are no systematic deviations of the observation from the predicted relation. It is credible that the assumption is not violated when such a model is employed. However, using a simple linear regression model we see that the assumption is probably violated as <span class="math inline">\(E(u_i|X_i)\)</span> varies with the <span class="math inline">\(X_i\)</span>.</p>
</div>
<div id="assumption-2-independently-and-identically-distributed-data" class="section level3 unnumbered">
<h3>Assumption 2: Independently and Identically Distributed Data</h3>
<p>Most sampling schemes used when collecting data from populations produce i.i.d.-samples. For example, we could use <tt>R</tt>’s random number generator to randomly select student IDs from a university’s enrollment list and record age <span class="math inline">\(X\)</span> and earnings <span class="math inline">\(Y\)</span> of the corresponding students. This is a typical example of simple random sampling and ensures that all the <span class="math inline">\((X_i, Y_i)\)</span> are drawn randomly from the same population.</p>
<p>A prominent example where the i.i.d. assumption is not fulfilled is time series data where we have observations on the same unit over time. For example, take <span class="math inline">\(X\)</span> as the number of workers in a production company over time. Due to business transformations, the company cuts jobs periodically by a specific share but there are also some non-deterministic influences that relate to economics, politics etc. Using <tt>R</tt> we can easily simulate such a process and plot it.</p>
<p>We start the series with a total of 5000 workers and simulate the reduction of employment with an autoregressive process that exhibits a downward movement in the long-run and has normally distributed errors:<a href="#fn3" class="footnote-ref" id="fnref3"><sup>3</sup></a></p>
<span class="math display">\[ employment_t = -5 + 0.98 \cdot employment_{t-1} + u_t \]</span>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set seed</span>
<span class="kw">set.seed</span>(<span class="dv">123</span>)

<span class="co"># generate a date vector</span>
Date &lt;-<span class="st"> </span><span class="kw">seq</span>(<span class="kw">as.Date</span>(<span class="st">&quot;1951/1/1&quot;</span>), <span class="kw">as.Date</span>(<span class="st">&quot;2000/1/1&quot;</span>), <span class="st">&quot;years&quot;</span>)

<span class="co"># initialize the employment vector</span>
X &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="dv">5000</span>, <span class="kw">rep</span>(<span class="ot">NA</span>, <span class="kw">length</span>(Date)<span class="op">-</span><span class="dv">1</span>))

<span class="co"># generate time series observations with random influences</span>
<span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">2</span><span class="op">:</span><span class="kw">length</span>(Date)) {
  
    X[i] &lt;-<span class="st"> </span><span class="dv">-50</span> <span class="op">+</span><span class="st"> </span><span class="fl">0.98</span> <span class="op">*</span><span class="st"> </span>X[i<span class="dv">-1</span>] <span class="op">+</span><span class="st"> </span><span class="kw">rnorm</span>(<span class="dt">n =</span> <span class="dv">1</span>, <span class="dt">sd =</span> <span class="dv">200</span>)
    
}

<span class="co">#plot the results</span>
<span class="kw">plot</span>(<span class="dt">x =</span> Date, 
     <span class="dt">y =</span> X, 
     <span class="dt">type =</span> <span class="st">&quot;l&quot;</span>, 
     <span class="dt">col =</span> <span class="st">&quot;steelblue&quot;</span>, 
     <span class="dt">ylab =</span> <span class="st">&quot;Workers&quot;</span>, 
     <span class="dt">xlab =</span> <span class="st">&quot;Time&quot;</span>)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-35-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
<p>It is evident that the observations on the number of employees cannot be independent in this example: the level of today’s employment is correlated with tomorrows employment level. Thus, the i.i.d. assumption is violated.</p>
</div>
<div id="assumption-3-large-outliers-are-unlikely" class="section level3 unnumbered">
<h3>Assumption 3: Large Outliers are Unlikely</h3>
<p>It is easy to come up with situations where extreme observations, i.e., observations that deviate considerably from the usual range of the data, may occur. Such observations are called outliers. Technically speaking, assumption 3 requires that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> have a finite kurtosis.<a href="#fn4" class="footnote-ref" id="fnref4"><sup>4</sup></a></p>
<p>Common cases where we want to exclude or (if possible) correct such outliers is when they are apparently typos, conversion errors or measurement errors. Even if it seems like extreme observations have been recorded correctly, it is advisable to exclude them before estimating a model since OLS suffers from <em>sensitivity to outliers</em>.</p>
<p>What does this mean? One can show that extreme observations receive heavy weighting in the estimation of the unknown regression coefficients when using OLS. Therefore, outliers can lead to strongly distorted estimates of regression coefficients. To get a better impression of this issue, consider the following application where we have placed some sample data on <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> which are highly correlated. The relation between <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> seems to be explained pretty well by the plotted regression line: all of the white data points lie close to the red regression line and we have <span class="math inline">\(R^2=0.92\)</span>.</p>
<p>Now go ahead and add a further observation at, say, <span class="math inline">\((18,2)\)</span>. This observations clearly is an outlier. The result is quite striking: the estimated regression line differs greatly from the one we adjudged to fit the data well. The slope is heavily downward biased and <span class="math inline">\(R^2\)</span> decreased to a mere <span class="math inline">\(29\%\)</span>! <br>
Double-click inside the coordinate system to reset the app. Feel free to experiment. Choose different coordinates for the outlier or add additional ones.</p>
<iframe height="410" width="900" frameborder="0" scrolling="no" src="Outlier.html">
</iframe>
<p>The following code roughly reproduces what is shown in figure 4.5 in the book. As done above we use sample data generated using <tt>R</tt>’s random number functions <tt>rnorm()</tt> and <tt>runif()</tt>. We estimate two simple regression models, one based on the original data set and another using a modified set where one observation is change to be an outlier and then plot the results. In order to understand the complete code you should be familiar with the function <tt>sort()</tt> which sorts the entries of a numeric vector in ascending order.</p>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set seed</span>
<span class="kw">set.seed</span>(<span class="dv">123</span>)

<span class="co"># generate the data</span>
X &lt;-<span class="st"> </span><span class="kw">sort</span>(<span class="kw">runif</span>(<span class="dv">10</span>, <span class="dt">min =</span> <span class="dv">30</span>, <span class="dt">max =</span> <span class="dv">70</span>))
Y &lt;-<span class="st"> </span><span class="kw">rnorm</span>(<span class="dv">10</span> , <span class="dt">mean =</span> <span class="dv">200</span>, <span class="dt">sd =</span> <span class="dv">50</span>)
Y[<span class="dv">9</span>] &lt;-<span class="st"> </span><span class="dv">2000</span>

<span class="co"># fit model with outlier</span>
fit &lt;-<span class="st"> </span><span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X)

<span class="co"># fit model without outlier</span>
fitWithoutOutlier &lt;-<span class="st"> </span><span class="kw">lm</span>(Y[<span class="op">-</span><span class="dv">9</span>] <span class="op">~</span><span class="st"> </span>X[<span class="op">-</span><span class="dv">9</span>])

<span class="co"># plot the results</span>
<span class="kw">plot</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="kw">abline</span>(fit)
<span class="kw">abline</span>(fitWithoutOutlier, <span class="dt">col =</span> <span class="st">&quot;red&quot;</span>)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-36-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="3">
<li id="fn3"><p>See Chapter <a href="#ittsraf"><strong>??</strong></a> for more on autoregressive processes and time series analysis in general.<a href="2-4-tlsa.html#fnref3" class="footnote-back">↩</a></p></li>
<li id="fn4"><p>See Chapter 4.4 of the book.<a href="2-4-tlsa.html#fnref4" class="footnote-back">↩</a></p></li>
</ol>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="2-3-measures-of-fit.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="2-5-tsdotoe.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": true,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "serif",
"size": 2
},
"edit": {
"link": "https://github.com/mca91/EconometricsWithR/edit/master/04-ch4.Rmd",
"text": "Edit"
},
"history": {
"link": null,
"text": null
},
"download": ["ITER.pdf"],
"toc": {
"collapse": "subsection",
"scroll_highlight": true
}
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:" && /^https?:/.test(src))
      src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>