forked from mca91/EconometricsWithR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2-4-tlsa.html
414 lines (355 loc) · 30 KB
/
2-4-tlsa.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
<!DOCTYPE html>
<html >
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>2.4 The Least Squares Assumptions | Introduction to Econometrics with R</title>
<meta name="description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js.">
<meta name="generator" content="bookdown and GitBook 2.6.7">
<meta property="og:title" content="2.4 The Least Squares Assumptions | Introduction to Econometrics with R" />
<meta property="og:type" content="book" />
<meta property="og:url" content="https://www.econometrics-with-r.org/" />
<meta property="og:image" content="https://www.econometrics-with-r.org/images/cover.png" />
<meta property="og:description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js." />
<meta name="github-repo" content="mca91/EconometricsWithR" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="2.4 The Least Squares Assumptions | Introduction to Econometrics with R" />
<meta name="twitter:description" content="Beginners with little background in statistics and econometrics often have a hard time understanding the benefits of having programming skills for learning and applying Econometrics. ‘Introduction to Econometrics with R’ is an interactive companion to the well-received textbook ‘Introduction to Econometrics’ by James H. Stock and Mark W. Watson (2015). It gives a gentle introduction to the essentials of R programming and guides students in implementing the empirical applications presented throughout the textbook using the newly aquired skills. This is supported by interactive programming exercises generated with DataCamp Light and integration of interactive visualizations of central concepts which are based on the flexible JavaScript library D3.js." />
<meta name="twitter:image" content="https://www.econometrics-with-r.org/images/cover.png" />
<meta name="author" content="Christoph Hanck, Martin Arnold, Alexander Gerber and Martin Schmelzer">
<meta name="date" content="2019-03-12">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="prev" href="2-3-measures-of-fit.html">
<link rel="next" href="2-5-tsdotoe.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<!-- font families -->
<link href="https://fonts.googleapis.com/css?family=PT+Sans|Pacifico|Source+Sans+Pro" rel="stylesheet">
<script src="js/hideOutput.js"></script>
<!-- Mathjax -->
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/default.js"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js"],
tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]},
jax: ["input/TeX","output/CommonHTML"]
});
MathJax.Hub.processSectionDelay = 0;
</script>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-110299877-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-110299877-1');
</script>
<!-- open review block -->
<script async defer src="https://hypothes.is/embed.js"></script>
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(data-line-number);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ background-color: #f8f8f8; }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ef2929; } /* Alert */
code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #c4a000; } /* Attribute */
code span.bn { color: #0000cf; } /* BaseN */
code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4e9a06; } /* Char */
code span.cn { color: #000000; } /* Constant */
code span.co { color: #8f5902; font-style: italic; } /* Comment */
code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */
code span.dt { color: #204a87; } /* DataType */
code span.dv { color: #0000cf; } /* DecVal */
code span.er { color: #a40000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #0000cf; } /* Float */
code span.fu { color: #000000; } /* Function */
code span.im { } /* Import */
code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #204a87; font-weight: bold; } /* Keyword */
code span.op { color: #ce5c00; font-weight: bold; } /* Operator */
code span.ot { color: #8f5902; } /* Other */
code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */
code span.sc { color: #000000; } /* SpecialChar */
code span.ss { color: #4e9a06; } /* SpecialString */
code span.st { color: #4e9a06; } /* String */
code span.va { color: #000000; } /* Variable */
code span.vs { color: #4e9a06; } /* VerbatimString */
code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */
</style>
<link rel="stylesheet" href="style.css" type="text/css" />
<link rel="stylesheet" href="toc.css" type="text/css" />
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><center><img src="images/logo.png" alt="logo" width="50%" height="50%"style="margin: 15px 0 0 0"></center></li>
<li class="divider"></li>
<li class="chapter" data-level="" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i>Preface</a></li>
<li class="chapter" data-level="1" data-path="1-introduction.html"><a href="1-introduction.html"><i class="fa fa-check"></i><b>1</b> Introduction</a><ul>
<li class="chapter" data-level="1.1" data-path="1-1-a-very-short-introduction-to-r-and-rstudio.html"><a href="1-1-a-very-short-introduction-to-r-and-rstudio.html"><i class="fa fa-check"></i><b>1.1</b> A Very Short Introduction to <tt>R</tt> and <em>RStudio</em></a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="2-lrwor.html"><a href="2-lrwor.html"><i class="fa fa-check"></i><b>2</b> Linear Regression with One Regressor</a><ul>
<li class="chapter" data-level="2.1" data-path="2-1-simple-linear-regression.html"><a href="2-1-simple-linear-regression.html"><i class="fa fa-check"></i><b>2.1</b> Simple Linear Regression</a></li>
<li class="chapter" data-level="2.2" data-path="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><a href="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><i class="fa fa-check"></i><b>2.2</b> Estimating the Coefficients of the Linear Regression Model</a><ul>
<li class="chapter" data-level="" data-path="2-2-estimating-the-coefficients-of-the-linear-regression-model.html"><a href="2-2-estimating-the-coefficients-of-the-linear-regression-model.html#the-ordinary-least-squares-estimator"><i class="fa fa-check"></i>The Ordinary Least Squares Estimator</a></li>
</ul></li>
<li class="chapter" data-level="2.3" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html"><i class="fa fa-check"></i><b>2.3</b> Measures of Fit</a><ul>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#the-coefficient-of-determination"><i class="fa fa-check"></i>The Coefficient of Determination</a></li>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#the-standard-error-of-the-regression"><i class="fa fa-check"></i>The Standard Error of the Regression</a></li>
<li class="chapter" data-level="" data-path="2-3-measures-of-fit.html"><a href="2-3-measures-of-fit.html#application-to-the-test-score-data"><i class="fa fa-check"></i>Application to the Test Score Data</a></li>
</ul></li>
<li class="chapter" data-level="2.4" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html"><i class="fa fa-check"></i><b>2.4</b> The Least Squares Assumptions</a><ul>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-1-the-error-term-has-conditional-mean-of-zero"><i class="fa fa-check"></i>Assumption 1: The Error Term has Conditional Mean of Zero</a></li>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-2-independently-and-identically-distributed-data"><i class="fa fa-check"></i>Assumption 2: Independently and Identically Distributed Data</a></li>
<li class="chapter" data-level="" data-path="2-4-tlsa.html"><a href="2-4-tlsa.html#assumption-3-large-outliers-are-unlikely"><i class="fa fa-check"></i>Assumption 3: Large Outliers are Unlikely</a></li>
</ul></li>
<li class="chapter" data-level="2.5" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html"><i class="fa fa-check"></i><b>2.5</b> The Sampling Distribution of the OLS Estimator</a><ul>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-1"><i class="fa fa-check"></i>Simulation Study 1</a></li>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-2"><i class="fa fa-check"></i>Simulation Study 2</a></li>
<li class="chapter" data-level="" data-path="2-5-tsdotoe.html"><a href="2-5-tsdotoe.html#simulation-study-3"><i class="fa fa-check"></i>Simulation Study 3</a></li>
</ul></li>
<li class="chapter" data-level="2.6" data-path="2-6-exercises.html"><a href="2-6-exercises.html"><i class="fa fa-check"></i><b>2.6</b> Exercises</a></li>
</ul></li>
<li class="divider"></li>
<li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Introduction to Econometrics with R</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div class = rmdreview>
This book is in <b>Open Review</b>. We want your feedback to make the book better for you and other students. You may annotate some text by <span style="background-color: #3297FD; color: white">selecting it with the cursor</span> and then click the <i class="h-icon-annotate"></i> on the pop-up menu. You can also see the annotations of others: click the <i class="h-icon-chevron-left"></i> in the upper right hand corner of the page <i class="fa fa-arrow-circle-right fa-rotate-315" aria-hidden="true"></i>
</div>
<div id="tlsa" class="section level2">
<h2><span class="header-section-number">2.4</span> The Least Squares Assumptions</h2>
<p>OLS performs well under a quite broad variety of different circumstances. However, there are some assumptions which need to be satisfied in order to ensure that the estimates are normally distributed in large samples (we discuss this in Chapter <a href="2-5-tsdotoe.html#tsdotoe">2.5</a>.</p>
<div id="KC4.3" class="keyconcept">
<h3 class="right">
Key Concept 4.3
</h3>
<h3 class="left">
The Least Squares Assumptions
</h3>
<p>
<p><span class="math display">\[Y_i = \beta_0 + \beta_1 X_i + u_i \text{, } i = 1,\dots,n\]</span>
where</p>
<ol style="list-style-type: decimal">
<li>The error term <span class="math inline">\(u_i\)</span> has conditional mean zero given <span class="math inline">\(X_i\)</span>: <span class="math inline">\(E(u_i|X_i) = 0\)</span>.</li>
<li><span class="math inline">\((X_i,Y_i), i = 1,\dots,n\)</span> are independent and identically distributed (i.i.d.) draws from their joint distribution.</li>
<li>Large outliers are unlikely: <span class="math inline">\(X_i\)</span> and <span class="math inline">\(Y_i\)</span> have nonzero finite fourth moments.</li>
</ol>
</p>
</div>
<div id="assumption-1-the-error-term-has-conditional-mean-of-zero" class="section level3 unnumbered">
<h3>Assumption 1: The Error Term has Conditional Mean of Zero</h3>
<p>This means that no matter which value we choose for <span class="math inline">\(X\)</span>, the error term <span class="math inline">\(u\)</span> must not show any systematic pattern and must have a mean of <span class="math inline">\(0\)</span>.
Consider the case that, unconditionally, <span class="math inline">\(E(u) = 0\)</span>, but for low and high values of <span class="math inline">\(X\)</span>, the error term tends to be positive and for midrange values of
<span class="math inline">\(X\)</span> the error tends to be negative. We can use R to construct such an example. To do so we generate our own data using <tt>R</tt>’s built-in random number generators.</p>
<p>We will use the following functions:</p>
<ul>
<li><tt>runif()</tt> - generates uniformly distributed random numbers</li>
<li><tt>rnorm()</tt> - generates normally distributed random numbers</li>
<li><tt>predict()</tt> - does predictions based on the results of model fitting functions like <tt>lm()</tt></li>
<li><tt>lines()</tt> - adds line segments to an existing plot</li>
</ul>
<p>We start by creating a vector containing values that are uniformly distributed on the interval <span class="math inline">\([-5,5]\)</span>. This can be done with the function <tt>runif()</tt>. We also need to simulate the error term. For this we generate normally distributed random numbers with a mean equal to <span class="math inline">\(0\)</span> and a variance of <span class="math inline">\(1\)</span> using <tt>rnorm()</tt>. The <span class="math inline">\(Y\)</span> values are obtained as a quadratic function of the <span class="math inline">\(X\)</span> values and the error.</p>
<p>After generating the data we estimate both a simple regression model and a quadratic model that also includes the regressor <span class="math inline">\(X^2\)</span> (this is a multiple regression model, see Chapter <a href="#rmwmr"><strong>??</strong></a>). Finally, we plot the simulated data and add the estimated regression line of a simple regression model as well as the predictions made with a quadratic model to compare the fit graphically.</p>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set a seed to make the results reproducible</span>
<span class="kw">set.seed</span>(<span class="dv">321</span>)
<span class="co"># simulate the data </span>
X <-<span class="st"> </span><span class="kw">runif</span>(<span class="dv">50</span>, <span class="dt">min =</span> <span class="dv">-5</span>, <span class="dt">max =</span> <span class="dv">5</span>)
u <-<span class="st"> </span><span class="kw">rnorm</span>(<span class="dv">50</span>, <span class="dt">sd =</span> <span class="dv">5</span>)
<span class="co"># the true relation </span>
Y <-<span class="st"> </span>X<span class="op">^</span><span class="dv">2</span> <span class="op">+</span><span class="st"> </span><span class="dv">2</span> <span class="op">*</span><span class="st"> </span>X <span class="op">+</span><span class="st"> </span>u
<span class="co"># estimate a simple regression model </span>
mod_simple <-<span class="st"> </span><span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="co"># predict using a quadratic model </span>
prediction <-<span class="st"> </span><span class="kw">predict</span>(<span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X <span class="op">+</span><span class="st"> </span><span class="kw">I</span>(X<span class="op">^</span><span class="dv">2</span>)), <span class="kw">data.frame</span>(<span class="dt">X =</span> <span class="kw">sort</span>(X)))
<span class="co"># plot the results</span>
<span class="kw">plot</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="kw">abline</span>(mod_simple, <span class="dt">col =</span> <span class="st">"red"</span>)
<span class="kw">lines</span>(<span class="kw">sort</span>(X), prediction)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-34-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
<p>The plot shows what is meant by <span class="math inline">\(E(u_i|X_i) = 0\)</span> and why it does not hold for the linear model:</p>
<p>Using the quadratic model (represented by the black curve) we see that there are no systematic deviations of the observation from the predicted relation. It is credible that the assumption is not violated when such a model is employed. However, using a simple linear regression model we see that the assumption is probably violated as <span class="math inline">\(E(u_i|X_i)\)</span> varies with the <span class="math inline">\(X_i\)</span>.</p>
</div>
<div id="assumption-2-independently-and-identically-distributed-data" class="section level3 unnumbered">
<h3>Assumption 2: Independently and Identically Distributed Data</h3>
<p>Most sampling schemes used when collecting data from populations produce i.i.d.-samples. For example, we could use <tt>R</tt>’s random number generator to randomly select student IDs from a university’s enrollment list and record age <span class="math inline">\(X\)</span> and earnings <span class="math inline">\(Y\)</span> of the corresponding students. This is a typical example of simple random sampling and ensures that all the <span class="math inline">\((X_i, Y_i)\)</span> are drawn randomly from the same population.</p>
<p>A prominent example where the i.i.d. assumption is not fulfilled is time series data where we have observations on the same unit over time. For example, take <span class="math inline">\(X\)</span> as the number of workers in a production company over time. Due to business transformations, the company cuts jobs periodically by a specific share but there are also some non-deterministic influences that relate to economics, politics etc. Using <tt>R</tt> we can easily simulate such a process and plot it.</p>
<p>We start the series with a total of 5000 workers and simulate the reduction of employment with an autoregressive process that exhibits a downward movement in the long-run and has normally distributed errors:<a href="#fn3" class="footnote-ref" id="fnref3"><sup>3</sup></a></p>
<span class="math display">\[ employment_t = -5 + 0.98 \cdot employment_{t-1} + u_t \]</span>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set seed</span>
<span class="kw">set.seed</span>(<span class="dv">123</span>)
<span class="co"># generate a date vector</span>
Date <-<span class="st"> </span><span class="kw">seq</span>(<span class="kw">as.Date</span>(<span class="st">"1951/1/1"</span>), <span class="kw">as.Date</span>(<span class="st">"2000/1/1"</span>), <span class="st">"years"</span>)
<span class="co"># initialize the employment vector</span>
X <-<span class="st"> </span><span class="kw">c</span>(<span class="dv">5000</span>, <span class="kw">rep</span>(<span class="ot">NA</span>, <span class="kw">length</span>(Date)<span class="op">-</span><span class="dv">1</span>))
<span class="co"># generate time series observations with random influences</span>
<span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">2</span><span class="op">:</span><span class="kw">length</span>(Date)) {
X[i] <-<span class="st"> </span><span class="dv">-50</span> <span class="op">+</span><span class="st"> </span><span class="fl">0.98</span> <span class="op">*</span><span class="st"> </span>X[i<span class="dv">-1</span>] <span class="op">+</span><span class="st"> </span><span class="kw">rnorm</span>(<span class="dt">n =</span> <span class="dv">1</span>, <span class="dt">sd =</span> <span class="dv">200</span>)
}
<span class="co">#plot the results</span>
<span class="kw">plot</span>(<span class="dt">x =</span> Date,
<span class="dt">y =</span> X,
<span class="dt">type =</span> <span class="st">"l"</span>,
<span class="dt">col =</span> <span class="st">"steelblue"</span>,
<span class="dt">ylab =</span> <span class="st">"Workers"</span>,
<span class="dt">xlab =</span> <span class="st">"Time"</span>)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-35-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
<p>It is evident that the observations on the number of employees cannot be independent in this example: the level of today’s employment is correlated with tomorrows employment level. Thus, the i.i.d. assumption is violated.</p>
</div>
<div id="assumption-3-large-outliers-are-unlikely" class="section level3 unnumbered">
<h3>Assumption 3: Large Outliers are Unlikely</h3>
<p>It is easy to come up with situations where extreme observations, i.e., observations that deviate considerably from the usual range of the data, may occur. Such observations are called outliers. Technically speaking, assumption 3 requires that <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> have a finite kurtosis.<a href="#fn4" class="footnote-ref" id="fnref4"><sup>4</sup></a></p>
<p>Common cases where we want to exclude or (if possible) correct such outliers is when they are apparently typos, conversion errors or measurement errors. Even if it seems like extreme observations have been recorded correctly, it is advisable to exclude them before estimating a model since OLS suffers from <em>sensitivity to outliers</em>.</p>
<p>What does this mean? One can show that extreme observations receive heavy weighting in the estimation of the unknown regression coefficients when using OLS. Therefore, outliers can lead to strongly distorted estimates of regression coefficients. To get a better impression of this issue, consider the following application where we have placed some sample data on <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> which are highly correlated. The relation between <span class="math inline">\(X\)</span> and <span class="math inline">\(Y\)</span> seems to be explained pretty well by the plotted regression line: all of the white data points lie close to the red regression line and we have <span class="math inline">\(R^2=0.92\)</span>.</p>
<p>Now go ahead and add a further observation at, say, <span class="math inline">\((18,2)\)</span>. This observations clearly is an outlier. The result is quite striking: the estimated regression line differs greatly from the one we adjudged to fit the data well. The slope is heavily downward biased and <span class="math inline">\(R^2\)</span> decreased to a mere <span class="math inline">\(29\%\)</span>! <br>
Double-click inside the coordinate system to reset the app. Feel free to experiment. Choose different coordinates for the outlier or add additional ones.</p>
<iframe height="410" width="900" frameborder="0" scrolling="no" src="Outlier.html">
</iframe>
<p>The following code roughly reproduces what is shown in figure 4.5 in the book. As done above we use sample data generated using <tt>R</tt>’s random number functions <tt>rnorm()</tt> and <tt>runif()</tt>. We estimate two simple regression models, one based on the original data set and another using a modified set where one observation is change to be an outlier and then plot the results. In order to understand the complete code you should be familiar with the function <tt>sort()</tt> which sorts the entries of a numeric vector in ascending order.</p>
<div class="unfolded">
<pre class="sourceCode r"><code class="sourceCode r"><span class="co"># set seed</span>
<span class="kw">set.seed</span>(<span class="dv">123</span>)
<span class="co"># generate the data</span>
X <-<span class="st"> </span><span class="kw">sort</span>(<span class="kw">runif</span>(<span class="dv">10</span>, <span class="dt">min =</span> <span class="dv">30</span>, <span class="dt">max =</span> <span class="dv">70</span>))
Y <-<span class="st"> </span><span class="kw">rnorm</span>(<span class="dv">10</span> , <span class="dt">mean =</span> <span class="dv">200</span>, <span class="dt">sd =</span> <span class="dv">50</span>)
Y[<span class="dv">9</span>] <-<span class="st"> </span><span class="dv">2000</span>
<span class="co"># fit model with outlier</span>
fit <-<span class="st"> </span><span class="kw">lm</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="co"># fit model without outlier</span>
fitWithoutOutlier <-<span class="st"> </span><span class="kw">lm</span>(Y[<span class="op">-</span><span class="dv">9</span>] <span class="op">~</span><span class="st"> </span>X[<span class="op">-</span><span class="dv">9</span>])
<span class="co"># plot the results</span>
<span class="kw">plot</span>(Y <span class="op">~</span><span class="st"> </span>X)
<span class="kw">abline</span>(fit)
<span class="kw">abline</span>(fitWithoutOutlier, <span class="dt">col =</span> <span class="st">"red"</span>)</code></pre>
<p><img src="ITER_files/figure-html/unnamed-chunk-36-1.png" width="80%" style="display: block; margin: auto;" /></p>
</div>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="3">
<li id="fn3"><p>See Chapter <a href="#ittsraf"><strong>??</strong></a> for more on autoregressive processes and time series analysis in general.<a href="2-4-tlsa.html#fnref3" class="footnote-back">↩</a></p></li>
<li id="fn4"><p>See Chapter 4.4 of the book.<a href="2-4-tlsa.html#fnref4" class="footnote-back">↩</a></p></li>
</ol>
</div>
</section>
</div>
</div>
</div>
<a href="2-3-measures-of-fit.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="2-5-tsdotoe.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": true,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "serif",
"size": 2
},
"edit": {
"link": "https://github.com/mca91/EconometricsWithR/edit/master/04-ch4.Rmd",
"text": "Edit"
},
"history": {
"link": null,
"text": null
},
"download": ["ITER.pdf"],
"toc": {
"collapse": "subsection",
"scroll_highlight": true
}
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:" && /^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>