forked from sassoftware/enlighten-apply
-
Notifications
You must be signed in to change notification settings - Fork 0
/
digit_recognizer.sas
487 lines (426 loc) · 12 KB
/
digit_recognizer.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
******************************************************************************;
* Copyright (c) 2015 by SAS Institute Inc., Cary, NC 27513 USA *;
* *;
* Licensed under the Apache License, Version 2.0 (the "License"); *;
* you may not use this file except in compliance with the License. *;
* You may obtain a copy of the License at *;
* *;
* http://www.apache.org/licenses/LICENSE-2.0 *;
* *;
* Unless required by applicable law or agreed to in writing, software *;
* distributed under the License is distributed on an "AS IS" BASIS, *;
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *;
* See the License for the specific language governing permissions and *;
* limitations under the License. *;
******************************************************************************;
******************************************************************************;
* VARIOUS SAS ROUTINES FOR MNIST DATA: *;
* CALCULATE PIXEL DENSITY *;
* RESIZE GRID ON WHICH PIXELS CAN BE CENTERED (ODD# X ODD#) *;
* VISUALIZE DIGITS *;
* DENOISING AUTOENCODER *;
* EXTRACT FEATURES *;
******************************************************************************;
*** SET WORKING DIRECTORY TO REPO DOWNLOADED FROM GIT;
%let git_repo_data_dir= ;
libname l "&git_repo_data_dir";
%let train_set= Digits_train_sample;
*** SET CPU COUNT;
%let cpu_count= ;
*** ADD PRIMARY KEY TO TRAINING DATA *****************************************;
*** MAKE TEMP COPY - DO NOT ALTER ORIGINAL DATA ******************************;
data &train_set;
length pic_ID 8;
set l.&train_set;
pic_ID= _n_;
run;
*** TRANSFORM PIXELS INTO XY PLANE WITH Z CONTOURS ***************************;
data _xyz;
set &train_set;
array pixels pixel0-pixel783;
do j= 1 to 784;
pic_ID= pic_ID;
label= label;
x= j-28*floor((j-1)/28);
y= 29-ceil(j/28);
z= pixels(j);
output;
end;
drop j pixel0-pixel783;
run;
*** CALCULATE PIXEL DENSITY IN XY SPACE **************************************;
*** DENSITY ~ INTENSITY;
data _d;
set _xyz;
by pic_ID;
retain _sum 0;
_sum= _sum + z;
if last.pic_ID then do;
density= _sum/(28*28);
output;
_sum= 0;
end;
keep pic_ID density;
run;
*** MERGE RESULTS ONTO TRAINING SET;
data &train_set._dn;
length pic_ID label density pixel0-pixel783 8;
merge &train_set _d;
by pic_ID;
run;
*** CENTER *******************************************************************;
*** DIFFICULT TO CENTER DIGITAL IMAGE/CANNOT BE CENTERED ON EVEN BY EVEN GRID;
*** (ORIGIN= 14.5, 14.5)
*** CREATE ODD BY ODD GRID;
*** (ORIGIN= 14, 14);
*** REMOVE OUTER PIXELS;
data &train_set._dn;
set &train_set._dn;
drop pixel0-pixel27 pixel28 pixel56 pixel84 pixel112 pixel140 pixel168
pixel196 pixel224 pixel252 pixel280 pixel308 pixel336 pixel364
pixel392 pixel420 pixel448 pixel476 pixel504 pixel532 pixel560
pixel588 pixel616 pixel644 pixel672 pixel700 pixel728 pixel756;
run;
*** REMAP PIXEL NAMES TO 27 BY 27 GRID;
data _new;
do new= 0 to ((27*27)-1);
output;
end;
run;
data _new;
set _new;
match= _n_;
run;
data _old;
do i=0 to 755;
if mod((i+28),28)^= 0 then do;
old= i+28;
output;
end;
else continue;
drop i;
end;
run;
data _old;
set _old;
match= _n_;
run;
filename rnm_stmt "%sysfunc(pathname(WORK))\rnm_stmt.sas";
data _null_;
merge _new _old;
by match;
file rnm_stmt;
if _n_= 1 then
put "proc datasets lib=WORK; modify &train_set._dn; rename";
line= 'pixel'||trim(left(old))||' = pixel'||trim(left(new));
put line;
if _n_= 27*27 then put '; run; quit;';
run;
%include rnm_stmt;
filename rnm_stmt;
*** CALCULATE COORDINATES OF BOX SURROUNDING EACH DIGIT;
*** RE-TRANSFORM PIXELS INTO XY PLANE WITH Z CONTOURS;
data _xyz;
set &train_set._dn;
array pixels pixel0-pixel728;
do j= 1 to 729;
pic_ID= pic_ID;
label= label;
x= j-27*floor((j-1)/27);
y= 28-ceil(j/27);
z= pixels(j);
output;
end;
drop j pixel0-pixel728;
run;
*** CALCULATE COORDINATES OF BOX SURROUNDING EACH DIGIT;
proc sort
data= _xyz(keep= pic_ID x z where=(z^= 0))
out=_max_x
sortsize= MAX
threads;
by pic_ID descending x;
run;
data _max_x;
set _max_x;
retain max_x;
by pic_ID;
if first.pic_ID then max_x= x;
if last.pic_ID then do;
min_x= x;
output;
end;
drop x z;
run;
proc sort
data= _xyz(keep= pic_ID y z where=(z^= 0))
out=_max_y
sortsize= MAX
threads;
by pic_ID descending y;
run;
data _max_y;
set _max_y;
retain max_y;
by pic_ID;
if first.pic_ID then max_y= y;
if last.pic_ID then do;
min_y= y;
output;
end;
drop y z;
run;
*** CENTER DIGITS;
data _xyz;
merge _xyz _max_x _max_y;
by pic_ID;
x_mid= round((max_x - min_x)/2 + min_x,1);
y_mid= round((max_y - min_y)/2 + min_y,1);
if x_mid^= 14 then do;
x_offset= x_mid - 14; /* x offset is units RIGHT of the origin */
x= x-x_offset;
if x > 27 then x= 27;
if x < 1 then x= 1;
end;
if y_mid^= 14 then do;
y_offset= y_mid - 14; /* y offset is units ABOVE the origin */
y= y-y_offset;
if y > 27 then y= 27;
if y < 1 then y= 1;
end;
if z^= 0;
run;
*** TRANSFORM FROM XY SPACE TO PIXEL SPACE;
filename rnm_stmt "%sysfunc(pathname(WORK))\rnm_stmt2.sas";
data _null_;
file rnm_stmt;
put "data &train_set._dn_cn;";
put 'set _xyz;';
put 'by pic_ID;';
put 'array pixels pixel0-pixel728;';
put 'retain pixels;';
do y= 1 to 27;
do x= 1 to 27;
_y= 28-y;
i= (y-1)*27 + x;
put 'if x= ' x' and y= ' _y' then pixels[' i']= z;';
put 'if pixels[' i']= . then pixels[' i']= 0;';
output;
end;
end;
put 'if last.pic_ID then do;';
put 'output;';
put 'do i= 1 to 729;';
put 'pixels[i]= 0;';
put 'end;';
put 'end;';
put 'drop x y z max_x min_x max_y min_y x_offset y_offset i x_mid y_mid;';
put 'run;';
run;
%include rnm_stmt;
filename rnm_stmt;
*** THIS SET IS NOW SUITABLE FOR SUPERVISED TRAINING IN ENTEPRISE MINER;
*** MACRO USED TO VIEW DATA MANIPULATION RESULTS *****************************;
*** VIEW RANDOM DIGITS *******************************************************;
*** TEMPLATE *****************************************************************;
ods path show;
ods path(prepend) work.templat(update);
proc template; /* DEFINE A GRAPH TEMPLATE */
define statgraph contour;
dynamic _title;
begingraph;
entrytitle _title;
layout overlayequated / equatetype= square
commonaxisopts= (viewmin= 0 viewmax= 26
tickvaluelist= (0 5 10 15 20 25))
xaxisopts= (offsetmin= 0 offsetmax= 0)
yaxisopts= (offsetmin= 0 offsetmax= 0);
contourplotparm x= x y= y z= z /
contourtype= gradient nlevels= 255
colormodel= twocolorramp;
endlayout;
endgraph;
end;
run;
*** MACRO FOR VEIWING DIGITS *************************************************;
%macro view_digits(DS, DIM);
ods listing close;
ods html close;
ods html;
%let _length= 10;
%let _nobs= 2000;
%let _seed= %sysfunc(floor(%sysfunc(time())));
data _r;
length r 8;
do i= 1 to &_length;
r= floor(&_nobs*ranuni(&_seed));
output;
end;
run;
proc sort data= _r; by r; run;
data _null_;
set _r;
call symput(left(compress('rand'||_n_)), r);
run;
%macro random_digit_string(_length, _nobs);
%sysfunc(compress(
%do i= 1 %to %eval(&_length - 1);
&&rand&i %str(,)
%end;
&&rand&i
))
%mend random_digit_string;
data _xyz;
do i= 1, %random_digit_string(&_length, &_nobs);
obs= i;
set &DS point= obs;
array pixels pixel: ;
do i= 1 to %eval(&dim*&dim);
x= (i-&dim*floor((i-1)/&dim))-1;
y= (%eval(&dim+1)-ceil(i/&dim))-1;
z= pixels[i];
output;
keep pic_ID x y z;
end;
end;
stop;
run;
proc sgrender data= _xyz template= contour;
dynamic _title= "Digit Image";
by pic_ID;
run;
%mend;
%view_digits(&train_set._dn_cn, 27);
*** DATA AND METADATA PREP FOR AUTOENCODER ***********************************;
*** CREATE MACROS FOR VARNAMES;
*** DROP PIXELS THAT ARE ALWAYS ZERO;
proc means data= &train_set._dn_cn (keep= pixel:) noprint;
var pixel:;
output out= o (keep= _STAT_ pixel: where= (_STAT_= 'MAX'));
run;
proc transpose data= o out= ot; run;
proc sql noprint;
select _NAME_ into :targets separated by ' '
from ot
where col1 ne 0;
select _NAME_ into :inputs separated by ' corrupted'
from ot
where col1 ne 0;
select _NAME_ into :drops separated by ' '
from ot
where col1 eq 0;
quit;
%put &targets;
%let inputs= corrupted&inputs;
%put &inputs;
%put &drops;
*** CREATE CORRUPTED COPIES OF TRAINING DATA;
%let THRESHOLD= 0.05; /* SET BETWEEN 0 AND 1 */
data autoencoderTraining;
set &train_set._dn_cn (drop= &drops);
array pixels &targets;
array corruptedPixels &inputs;
do i= 1 to dim(pixels);
if rand('UNIFORM') < &THRESHOLD then corruptedPixels[i]= 0;
else corruptedPixels[i]= pixels[i];
end;
drop i density;
run;
*** CHECK CORRUPTION;
*** (CORRUPTED PIXEL MEAN INTENSITY) ~ (PIXEL MEAN INTENSITY*(1-&THRESHOLD));
proc sql noprint;
select _NAME_ into :checkVar
from ot
where col1 ne 0
order by rand('UNIFORM');
run;
%put &checkVar;
proc means data= autoencoderTraining mean;
var &checkVar corrupted&checkVar;
run;
*** TRAIN AUTOENCODER ********************************************************;
*** CREATE REQUIRED DMDB CATALOG;
proc dmdb
data= autoencoderTraining
out= autoencoderTrainingDMDB
dmdbcat= work.autoencoderTrainingCat;
var &inputs &targets;
class label;
id pic_ID;
target &targets;
run;
*** TRAIN AUTOENCODER;
*** REDIRECT LONG OUTPUT;
ods html close;
ods listing;
filename out 'neural.lst'; /* ENTER FILENAME FOR OUTPUT */
proc printto print= out; run;
proc neural
data= autoencoderTraining
dmdbcat= work.autoencoderTrainingCat
random= 11111;
performance compile details cpucount= &cpu_count threads= yes; /* ENTER VALUE FOR CPU COUNT */
/* DO NOT EXCEED NUMBER OF PHYSICAL CORES */
netopts decay= 0.5;
/* DEFAULTS: ACT= TANH COMBINE= LINEAR */
/* IDS ARE USED AS LAYER INDICATORS - SEE FIGURE 6 */
/* INPUTS AND TARGETS SHOULD BE STANDARDIZED */
archi MLP hidden= 5;
hidden 300 / id= h1;
hidden 100 / id= h2;
hidden 2 / id= h3 act= linear;
hidden 100 / id= h4;
hidden 300 / id= h5;
input &inputs / id= i level= int std= none;
target &targets / act= identity id= t level= int std= none;
/* BEFORE PRELIMINARY TRAINING WEIGHTS WILL BE RANDOM */
initial infan= 1;
prelim 10 preiter= 10;
/* TRAIN LAYERS SEPARATELY */
freeze h1->h2;
freeze h2->h3;
freeze h3->h4;
freeze h4->h5;
train technique= congra maxtime= 10000 maxiter= 1000;
freeze i->h1;
thaw h1->h2;
train technique= congra maxtime= 10000 maxiter= 1000;
freeze h1->h2;
thaw h2->h3;
train technique= congra maxtime= 10000 maxiter= 1000;
freeze h2->h3;
thaw h3->h4;
train technique= congra maxtime= 10000 maxiter= 1000;
freeze h3->h4;
thaw h4->h5;
train technique= congra maxtime= 10000 maxiter= 1000;
/* RETRAIN ALL LAYERS SIMULTANEOUSLY */
thaw i->h1;
thaw h1->h2;
thaw h2->h3;
thaw h3->h4;
train technique= congra maxtime= 10000 maxiter= 1000;
code file= 'neural.sas'; /* ENTER SCORE CODE FILE PATH - SAME AS NEXT COMMENT BELOW */
run;
proc printto; run;
*** EXTRACT AND PLOT FEATURES ************************************************;
options nosource2;
data extractedFeatures(keep= label h31 h32);
set autoencoderTraining;
%include 'neural.sas'; /* ENTER SCORE CODE FILE PATH - SAME AS LAST COMMENT ABOVE */
if mod(_n_, 10) = 0 then do;
line= 'Processing line '||strip(put(_n_, best.))||' of 2000.';
put line;
end;
run;
ods html;
ods listing close;
proc sort data= extractedFeatures; by label; run;
proc sgplot
data= extractedFeatures;
scatter x= h32 y= h31 /
group= label groupdisplay= cluster clusterwidth= 0
markercharattrs= (size= 3.75pt)
markerchar= label
transparency= 0.3;
run;