-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.tex
1423 lines (1188 loc) · 53.5 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% To compile as an article.
\documentclass[oneside,a4paper]{article} % fleqn
\newcommand{\level}{\section}
\newcommand{\sublevel}{\subsection}
\newcommand{\subsublevel}{\subsubsection}
\usepackage[a4paper,left=2.5cm,right=2.5cm,top=2.5cm,bottom=2.5cm]{geometry}
% To compile as a book.
% \documentclass[oneside,a4]{book}
% \newcommand{\level}{\chapter}
% \newcommand{\sublevel}{\section}
% \newcommand{\subsublevel}{\subsection}
% \usepackage[paperwidth=14.5cm, paperheight=21.5cm]{geometry}
% \hoffset = -35pt
% \voffset = -40pt
% \oddsidemargin = 0pt
% \evensidemargin = 0pt
% \topmargin = 0pt
% \textheight = 500pt
% \textwidth = 340pt
\usepackage{algorithm2e}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{color}
\usepackage{dsfont}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{listings}
\usepackage{mathtools}
\usepackage{makeidx}
\usepackage{multicol}
\usepackage{soul}
\usepackage{wrapfig}
\hypersetup{
colorlinks=false,
pdfborder={0 0 0},
}
% \usepackage{fancyhdr}
% \pagestyle{fancy}
% \lhead{}
% \chead{\textcolor{red}{This is a rough draft!}}
% \rhead{}
% \renewcommand{\headrulewidth}{0pt}
% Soft colors.
% \definecolor{red}{rgb}{0.8,0.1,0.1}
% \definecolor{blue}{rgb}{0.1,0.2,0.8}
% Some useful definitions.
% \newcommand{\note}[1]{\hl{#1}}
\newcommand{\note}[1]{\textcolor{red}{#1}}
\newcommand{\la}{\leftarrow}
\newcommand{\Lra}{\Leftrightarrow}
\newcommand{\nat}{\mathds{N}}
\newcommand{\new}[1]{\textcolor{black}{#1}} % blue
% \newcommand{\new}[1]{\fcolorbox{black}{white}{#1}}
\newcommand{\ol}{\overline}
\newcommand{\pcunchanged}{c_{t + 1} = c_t}
\newcommand{\Ra}{\Rightarrow}
\newcommand{\ra}{\rightarrow}
\newcommand{\term}[1]{\textbf{#1}\index{#1}}
\newcommand{\altterm}[2]{\textbf{#1}\index{#2}}
% \newcommand{\term}[1]{\textcolor{magenta}{#1}}
\newcommand{\true}{\textbf{true}}
\newcommand{\false}{\textbf{false}}
\newcommand{\ifl}{\en{if}\ru{если}}
% Execution marks
\newcommand{\te}[1]{#1^T} % Taint execution
\newcommand{\se}{\overline} % Symbolic execution
\newcommand{\dse}{\widetilde} % Dynamic symbolic execution
% Assert, declare and update as commands.
\newcommand{\assert}{\textbf{assert}\,}
\newcommand{\declare}{\textbf{declare}\,\,}
\newcommand{\update}{\textbf{update}\,\,}
% Assert, declare and update as statements.
% \newcommand{\assert}{c_{t + 1} = c_t \cup}
% \newcommand{\declare}{}
% \newcommand{\update}{}
\newcommand{\lblock}{ \left \{ \begin{array}{l} }
\newcommand{\rblock}{ \end{array} \right. }
\renewcommand{\arraystretch}{1.5}
% \renewcommand\qedsymbol{$\blacksquare$}
\setcounter{secnumdepth}{0}
\setlength{\skip\footins}{0.7cm}
\setlength{\footnotesep}{0.3cm}
% Lanugage.
\newcommand{\en}[1]{#1}
\newcommand{\ru}[1]{}
\newcommand{\noru}[1]{}
% Images.
\newcommand{\centerimage}[2]{
\begin{figure}[h!]
\begin{center}
\includegraphics[scale=1.25]{pdf/#1.pdf}
\caption{#2}
\end{center}
\label{img:#1}
\end{figure}
}
\newcommand{\rightimage}[3]{
\begin{wrapfigure}{R}{#3\textwidth}
\centering
\includegraphics[scale=1.25]{pdf/#1.pdf}
\caption{\label{img:#1}#2}
\end{wrapfigure}
}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}{Claim}
\lstset{
basicstyle=\linespread{0.9}\small\ttfamily,
keywordstyle=\bfseries\color{black},
stringstyle=\color{black},
identifierstyle=\color{black}
}
\setlength{\parindent}{0em}
\title{
\en{Formal model of program execution, symbolic,
dynamic symbolic execution (concolic execution),
and taint tracking}
\ru{Формальная модель исполнения программы, символьного,
динамического символьного исполнения (конколик-исполнения),
и отслеживания помещенных данных}
}
\author{\en{Sergey Vartanov}\ru{Сергей Вартанов} \\
\href{mailto:[email protected]}{\texttt{[email protected]}},
\href{mailto:[email protected]}{\texttt{[email protected]}}}
\makeindex
\begin{document}
\maketitle
% \setlength{\parskip}{0em}
% \tableofcontents
% \pagebreak
\setlength{\parskip}{0.5em}
\begin{abstract}
\en{The primary objective of this document is to propose a formal definition of
some program analysis terms, such as execution (concrete execution),
(pure) symbolic execution,
dynamic symbolic execution (concolic execution),
feasibility, and taint analysis.}
\ru{Основная цель данной работы --- предложить формальную модель некоторых
терминов из области анализа программ, таких как исполнение
(фактическое исполнение),
(чистое) символьное исполнение,
динамическое символьное исполнение (конколик-исполнение),
исполнимость и отслеживание помеченных данных.}
\end{abstract}
\level{\en{Concrete program execution}\ru{Фактическое исполнение программы}}
\sublevel{\en{Program definition}\ru{Определение программы}}
\en{We will start with a program definition.
In order to separate program execution from
symbolic and dynamic symbolic execution, which will be discussed later,
we will use terms \textit{concrete program}
and \textit{concrete program execution}
since it processes concrete data.}
\ru{Мы начнём с определения программы.
Чтобы подчеркнуть разницу между исполнением программы,
символьным и динамическим символьным исполнением,
которые будут рассмотрены позднее,
мы будем использовать термин фактическое исполнение программы,
поскольку при этом процессе обрабатываются фактические данные.}
\en{We want to start with the classical definition of a
\term{computational method} from
Donald Knuth's \textit{The Art of Computer Programming}~\cite{knuth}.
Computational method is $\langle Q, I, \Omega, f \rangle$,
where $f: Q \ra Q$, $Q$~is states\footnote{
Note, that there is no requrement, that all states should be reachable.
We can have state $q \in Q$ that is not initial and
for all initial states $q$ is not in it's execution.
}, $\Omega$~is terminal states, $I$~is input states.
In this model execution of a computational method $\forall x \in I$ is
$x_0, x_1, \dots, x_k, \dots$, where $x_0 = x, x_{k + 1} = f(x_k)$.
For convenience, we will use slightly different notation
because it will interfere with our further considerations
(superscript will be used to distinguish different program definitions):
\begin{align}\mathcal{P}^\text{K} =
\langle \mathcal{F}, D, D_I, D_T \rangle,\end{align}
where $\mathcal{F}: D \ra D$ is an \term{execution function},
$D$~is a set of states, $D_I$~is a set of initial states, $D_T$~is a set of
terminal states.
This definition seems to be concise and comprehensive.
It has no any assumptions about the nature of states $D$ and its limitations.
% Theoretically any real-world program execution process can be described
% by this definition.
However, since we want to highlight some particular aspects of modern programs
and computer systems, we will add some constraints and expand this definition
with more complex constructions.}
\noru{}
\subsublevel*{\en{Split execution function}\ru{Разделение функции исполнения}}
% \en{Firstly, execution function $\mathcal{F}$ is too complex.
% It is a single function that manages the whole execution process.
% To write a program we should somehow split this function into a set of
% more simple functions.
% % Let assume that set $D$ is finite: $|D| = n$.
% There is a function result for each value $d_i$ from $D$.}
% \ru{Во-первых, функция исполнения $\mathcal{F}$ слишком сложная.
% Это единственная функция, которая управляет всем процессом исполнения.
% Чтобы иметь возможность составлять программы,
% мы должны каким-то образом разделить
% функцию выполнения на множество более простых функций.
% Допустим, что множество $D$ конечно: $|D| = n$.
% Мы можем посчитать значения функции для каждого значения $d_i$ из $D$.}
% \begin{align}
% \mathcal{F}(d) =
% \begin{cases}
% d_1', & \text{\ifl}\,\, d = d_1, \\
% d_2', & \text{\ifl}\,\, d = d_2, \\
% \dots \\
% d_n', & \text{\ifl}\,\, d = d_n, \\
% \dots \\
% \end{cases}
% \end{align}
% \en{We can split set $D$ into disjoint sets:
% $D = s_0 \cup s_1 \cup \dots \cup s_m \cup \dots,
% \forall i, j \,\, i \neq j \Ra s_i \cap s_j = \varnothing$.
% These sets should be disjoint because we have to know current program state
% at each point in time.
% Now for each set $s_i$ we can define function $f_i: s_i \ra D$ such as
% $\forall d \in s_i \,\, f_i(d) = \mathcal{F}(d)$.}
% \noru{}
\sloppy
Suppose that execution function $\mathcal{F}$ is a function piecewise-defined
for sets $s_0, s_1, \dots, s_m, \dots$, where
$D = s_0 \cup s_1 \cup \dots \cup s_m \cup \dots,
\forall i, j \,\, i \neq j \Ra s_i \cap s_j = \varnothing$.
\begin{align}
\mathcal{F}(d) =
\begin{cases}
f_0(d), & \text{\ifl}\,\, d \in s_0, \\
f_1(d), & \text{\ifl}\,\, d \in s_1, \\
\dots \\
f_m(d), & \text{\ifl}\,\, d \in s_m, \\
\dots \\
\end{cases}
\end{align}
It is important that sets are disjoint because we have to know current
program state at each point in time.
\en{We will call these functions \textit{execution function elements} or
for short \altterm{function elements}{function element}.}
\en{Also, we can treat set $s_i$ as a state.
And here we have to clarify what is meant by a term \term{state}.}
\en{We have to turn to the definition of
Turing's machine~\cite{turing_computable_numbers}.
There are two different meanings of a term \term{state}.
\begin{enumerate}
\item First meaning is the one used in Knuth's definition.
It is a whole program execution state in some particular moment of
time.
In~\cite{turing_computable_numbers} it also called state of progress,
state of the system, or state formula.
For that meaning we will use term \textbf{\term{process state}}.
For the process state we will use small letter $d$,
for the set of process states---capital letter $D$.
\item The second one is the one we used.
In~\cite{turing_computable_numbers} it also called
record of \textit{m}-configuration, value of state register,
or designator of instruction.
For that meaning we will use term \textbf{\term{program state}}.
For the program state we will use small letter $s$\footnote{
$D$ and $s$ are both sets of $d$ and formally the same.
But we will use them in different situations.
},
for the set of program states---capital letter $S$.
\end{enumerate}}
\ru{Мы будем называть эти функции \textit{элементами функции исполнения}, или
коротко \altterm{функциональными элементами}{функциональный элемент}.
Они представляют собой инструкции или операции.
Также, мы можем рассматривать множества $s_i$ в качестве состояний.
И здесь важно уточнить, что мы понимаем под термином \term{состояние}.
Мы должны обратиться к определению
машины Тьюринга~\cite{turing_computable_numbers}.
В этой работе приводится два значения термина \term{состояние}.
\begin{enumerate}
\item Первое значение~--- то, которое используется в определении Кнута.
Это состояние всего исполнения программы в определённый момент времени.
В~\cite{turing_computable_numbers} оно также называется состоянием
прогресса, состоянием системы или формулой состояния.
В этом значении мы будем использовать термин
\textbf{\term{состояние процесса}}.
Для состояния процесса мы будем использовать строчную букву $d$,
для множества состояний процесса~--- прописную букву $D$.
\item Второе значение~--- то, которое использовали мы.
В~\cite{turing_computable_numbers} оно также называется записью
\textit{m}-конфигурации, значением регистра, или указателем инструкции.
В этом значении мы будем использовать термин
\textbf{\term{состояние программы}}.
Для состояния программы мы будем использовать строчную букву $s$,
для множества состояний программы~--- прописную букву $S$.
\end{enumerate}}
\centerimage{states}{
\en{Process state and program state specifiers.}
\ru{Состояние процесса и состояние программы.}
}
\en{It is importaint to note that function elements and
states were defined in such a way that two statements
``next function element to be executed is $f_i$'' and
``current state is $s_i$'' mean the same.}
\en{For convenience, we will require that $s_0 = D_I$
so we can treat $s_0$ as an initial program state
and that there is a superset $\{s_j\}$ that form $D_T$
so we can treat them as terminal program states:
$S_T = D_T = \bigcup_{j} s_j$.}
Now we can propose another definition of the program:
\begin{align}\mathcal{P}^1 = \langle F, D, S, s_0, S_T \rangle,\end{align}
where
$F = \{f_i\}, f_i: D \ra D$ is a set of function elements,
$D$ is a set of all possible process states,
$S = \{s_j\}$ is a set of program states,
$s_0$ is a initial program state,
$S_T = \{s_k\}$ is a set of terminal program states.
Note that current program state defines the next function element to be
used to proceed program execution.
Therefore we can
define initial function element instead of initial program state, and
terminal function elements instead of terminal program states:
\begin{align}\mathcal{P}^2 = \langle F, D, S, f_0, F_T \rangle\end{align}
where
$f_0$ is a initial function element,
$F_T = \{f_k\}$ is a set of terminal function elements.
\ru{Важно подчеркнуть, что мы определили функциональные элементы и состояния
таким образом, что два утверждения
<<следующим выполненым функциональным элементом будет $f_i$>> и
<<текущее состояние~--- $s_i$>> эквивалентны.}
\ru{Для удобства потребуем, чтобы $s_0 = D_I$
(в таком случае мы можем рассматривать $s_0$ как начальное состояние программы)
и чтобы существовало подмножество $\{s_j\}$, которое формирует $D_T$
(в таком случае мы можем рассматривать их как терминальные состояния программы:
$D_T = \bigcup_{j} s_j$).}
Here $F$ can be treated as a set of program \altterm{statements}{statement}
but not as \term{instruction set}, because they may share semantics but work
for different states.
\subsublevel*{Program counter}
%\subsublevel*{Split data flow and control flow}
% $S = \{s_j\}$ was described as an arbitrary partition of $D$.
% However, with some assumptions about that partition we can get
% new properties of the program definition.
% Firstly, assume, that the partition $\{s_j\}$ was made in such a way,
% that every process state $d_i$ can be divided into two parts:
% $d_i = (d^D_i, d^F_i)$.
Assume as well that every process state $d_i$ can be divided into two parts:
$d_i = (d^D_i, d^F_i)$.
So now there are two sets $D^D = \{d^D_i\}$ and $D^F = \{d^F_i\}$,
where the second part only defines its state.
I.e. $\exists g: g(d^F_i) = j \Lra d_i \in s_j$.
In that case we can say that $d^F_i$ is a \term{program state designator}.
It can be treated as \term{instruction pointer} or \term{program counter}.
% Secondly, assume, that for every $f_i$ there are
% functions $f^D_i$ and $f^F_i$
% such as if $f_i(d_j) = d_k$,
% $f^D_i(d^D_j) = d^D_k$ and $f^F_i(d^F_j) = d^F_k$
% (where $d_j = (d^D_j, d^F_j)$ and $d_k = (d^D_k, d^F_k)$).
% In that case we divided our process state into a control flow and data parts.
%Assume also, that for every $f_i$
% there is a function $f^D_i$ such as
% $(d^D_k, d^F_k) = f_t\big((d^D_i, d^F_i)\big) \Lra
% d^D_k = \varphi_t(d^D_i)$.
% remove program state designator from the process state, but
We can a bit redefine function elements and make them return
new process state \textbf{and}
next function element: $\mathcal{P} = \langle F, D, S, f_0, F_T \rangle$,
where $F = \{f_j\} = \{(f^F_j, f^D_j)\}$,
$f_j: s_j \ra F \times D$, $f_j = (f^F_j, f^D_j)$:
$f^F_j: s_j \ra F$~is \term{control flow function element},
$f^D_j: s_j \ra D$~is \term{data flow function element}.
Function element could be considered as program instruction, that
computes next program instruction \{$f^F_j$\} and modifies data \{$f^D_j$\}.
Additionally, we will assume the following.
\begin{itemize}
\item Function element set is not empty: $F \neq \varnothing$.
\item $\forall j \,\, f^F_j$ is a total function.
\begin{itemize}
\item If $f_j \notin F_T$,
$\forall d \in D \,\, f^F_j(d) = f_k, f_k \in F$.
\item If $f_j \in F_T$,
$\forall d \in D \,\, f^F_j(d) = f_j$.
\end{itemize}
\item To simplify considering, we will assume that unless otherwise stated
there is one and only one initial function element $f_0$:
$F_I = \{f_0\}$.
\item Program has at least one terminal function element:
$F_T \neq \varnothing$.
\end{itemize}
\subsublevel*{\en{Limitations}\ru{Ограничения}}
\en{We assume \textbf{deterministic} program execution
using \textbf{limited} binary data without \textbf{self-modification}.
Concrete program execution means normal program execution
on given binary data.}
\ru{Будем рассматривать \textbf{детерминированное} исполнение программы без
\textbf{самомодификаций}, которая использует \textbf{ограниченный} набор
бинарной информации.}
\begin{itemize}
\item Firstly, we should emphasize that we distinguish data ($D$)
from function elements ($F$).
It implies that self-modification is not possible.
To allow self-modification, we should merge $F$ and $D$ into one set.
\item Program is deterministic: $f^F_j: D \ra F$.
For nondeterministic program it should be: $f^F_j: D \ra F*$.
\item Data is limited and predefined: $|D| < \infty$.
\end{itemize}
\subsublevel*{\en{Possible data model definition}\
\ru{Возможные определения модели данных}}
\en{Data set can be homogenous or heterogenous.
In Knuth's examples data set is heterogenous (different vector sizes).
In real-world programs data is set homogenous.}
\ru{Множество данных может быть как гомогенным, так и гетерогенным.
В примерах Кнута множество данных гетерогенно (векторы имеют разные размеры).
В реальных программах множество данных гомогенно.}
\en{Data set $D$ is a set of all possible values that can be arguments of
function elements $f_j$: $D = \{d_j\}$.
Data $d \in D$ is an abstract entity and it can be anything.
The simplest data definition is a set of values: $d = \{d^j\}, d \in D$
(we will use superscript for data elements because it is more convenient to use
subscript for data instances), where $d^j$ is a Boolean variable,
Boolean vector, natural, integer, real number, or anything else.}
\noru{}
\en{As we want to model real computer programs we can notice,
that simplest definition of data as a Boolean variables is useful enough
for majority of computer systems, which operates limited binary memory.}
\noru{}
\en{However it is convenient to use more complicated data definitions to
simplify modeling.}
\noru{}
Binary data:
\begin{itemize}
\item $B = (\false, \true), \mathcal{B}_2 = B^8$.
\end{itemize}
Data model definition with registers and memory:
\begin{itemize}
\item $D_1 = \{d_j\} = R \cup M$.
\begin{itemize}
\item $R = \{r_j\}$~--- \term{concrete registers}.
$r_j \in \mathcal{B}_2$.
\item $M = \{m_j\}$~--- \term{concrete memory}. $m_j \in \mathcal{B}_2$.
\end{itemize}
\end{itemize}
For the next data model definition we will
\textbf{avoid limited data condition}.
Here $Y$ is a set of sets $Y_i$.
Each set $Y_i$ may contain an arbitrary number of elements.
Data model definition with set of inputs, registers, memory,
and external knowledge about array allocation:
\begin{itemize}
\item $D_2 = \{d_j\} = Y \cup R \cup M \cup L$. $|L| \le |M|$.
\begin{itemize}
\item $Y = \{Y_i\}$~--- \term{concrete inputs},
$Y_i = \{y_{i, j}\}$~--- \term{concrete input}.
$y_{i, j} \in \mathcal{B}_2$.
% \item $Y = \{ Y_i \}$~--- \term{inputs},
% $Y_i = \langle \{y_{i, j}\}, y_{i, l} \rangle$~--- \term{input}.
% $y_{i, j} \in \mathcal{B}_2, y_{i, l} \in \mathcal{B}_2$.
\item $R = \{r_j\}$~---
\term{concrete registers}. $r_j \in \mathcal{B}_2$.
\item $M = \{m_j\}$~---
\term{concrete memory}. $m_j \in \mathcal{B}_2$.
\item $L = \{l_j\}$~---
\term{concrete array lengths}. $l_j \in \mathcal{B}_2$.
\end{itemize}
\end{itemize}
\subsublevel*{Control flow graph}
Set of all possible next program states for program state $s_i$ is
$\{s_j \in S | \exists d' = f_i(d), d \in s_i, d' \in s_j\}$.
\altterm{Control flow graph}{control flow graph} is a directed graph where
vertices are program states and there is an edge from
vertex $s_i$ to vertex $s_j$ iff $s_j \in f_i[s_i]$.
Function element $f_j = (f_j^F, f_j^D)$ is \term{branch} iff $|f_j[D]| > 1$.
Otherwise (if range of the control flow function
element has exactly one element),
function element is \term{operation},
i.e. iff $\exists f_k: f_j^D(d) = f_k, \forall d \in D$.
The set of all branches is $F_B = \{f_j \in F | |f_j[D]| > 1\}$.
The set of all operations is $F_O = \{f_j \in F | |f_j[D]| = 1\}$.
Since $|f_j[D]|$ by definition is either equals to 1, or greater than 1,
$F = F_B \cup F_O$ and $F_B \cap F_O = \varnothing$.
We will use graphical representation of control flow graph using circles
for vertices with function element captions and arrows for edges.
Vertices that correspond to terminal function elements will be represented
by circles with double stroke.
Vertices that correspond to initial function elements will be represented
as usual.
We assume that either there is a single initial element $f_0$,
or initial elements are represented by vertices that has no incoming edges.
\altterm{Basic blocks}{basic block} are sets of \note{continue...}
\subsublevel*{Semantics}
\begin{table}
\begin{center}
\begin{tabular}{| c | c |}
\hline
$(x, y)$ & $f^D_\land(x, y)$ \\ \hline \hline
(\false, \false) & (\false, \true) \\ \hline
(\false, \true) & (\false, \true) \\ \hline
(\true, \false) & (\false, \true) \\ \hline
(\true, \true) & (\true, \true) \\ \hline
\end{tabular}
\label{tbl:conjunction}
\end{center}
\caption{Conjunction truth table.}
\end{table}
\altterm{Concrete semantics}{concrete semantics} is a set of all data function
elements $f^D_j$ used in program: $F^D$.
In other words, concrete semantics defines
possible ways to transform data in program.
It could be considered as an instruction set, or a system of commands.
Semantics cannot be defined without data $D$ definition.
For example, let's take a look at logical conjunction.
It is a binary logical operator, which means it uses two Boolean operands
and returns Boolean result.
But to define an operation $f^D_\land$, that performs logical conjunction,
we should also define data $D$ and how this operation transform data.
Let's define its semantics as $f^D_\land(x, y) = (x \land y, y)$,
where $x, y \in B$.
It could be represented in the form of assignment $x \la x \land y$.
It could also be represented in the form of truth table
(see Table \ref{tbl:conjunction}).
\sublevel{Execution}
We consider that program execution is an iterative process with steps.
For further considerations we will use variable $t \in \nat$ as
\term{execution step}.
At every step program execution has its \term{program state} $s = (f, d)$:
next function element (next instruction) and current \term{data state}.
$f \in F, d \in D$.
Set of all possible execution states is $S$.
Program state at the iteration $t$ is
$s_t = (f_t, d_t) = \big((f_t^F, f_t^D), d_t\big)$.
$(f_0, d_0), f_0 \in F_I$ is \term{initial state}.
Any state $(f_t, d_t)$ such as $f_t \in F_T$ is \term{terminal state}.
We consider that there is \term{execution function} $\mathcal{F}: S \ra S$
that manages the execution process:
\[\mathcal{F}(s_t) = \mathcal{F}\big((f_t, d_t)\big) =
f_t(d_t) =
\big(f_t^F(d_t), f_t^D(d_t)\big) =
(f_{t + 1}, d_{t + 1}) = s_{t + 1}.\]
Execution function $\mathcal{F}$ defines an \term{execution rule}:
\begin{itemize}
\item $f_{t + 1} = f_t^F(d_t)$,
\item $d_{t + 1} = f_t^D(d_t)$.
\end{itemize}
\altterm{Execution of a program}{program execution}
$\mathcal{P} = \langle F, F_I, F_T, D \rangle$
on \term{input data} $d_0$, where $F_I = \{f_0\}$,
is a chain of program states,
starting with initial state $s_0 = (f_0, d_0)$,
where next state is a function $\mathcal{F}$ of previous state:
$s_{t + 1} = \mathcal{F}(s_t), s_t \in S$.
Therefore execution can be represented as iterative use of execution function:
$\mathcal{F}(\mathcal{F}(\dots\mathcal{F}(d_0)\dots))$.
We consider that if program execution approaches state $s_t = (f_t, d_t)$,
where $f_t \in F_T$, program execution is terminated.
Hence, execution is either infinite chain
\[E(d_0) = s_0 \ra s_1 \ra \cdots \ra s_t \ra \cdots =
(f_0, d_0) \ra (f_1, d_1) \ra \cdots \ra (f_t, d_t) \ra \cdots,\]
where $\forall j \,\, f_j \notin F_T$ (in this case $d_t$ is an
\term{output data} of program execution $E(d_0)$),
or finite chain
\[E(d_0) = s_0 \ra s_1 \ra \cdots \ra s_t =
(f_0, d_0) \ra (f_1, d_1) \ra \cdots \ra (f_t, d_t),\]
where $f_t \in F_T \land \forall j < t \,\, f_j \notin F_T$
(in this case execution has no output data).
For the first case we will use notation $E(d_0) \ra d_t$,
for the second: $E(d_0) \ra \infty$.
Note, that execution is a sequence of states:
$E(d_0) = (s_0, s_1, \dots, s_t, \dots)$.
But we will use notation
$E(d_0) = s_0 \ra s_1 \ra \dots \ra s_t \ra \dots$
to emphasize that it is a process.
\begin{lemma}
\label{exists_one_execution}
$\forall \mathcal{P} = \langle F, F_I, F_T, D \rangle \,\,
\forall d \in D \,\, \exists! E(d)$.
\end{lemma}
\begin{proof}
Let's prove it by constructing the execution chain.
By the program definition, there is a initial function element $f_0$.
Hence, $\forall d \in D$
there is a initial program state $s_0 = (f_0, d)$.
For every program state $s_t = (f_t, d_t)$,
either $f_t \notin F_T \Ra f_t(d_t) = (f_{t + 1}, d_{t + 1})$
and we have next step of the execution,
or $f_t \in F_T \Ra$ execution is terminated.
All steps of this process are deterministic,
hence we will have execution $E(d)$ determined by initial input data $d$.
\end{proof}
To visualize the process of the program execution we will use circles
for program states connected by arrows which depicts function $\mathcal{F}$.
If program execution is finite, the last state will be depicted by
a circle with double stroke.
\centerimage{concrete_execution}
{Infinite and finite concrete program execution.}
We will also define \term{execution path} as either any finite sequence
$P = (f_0, f_1, \dots, f_t)$, where
$\forall j \,\, f_j \in F \land f_{j + 1} \in f_j[D]$
and $f_t \in F_T \land \forall j < t \,\, f_j \notin F_T$
or infinite sequence $P = (f_0, f_1, \dots, f_t, \dots)$, where
$\forall j \,\, f_j \in F \land f_{j + 1} \in f_j[D] \land f_j \notin F_T$.
\begin{lemma}
There is one and only one execution path $P$ for the execution $E(d_0)$.
\[ \begin{array}{c}
\forall E(d_0) = s_0 \ra s_1 \ra \cdots \ra s_t \ra \cdots =
(f_0, d_0) \ra (f_1, d_1) \ra \cdots \ra (f_t, d_t) \ra \cdots \\
\exists! P = (f'_0, f'_1, \dots, f'_t, \dots):
f'_0 = f_0, f'_1 = f_1, \dots, f'_t = f_t, \dots
\end{array} \]
\end{lemma}
\begin{proof}
Let's create a sequence $P = (f_0, f_1, \dots, f_t, \dots)$ from
execution states.
Now we have to proof that $P$ is execution path.
Choose arbitrary $f_j = (f_j^F, f_j^D)$ and $f_{j + 1}$.
By execution definition,
$f_{j + 1} = f_j^F(d_j) \Ra f_{j + 1} \in f_j[D]$.
Hence, $P$ is an execution path.
\end{proof}
We will denote this fact as $E(d_0) \Ra P$.
Let's also define a \term{branch chain} $P_B$ of a execution path $P$
as a subsequence of $P$, which contains only $f_j \in F_B$.
We can denote this fact as $P \Ra P_B$.
% $P_B = \{f_{j_0}, f_{j_1}, \dots, f_{j_k}, \dots\}$, where
% $\forall k \,\, f_{j_k} \in F_B$
\begin{lemma}
$\forall P_B \,\, \exists! P: P \Ra P_B$.
\end{lemma}
\begin{proof}
\note{Add proof.}
\end{proof}
Therefore, we can write $P \Lra P_B$.
Note, that we define execution path just as a path in control flow graph.
It could be infeasible if there is no such input data that implies that path.
Execution path $P$ is \term{feasible in concrete model} iff
$\exists d_0 \in D: (f_1, d_1) = f_0(d_0) \land
\forall j > 0 ~ (f_j, d_j) = f_{j - 1}(d_{j - 1})$.
Some sets of program executions:
\begin{itemize}
\item $\mathds{E}_\mathcal{P}$~--- set of all possible program executions.
\item $\mathds{P}_\mathcal{P}$~--- set of all possible execution paths.
$\mathds{P}'_\mathcal{P}$~---
set of all possible feasible execution paths.
\end{itemize}
\subsublevel*{Number of paths}
\begin{lemma}
$|\mathds{E}_\mathcal{P}| = |D| |\mathcal{B}|$.
\end{lemma}
\begin{proof}
$|D| |\mathcal{B}|$~--- number of all possible data states.
From lemma \ref{exists_one_execution},
$\forall d \in D \,\, \exists! E(d)$.
$\forall d', d'' \in D : d' \neq d'' \,\, E(d') = (f_0, d') \ra \cdots
E(d'') = (f_0, d'') \ra \cdots.
(f_0, d') \neq (f_0, d'') \Ra E(d') \neq E(d'').$
Therefore, number of possible executions is equal to number of
possible initial input data.
\end{proof}
\begin{lemma}
$|\mathds{P}''_\mathcal{P}| \le |D| |\mathcal{B}|$.
\end{lemma}
\level{Symbolic execution model}
Symbolic execution of the program is
an execution of a symbolic model of that program.
Symbolic model includes a set of symbolic function element
(symbolic models of function elements) and
symbolic data model.
Execution is performed using symbolic variables instead of concrete data.
\sublevel{Model}
$\se{\mathcal{P}} = \langle \se{F}, \se{F}_I, \se{F}_T, \se{D} \rangle$
is a \term{symbolic model} of program
$\mathcal{P} = \langle F, F_I, F_T, D \rangle$.
There are a lot of ways to create a symbolic model of a program,
therefore we will use some particular set of rules $R_\text{SE}$.
We can write it as
$\mathcal{P} \xrightarrow[]{R_\text{SE}} \se{\mathcal{P}}$.
It denotes that for all $\mathcal{P}$ exists only one symbolic model
$\se{\mathcal{P}}$ constructed using the set of rules $R_\text{SE}$.
Function elements and data is defines as follows.
\begin{itemize}
\item $\se{F} = \{\se{f}_j\} =
\{(\se{f}_j^F, \se{f}_j^D, \se{f}_j^C)\}$~---
\term{symbolic function element models}
that correspond to function elements of the program $\mathcal{P}$.
\item $\se{F}_I$ is a set of
\term{initial symbolic function element models},
that correspond to initial function elements
of the program $\mathcal{P}$.
\item $\se{F}_T$ is a set of
\term{terminal symbolic function element models},
that correspond to terminal function elements
of the program $\mathcal{P}$.
\item $\se{f}_j: \se{D} \times C \ra
(\se{F} \times \se{D} \times C)*$~---
\term{symbolic function element model}.
% %%% Not sure we have to divide function $\se{f}_j$.
% \begin{itemize}
% \item $\se{f}_j^F: \varnothing \ra \se{F}*$~---
% \term{control flow symbolic function element model},
% \item $\se{f}_j^D: \se{D} \ra \se{D}*$~---
% \term{data symbolic function element model}.
% \item $\se{f}_j^C: \se{D} \times C \ra C*$~---
% \term{data symbolic function element model}.
% \end{itemize}
% \end{itemize}
\end{itemize}
\subsublevel*{Data model}
Symbolic data model $\se{D}$ is a symbolic model of program data $D$:
$D \xrightarrow[]{R_\text{SE}} \se{D}$.
To build a symbolic data model $\se{D}$ of a program, we should choose,
which items of the original data $D$ to be symbolized.
Let's assume out original data $D$ is a set of elements ($D = \{d_j\}$)
and we want to symbolize all of them in our model.
Firstly, we have to define a set of
\altterm{symbolic variables}{symbolic variable}
as a set of variables $X = \{x_j\}$.
And let symbolic data model $\se{D}$ be a
Data model with memory, registers, and array lengths.
\begin{itemize}
\item $\se{D} = \{\se{d}_j\} = \se{R} \cup \new{\se{M}} \cup \se{L}$~---
\term{symbolic data model}.
\begin{itemize}
\item $\se{R} = \{\se{r}_j\}, \se{r}_j = x_r, x_r \in X_R$~---
\term{symbolic registers model}.
\item $\se{M} = \{\se{m}_j\}, \se{m}_j = x_m, x_m \in X_M$~---
\term{symbolic memory model}.
\item $\se{L} = \{\se{l}_j\}, \se{l}_j = x_l, x_l \in X_L$~---
\term{symbolic length model}.
\end{itemize}
\item $X = X_M \cup \new{X_L} \cup X_R$.
\begin{itemize}
\item $X_M = \{x_m\}$~---
\term{memory symbolic variables}. $x_m \in \mathcal{B}$.
\item \new{$X_L = \{x_l\}$~--- \term{length symbolic variables}.
$x_l \in \mathcal{B}$.}
\item $X_R = \{x_r\}$~---
\term{register symbolic variables}. $x_r \in \mathcal{B}$.
\end{itemize}
\end{itemize}
\note{We don't consider data element indices as entities.
That's why they cannot be represented by symbolic variables (or can they?).
There should be such a data model.}
\subsublevel*{Path condition}
Set $C$ is called \term{path condition} or \term{path constraint}.
It is a logical expression over Boolean variables
$X = \{x_k\}, x_k \in \mathcal{B}$ called
\altterm{symbolic variables}{symbolic variable}.
\subsublevel*{Symbolic function element models}
Symbolic function element models are constructed from function elements of the
original program:
$F \xrightarrow[]{R_\text{SE}} \se{F}$, and
$\forall j \,\, f_j \xrightarrow[]{R_\text{SE}} \se{f}_j$.
\begin{align}
\se{f}_t(d_t, c_t) =
\begin{cases}
(\se{f}, \se{d}, c), \\
(\se{f}, \se{d}, c), \\
\dots \\
(\se{f}, \se{d}, c).
\end{cases}
\end{align}
Symbolic function element models are multivalued functions.
Number of values for each $\se{f}_j$ is predefined and doesn't depend on
its arguments.
\sublevel{Execution}
Symbolic execution unlike concrete execution is not an iterative process.
However, it has execution states
$\se{s} = (\se{f}, \se{d}, c)$~--- \term{symbolic state},
$\se{f} \in \se{F}, \se{d} \in \se{D}, c \in C$.
Set of all possible symbolic states is $\se{S}$.
$(\se{f}_0, \se{d}_0, \true)$ is an \term{initial symbolic state}.
Any state $(\se{f}_t, \se{d}_t, c_t)$, such as $\se{f}_t \in \se{F}_T$
is a \term{symbolic terminal state}.
Note, that $t$ is not an execution step here, but a state identifier.
We consider that there is a \term{symbolic execution function}
$\se{\mathcal{F}}: \se{S} \ra \se{S}$
that manages the symbolic execution process:
\begin{align}
\se{\mathcal{F}}(s_{k, \dots, l})
&= \se{\mathcal{F}}
\big((f_{k, \dots, l}, d_{k, \dots, l}, c_{k, \dots, l})\big) = \\
&= \big((f_{k, \dots, l, m}, d_{k, \dots, l, m}, c_{k, \dots, l, m}), \dots,
(f_{k, \dots, l, n}, d_{k, \dots, l, n}, c_{k, \dots, l, n})\big) = \\
&= (s_{k, \dots, l, m}, \dots, s_{k, \dots, l, n})
\end{align}
\altterm{Symbolic execution}{symbolic execution}
$\se{E} = \se{s}_0 \ra \cdots$ visualization is
presented on Figure \ref{img:symbolic_execution}.
% \begin{tikzcd}
% && \se{s}_{0,0,0} \arrow{r} & \cdots \\
% & \se{s}_{0,0} \arrow{ru} \arrow{r} \arrow{rd} & \cdots \\
% \se{E} = \se{s}_0 \arrow{r} \arrow{ru} \arrow{rd} & \cdots &
% \se{s}_{0,0,n_{0,0}}
% \arrow{r} & \cdots \\
% & \se{s}_{0,n_0} \arrow{r} & \cdots
% \end{tikzcd}
\centerimage{symbolic_execution}{
\en{Symbolic execution process.}
\ru{Процесс символьного исполнения.}
}
Execution rule:
\begin{itemize}
\item $\{\se{f}_{0, t_1, \dots, t_n, 0}, \dots,
\se{f}_{0, t_1, \dots, t_n, m}\} =
\se{f}_{0, t_1, \dots, t_n}^F(\se{d}_{0, t_1, \dots, t_n})$,
\item $\{\se{d}_{0, t_1, \dots, t_n, 0}, \dots,
\se{d}_{0, t_1, \dots, t_n, m}\} =
\se{f}_{0, t_1, \dots, t_n}^D(\se{d}_{0, t_1, \dots, t_n})$.
\end{itemize}
Execution path $P = \{\se{f}_0, \se{f}_1, \dots, \se{f}_t\}$ is
\term{feasible in symbolic model} iff $c_{0, 1, \dots, t}$ is true.
\note{Difference between forward and backward symbolic execution?
What is exactly backward symbolic execution?}
\sublevel{Symbolic semantics}
The major part of symbolic program model is a symbolic semantics.
We should define symbolic semantics for each function element from the original
program.
\sublevel{Symbolic execution and concrete execution}
\note{Feasibility in concrete model vs feasibility in symbolic model.}
\sublevel{Static symbolic execution}
\note{Some thoughts goes here.}
\level{Taint tracking}
\altterm{Taint tracking}{taint tracking} is a technique used to track data
flow in program during its execution.
To use taint tracking we should define a set that will be used to mark data
elements, initial taint marks distribution (or taint sources),
and taint policy---rules that describes taint propagation.
\sublevel{Model}
The set of a possible taint marks will be denote as $\mathcal{T}$.
Simplest useful taint set is $B = \{\false, \true\}$,
that divides data elements into two disjoint sets:
what is tainted and what is not.
\altterm{Taint data model}{taint data model} is
$\te{D} = \{\te{d}_j\}, \te{d}_j \in \mathcal{T}$.
Taint semantics is called \term{taint policy}.
\subsublevel*{Possible data model definitions}
If original data model consists of memory and registers $D = (M, R)$,
taint data model could be defined as $\te{D} = (\te{M}, \te{R})$:
\begin{itemize}
\item $\te{M} = \{\te{m}_j\}, \te{m}_j \in \mathcal{T}$~---
\term{taint memory model},
\item $\te{R} = \{\te{r}_j\}, \te{r}_j \in \mathcal{T}$~---
\term{taint registers model}.
\end{itemize}
\note{Definitions of undertainting and overtainting.}
\level{Dynamic symbolic (concolic) execution model}
Dynamic symbolic execution or, in other words, concolic execution,
is an execution of a program along with execution of its symbolic model
(that means construction of a path condition) but for the particular
execution path that is defined by initial input data $d_0$.
\sublevel{Model}
Model for dynamic symbolic execution consists of program model
$\mathcal{P} = \langle F, F_I, F_T, D \rangle$ and its symbolic model
$\se{\mathcal{P}} = \langle \se{F}, \se{F}_I, \se{F}_T, \se{D} \rangle$.
Dynamic symbolic execution program model could we presented as
\[\dse{\mathcal{P}} = \langle F, F_I, F_T, D,
\se{F}, \se{F}_I, \se{F}_T, \se{D} \rangle.\]
\begin{itemize}