From 2eeed711beec49dfad5d3a3f16fdfca4b2f3acf0 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 7 Aug 2013 22:47:34 +0000 Subject: [PATCH] DataFlowSanitizer; Clang changes. DataFlowSanitizer is a generalised dynamic data flow analysis. Unlike other Sanitizer tools, this tool is not designed to detect a specific class of bugs on its own. Instead, it provides a generic dynamic data flow analysis framework to be used by clients to help detect application-specific issues within their own code. Differential Revision: http://llvm-reviews.chandlerc.com/D966 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@187925 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/DataFlowSanitizer.rst | 77 ++++++++++++++++ docs/DataFlowSanitizerDesign.rst | 142 +++++++++++++++++++++++++++++ docs/UsersManual.rst | 2 + include/clang/Basic/Sanitizers.def | 3 + lib/CodeGen/BackendUtil.cpp | 12 +++ lib/Driver/SanitizerArgs.h | 4 +- lib/Driver/Tools.cpp | 8 ++ lib/Lex/PPMacroExpansion.cpp | 1 + runtime/compiler-rt/Makefile | 3 +- 9 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 docs/DataFlowSanitizer.rst create mode 100644 docs/DataFlowSanitizerDesign.rst diff --git a/docs/DataFlowSanitizer.rst b/docs/DataFlowSanitizer.rst new file mode 100644 index 00000000000..426099073cf --- /dev/null +++ b/docs/DataFlowSanitizer.rst @@ -0,0 +1,77 @@ +================= +DataFlowSanitizer +================= + +.. contents:: + :local: + +Introduction +============ + +DataFlowSanitizer is a generalised dynamic data flow analysis. + +Unlike other Sanitizer tools, this tool is not designed to detect a +specific class of bugs on its own. Instead, it provides a generic +dynamic data flow analysis framework to be used by clients to help +detect application-specific issues within their own code. + +Usage +===== + +With no program changes, applying DataFlowSanitizer to a program +will not alter its behavior. To use DataFlowSanitizer, the program +uses API functions to apply tags to data to cause it to be tracked, and to +check the tag of a specific data item. DataFlowSanitizer manages +the propagation of tags through the program according to its data flow. + +The APIs are defined in the header file ``sanitizer/dfsan_interface.h``. +For further information about each function, please refer to the header +file. + +Example +======= + +The following program demonstrates label propagation by checking that +the correct labels are propagated. + +.. code-block:: c++ + + #include + #include + + int main(void) { + int i = 1; + dfsan_label i_label = dfsan_create_label("i", 0); + dfsan_set_label(i_label, &i, sizeof(i)); + + int j = 2; + dfsan_label j_label = dfsan_create_label("j", 0); + dfsan_set_label(j_label, &j, sizeof(j)); + + int k = 3; + dfsan_label k_label = dfsan_create_label("k", 0); + dfsan_set_label(k_label, &k, sizeof(k)); + + dfsan_label ij_label = dfsan_get_label(i + j); + assert(dfsan_has_label(ij_label, i_label)); + assert(dfsan_has_label(ij_label, j_label)); + assert(!dfsan_has_label(ij_label, k_label)); + + dfsan_label ijk_label = dfsan_get_label(i + j + k); + assert(dfsan_has_label(ijk_label, i_label)); + assert(dfsan_has_label(ijk_label, j_label)); + assert(dfsan_has_label(ijk_label, k_label)); + + return 0; + } + +Current status +============== + +DataFlowSanitizer is a work in progress, currently under development for +x86\_64 Linux. + +Design +====== + +Please refer to the :doc:`design document`. diff --git a/docs/DataFlowSanitizerDesign.rst b/docs/DataFlowSanitizerDesign.rst new file mode 100644 index 00000000000..b704035f2fc --- /dev/null +++ b/docs/DataFlowSanitizerDesign.rst @@ -0,0 +1,142 @@ +DataFlowSanitizer Design Document +================================= + +This document sets out the design for DataFlowSanitizer, a general +dynamic data flow analysis. Unlike other Sanitizer tools, this tool is +not designed to detect a specific class of bugs on its own. Instead, +it provides a generic dynamic data flow analysis framework to be used +by clients to help detect application-specific issues within their +own code. + +DataFlowSanitizer is a program instrumentation which can associate +a number of taint labels with any data stored in any memory region +accessible by the program. The analysis is dynamic, which means that +it operates on a running program, and tracks how the labels propagate +through that program. The tool shall support a large (>100) number +of labels, such that programs which operate on large numbers of data +items may be analysed with each data item being tracked separately. + +Use Cases +--------- + +This instrumentation can be used as a tool to help monitor how data +flows from a program's inputs (sources) to its outputs (sinks). +This has applications from a privacy/security perspective in that +one can audit how a sensitive data item is used within a program and +ensure it isn't exiting the program anywhere it shouldn't be. + +Interface +--------- + +A number of functions are provided which will create taint labels, +attach labels to memory regions and extract the set of labels +associated with a specific memory region. These functions are declared +in the header file ``sanitizer/dfsan_interface.h``. + +.. code-block:: c + + /// Creates and returns a base label with the given description and user data. + dfsan_label dfsan_create_label(const char *desc, void *userdata); + + /// Sets the label for each address in [addr,addr+size) to \c label. + void dfsan_set_label(dfsan_label label, void *addr, size_t size); + + /// Sets the label for each address in [addr,addr+size) to the union of the + /// current label for that address and \c label. + void dfsan_add_label(dfsan_label label, void *addr, size_t size); + + /// Retrieves the label associated with the given data. + /// + /// The type of 'data' is arbitrary. The function accepts a value of any type, + /// which can be truncated or extended (implicitly or explicitly) as necessary. + /// The truncation/extension operations will preserve the label of the original + /// value. + dfsan_label dfsan_get_label(long data); + + /// Retrieves a pointer to the dfsan_label_info struct for the given label. + const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label); + + /// Returns whether the given label label contains the label elem. + int dfsan_has_label(dfsan_label label, dfsan_label elem); + + /// If the given label label contains a label with the description desc, returns + /// that label, else returns 0. + dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc); + +Taint label representation +-------------------------- + +As stated above, the tool must track a large number of taint +labels. This poses an implementation challenge, as most multiple-label +tainting systems assign one label per bit to shadow storage, and +union taint labels using a bitwise or operation. This will not scale +to clients which use hundreds or thousands of taint labels, as the +label union operation becomes O(n) in the number of supported labels, +and data associated with it will quickly dominate the live variable +set, causing register spills and hampering performance. + +Instead, a low overhead approach is proposed which is best-case O(log\ +:sub:`2` n) during execution. The underlying assumption is that +the required space of label unions is sparse, which is a reasonable +assumption to make given that we are optimizing for the case where +applications mostly copy data from one place to another, without often +invoking the need for an actual union operation. The representation +of a taint label is a 16-bit integer, and new labels are allocated +sequentially from a pool. The label identifier 0 is special, and means +that the data item is unlabelled. + +When a label union operation is requested at a join point (any +arithmetic or logical operation with two or more operands, such as +addition), the code checks whether a union is required, whether the +same union has been requested before, and whether one union label +subsumes the other. If so, it returns the previously allocated union +label. If not, it allocates a new union label from the same pool used +for new labels. + +Specifically, the instrumentation pass will insert code like this +to decide the union label ``lu`` for a pair of labels ``l1`` +and ``l2``: + +.. code-block:: c + + if (l1 == l2) + lu = l1; + else + lu = __dfsan_union(l1, l2); + +The equality comparison is outlined, to provide an early exit in +the common cases where the program is processing unlabelled data, or +where the two data items have the same label. ``__dfsan_union`` is +a runtime library function which performs all other union computation. + +Further optimizations are possible, for example if ``l1`` is known +at compile time to be zero (e.g. it is derived from a constant), +``l2`` can be used for ``lu``, and vice versa. + +Memory layout and label management +---------------------------------- + +The following is the current memory layout for Linux/x86\_64: + ++---------------+---------------+--------------------+ +| Start | End | Use | ++===============+===============+====================+ +| 0x700000008000|0x800000000000 | application memory | ++---------------+---------------+--------------------+ +| 0x200200000000|0x700000008000 | unused | ++---------------+---------------+--------------------+ +| 0x200000000000|0x200200000000 | union table | ++---------------+---------------+--------------------+ +| 0x000000010000|0x200000000000 | shadow memory | ++---------------+---------------+--------------------+ +| 0x000000000000|0x000000010000 | reserved by kernel | ++---------------+---------------+--------------------+ + +Each byte of application memory corresponds to two bytes of shadow +memory, which are used to store its taint label. As for LLVM SSA +registers, we have not found it necessary to associate a label with +each byte or bit of data, as some other tools do. Instead, labels are +associated directly with registers. Loads will result in a union of +all shadow labels corresponding to bytes loaded (which most of the +time will be short circuited by the initial comparison) and stores will +result in a copy of the label to the shadow of all bytes stored to. diff --git a/docs/UsersManual.rst b/docs/UsersManual.rst index 80956f89212..bdcd471971f 100644 --- a/docs/UsersManual.rst +++ b/docs/UsersManual.rst @@ -895,6 +895,8 @@ are listed below. used in conjunction with the ``-fsanitize-undefined-trap-on-error`` flag. This includes all of the checks listed below other than ``unsigned-integer-overflow`` and ``vptr``. + - ``-fsanitize=dataflow``: :doc:`DataFlowSanitizer`, a general data + flow analysis. The following more fine-grained checks are also available: diff --git a/include/clang/Basic/Sanitizers.def b/include/clang/Basic/Sanitizers.def index 187388c6346..eb4e92d8c6e 100644 --- a/include/clang/Basic/Sanitizers.def +++ b/include/clang/Basic/Sanitizers.def @@ -77,6 +77,9 @@ SANITIZER("vptr", Vptr) // IntegerSanitizer SANITIZER("unsigned-integer-overflow", UnsignedIntegerOverflow) +// DataFlowSanitizer +SANITIZER("dataflow", DataFlow) + // -fsanitize=undefined includes all the sanitizers which have low overhead, no // ABI or address space layout implications, and only catch undefined behavior. SANITIZER_GROUP("undefined", Undefined, diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp index 79cbc380a54..1edd0657e12 100644 --- a/lib/CodeGen/BackendUtil.cpp +++ b/lib/CodeGen/BackendUtil.cpp @@ -206,6 +206,11 @@ static void addThreadSanitizerPass(const PassManagerBuilder &Builder, PM.add(createThreadSanitizerPass(CGOpts.SanitizerBlacklistFile)); } +static void addDataFlowSanitizerPass(const PassManagerBuilder &Builder, + PassManagerBase &PM) { + PM.add(createDataFlowSanitizerPass()); +} + void EmitAssemblyHelper::CreatePasses(TargetMachine *TM) { unsigned OptLevel = CodeGenOpts.OptimizationLevel; CodeGenOptions::InliningMethod Inlining = CodeGenOpts.getInlining(); @@ -265,6 +270,13 @@ void EmitAssemblyHelper::CreatePasses(TargetMachine *TM) { addThreadSanitizerPass); } + if (LangOpts.Sanitize.DataFlow) { + PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast, + addDataFlowSanitizerPass); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + addDataFlowSanitizerPass); + } + // Figure out TargetLibraryInfo. Triple TargetTriple(TheModule->getTargetTriple()); PMBuilder.LibraryInfo = new TargetLibraryInfo(TargetTriple); diff --git a/lib/Driver/SanitizerArgs.h b/lib/Driver/SanitizerArgs.h index da6f4f6a9fb..58dece9b56f 100644 --- a/lib/Driver/SanitizerArgs.h +++ b/lib/Driver/SanitizerArgs.h @@ -37,10 +37,11 @@ class SanitizerArgs { NeedsAsanRt = Address, NeedsTsanRt = Thread, NeedsMsanRt = Memory, + NeedsDfsanRt = DataFlow, NeedsLeakDetection = Leak, NeedsUbsanRt = Undefined | Integer, NotAllowedWithTrap = Vptr, - HasZeroBaseShadow = Thread | Memory + HasZeroBaseShadow = Thread | Memory | DataFlow }; unsigned Kind; std::string BlacklistFile; @@ -66,6 +67,7 @@ class SanitizerArgs { return false; return Kind & NeedsUbsanRt; } + bool needsDfsanRt() const { return Kind & NeedsDfsanRt; } bool sanitizesVptr() const { return Kind & Vptr; } bool notAllowedWithTrap() const { return Kind & NotAllowedWithTrap; } diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp index 254bf8bd750..4a3ce42f0a1 100644 --- a/lib/Driver/Tools.cpp +++ b/lib/Driver/Tools.cpp @@ -1860,6 +1860,12 @@ static void addUbsanRTLinux(const ToolChain &TC, const ArgList &Args, addSanitizerRTLinkFlagsLinux(TC, Args, CmdArgs, "ubsan_cxx", false); } +static void addDfsanRTLinux(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs) { + if (!Args.hasArg(options::OPT_shared)) + addSanitizerRTLinkFlagsLinux(TC, Args, CmdArgs, "dfsan", true); +} + static bool shouldUseFramePointer(const ArgList &Args, const llvm::Triple &Triple) { if (Arg *A = Args.getLastArg(options::OPT_fno_omit_frame_pointer, @@ -6275,6 +6281,8 @@ void gnutools::Link::ConstructJob(Compilation &C, const JobAction &JA, addMsanRTLinux(getToolChain(), Args, CmdArgs); if (Sanitize.needsLsanRt()) addLsanRTLinux(getToolChain(), Args, CmdArgs); + if (Sanitize.needsDfsanRt()) + addDfsanRTLinux(getToolChain(), Args, CmdArgs); // The profile runtime also needs access to system libraries. addProfileRTLinux(getToolChain(), Args, CmdArgs); diff --git a/lib/Lex/PPMacroExpansion.cpp b/lib/Lex/PPMacroExpansion.cpp index a9d76c357c3..0eb1169c5ac 100644 --- a/lib/Lex/PPMacroExpansion.cpp +++ b/lib/Lex/PPMacroExpansion.cpp @@ -908,6 +908,7 @@ static bool HasFeature(const Preprocessor &PP, const IdentifierInfo *II) { .Case("enumerator_attributes", true) .Case("memory_sanitizer", LangOpts.Sanitize.Memory) .Case("thread_sanitizer", LangOpts.Sanitize.Thread) + .Case("dataflow_sanitizer", LangOpts.Sanitize.DataFlow) // Objective-C features .Case("objc_arr", LangOpts.ObjCAutoRefCount) // FIXME: REMOVE? .Case("objc_arc", LangOpts.ObjCAutoRefCount) diff --git a/runtime/compiler-rt/Makefile b/runtime/compiler-rt/Makefile index 59a62e7918c..0fc93c7e68b 100644 --- a/runtime/compiler-rt/Makefile +++ b/runtime/compiler-rt/Makefile @@ -109,7 +109,8 @@ endif ifeq ($(ARCH),x86_64) RuntimeLibrary.linux.Configs += \ full-x86_64.a profile-x86_64.a san-x86_64.a asan-x86_64.a \ - tsan-x86_64.a msan-x86_64.a ubsan-x86_64.a ubsan_cxx-x86_64.a + tsan-x86_64.a msan-x86_64.a ubsan-x86_64.a ubsan_cxx-x86_64.a \ + dfsan-x86_64.a # We need to build 32-bit ASan/UBsan libraries on 64-bit platform, and add them # to the list of runtime libraries to make # "clang -fsanitize=(address|undefined) -m32" work.