Enabling stack canaries for SerenityOS

Introduction

I've been looking into beefing up static and dynamic analysis available in SerenityOS. One of the most basic things that wasn't enabled yet was GCC's -fstack-protector feature, which automatically inserts a stack canary in the epilog of all functions.

osdev-wiki: Stack Smashing Protector

Which variant to use?

There are three different variants of -fstack-protector, each with different pros and cons. Kees Cook has a nice writeup on the specifics of -fstack-protector-strong and it's design on his blog.

The ARM Clang Compiler Reference Guide has a nice overview of the different variants of -fstack-protector and their impact.

-fno-stack-protector disables stack protection.

-fstack-protector enables stack protection for vulnerable functions that contain:

* A character array larger than 8 bytes.
* An 8-bit integer array larger than 8 bytes.
* A call to alloca() with either a variable size or a constant size bigger
  than 8 bytes.

-fstack-protector-all adds stack protection to all functions regardless of
 their vulnerability

-fstack-protector-strong enables stack protection for vulnerable functions
 that contain:

* An array of any size and type.
* A call to alloca().
* A local variable that has its address taken

It appears to me that -fstack-protector-strong would be great compromise for Serenity OS.

Bootstrapping the Implementation

The actual change to the build system is (of course) very simple:

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7597e97fab..df039d3f82 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,9 +125,7 @@ if (CMAKE_SYSTEM_NAME MATCHES Darwin)
     set(CMAKE_SKIP_RPATH TRUE)
 endif()
 
-#FIXME: -fstack-protector
-
-add_compile_options(-Os -g1 -fno-exceptions -Wno-address-of-packed-member -Wundef -Wcast-qual -Wwrite-strings -Wimplicit-fallthrough -Wno-nonnull-compare -Wno-deprecated-copy -Wno-expansion-to-defined)
+add_compile_options(-Os -g1 -fno-exceptions -fstack-protector-strong -Wno-address-of-packed-member -Wundef -Wcast-qual -Wwrite-strings -Wimplicit-fallthrough -Wno-nonnull-compare -Wno-deprecated-copy -Wno-expansion-to-defined)
 add_compile_options(-ffile-prefix-map=${CMAKE_SOURCE_DIR}=.)
 
 add_compile_definitions(DEBUG SANITIZE_PTRS)

The system compiles, but fails to link after adding the new flag. The link errors look something like:

[1/948] Linking CXX shared library Libraries/LibC/libc.so
FAILED: Libraries/LibC/libc.so
../Libraries/LibC/stdlib.cpp:1062: undefined reference to `__stack_chk_fail_local'

It looks like we need to expose this symbol in the Serenity LibC implementation so that the compile generated function epilogue code can call it. It turns out that there was some previous work here, and I just needed to extended it to get it working again. This is what I came up with:

diff --git a/Libraries/LibC/cxxabi.cpp b/Libraries/LibC/cxxabi.cpp
index cc920dec89..05978bfe9b 100644
--- a/Libraries/LibC/cxxabi.cpp
+++ b/Libraries/LibC/cxxabi.cpp
@@ -24,7 +24,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <AK/Types.h>
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -87,11 +86,4 @@ void __cxa_finalize(void* dso_handle)
     ASSERT_NOT_REACHED();
 }
 
-extern u32 __stack_chk_guard;
-u32 __stack_chk_guard = (u32)0xc6c7c8c9;
-
-[[noreturn]] void __stack_chk_fail()
-{
-    ASSERT_NOT_REACHED();
-}
 } // extern "C"

diff --git a/Libraries/LibC/ssp.cpp b/Libraries/LibC/ssp.cpp
new file mode 100644
index 0000000000..c2715d6ba0
--- /dev/null
+++ b/Libraries/LibC/ssp.cpp
@@ -0,0 +1,55 @@
+#include <AK/Types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/internals.h>
+#include <unistd.h>
+
+extern "C" {
+
+extern u32 __stack_chk_guard;
+u32 __stack_chk_guard = (u32)0xc6c7c8c9;
+
+[[noreturn]] void __stack_chk_fail()
+{
+    dbgprintf("Error: Stack protector failure, stack smashing detected!\n");
+    if (__stdio_is_initialized)
+        fprintf(stderr, "Error: Stack protector failure, stack smashing detected!\n");
+    abort();
+}
+
+[[noreturn]] void __stack_chk_fail_local()
+{
+    __stack_chk_fail();
+}
+
+} // extern "C"

diff --git a/Libraries/LibC/sys/internals.h b/Libraries/LibC/sys/internals.h
index 70443404e1..c23785a02e 100644
--- a/Libraries/LibC/sys/internals.h
+++ b/Libraries/LibC/sys/internals.h
@@ -43,5 +43,6 @@ 
 void __cxa_finalize(void* dso_handle);
 [[noreturn]] void __cxa_pure_virtual() __attribute__((weak));
 [[noreturn]] void __stack_chk_fail();
+[[noreturn]] void __stack_chk_fail_local();
 
 __END_DECLS

Getting Sidetracked with Linking

Compiling again with the new __stack_chk_fail_local implementation available in LibC, we surprisingly failed again with the same error. On closer inspection it's actually slightly different, we successfully linked LibC but now we are failing to link a different library LibDiff that depends on LibC:

[334/1280] Linking CXX shared library Libraries/LibDiff/libdiff.so
FAILED: Libraries/LibDiff/libdiff.so
../Libraries/LibDiff/Hunks.cpp:113: undefined reference to `__stack_chk_fail_local'
../Libraries/LibC/stdlib.cpp:1062: undefined reference to `__stack_chk_fail_local'

I started debugging what was going on, so lets see what is ending up in the final library. We can see that the __stack_chk_fail_local symbol is in the library but notice the symbols flagged l instead of g like __stack_chk_fail is flagged:

$ objdump -t Libraries/LibC/libc.so | rg __stack_chk_fail
0001ea58 l     F .text  00000015 __stack_chk_fail_local
0001ea08 g     F .text  00000050 __stack_chk_fail

The symbol table flags can be decoded using the objdump(1) man page:

The flag characters are divided into 7 groups as follows:

"l"
"g"
"u"
"!" The symbol is a local (l), global (g), unique global (u),
   neither global nor local (a space) or both global and
   local (!).  A symbol can be neither local or global for a
   variety of reasons, e.g., because it is used for
   debugging, but it is probably an indication of a bug if
   it is ever both local and global.  Unique global symbols
   are a GNU extension to the standard set of ELF symbol
   bindings.  For such a symbol the dynamic linker will make
   sure that in the entire process there is just one symbol
   with this name and type in use.

So somehow the __stack_chk_fail_local symbol is getting marked local (l) instead of global (g) like most other symbols in the application. After researching around online I ran across the following thread which provides some useful insight into how the stack protector support is supposed to be linked into applications when running on Linux:

Date: Tue, 11 Sep 2018 15:07:50 +0200
From: Szabolcs Nagy <nsz@...t70.net>
To: musl@...ts.openwall.com
Subject: Re: undefined reference to __stack_chk_fail_local (x86)

* Matias Fonzo <selk@...gora.org> [2018-09-11 09:27:45 -0300]:
> Bootstrapping Dragora (distro) reflects an error trying to build the
> kernel headers using musl 1.1.20:
> 
> Running build() ...
>   UPD     include/generated/uapi/linux/version.h
>   HOSTCC  scripts/basic/fixdep
> /tmp/ccONBchp.o: In function `read_file':
> fixdep.c:(.text+0x12a): undefined reference to `__stack_chk_fail_local'
> /tmp/ccONBchp.o: In function `main':
> fixdep.c:(.text.startup+0x6e2): undefined reference to
> `__stack_chk_fail_local' /tools/lib32/gcc/i586-linux-musl/bin/ld:
> scripts/basic/fixdep: hidden symbol `__stack_chk_fail_local' isn't
> defined /tools/lib32/gcc/i586-linux-musl/bin/ld:
> final link failed: Bad value collect2: error: ld returned 1 exit status
> make[1]: *** [scripts/Makefile.host:90: scripts/basic/fixdep] Error 1
> make: *** [Makefile:464: scripts_basic] Error 2 Return status = 2
> 

this happens because on i386 and powerpc gcc emits _local_ calls
to __stack_chk_fail_local which means it has to be defined within
the same module that you are linking (not in libc.so).
(that symbol should either call the extern __stack_chk_fail in
libc.so or just crash)

this should have been done by libgcc.a having a definition for
this symbol (since the compiler is using it), but instead it got
added to glibc libc_nonshared.a which is added to the link command
by using a linker script in place of libc.so.

musl does not want to copy that hack, so the usual workaround is
to make gcc pass -lssp_nonshared to the linker when stack-protector
is in use and then have a libssp_nonshared.a with the appropriate
definition. (this is what alpine linux does)

you can also do this manually or disable ssp with -fno-stack-protector,
it may also work if a preincluded header declared it as weak
so undefined weak reference would just become 0 (and crash at runtime
which is the intended behaviour), but i havent tested that.

After reading this I modified the CMake changes I had made to include my new spp.cpp file into LibC to factor it out into it's own static library that can be automatically inserted as a dependent library whenever LibC is linked:

diff --git a/Libraries/LibC/CMakeLists.txt b/Libraries/LibC/CMakeLists.txt
index 3366957d5d..971880ca00 100644
--- a/Libraries/LibC/CMakeLists.txt
+++ b/Libraries/LibC/CMakeLists.txt
@@ -71,13 +71,21 @@ 
 )
 
+add_library(ssp STATIC ssp.cpp)
 set(SOURCES ${LIBC_SOURCES} ${AK_SOURCES} ${ELF_SOURCES})
 
 serenity_libc_static(LibCStatic c)
-target_link_libraries(LibCStatic crt0)
+target_link_libraries(LibCStatic crt0 ssp)
 add_dependencies(LibCStatic LibM)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
 serenity_libc(LibC c)
-target_link_libraries(LibC crt0)
+target_link_libraries(LibC crt0 ssp)
 add_dependencies(LibC LibM)

With these changes the system now compiles and links cleanly!

Testing

Now that we have the system compiling, lets write test program to make sure we can actually crash the program when a stack smash is detected. The program should overwrite a buffer on the stack:

#include <cstdio>

static void smasher(char* string)
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
    for (int i = 0; i < 256; i++) {
        string[i] = 'A';
    }
#pragma GCC diagnostic pop
}

static void stack_to_smash()
{
    char string[8] = {};
    smasher(string);
}

int main()
{
    puts("[+] Starting the stack smash...");
    stack_to_smash();
    puts("[+] Stack smash wasn't detected!");

    return 0;
}

If we compile this, and boot into serenity and run our program what happens?

courage ~ $ ./user/Tests/LibC/stack-smash
[+] Starting the stack smash ...
[+] Stack smash wasn't detected!

Hrm... that's not good, what's going on?

After looking at the disassembly it's obvious, the stack_to_smash() call is being completely in-lined and optimized out.

$ objdump -M intel --disassemble=main Userland/Tests/LibC/stack-smash
Userland/Tests/LibC/stack-smash:     file format elf32-i386
00000510 <main>:
 510:   8d 4c 24 04             lea    ecx,[esp+0x4]
 514:   83 e4 f0                and    esp,0xfffffff0
 517:   ff 71 fc                push   DWORD PTR [ecx-0x4]
 51a:   55                      push   ebp
 51b:   89 e5                   mov    ebp,esp
 51d:   53                      push   ebx
 51e:   e8 82 00 00 00          call   5a5 <__x86.get_pc_thunk.bx>
 523:   81 c3 e5 13 00 00       add    ebx,0x13e5
 529:   51                      push   ecx
 52a:   83 ec 0c                sub    esp,0xc
 52d:   8d 83 a1 ee ff ff       lea    eax,[ebx-0x115f]
 533:   50                      push   eax
 534:   e8 97 ff ff ff          call   4d0 <puts@plt>
 539:   8d 83 c1 ee ff ff       lea    eax,[ebx-0x113f]
 53f:   89 04 24                mov    DWORD PTR [esp],eax
 542:   e8 89 ff ff ff          call   4d0 <puts@plt>
 547:   8d 65 f8                lea    esp,[ebp-0x8]
 54a:   31 c0                   xor    eax,eax
 54c:   59                      pop    ecx
 54d:   5b                      pop    ebx
 54e:   5d                      pop    ebp
 54f:   8d 61 fc                lea    esp,[ecx-0x4]
 552:   c3                      ret

We can add __attribute__((noinline)) to our function definitions to instruct the compiler to not inline calls to our test functions:

#include <cstdio>

// Note: Needs to be 'noline' so stack canary isn't optimized out.
static void __attribute__((noinline)) smasher(char* string)
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
    for (int i = 0; i < 256; i++) {
        string[i] = 'A';
    }
#pragma GCC diagnostic pop
}

// Note: Needs to be 'noline' so stack canary isn't optimized out.
static void __attribute__((noinline)) stack_to_smash()
{
    char string[8] = {};
    smasher(string);
}

int main()
{
    puts("[+] Starting the stack smash...");
    stack_to_smash();
    puts("[+] Stack smash wasn't detected!");

    return 0;
}

After re-compiling we can see the compiler instrumentation in the binary now, note the injected calls to __stack_chk_fail_local in the disassembly of _ZL14stack_to_smashv:

$ objdump -M intel --disassemble=main Userland/Tests/LibC/stack-smash
Userland/Tests/LibC/stack-smash:     file format elf32-i386
00000690 <main>:
 690:   8d 4c 24 04             lea    ecx,[esp+0x4]
 694:   83 e4 f0                and    esp,0xfffffff0
 697:   ff 71 fc                push   DWORD PTR [ecx-0x4]
 69a:   55                      push   ebp
 69b:   89 e5                   mov    ebp,esp
 69d:   53                      push   ebx
 69e:   e8 87 00 00 00          call   72a <__x86.get_pc_thunk.bx>
 6a3:   81 c3 29 15 00 00       add    ebx,0x1529
 6a9:   51                      push   ecx
 6aa:   83 ec 0c                sub    esp,0xc
 6ad:   8d 83 1d ee ff ff       lea    eax,[ebx-0x11e3]
 6b3:   50                      push   eax
 6b4:   e8 67 ff ff ff          call   620 <puts@plt>
 6b9:   e8 1c 02 00 00          call   8da <_ZL14stack_to_smashv>
 6be:   8d 83 3d ee ff ff       lea    eax,[ebx-0x11c3]
 6c4:   89 04 24                mov    DWORD PTR [esp],eax
 6c7:   e8 54 ff ff ff          call   620 <puts@plt>
 6cc:   8d 65 f8                lea    esp,[ebp-0x8]
 6cf:   31 c0                   xor    eax,eax
 6d1:   59                      pop    ecx
 6d2:   5b                      pop    ebx
 6d3:   5d                      pop    ebp
 6d4:   8d 61 fc                lea    esp,[ecx-0x4]
 6d7:   c3                      ret

000008c6 <_ZL7smasherPc>:
 8c6:   55                      push   ebp
 8c7:   89 c2                   mov    edx,eax
 8c9:   b9 00 01 00 00          mov    ecx,0x100
 8ce:   b0 41                   mov    al,0x41
 8d0:   89 e5                   mov    ebp,esp
 8d2:   57                      push   edi
 8d3:   89 d7                   mov    edi,edx
 8d5:   f3 aa                   rep stos BYTE PTR es:[edi],al
 8d7:   5f                      pop    edi
 8d8:   5d                      pop    ebp
 8d9:   c3                      ret

000008da <_ZL14stack_to_smashv>:
 8da:   e8 41 00 00 00          call   920 <__x86.get_pc_thunk.ax>
 8df:   05 ed 12 00 00          add    eax,0x12ed
 8e4:   55                      push   ebp
 8e5:   89 e5                   mov    ebp,esp
 8e7:   53                      push   ebx
 8e8:   83 ec 14                sub    esp,0x14
 8eb:   8d 98 28 00 00 00       lea    ebx,[eax+0x28]
 8f1:   c7 45 ec 00 00 00 00    mov    DWORD PTR [ebp-0x14],0x0
 8f8:   c7 45 f0 00 00 00 00    mov    DWORD PTR [ebp-0x10],0x0
 8ff:   8b 03                   mov    eax,DWORD PTR [ebx]
 901:   89 45 f4                mov    DWORD PTR [ebp-0xc],eax
 904:   31 c0                   xor    eax,eax
 906:   8d 45 ec                lea    eax,[ebp-0x14]
 909:   e8 b8 ff ff ff          call   8c6 <_ZL7smasherPc>
 90e:   8b 45 f4                mov    eax,DWORD PTR [ebp-0xc]
 911:   2b 03                   sub    eax,DWORD PTR [ebx]
 913:   74 05                   je     91a <_ZL14stack_to_smashv+0x40>
 915:   e8 5a 00 00 00          call   974 <__stack_chk_fail_local>
 91a:   83 c4 14                add    esp,0x14
 91d:   5b                      pop    ebx
 91e:   5d                      pop    ebp
 91f:   c3                      ret

With these changes we can see the system catches the corruption as expected:

courage ~ $ ./user/Tests/LibC/stack-smash
[+] Starting the stack smash ...
Error: Stack protector failure, stack smashing detected!
Shell: Job 1 (/usr/Tests/LibC/stack-smash) Aborted

Final PR

The resulting pull request is here: Build + LibC: Enable -fstack-protector-strong in user space