diff --git a/src/binfmt_rto/Makefile b/src/binfmt_rto/Makefile index 92b440bc49a2ae09a1ee23aeff2c58993545d8f7..2e61de5bd901d67a2299a6c0d0c1a305326d58dc 100644 --- a/src/binfmt_rto/Makefile +++ b/src/binfmt_rto/Makefile @@ -1,7 +1,7 @@ KDIR ?= /lib/modules/$(shell uname -r)/build obj-m := sysboost_loader.o -sysboost_loader-objs := main.o binfmt_rto.o loader_device.o +sysboost_loader-objs := main.o binfmt_rto.o loader_device.o rto_populate.o PWD := $(shell pwd) modules: diff --git a/src/binfmt_rto/binfmt_rto.c b/src/binfmt_rto/binfmt_rto.c index 1a1900d6998fc4128e60f6776d63f2e29117aad1..e91c0c5ef5b9b51ae09369ac6d50a531195fa8f2 100644 --- a/src/binfmt_rto/binfmt_rto.c +++ b/src/binfmt_rto/binfmt_rto.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -50,14 +49,18 @@ #include #include +#ifdef CONFIG_X86 +#include +#endif #include "main.h" +#include "loader_device.h" +#include "binfmt_rto.h" #ifndef CONFIG_ELF_SYSBOOST #define CONFIG_ELF_SYSBOOST 1 #endif #ifdef CONFIG_ELF_SYSBOOST -#include #include "../elf_ext.h" /* compat 22.03 LTS, 22.03 LTS SP2 */ @@ -65,8 +68,6 @@ #define MM_SAVED_AUXV(mm) mm->saved_auxv #endif -extern int map_vdso(const struct vdso_image *image, unsigned long addr); - #define proc_symbol(SYM) typeof(SYM) *(SYM) static struct global_symbols { #ifdef CONFIG_ARM64 @@ -95,7 +96,7 @@ static struct global_symbols { proc_symbol(arch_align_stack); proc_symbol(task_cputime); proc_symbol(thread_group_cputime); -} g_sym; +} rto_sym; #define proc_symbol_char(x) #x static char *global_symbol_names[] = { @@ -126,57 +127,10 @@ static char *global_symbol_names[] = { proc_symbol_char(thread_group_cputime) }; -typedef unsigned long (*kallsyms_lookup_name_kprobe_t)(const char *name); -kallsyms_lookup_name_kprobe_t klookupf; - -static int init_kallsyms_lookup_name(void) -{ - int ret; - struct kprobe kallsyms_kprobe_var = { - .symbol_name = "kallsyms_lookup_name", - }; - - ret = register_kprobe(&kallsyms_kprobe_var); - if (ret) { - pr_err("register_kprobes returned %d\n", ret); - return ret; - } - - klookupf = (kallsyms_lookup_name_kprobe_t)kallsyms_kprobe_var.addr; - unregister_kprobe(&kallsyms_kprobe_var); - if (!klookupf) { - pr_err("no kallsyms_lookup_name in kernel!\n"); - return -EFAULT; - } - - return 0; -} - -int do_init_symbols(unsigned long *func_base, char *func[], unsigned int num) -{ - unsigned int i; - unsigned long *input_func_base = func_base; - - for (i = 0; i < num; i++) { - *input_func_base = klookupf(func[i]); - if (!*input_func_base) { - pr_warn("get %s failed\n", func[i]); - return -EEXIST; - } - input_func_base++; - } - - return 0; -} - -int init_symbols(void) +static int init_symbols(void) { int ret; - unsigned long *func_base = (unsigned long *)&g_sym; - - ret = init_kallsyms_lookup_name(); - if (ret) - return ret; + unsigned long *func_base = (unsigned long *)&rto_sym; ret = do_init_symbols(func_base, global_symbol_names, ARRAY_SIZE(global_symbol_names)); if (ret < 0) @@ -191,7 +145,7 @@ int init_symbols(void) static inline unsigned long __cpu_get_elf_hwcap(void) { #ifdef CONFIG_ARM64 - return g_sym.cpu_get_elf_hwcap(); + return rto_sym.cpu_get_elf_hwcap(); #else // x86 boot_cpu_data is export return (boot_cpu_data.x86_capability[CPUID_1_EDX]); @@ -205,10 +159,10 @@ static inline unsigned long __cpu_get_elf_hwcap(void) static inline unsigned long __cpu_get_elf_hwcap2(void) { #ifdef CONFIG_ARM64 - return g_sym.cpu_get_elf_hwcap2(); + return rto_sym.cpu_get_elf_hwcap2(); #else // x86 is global val elf_hwcap2, not export - return *(u32 *)g_sym.elf_hwcap2; + return *(u32 *)rto_sym.elf_hwcap2; #endif } #endif @@ -228,35 +182,35 @@ do { \ * If we haven't determined a sensible value to give to \ * userspace, omit the entry: \ */ \ - if (likely(*g_sym.signal_minsigstksz)) \ - NEW_AUX_ENT(AT_MINSIGSTKSZ, *g_sym.signal_minsigstksz); \ + if (likely(*rto_sym.signal_minsigstksz)) \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, *rto_sym.signal_minsigstksz); \ else \ NEW_AUX_ENT(AT_IGNORE, 0); \ } while (0) // TODO: vdso layout for ARM64 -#define __arch_setup_additional_pages(bprm, uses_interp, load_bias, is_rto_format) (g_sym.arch_setup_additional_pages(bprm, uses_interp)) +#define __arch_setup_additional_pages(bprm, uses_interp, load_bias, is_rto_format) (rto_sym.arch_setup_additional_pages(bprm, uses_interp)) #ifdef arch_elf_adjust_prot #undef arch_elf_adjust_prot #endif -#define arch_elf_adjust_prot g_sym.arch_elf_adjust_prot +#define arch_elf_adjust_prot rto_sym.arch_elf_adjust_prot #else // x86 #ifdef get_sigframe_size #define ARCH_DLINFO \ do { \ - if (*(unsigned int *)g_sym.vdso64_enabled) \ + if (*(unsigned int *)rto_sym.vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ - NEW_AUX_ENT(AT_MINSIGSTKSZ, g_sym.get_sigframe_size()); \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, rto_sym.get_sigframe_size()); \ } while (0) #else #define ARCH_DLINFO \ do { \ - if (*(unsigned int *)g_sym.vdso64_enabled) \ + if (*(unsigned int *)rto_sym.vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ } while (0) @@ -265,10 +219,10 @@ do { \ int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, unsigned long load_bias, bool is_rto_format) { if (!is_rto_format) { - return g_sym.arch_setup_additional_pages(bprm, uses_interp); + return rto_sym.arch_setup_additional_pages(bprm, uses_interp); } - if (!*(unsigned int *)g_sym.vdso64_enabled) + if (!*(unsigned int *)rto_sym.vdso64_enabled) return 0; // layout for vdso and app and ld.so @@ -277,13 +231,13 @@ int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, un // vvar | vdso | app if (debug) printk("binfmt_rto: base 0x%lx vvar 0x%lx\n", load_bias, load_bias - ELF_VVAR_AND_VDSO_LEN); - return g_sym.map_vdso(g_sym.vdso_image_64, load_bias - ELF_VVAR_AND_VDSO_LEN); + return rto_sym.map_vdso(rto_sym.vdso_image_64, load_bias - ELF_VVAR_AND_VDSO_LEN); } #ifdef SET_PERSONALITY2 #undef SET_PERSONALITY2 #endif -#define SET_PERSONALITY2(ex, state) (g_sym.set_personality_64bit()) +#define SET_PERSONALITY2(ex, state) (rto_sym.set_personality_64bit()) #endif @@ -323,20 +277,10 @@ static int elf_core_dump(struct coredump_params *cprm); #define elf_core_dump NULL #endif -#if ELF_EXEC_PAGESIZE > PAGE_SIZE -#define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE -#else -#define ELF_MIN_ALIGN PAGE_SIZE -#endif - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) -#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) -#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) - static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, @@ -436,7 +380,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * thing we can do is to shuffle the initial stack for them. */ - p = g_sym.arch_align_stack(p); + p = rto_sym.arch_align_stack(p); /* * If this architecture has a platform capability string, copy it @@ -622,11 +566,16 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); + pr_info("vm_mmap, addr: %lx, total_size: %lx, off: %lx", + addr, total_size, off); map_addr = vm_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) vm_munmap(map_addr+size, total_size-size); - } else + } else { map_addr = vm_mmap(filep, addr, size, prot, type, off); + pr_info("vm_mmap, addr: %lx, size: %lx, off: %lx", + addr, size, off); + } if ((type & MAP_FIXED_NOREPLACE) && PTR_ERR((void *)map_addr) == -EEXIST) @@ -694,8 +643,8 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) * header pointed to by elf_ex, into a newly allocated array. The caller is * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. */ -static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, - struct file *elf_file) +struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, + struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; int retval, err = -1; @@ -1085,29 +1034,38 @@ static struct file * try_get_rto_file(struct file *file) return rto_file; } -static int prepare_rto(struct linux_binprm *bprm) +void *load_bprm_buf(struct file *file) { + ssize_t ret; + char *buffer; loff_t pos = 0; - void *buffer; - long ret; buffer = kmalloc(BINPRM_BUF_SIZE, GFP_KERNEL); if (!buffer) - return -ENOMEM; - memcpy(buffer, bprm->buf, BINPRM_BUF_SIZE); + return ERR_PTR(-ENOMEM); - memset(bprm->buf, 0, BINPRM_BUF_SIZE); - ret = kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); + ret = kernel_read(file, buffer, BINPRM_BUF_SIZE, &pos); if (ret != BINPRM_BUF_SIZE) { - memcpy(bprm->buf, buffer, BINPRM_BUF_SIZE); - if (ret >= 0) - ret = -ENOENT; - } else { - ret = 0; + kfree(buffer); + if (ret < 0) + return ERR_PTR(ret); + return ERR_PTR(-EIO); } + return buffer; +} + +static int prepare_rto(struct linux_binprm *bprm) +{ + void *buffer; + + buffer = load_bprm_buf(bprm->file); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + + memcpy(bprm->buf, buffer, BINPRM_BUF_SIZE); kfree(buffer); - return ret; + return 0; } static inline int try_replace_file(struct linux_binprm *bprm) @@ -1144,41 +1102,41 @@ static inline void ___start_thread(struct pt_regs *regs, unsigned long pc, { start_thread_common(regs, pc); regs->pstate = PSR_MODE_EL0t; - g_sym.spectre_v4_enable_task_mitigation(current); + rto_sym.spectre_v4_enable_task_mitigation(current); regs->sp = sp; } #endif /* CONFIG_ARM64 */ -static bool check_elf_xattr(struct linux_binprm *bprm) -{ - char *xattr = NULL; - int xattr_size = 0; - - return false; - - // try to get attr from bprm - xattr_size = vfs_getxattr(bprm->file->f_path.dentry, - "trusted.flags", NULL, 0); - if (xattr_size < 0) { - return false; - } - - xattr = kvmalloc(xattr_size, GFP_KERNEL); - if (xattr == NULL) { - return false; - } - xattr_size = vfs_getxattr(bprm->file->f_path.dentry, - "trusted.flags", xattr, xattr_size); - if (xattr_size <= 0) { - kvfree(xattr); - return false; - } - - if (memcmp(xattr, "true", xattr_size)) { - return false; - } - return true; -} +// static bool check_elf_xattr(struct linux_binprm *bprm) +// { +// char *xattr = NULL; +// int xattr_size = 0; + +// return false; + +// // try to get attr from bprm +// xattr_size = vfs_getxattr(bprm->file->f_path.dentry, +// "trusted.flags", NULL, 0); +// if (xattr_size < 0) { +// return false; +// } + +// xattr = kvmalloc(xattr_size, GFP_KERNEL); +// if (xattr == NULL) { +// return false; +// } +// xattr_size = vfs_getxattr(bprm->file->f_path.dentry, +// "trusted.flags", xattr, xattr_size); +// if (xattr_size <= 0) { +// kvfree(xattr); +// return false; +// } + +// if (memcmp(xattr, "true", xattr_size)) { +// return false; +// } +// return true; +// } #endif /* CONFIG_ELF_SYSBOOST */ @@ -1208,14 +1166,15 @@ static int load_elf_binary(struct linux_binprm *bprm) #ifdef CONFIG_ELF_SYSBOOST bool is_rto_format = false; -load_rto: +// load_rto: is_rto_format = elf_ex->e_flags & OS_SPECIFIC_FLAG_RTO; retval = -ENOEXEC; /* close feature to rmmod this ko */ - if (!use_rto) { + if (!use_rto || !IS_SYSBOOST_RTO(bprm->file->f_inode)) { goto out; } + pr_info("lyt enter rto\n"); #endif /* First of all, some simple consistency checks */ @@ -1235,28 +1194,28 @@ load_rto: if (!elf_phdata) goto out; -#ifdef CONFIG_ELF_SYSBOOST - /* replace app.rto file, then use binfmt */ - if (check_elf_xattr(bprm)) { - int ret = try_replace_file(bprm); - if (!ret) { - if (elf_ex->e_flags & OS_SPECIFIC_FLAG_RTO) { - goto load_rto; - } else { - goto out; - } - } else { - /* limit print */ - printk("replace rto file fail, %d\n", ret); - goto out; - } - } - if (!is_rto_format && !(elf_ex->e_flags & OS_SPECIFIC_FLAG_HUGEPAGE)) - goto out; - if (debug) { - printk("exec in rto mode, is_rto_format %d\n", is_rto_format); - } -#endif +// #ifdef CONFIG_ELF_SYSBOOST +// /* replace app.rto file, then use binfmt */ +// if (check_elf_xattr(bprm)) { +// int ret = try_replace_file(bprm); +// if (!ret) { +// if (elf_ex->e_flags & OS_SPECIFIC_FLAG_RTO) { +// goto load_rto; +// } else { +// goto out; +// } +// } else { +// /* limit print */ +// printk("replace rto file fail, %d\n", ret); +// goto out; +// } +// } +// if (!is_rto_format && !(elf_ex->e_flags & OS_SPECIFIC_FLAG_HUGEPAGE)) +// goto out; +// if (debug) { +// printk("exec in rto mode, is_rto_format %d\n", is_rto_format); +// } +// #endif elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { @@ -1405,14 +1364,14 @@ out_free_interp: if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; - if (!(current->personality & ADDR_NO_RANDOMIZE) && *g_sym.randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && *rto_sym.randomize_va_space) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - retval = setup_arg_pages(bprm, g_sym.randomize_stack_top(STACK_TOP), + retval = setup_arg_pages(bprm, rto_sym.randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) goto out_free_dentry; @@ -1430,7 +1389,7 @@ out_free_interp: for(i = 0, elf_ppnt = elf_phdata; i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; - unsigned long k, vaddr; + unsigned long k, vaddr, size, off; unsigned long total_size = 0; unsigned long alignment; @@ -1510,7 +1469,7 @@ out_free_interp: if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) - load_bias += g_sym.arch_mmap_rnd(); + load_bias += rto_sym.arch_mmap_rnd(); alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); @@ -1542,6 +1501,11 @@ out_free_interp: PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } + // size = elf_ppnt->p_filesz + ELF_PAGEOFFSET(elf_ppnt->p_vaddr); + // off = elf_ppnt->p_offset - ELF_PAGEOFFSET(elf_ppnt->p_vaddr); + // pr_info("lyt addr: 0x%lx, off: 0x%lx, size: 0x%lx, \n", + // load_bias + vaddr, off, size); + // rto_populate(bprm->file, error, off, size); if (!load_addr_set) { load_addr_set = 1; @@ -1675,7 +1639,7 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (*g_sym.randomize_va_space > 1)) { + if ((current->flags & PF_RANDOMIZE) && (*rto_sym.randomize_va_space > 1)) { /* * For architectures with ELF randomization, when executing * a loader directly (i.e. no interpreter listed in ELF @@ -1688,7 +1652,7 @@ out_free_interp: mm->brk = mm->start_brk = ELF_ET_DYN_BASE; } - mm->brk = mm->start_brk = g_sym.arch_randomize_brk(mm); + mm->brk = mm->start_brk = rto_sym.arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif @@ -1926,13 +1890,13 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * This is the record for the group leader. It shows the * group-wide total, not its individual thread total. */ - g_sym.thread_group_cputime(p, &cputime); + rto_sym.thread_group_cputime(p, &cputime); prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; - g_sym.task_cputime(p, &utime, &stime); + rto_sym.task_cputime(p, &utime, &stime); prstatus->pr_utime = ns_to_kernel_old_timeval(utime); prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } @@ -2678,7 +2642,7 @@ static int elf_core_dump(struct coredump_params *cprm) for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *meta = cprm->vma_meta + i; - if (!g_sym.dump_user_range(cprm, meta->start, meta->dump_size)) + if (!rto_sym.dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } dump_truncate(cprm); diff --git a/src/binfmt_rto/binfmt_rto.h b/src/binfmt_rto/binfmt_rto.h new file mode 100644 index 0000000000000000000000000000000000000000..52456840690d573eb26bdd24181f1432cdd78c1b --- /dev/null +++ b/src/binfmt_rto/binfmt_rto.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include + +int rto_populate(struct file *file, unsigned long vaddr, + unsigned long offset, unsigned long size); + +int init_rto_binfmt(void); +void exit_rto_binfmt(void); +void *load_bprm_buf(struct file *file); +struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, + struct file *elf_file); + +int rto_populate_init(void); + +#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE +#else +#define ELF_MIN_ALIGN PAGE_SIZE +#endif + +#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) +#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) diff --git a/src/binfmt_rto/loader_device.c b/src/binfmt_rto/loader_device.c index 3b6918233e4ee0788d9026789afad288d3a7b8f9..3d39fb26fd879d49b947505d9dfdac8627147f3f 100644 --- a/src/binfmt_rto/loader_device.c +++ b/src/binfmt_rto/loader_device.c @@ -10,18 +10,7 @@ #include #include "main.h" #include "loader_device.h" - -struct loaded_seg { - struct list_head list; - struct list_head hpages; -}; - -struct loaded_rto { - struct list_head list; - struct inode *inode; - struct list_head segs; - atomic_t use_count; -}; +#include "binfmt_rto.h" static LIST_HEAD(loaded_rtos); static DEFINE_RWLOCK(rtos_rwlock); @@ -89,7 +78,6 @@ static int load_seg(struct file *file, struct loaded_rto *loaded_rto, list_add(&loaded_seg->list, &loaded_rto->segs); return 0; error: -pr_info("load_seg error: %d\n", ret); loaded_seg_free(loaded_seg); return ret; } @@ -125,35 +113,52 @@ static void loaded_rto_put(struct loaded_rto *loaded_rto) loaded_rto_free(loaded_rto); } -static int do_load_rto(struct file *file) +static int load_rto(struct file *file) { - int ret; + int ret, i; struct loaded_rto *loaded_rto; struct inode *inode = file->f_inode; + unsigned long size, offset; + struct elfhdr *elf_ex; + struct elf_phdr *elf_ppnt, *elf_phdata; loaded_rto = loaded_rto_alloc(inode); if (!loaded_rto) return -ENOMEM; - ret = load_seg(file, loaded_rto, 0, 2*HPAGE_SIZE); - if (ret) - goto error; - - return 0; -error: - loaded_rto_free(loaded_rto); - return ret; -} - -static int load_rto(struct file *file) -{ - struct inode *inode; + elf_ex = load_bprm_buf(file); + if (IS_ERR(elf_ex)) { + ret = PTR_ERR(elf_ex); + goto error_bprm_buf; + } + elf_phdata = load_elf_phdrs(elf_ex, file); + if (!elf_phdata) { + ret = -EIO; + goto error_phdrs; + } - inode = file->f_inode; + for(i = 0, elf_ppnt = elf_phdata; i < elf_ex->e_phnum; i++, elf_ppnt++) { + if (elf_ppnt->p_type != PT_LOAD) + continue; - do_load_rto(file); + size = elf_ppnt->p_filesz + ELF_PAGEOFFSET(elf_ppnt->p_vaddr); + offset = elf_ppnt->p_offset - ELF_PAGEOFFSET(elf_ppnt->p_vaddr); + ret = load_seg(file, loaded_rto, offset, size); + if (ret) + goto error_seg; + } + kfree(elf_phdata); + kfree(elf_ex); return 0; + +error_seg: + kfree(elf_phdata); +error_phdrs: + kfree(elf_ex); +error_bprm_buf: + loaded_rto_free(loaded_rto); + return ret; } struct loaded_rto *find_loaded_rto(struct inode *inode) diff --git a/src/binfmt_rto/loader_device.h b/src/binfmt_rto/loader_device.h index 4872753f559d95e3001352903b827a7ad77d70dc..6adc92d621e7409755fc0e69e810ecce13f3b40f 100644 --- a/src/binfmt_rto/loader_device.h +++ b/src/binfmt_rto/loader_device.h @@ -3,6 +3,19 @@ * Copyright (c) 2023 Huawei Technologies Co.,Ltd. All rights reserved. */ +struct loaded_seg { + struct list_head list; + struct list_head hpages; +}; + +struct loaded_rto { + struct list_head list; + struct inode *inode; + struct list_head segs; + atomic_t use_count; +}; + +struct loaded_rto *find_loaded_rto(struct inode *inode); int __init loader_device_init(void); void __exit loader_device_exit(void); diff --git a/src/binfmt_rto/main.c b/src/binfmt_rto/main.c index 749294e01e997a2da8c5a527cee152f8ef0b9dd4..8ae3ca2273f6afafd8165db24a48369db9ad9cc3 100644 --- a/src/binfmt_rto/main.c +++ b/src/binfmt_rto/main.c @@ -5,6 +5,7 @@ #include "main.h" #include "loader_device.h" +#include "binfmt_rto.h" bool use_rto = false; module_param(use_rto, bool, 0600); @@ -15,17 +16,47 @@ int debug = 0; module_param(debug, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(debug, "debug mode"); -int init_rto_binfmt(void); -void exit_rto_binfmt(void); +kallsyms_lookup_name_kprobe_t klookupf; + +static int init_kallsyms_lookup_name(void) +{ + int ret; + struct kprobe kallsyms_kprobe_var = { + .symbol_name = "kallsyms_lookup_name", + }; + + ret = register_kprobe(&kallsyms_kprobe_var); + if (ret) { + pr_err("register_kprobes returned %d\n", ret); + return ret; + } + + klookupf = (kallsyms_lookup_name_kprobe_t)kallsyms_kprobe_var.addr; + unregister_kprobe(&kallsyms_kprobe_var); + if (!klookupf) { + pr_err("no kallsyms_lookup_name in kernel!\n"); + return -EFAULT; + } + + return 0; +} static int __init sysboost_loader_init(void) { int ret = 0; + ret = init_kallsyms_lookup_name(); + if (ret) + goto error_rto; + + ret = rto_populate_init(); + if (ret) + goto error_rto; + ret = init_rto_binfmt(); if (ret) goto error_rto; - + ret = loader_device_init(); if (ret) goto error_device; diff --git a/src/binfmt_rto/main.h b/src/binfmt_rto/main.h index 80f370bf94bb183b4aa7ea80eacbf9a5b1c702ad..04dc032af6f0d7f56b46aae0504d75d88226799b 100644 --- a/src/binfmt_rto/main.h +++ b/src/binfmt_rto/main.h @@ -4,7 +4,26 @@ */ #include +#include extern bool use_rto; extern int debug; +typedef unsigned long (*kallsyms_lookup_name_kprobe_t)(const char *name); +extern kallsyms_lookup_name_kprobe_t klookupf; +static inline int do_init_symbols(unsigned long *func_base, char *func[], unsigned int num) +{ + unsigned int i; + unsigned long *input_func_base = func_base; + + for (i = 0; i < num; i++) { + *input_func_base = klookupf(func[i]); + if (!*input_func_base) { + pr_warn("get %s failed\n", func[i]); + return -EEXIST; + } + input_func_base++; + } + + return 0; +} diff --git a/src/binfmt_rto/rto_populate.c b/src/binfmt_rto/rto_populate.c new file mode 100644 index 0000000000000000000000000000000000000000..19f45b4aa208e092afd38961130bb5b284d51032 --- /dev/null +++ b/src/binfmt_rto/rto_populate.c @@ -0,0 +1,727 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "main.h" +#include "loader_device.h" + +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + +int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags); +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + struct follow_page_context *ctx); + +#define proc_symbol(SYM) typeof(SYM) *(SYM) +static struct global_symbols { + proc_symbol(check_vma_flags); + proc_symbol(follow_page_mask); + proc_symbol(__pud_alloc); + proc_symbol(__anon_vma_prepare); + proc_symbol(__pmd_alloc); + +} ppl_sym; + +#define proc_symbol_char(x) #x +static char *global_symbol_names[] = { + proc_symbol_char(check_vma_flags), + proc_symbol_char(follow_page_mask), + proc_symbol_char(__pud_alloc), + proc_symbol_char(__anon_vma_prepare), + proc_symbol_char(__pmd_alloc), +}; + +static int init_symbols(void) +{ + int ret; + unsigned long *func_base = (unsigned long *)&ppl_sym; + + ret = do_init_symbols(func_base, global_symbol_names, ARRAY_SIZE(global_symbol_names)); + if (ret < 0) + return ret; + + return 0; +} + +static vm_fault_t __rto_do_huge_pmd_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) +{ + struct vm_area_struct *vma = vmf->vma; + // pgtable_t pgtable; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret = 0; + + VM_BUG_ON_PAGE(!PageCompound(page), page); + + // if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + // put_page(page); + // count_vm_event(THP_FAULT_FALLBACK); + // count_vm_event(THP_FAULT_FALLBACK_CHARGE); + // return VM_FAULT_FALLBACK; + // } + // cgroup_throttle_swaprate(page, gfp); + + // pgtable = pte_alloc_one(vma->vm_mm); + // if (unlikely(!pgtable)) { + // ret = VM_FAULT_OOM; + // goto release; + // } + + // clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + // __SetPageUptodate(page); + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + goto unlock_release; + } else { + pmd_t entry; + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + + /* Deliver the page fault to userland */ + // if (userfaultfd_missing(vma)) { + // vm_fault_t ret2; + + // spin_unlock(vmf->ptl); + // put_page(page); + // pte_free(vma->vm_mm, pgtable); + // ret2 = handle_userfault(vmf, VM_UFFD_MISSING); + // VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); + // return ret2; + // } + + entry = mk_huge_pmd(page, vma->vm_page_prot); + // we don't need write access for text segment. + // entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + // we don't need LRU. + // page_add_new_anon_rmap(page, vma, haddr, true); + // lru_cache_add_inactive_or_unevictable(page, vma); + + // we won't split thp, no need to deposit + // pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + // add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + // reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + + // count_vm_event(THP_FAULT_ALLOC); + // count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + } + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +// release: + // if (pgtable) + // pte_free(vma->vm_mm, pgtable); + // put_page(page); + return ret; + +} + +static inline int rto_anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return ppl_sym.__anon_vma_prepare(vma); +} + +vm_fault_t rto_do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *hpage) +{ + struct vm_area_struct *vma = vmf->vma; + gfp_t gfp; + // struct page *page; + // unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + + // we have checked boader outside, no need to double check + // if (!transhuge_vma_suitable(vma, haddr)) + // return VM_FAULT_FALLBACK; + if (unlikely(rto_anon_vma_prepare(vma))) + return VM_FAULT_OOM; + // if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + // return VM_FAULT_OOM; + // if (!(vmf->flags & FAULT_FLAG_WRITE) && + // !mm_forbids_zeropage(vma->vm_mm) && + // transparent_hugepage_use_zero_page()) { + // pgtable_t pgtable; + // struct page *zero_page; + // vm_fault_t ret; + // pgtable = pte_alloc_one(vma->vm_mm); + // if (unlikely(!pgtable)) + // return VM_FAULT_OOM; + // zero_page = mm_get_huge_zero_page(vma->vm_mm); + // if (unlikely(!zero_page)) { + // pte_free(vma->vm_mm, pgtable); + // count_vm_event(THP_FAULT_FALLBACK); + // return VM_FAULT_FALLBACK; + // } + // vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + // ret = 0; + // if (pmd_none(*vmf->pmd)) { + // ret = check_stable_address_space(vma->vm_mm); + // if (ret) { + // spin_unlock(vmf->ptl); + // pte_free(vma->vm_mm, pgtable); + // // } else if (userfaultfd_missing(vma)) { + // // spin_unlock(vmf->ptl); + // // pte_free(vma->vm_mm, pgtable); + // // ret = handle_userfault(vmf, VM_UFFD_MISSING); + // // VM_BUG_ON(ret & VM_FAULT_FALLBACK); + // } else { + // // set_huge_zero_page(pgtable, vma->vm_mm, vma, + // // haddr, vmf->pmd, zero_page); + // spin_unlock(vmf->ptl); + // } + // } else { + // spin_unlock(vmf->ptl); + // pte_free(vma->vm_mm, pgtable); + // } + // return ret; + // } + // gfp = alloc_hugepage_direct_gfpmask(vma); + + // TODO + // page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + // if (unlikely(!page)) { + // count_vm_event(THP_FAULT_FALLBACK); + // return VM_FAULT_FALLBACK; + // } + // prep_transhuge_page(page); + return __rto_do_huge_pmd_anonymous_page(vmf, hpage, gfp); +} + +static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf, struct page *hpage) +{ + // if (vma_is_anonymous(vmf->vma)) + return rto_do_huge_pmd_anonymous_page(vmf, hpage); + // if (vmf->vma->vm_ops->huge_fault) + // return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + // return VM_FAULT_FALLBACK; +} + +static inline pud_t *rto_pud_alloc(struct mm_struct *mm, p4d_t *p4d, + unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && ppl_sym.__pud_alloc(mm, p4d, address)) ? + NULL : pud_offset(p4d, address); +} + +static inline pmd_t *rto_pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return (unlikely(pud_none(*pud)) && ppl_sym.__pmd_alloc(mm, pud, address))? + NULL: pmd_offset(pud, address); +} + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_lock may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +static vm_fault_t __rto_handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = address & PAGE_MASK, + .flags = flags, + .pgoff = linear_page_index(vma, address), + // .gfp_mask = __get_fault_gfp_mask(vma), + }; + // unsigned int dirty = flags & FAULT_FLAG_WRITE; + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + p4d_t *p4d; + vm_fault_t ret; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return VM_FAULT_OOM; + + vmf.pud = rto_pud_alloc(mm, p4d, address); + if (!vmf.pud) + return VM_FAULT_OOM; +retry_pud: + // if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { + // ret = create_huge_pud(&vmf); + // if (!(ret & VM_FAULT_FALLBACK)) + // return ret; + // } else { + // pud_t orig_pud = *vmf.pud; + + // barrier(); + // if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + + // /* NUMA case for anonymous PUDs would go here */ + + // if (dirty && !pud_write(orig_pud)) { + // ret = wp_huge_pud(&vmf, orig_pud); + // if (!(ret & VM_FAULT_FALLBACK)) + // return ret; + // } else { + // huge_pud_set_accessed(&vmf, orig_pud); + // return 0; + // } + // } + // } + + vmf.pmd = rto_pmd_alloc(mm, vmf.pud, address); + if (!vmf.pmd) + return VM_FAULT_OOM; + + /* Huge pud page fault raced with pmd_alloc? */ + if (pud_trans_unstable(vmf.pud)) + goto retry_pud; + + // if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { + ret = create_huge_pmd(&vmf, hpage); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + // } + + BUG(); + return 0; +} + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_lock may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +static vm_fault_t rto_handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs, struct page *hpage) +{ + vm_fault_t ret; + + __set_current_state(TASK_RUNNING); + + // TODO these 2 lines can be uncomment + // count_vm_event(PGFAULT); + // count_memcg_event_mm(vma->vm_mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + // check_sync_rss_stat(current); + + // if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + // flags & FAULT_FLAG_INSTRUCTION, + // flags & FAULT_FLAG_REMOTE)) + // return VM_FAULT_SIGSEGV; + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + + // if (unlikely(is_vm_hugetlb_page(vma))) + // ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + // else + ret = __rto_handle_mm_fault(vma, address, flags, hpage); + + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + // TODO don't consider oom now + // if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + // mem_cgroup_oom_synchronize(false); + } + + // mm_account_fault(regs, address, flags, ret); + + return ret; +} + +/* + * mmap_lock must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_lock may be released. If it + * is, *@locked will be set to 0 and -EBUSY returned. + */ +static int rto_faultin_page(struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *locked, struct page *hpage) +{ + unsigned int fault_flags = 0; + vm_fault_t ret; + + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + /* + * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED + * can co-exist + */ + fault_flags |= FAULT_FLAG_TRIED; + } + + ret = rto_handle_mm_fault(vma, address, fault_flags, NULL, hpage); + if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; + BUG(); + } + + if (ret & VM_FAULT_RETRY) { + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags |= FOLL_COW; + return 0; +} + +/** + * __get_user_pages() - pin user pages in memory + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: whether we're still with the mmap_lock held + * + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_lock is held. + * + * Must be called with mmap_lock held. It may be released. See below. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @locked != NULL, *@locked will be set to 0 when mmap_lock is + * released by an up_read(). That can happen if @gup_flags does not + * have FOLL_NOWAIT. + * + * A caller using such a combination of @locked and @gup_flags + * must therefore hold the mmap_lock for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +static long rto_get_user_pages(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked, struct list_head *hpages) +{ + long ret = 0, i = 0; + struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; + + if (!nr_pages) + return 0; + + start = untagged_addr(start); + + VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + do { + struct page *page, *hpage; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + // if (!vma && in_gate_area(mm, start)) { + // ret = get_gate_page(mm, start & PAGE_MASK, + // gup_flags, &vma, + // pages ? &pages[i] : NULL); + // if (ret) + // goto out; + // ctx.page_mask = 0; + // goto next_page; + // } + + if (!vma || ppl_sym.check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } + + // if (is_vm_hugetlb_page(vma)) { + // i = follow_hugetlb_page(mm, vma, pages, vmas, + // &start, &nr_pages, i, + // gup_flags, locked); + // if (locked && *locked == 0) { + // /* + // * We've got a VM_FAULT_RETRY + // * and we've lost mmap_lock. + // * We must stop here. + // */ + // BUG_ON(gup_flags & FOLL_NOWAIT); + // BUG_ON(ret != 0); + // goto out; + // } + // continue; + // } + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + + /* TODO try comment here to increase efficiency */ + page = ppl_sym.follow_page_mask(vma, start, foll_flags, &ctx); + if (!page) { + ret = rto_faultin_page(vma, start, &foll_flags, locked, hpage); + switch (ret) { + case 0: + goto retry; + case -EBUSY: + ret = 0; + fallthrough; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + goto out; + case -ENOENT: + goto next_page; + } + BUG(); + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. + */ + BUG(); + goto next_page; + } else if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + // if (pages) { + // pages[i] = page; + // flush_anon_page(vma, page, start); + // flush_dcache_page(page); + // ctx.page_mask = 0; + // } +next_page: + // if (vmas) { + // vmas[i] = vma; + // ctx.page_mask = 0; + // } + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages); +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; +} + +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @locked: whether the mmap_lock is still held + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * Return either number of pages pinned in the vma, or a negative error + * code on error. + * + * vma->vm_mm->mmap_lock must be held. + * + * If @locked is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @locked is non-NULL, it must held for read only and may be + * released. If it's released, *@locked will be set to 0. + */ +static long rto_populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *locked, struct list_head *hpages) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma_is_accessible(vma)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return rto_get_user_pages(mm, start, nr_pages, gup_flags, + NULL, NULL, locked, hpages); +} + +int rto_populate(struct file *file, unsigned long vaddr, + unsigned long offset, unsigned long size) +{ + struct mm_struct *mm = current->mm; + struct inode *inode = file->f_inode; + struct vm_area_struct *vma; + struct loaded_rto *loaded_rto; + struct loaded_seg *loaded_seg; + int ret, locked = 1; + + ret = -EINVAL; + vma = find_vma(mm, vaddr); + if (!vma) + goto error; + + loaded_rto = find_loaded_rto(inode); + loaded_seg = list_first_entry(&loaded_rto->segs, struct loaded_seg, list); + + mmap_read_lock(mm); + rto_populate_vma_page_range(vma, vaddr, size, &locked, &loaded_seg->hpages); + mmap_read_unlock(mm); + + return 0; +error: + pr_info("rto_populate fail, error: %d\n", ret); + return ret; +} + +int rto_populate_init(void) +{ + return init_symbols(); +}