--- sys/amd64/amd64/cpu_switch.S.orig +++ sys/amd64/amd64/cpu_switch.S @@ -104,11 +104,12 @@ testl $PCB_32BIT,PCB_FLAGS(%r8) jz 1f /* no, skip over */ - /* Save segment selector numbers */ - movl %ds,PCB_DS(%r8) - movl %es,PCB_ES(%r8) - movl %fs,PCB_FS(%r8) + /* Save userland %gs */ movl %gs,PCB_GS(%r8) + movq PCB_GS32P(%r8),%rax + movq (%rax),%rax + movq %rax,PCB_GS32SD(%r8) + 1: /* Test if debug registers should be saved. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) @@ -170,22 +171,6 @@ */ movq TD_PCB(%rsi),%r8 - testl $PCB_32BIT,PCB_FLAGS(%r8) - jz 1f /* no, skip over */ - - /* Restore segment selector numbers */ - movl PCB_DS(%r8),%ds - movl PCB_ES(%r8),%es - movl PCB_FS(%r8),%fs - - /* Restore userland %gs while preserving kernel gsbase */ - movl $MSR_GSBASE,%ecx - rdmsr - movl PCB_GS(%r8),%gs - wrmsr - jmp 2f -1: - /* Restore userland %fs */ movl $MSR_FSBASE,%ecx movl PCB_FSBASE(%r8),%eax @@ -197,7 +182,6 @@ movl PCB_GSBASE(%r8),%eax movl PCB_GSBASE+4(%r8),%edx wrmsr -2: /* Update the TSS_RSP0 pointer for the next interrupt */ movq PCPU(TSSP), %rax @@ -211,6 +195,19 @@ movl %eax, PCPU(CURTID) movq %rsi, PCPU(CURTHREAD) /* into next thread */ + testl $PCB_32BIT,PCB_FLAGS(%r8) + jz 1f /* no, skip over */ + + /* Restore userland %gs while preserving kernel gsbase */ + movq PCB_GS32P(%r8),%rax + movq PCB_GS32SD(%r8),%rbx + movq %rbx,(%rax) + movl $MSR_GSBASE,%ecx + rdmsr + movl PCB_GS(%r8),%gs + wrmsr + +1: /* Restore context. */ movq PCB_RBX(%r8),%rbx movq PCB_RSP(%r8),%rsp --- sys/amd64/amd64/genassym.c.orig +++ sys/amd64/amd64/genassym.c @@ -136,12 +136,14 @@ ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); +ASSYM(PCB_FULLCTX, PCB_FULLCTX); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); -ASSYM(PCB_FULLCTX, PCB_FULLCTX); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +ASSYM(PCB_GS32P, offsetof(struct pcb, pcb_gs32p)); +ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd)); ASSYM(PCB_SIZE, sizeof(struct pcb)); --- sys/amd64/amd64/machdep.c.orig +++ sys/amd64/amd64/machdep.c @@ -725,6 +725,15 @@ 0, /* long */ 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, +/* GUGS32_SEL 8 32 bit GS Descriptor for user */ +{ 0x0, /* segment base address */ + 0xfffff, /* length - all address space */ + SDT_MEMRWA, /* segment type */ + SEL_UPL, /* segment descriptor priority level */ + 1, /* segment descriptor present */ + 0, /* long */ + 1, /* default 32 vs 16 bit size */ + 1 /* limit granularity (byte/page units)*/ }, }; void --- sys/amd64/amd64/support.S.orig +++ sys/amd64/amd64/support.S @@ -689,3 +689,48 @@ movq %rax,32(%rdi) movq %rdi,bbhead NON_GPROF_RET + +#if defined(SMP) || !defined(_KERNEL) +#define MPLOCKED lock ; +#else +#define MPLOCKED +#endif + + .text + +futex_fault: + movq PCPU(CURPCB), %rdx + movq $0, PCB_ONFAULT(%rdx) + movq $-EFAULT, %rax + ret + +/* int futex_xchgl(int oparg, caddr_t uaddr, int *oldval); */ +ENTRY(futex_xchgl) + movq PCPU(CURPCB), %r11 + movq $futex_fault, PCB_ONFAULT(%r11) + + movq $VM_MAXUSER_ADDRESS-4, %rax + cmpq %rax, %rsi + ja futex_fault + + MPLOCKED xchgl %edi, (%rsi) + movl %edi, (%rdx) + xorl %eax, %eax + movq %rax, PCB_ONFAULT(%r11) + ret + +/* int futex_addl(int oparg, caddr_t uaddr, int *oldval); */ +ENTRY(futex_addl) + movq PCPU(CURPCB), %r11 + movq $futex_fault, PCB_ONFAULT(%r11) + + movq $VM_MAXUSER_ADDRESS-4, %rax + cmpq %rax, %rsi + ja futex_fault + + MPLOCKED xaddl %edi, (%rsi) + movl %edi, (%rdx) + xorl %eax, %eax + movq %rax, PCB_ONFAULT(%r11) + ret + --- sys/amd64/conf/NOTES.orig +++ sys/amd64/conf/NOTES @@ -608,9 +608,12 @@ # and PSEUDOFS) options LINPROCFS -#Enable the linux-like sys filesystem support (requires COMPAT_LINUX32 +# Enable the linux-like sys filesystem support (requires COMPAT_LINUX32 # and PSEUDOFS) -options LINSYSFS +options LINSYSFS + +# Enable the linux aio support (requires COMPAT_LINUX32 and VFS_AIO) +options LINUXAIO # # SysVR4 ABI emulation --- sys/amd64/include/pcb.h.orig +++ sys/amd64/include/pcb.h @@ -41,6 +41,7 @@ * AMD64 process control block */ #include +#include struct pcb { register_t pcb_cr3; @@ -73,6 +74,10 @@ #define PCB_FULLCTX 0x80 /* full context restore on sysret */ caddr_t pcb_onfault; /* copyin/out fault recovery */ + + /* 32-bit segment descriptor */ + struct user_segment_descriptor *pcb_gs32p; + struct user_segment_descriptor pcb_gs32sd; }; #ifdef _KERNEL --- sys/amd64/include/segments.h.orig +++ sys/amd64/include/segments.h @@ -200,9 +200,10 @@ #define GUCODE32_SEL 3 /* User 32 bit code Descriptor */ #define GUDATA_SEL 4 /* User 32/64 bit Data Descriptor */ #define GUCODE_SEL 5 /* User 64 bit Code Descriptor */ -#define GPROC0_SEL 6 /* TSS for entering kernel etc */ +#define GPROC0_SEL 6 /* TSS for entering kernel etc */ /* slot 6 is second half of GPROC0_SEL */ -#define NGDT 8 +#define GUGS32_SEL 8 /* User 32 bit GS Descriptor */ +#define NGDT 9 #ifdef _KERNEL extern struct user_segment_descriptor gdt[]; --- sys/amd64/linux32/linux.h.orig +++ sys/amd64/linux32/linux.h @@ -531,6 +531,7 @@ #define LINUX_O_RDONLY 00000000 #define LINUX_O_WRONLY 00000001 #define LINUX_O_RDWR 00000002 +#define LINUX_O_ACCMODE 00000003 #define LINUX_O_CREAT 00000100 #define LINUX_O_EXCL 00000200 #define LINUX_O_NOCTTY 00000400 @@ -565,6 +566,8 @@ #define LINUX_F_WRLCK 1 #define LINUX_F_UNLCK 2 +#define LINUX_AT_FDCWD -100 + /* * mount flags */ @@ -866,4 +869,6 @@ (LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES | \ LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD) +#include + #endif /* !_AMD64_LINUX_H_ */ --- sys/amd64/linux32/linux32_dummy.c.orig +++ sys/amd64/linux32/linux32_dummy.c @@ -56,7 +56,6 @@ DUMMY(rt_sigqueueinfo); DUMMY(capget); DUMMY(capset); -DUMMY(sendfile); DUMMY(truncate64); DUMMY(setfsuid); DUMMY(setfsgid); @@ -97,7 +96,6 @@ DUMMY(inotify_add_watch); DUMMY(inotify_rm_watch); DUMMY(migrate_pages); -DUMMY(openat); DUMMY(mkdirat); DUMMY(mknodat); DUMMY(fchownat); @@ -113,6 +111,11 @@ DUMMY(pselect6); DUMMY(ppoll); DUMMY(unshare); +DUMMY(io_setup); +DUMMY(io_destroy); +DUMMY(io_getevents); +DUMMY(io_submit); +DUMMY(io_cancel); #define DUMMY_XATTR(s) \ int \ --- sys/amd64/linux32/linux32_locore.s.orig +++ sys/amd64/linux32/linux32_locore.s @@ -11,8 +11,6 @@ NON_GPROF_ENTRY(linux_sigcode) call *LINUX_SIGF_HANDLER(%esp) leal LINUX_SIGF_SC(%esp),%ebx /* linux scp */ - movl LINUX_SC_GS(%ebx),%gs - movl LINUX_SC_FS(%ebx),%fs movl LINUX_SC_ES(%ebx),%es movl LINUX_SC_DS(%ebx),%ds movl %esp, %ebx /* pass sigframe */ @@ -25,8 +23,6 @@ linux_rt_sigcode: call *LINUX_RT_SIGF_HANDLER(%esp) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ - movl LINUX_SC_GS(%ebx),%gs - movl LINUX_SC_FS(%ebx),%fs movl LINUX_SC_ES(%ebx),%es movl LINUX_SC_DS(%ebx),%ds push %eax /* fake ret addr */ --- sys/amd64/linux32/linux32_machdep.c.orig +++ sys/amd64/linux32/linux32_machdep.c @@ -53,7 +53,10 @@ #include #include +#include #include +#include +#include #include #include @@ -121,7 +124,7 @@ * Allocate temporary demand zeroed space for argument and * environment strings */ - args->buf = (char *) kmem_alloc_wait(exec_map, + args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); if (args->buf == NULL) return (ENOMEM); @@ -155,14 +158,14 @@ if (error) { if (error == ENAMETOOLONG) error = E2BIG; - + goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } - + args->begin_envv = args->endp; /* @@ -219,13 +222,13 @@ if (error == 0) error = kern_execve(td, &eargs, NULL); if (error == 0) - /* linux process can exec fbsd one, dont attempt + /* Linux process can execute FreeBSD one, do not attempt * to create emuldata for such process using * linux_proc_init, this leads to a panic on KASSERT - * because such process has p->p_emuldata == NULL + * because such process has p->p_emuldata == NULL. */ if (td->td_proc->p_sysent == &elf_linux_sysvec) - error = linux_proc_init(td, 0, 0); + error = linux_proc_init(td, 0, 0); return (error); } @@ -466,7 +469,7 @@ if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) return (error); - + if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; @@ -480,7 +483,9 @@ td2 = FIRST_THREAD_IN_PROC(p2); - /* make it run */ + /* + * Make this runnable after we are finished with it. + */ mtx_lock_spin(&sched_lock); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); @@ -501,7 +506,7 @@ printf(ARGS(vfork, "")); #endif - /* exclude RFPPWAIT */ + /* Exclude RFPPWAIT */ if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) return (error); if (error == 0) { @@ -520,7 +525,7 @@ PROC_UNLOCK(p2); td2 = FIRST_THREAD_IN_PROC(p2); - + /* make it run */ mtx_lock_spin(&sched_lock); TD_SET_CAN_RUN(td2); @@ -532,7 +537,7 @@ while (p2->p_flag & P_PPWAIT) msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); - + return (0); } @@ -547,10 +552,9 @@ #ifdef DEBUG if (ldebug(clone)) { - printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"), - (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack, - (unsigned int)(uintptr_t)args->parent_tidptr, - (unsigned int)(uintptr_t)args->child_tidptr); + printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " + "child tid: %p"), (unsigned)args->flags, + args->stack, args->parent_tidptr, args->child_tidptr); } #endif @@ -565,11 +569,11 @@ ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; - /* - * XXX: in linux sharing of fs info (chroot/cwd/umask) - * and open files is independant. in fbsd its in one - * structure but in reality it doesn't cause any problems - * because both of these flags are usually set together. + /* + * XXX: In Linux, sharing of fs info (chroot/cwd/umask) + * and open files is independant. In FreeBSD, its in one + * structure but in reality it does not make any problems + * because both of these flags are set at once usually. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; @@ -590,6 +594,10 @@ if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) ff |= RFTHREAD; + if (args->flags & LINUX_CLONE_PARENT_SETTID) + if (args->parent_tidptr == NULL) + return (EINVAL); + error = fork1(td, ff, 0, &p2); if (error) return (error); @@ -601,35 +609,21 @@ PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } - + /* create the emuldata */ error = linux_proc_init(td, p2->p_pid, args->flags); /* reference it - no need to check this */ em = em_find(p2, EMUL_DOLOCK); KASSERT(em != NULL, ("clone: emuldata not found.\n")); /* and adjust it */ - if (args->flags & LINUX_CLONE_PARENT_SETTID) { - if (args->parent_tidptr == NULL) { - EMUL_UNLOCK(&emul_lock); - return (EINVAL); - } - error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); - if (error) { - EMUL_UNLOCK(&emul_lock); - return (error); - } - } if (args->flags & LINUX_CLONE_THREAD) { - /* XXX: linux mangles pgrp and pptr somehow - * I think it might be this but I am not sure. - */ #ifdef notyet PROC_LOCK(p2); p2->p_pgrp = td->td_proc->p_pgrp; PROC_UNLOCK(p2); #endif - exit_signal = 0; + exit_signal = 0; } if (args->flags & LINUX_CLONE_CHILD_SETTID) @@ -644,25 +638,70 @@ EMUL_UNLOCK(&emul_lock); + if (args->flags & LINUX_CLONE_PARENT_SETTID) { + error = copyout(&p2->p_pid, args->parent_tidptr, + sizeof(p2->p_pid)); + if (error) + printf(LMSG("copyout failed!")); + } + PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); td2 = FIRST_THREAD_IN_PROC(p2); - /* - * in a case of stack = NULL we are supposed to COW calling process stack - * this is what normal fork() does so we just keep the tf_rsp arg intact + /* + * In a case of stack = NULL, we are supposed to COW calling process + * stack. This is what normal fork() does, so we just keep tf_rsp arg + * intact. */ if (args->stack) - td2->td_frame->tf_rsp = PTROUT(args->stack); + td2->td_frame->tf_rsp = PTROUT(args->stack); if (args->flags & LINUX_CLONE_SETTLS) { - /* XXX: todo */ + struct user_segment_descriptor sd; + struct l_user_desc info; + int a[2]; + + error = copyin((void *)td->td_frame->tf_rsi, &info, + sizeof(struct l_user_desc)); + if (error) { + printf(LMSG("copyin failed!")); + } else { + /* We might copy out the entry_number as GUGS32_SEL. */ + info.entry_number = GUGS32_SEL; + error = copyout(&info, (void *)td->td_frame->tf_rsi, + sizeof(struct l_user_desc)); + if (error) + printf(LMSG("copyout failed!")); + + a[0] = LINUX_LDT_entry_a(&info); + a[1] = LINUX_LDT_entry_b(&info); + + memcpy(&sd, &a, sizeof(a)); +#ifdef DEBUG + if (ldebug(clone)) + printf("Segment created in clone with " + "CLONE_SETTLS: lobase: %x, hibase: %x, " + "lolimit: %x, hilimit: %x, type: %i, " + "dpl: %i, p: %i, xx: %i, long: %i, " + "def32: %i, gran: %i\n", sd.sd_lobase, + sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, + sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, + sd.sd_long, sd.sd_def32, sd.sd_gran); +#endif + td2->td_pcb->pcb_gsbase = (register_t)info.base_addr; + td2->td_pcb->pcb_gs32sd = sd; + td2->td_pcb->pcb_gs32p = &gdt[GUGS32_SEL]; + td2->td_pcb->pcb_gs = GSEL(GUGS32_SEL, SEL_UPL); + td2->td_pcb->pcb_flags |= PCB_32BIT; + } } #ifdef DEBUG if (ldebug(clone)) - printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"), - (long)p2->p_pid, args->stack, exit_signal); + printf(LMSG("clone: successful rfork to %d, " + "stack %p sig = %d"), (int)p2->p_pid, args->stack, + exit_signal); #endif if (args->flags & LINUX_CLONE_VFORK) { PROC_LOCK(p2); @@ -680,12 +719,12 @@ td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; - + if (args->flags & LINUX_CLONE_VFORK) { - /* wait for the children to exit, ie. emulate vfork */ - PROC_LOCK(p2); + /* wait for the children to exit, ie. emulate vfork */ + PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) - msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); + msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); } @@ -704,8 +743,8 @@ #ifdef DEBUG if (ldebug(mmap2)) - printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"), - (void *)(intptr_t)args->addr, args->len, args->prot, + printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), + args->addr, args->len, args->prot, args->flags, args->fd, args->pgoff); #endif @@ -731,10 +770,9 @@ #ifdef DEBUG if (ldebug(mmap)) - printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"), - (void *)(intptr_t)linux_args.addr, linux_args.len, - linux_args.prot, linux_args.flags, linux_args.fd, - linux_args.pgoff); + printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), + linux_args.addr, linux_args.len, linux_args.prot, + linux_args.flags, linux_args.fd, linux_args.pgoff); #endif if ((linux_args.pgoff % PAGE_SIZE) != 0) return (EINVAL); @@ -820,14 +858,14 @@ } if (linux_args->flags & LINUX_MAP_GROWSDOWN) { - /* - * The linux MAP_GROWSDOWN option does not limit auto + /* + * The Linux MAP_GROWSDOWN option does not limit auto * growth of the region. Linux mmap with this option * takes as addr the inital BOS, and as len, the initial * region size. It can then grow down from addr without - * limit. However, linux threads has an implicit internal + * limit. However, Linux threads has an implicit internal * limit to stack size of STACK_SIZE. Its just not - * enforced explicitly in linux. But, here we impose + * enforced explicitly in Linux. But, here we impose * a limit of (STACK_SIZE - GUARD_SIZE) on the stack * region, since we can do this with our mmap. * @@ -844,8 +882,8 @@ if ((caddr_t)PTRIN(linux_args->addr) + linux_args->len > p->p_vmspace->vm_maxsaddr) { - /* - * Some linux apps will attempt to mmap + /* + * Some Linux apps will attempt to mmap * thread stacks near the top of their * address space. If their TOS is greater * than vm_maxsaddr, vm_map_growstack() @@ -872,7 +910,7 @@ else bsd_args.len = STACK_SIZE - GUARD_SIZE; - /* + /* * This gives us a new BOS. If we're using VM_STACK, then * mmap will just map the top SGROWSIZ bytes, and let * the stack grow down to the limit at BOS. If we're @@ -905,6 +943,19 @@ } int +linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) +{ + struct mprotect_args bsd_args; + + bsd_args.addr = uap->addr; + bsd_args.len = uap->len; + bsd_args.prot = uap->prot; + if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) + bsd_args.prot |= PROT_READ | PROT_EXEC; + return (mprotect(td, &bsd_args)); +} + +int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; @@ -992,7 +1043,7 @@ } /* - * Linux has two extra args, restart and oldmask. We dont use these, + * Linux has two extra args, restart and oldmask. We don't use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ @@ -1177,14 +1228,104 @@ } int -linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) +linux_set_thread_area(struct thread *td, + struct linux_set_thread_area_args *args) { - struct mprotect_args bsd_args; + struct l_user_desc info; + struct user_segment_descriptor sd; + int a[2]; + int error; + + error = copyin(args->desc, &info, sizeof(struct l_user_desc)); + if (error) + return (error); + +#ifdef DEBUG + if (ldebug(set_thread_area)) + printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " + "%i, %i, %i"), info.entry_number, info.base_addr, + info.limit, info.seg_32bit, info.contents, + info.read_exec_only, info.limit_in_pages, + info.seg_not_present, info.useable); +#endif + + /* + * Semantics of Linux version: every thread in the system has array + * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. + * This syscall loads one of the selected TLS decriptors with a value + * and also loads GDT descriptors 6, 7 and 8 with the content of + * the per-thread descriptors. + * + * Semantics of FreeBSD version: I think we can ignore that Linux has + * three per-thread descriptors and use just the first one. + * The tls_array[] is used only in [gs]et_thread_area() syscalls and + * for loading the GDT descriptors. We use just one GDT descriptor + * for TLS, so we will load just one. + * XXX: This doesnt work when user-space process tries to use more + * than one TLS segment. Comment in the Linux source says wine might + * do that. + */ + + /* + * GLIBC reads current %gs and call set_thread_area() with it. + * We should let GUDATA_SEL and GUGS32_SEL proceed as well because + * we use these segments. + */ + switch (info.entry_number) { + case GUGS32_SEL: + case GUDATA_SEL: + case 6: + case -1: + info.entry_number = GUGS32_SEL; + break; + default: + return (EINVAL); + } + + /* + * We have to copy out the GDT entry we use. + * XXX: What if userspace program does not check return value and + * tries to use 6, 7 or 8? + */ + error = copyout(&info, args->desc, sizeof(struct l_user_desc)); + if (error) + return (error); + + if (LINUX_LDT_empty(&info)) { + a[0] = 0; + a[1] = 0; + } else { + a[0] = LINUX_LDT_entry_a(&info); + a[1] = LINUX_LDT_entry_b(&info); + } + + memcpy(&sd, &a, sizeof(a)); +#ifdef DEBUG + if (ldebug(set_thread_area)) + printf("Segment created in set_thread_area: " + "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " + "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " + "def32: %i, gran: %i\n", + sd.sd_lobase, + sd.sd_hibase, + sd.sd_lolimit, + sd.sd_hilimit, + sd.sd_type, + sd.sd_dpl, + sd.sd_p, + sd.sd_xx, + sd.sd_long, + sd.sd_def32, + sd.sd_gran); +#endif + + critical_enter(); + td->td_pcb->pcb_gsbase = (register_t)info.base_addr; + td->td_pcb->pcb_gs32sd = gdt[GUGS32_SEL] = sd; + td->td_pcb->pcb_gs32p = &gdt[GUGS32_SEL]; + td->td_pcb->pcb_flags |= PCB_32BIT; + wrmsr(MSR_KGSBASE, td->td_pcb->pcb_gsbase); + critical_exit(); - bsd_args.addr = uap->addr; - bsd_args.len = uap->len; - bsd_args.prot = uap->prot; - if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) - bsd_args.prot |= PROT_READ | PROT_EXEC; - return (mprotect(td, &bsd_args)); + return (0); } --- sys/amd64/linux32/linux32_proto.h.orig +++ sys/amd64/linux32/linux32_proto.h @@ -2,7 +2,7 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/amd64/linux32/linux32_proto.h,v 1.29 2007/02/15 01:15:31 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/amd64/linux32/syscalls.master,v 1.26 2007/02/15 01:13:36 jkim Exp */ @@ -192,6 +192,9 @@ struct linux_olduname_args { register_t dummy; }; +struct linux_chroot_args { + char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; +}; struct linux_ustat_args { char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)]; char ubuf_l_[PADL_(struct l_ustat *)]; struct l_ustat * ubuf; char ubuf_r_[PADR_(struct l_ustat *)]; @@ -594,7 +597,10 @@ char uoss_l_[PADL_(l_stack_t *)]; l_stack_t * uoss; char uoss_r_[PADR_(l_stack_t *)]; }; struct linux_sendfile_args { - register_t dummy; + char out_l_[PADL_(int)]; int out; char out_r_[PADR_(int)]; + char in_l_[PADL_(int)]; int in; char in_r_[PADR_(int)]; + char offset_l_[PADL_(l_long *)]; l_long * offset; char offset_r_[PADR_(l_long *)]; + char count_l_[PADL_(l_size_t)]; l_size_t count; char count_r_[PADR_(l_size_t)]; }; struct linux_vfork_args { register_t dummy; @@ -734,6 +740,38 @@ char uaddr2_l_[PADL_(void *)]; void * uaddr2; char uaddr2_r_[PADR_(void *)]; char val3_l_[PADL_(int)]; int val3; char val3_r_[PADR_(int)]; }; +struct linux_sched_getaffinity_args { + char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)]; + char cpusetsize_l_[PADL_(l_uint)]; l_uint cpusetsize; char cpusetsize_r_[PADR_(l_uint)]; + char mask_l_[PADL_(l_ulong *)]; l_ulong * mask; char mask_r_[PADR_(l_ulong *)]; +}; +struct linux_set_thread_area_args { + char desc_l_[PADL_(struct l_user_desc *)]; struct l_user_desc * desc; char desc_r_[PADR_(struct l_user_desc *)]; +}; +struct linux_io_setup_args { + char nr_reqs_l_[PADL_(l_uint)]; l_uint nr_reqs; char nr_reqs_r_[PADR_(l_uint)]; + char ctxp_l_[PADL_(linux_aio_context_t *)]; linux_aio_context_t * ctxp; char ctxp_r_[PADR_(linux_aio_context_t *)]; +}; +struct linux_io_destroy_args { + char ctx_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx; char ctx_r_[PADR_(linux_aio_context_t)]; +}; +struct linux_io_getevents_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char min_nr_l_[PADL_(l_long)]; l_long min_nr; char min_nr_r_[PADR_(l_long)]; + char nr_l_[PADL_(l_long)]; l_long nr; char nr_r_[PADR_(l_long)]; + char events_l_[PADL_(struct linux_io_event *)]; struct linux_io_event * events; char events_r_[PADR_(struct linux_io_event *)]; + char timeout_l_[PADL_(struct l_timespec *)]; struct l_timespec * timeout; char timeout_r_[PADR_(struct l_timespec *)]; +}; +struct linux_io_submit_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char nr_l_[PADL_(l_long)]; l_long nr; char nr_r_[PADR_(l_long)]; + char iocbpp_l_[PADL_(struct linux_iocb **)]; struct linux_iocb ** iocbpp; char iocbpp_r_[PADR_(struct linux_iocb **)]; +}; +struct linux_io_cancel_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char iocb_l_[PADL_(struct linux_iocb *)]; struct linux_iocb * iocb; char iocb_r_[PADR_(struct linux_iocb *)]; + char result_l_[PADL_(struct linux_io_event *)]; struct linux_io_event * result; char result_r_[PADR_(struct linux_io_event *)]; +}; struct linux_fadvise64_args { register_t dummy; }; @@ -871,7 +909,10 @@ register_t dummy; }; struct linux_openat_args { - register_t dummy; + char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)]; + char filename_l_[PADL_(char *)]; char * filename; char filename_r_[PADR_(char *)]; + char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; + char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)]; }; struct linux_mkdirat_args { register_t dummy; @@ -961,6 +1002,7 @@ int linux_ioctl(struct thread *, struct linux_ioctl_args *); int linux_fcntl(struct thread *, struct linux_fcntl_args *); int linux_olduname(struct thread *, struct linux_olduname_args *); +int linux_chroot(struct thread *, struct linux_chroot_args *); int linux_ustat(struct thread *, struct linux_ustat_args *); int linux_getppid(struct thread *, struct linux_getppid_args *); int linux_sigaction(struct thread *, struct linux_sigaction_args *); @@ -1093,6 +1135,13 @@ int linux_fremovexattr(struct thread *, struct linux_fremovexattr_args *); int linux_tkill(struct thread *, struct linux_tkill_args *); int linux_sys_futex(struct thread *, struct linux_sys_futex_args *); +int linux_sched_getaffinity(struct thread *, struct linux_sched_getaffinity_args *); +int linux_set_thread_area(struct thread *, struct linux_set_thread_area_args *); +int linux_io_setup(struct thread *, struct linux_io_setup_args *); +int linux_io_destroy(struct thread *, struct linux_io_destroy_args *); +int linux_io_getevents(struct thread *, struct linux_io_getevents_args *); +int linux_io_submit(struct thread *, struct linux_io_submit_args *); +int linux_io_cancel(struct thread *, struct linux_io_cancel_args *); int linux_fadvise64(struct thread *, struct linux_fadvise64_args *); int linux_exit_group(struct thread *, struct linux_exit_group_args *); int linux_lookup_dcookie(struct thread *, struct linux_lookup_dcookie_args *); @@ -1207,6 +1256,7 @@ #define LINUX_SYS_AUE_linux_ioctl AUE_IOCTL #define LINUX_SYS_AUE_linux_fcntl AUE_FCNTL #define LINUX_SYS_AUE_linux_olduname AUE_NULL +#define LINUX_SYS_AUE_linux_chroot AUE_CHROOT #define LINUX_SYS_AUE_linux_ustat AUE_NULL #define LINUX_SYS_AUE_linux_getppid AUE_GETPPID #define LINUX_SYS_AUE_linux_sigaction AUE_NULL @@ -1339,6 +1389,13 @@ #define LINUX_SYS_AUE_linux_fremovexattr AUE_NULL #define LINUX_SYS_AUE_linux_tkill AUE_NULL #define LINUX_SYS_AUE_linux_sys_futex AUE_NULL +#define LINUX_SYS_AUE_linux_sched_getaffinity AUE_NULL +#define LINUX_SYS_AUE_linux_set_thread_area AUE_NULL +#define LINUX_SYS_AUE_linux_io_setup AUE_NULL +#define LINUX_SYS_AUE_linux_io_destroy AUE_NULL +#define LINUX_SYS_AUE_linux_io_getevents AUE_NULL +#define LINUX_SYS_AUE_linux_io_submit AUE_NULL +#define LINUX_SYS_AUE_linux_io_cancel AUE_NULL #define LINUX_SYS_AUE_linux_fadvise64 AUE_NULL #define LINUX_SYS_AUE_linux_exit_group AUE_EXIT #define LINUX_SYS_AUE_linux_lookup_dcookie AUE_NULL @@ -1381,7 +1438,7 @@ #define LINUX_SYS_AUE_linux_inotify_add_watch AUE_NULL #define LINUX_SYS_AUE_linux_inotify_rm_watch AUE_NULL #define LINUX_SYS_AUE_linux_migrate_pages AUE_NULL -#define LINUX_SYS_AUE_linux_openat AUE_NULL +#define LINUX_SYS_AUE_linux_openat AUE_OPEN_RWTC #define LINUX_SYS_AUE_linux_mkdirat AUE_NULL #define LINUX_SYS_AUE_linux_mknodat AUE_NULL #define LINUX_SYS_AUE_linux_fchownat AUE_NULL --- sys/amd64/linux32/linux32_syscall.h.orig +++ sys/amd64/linux32/linux32_syscall.h @@ -2,7 +2,7 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/amd64/linux32/linux32_syscall.h,v 1.29 2007/02/15 01:15:31 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/amd64/linux32/syscalls.master,v 1.26 2007/02/15 01:13:36 jkim Exp */ @@ -57,7 +57,7 @@ #define LINUX_SYS_setpgid 57 #define LINUX_SYS_linux_olduname 59 #define LINUX_SYS_umask 60 -#define LINUX_SYS_chroot 61 +#define LINUX_SYS_linux_chroot 61 #define LINUX_SYS_linux_ustat 62 #define LINUX_SYS_dup2 63 #define LINUX_SYS_linux_getppid 64 @@ -222,6 +222,13 @@ #define LINUX_SYS_linux_fremovexattr 237 #define LINUX_SYS_linux_tkill 238 #define LINUX_SYS_linux_sys_futex 240 +#define LINUX_SYS_linux_sched_getaffinity 242 +#define LINUX_SYS_linux_set_thread_area 243 +#define LINUX_SYS_linux_io_setup 245 +#define LINUX_SYS_linux_io_destroy 246 +#define LINUX_SYS_linux_io_getevents 247 +#define LINUX_SYS_linux_io_submit 248 +#define LINUX_SYS_linux_io_cancel 249 #define LINUX_SYS_linux_fadvise64 250 #define LINUX_SYS_linux_exit_group 252 #define LINUX_SYS_linux_lookup_dcookie 253 --- sys/amd64/linux32/linux32_sysent.c.orig +++ sys/amd64/linux32/linux32_sysent.c @@ -2,7 +2,7 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/amd64/linux32/linux32_sysent.c,v 1.29 2007/02/15 01:15:31 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/amd64/linux32/syscalls.master,v 1.26 2007/02/15 01:13:36 jkim Exp */ @@ -81,7 +81,7 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 58 = ulimit */ { 0, (sy_call_t *)linux_olduname, AUE_NULL, NULL, 0, 0 }, /* 59 = linux_olduname */ { AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0 }, /* 60 = umask */ - { AS(chroot_args), (sy_call_t *)chroot, AUE_CHROOT, NULL, 0, 0 }, /* 61 = chroot */ + { AS(linux_chroot_args), (sy_call_t *)linux_chroot, AUE_CHROOT, NULL, 0, 0 }, /* 61 = linux_chroot */ { AS(linux_ustat_args), (sy_call_t *)linux_ustat, AUE_NULL, NULL, 0, 0 }, /* 62 = linux_ustat */ { AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0 }, /* 63 = dup2 */ { 0, (sy_call_t *)linux_getppid, AUE_GETPPID, NULL, 0, 0 }, /* 64 = linux_getppid */ @@ -207,7 +207,7 @@ { 0, (sy_call_t *)linux_capget, AUE_CAPGET, NULL, 0, 0 }, /* 184 = linux_capget */ { 0, (sy_call_t *)linux_capset, AUE_CAPSET, NULL, 0, 0 }, /* 185 = linux_capset */ { AS(linux_sigaltstack_args), (sy_call_t *)linux_sigaltstack, AUE_NULL, NULL, 0, 0 }, /* 186 = linux_sigaltstack */ - { 0, (sy_call_t *)linux_sendfile, AUE_SENDFILE, NULL, 0, 0 }, /* 187 = linux_sendfile */ + { AS(linux_sendfile_args), (sy_call_t *)linux_sendfile, AUE_SENDFILE, NULL, 0, 0 }, /* 187 = linux_sendfile */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 188 = getpmsg */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 189 = putpmsg */ { 0, (sy_call_t *)linux_vfork, AUE_VFORK, NULL, 0, 0 }, /* 190 = linux_vfork */ @@ -262,14 +262,14 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 239 = linux_sendfile64 */ { AS(linux_sys_futex_args), (sy_call_t *)linux_sys_futex, AUE_NULL, NULL, 0, 0 }, /* 240 = linux_sys_futex */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 241 = linux_sched_setaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 242 = linux_sched_getaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 243 = linux_set_thread_area */ + { AS(linux_sched_getaffinity_args), (sy_call_t *)linux_sched_getaffinity, AUE_NULL, NULL, 0, 0 }, /* 242 = linux_sched_getaffinity */ + { AS(linux_set_thread_area_args), (sy_call_t *)linux_set_thread_area, AUE_NULL, NULL, 0, 0 }, /* 243 = linux_set_thread_area */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 244 = linux_get_thread_area */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 245 = linux_io_setup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 246 = linux_io_destroy */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 247 = linux_io_getevents */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 248 = inux_io_submit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 249 = linux_io_cancel */ + { AS(linux_io_setup_args), (sy_call_t *)linux_io_setup, AUE_NULL, NULL, 0, 0 }, /* 245 = linux_io_setup */ + { AS(linux_io_destroy_args), (sy_call_t *)linux_io_destroy, AUE_NULL, NULL, 0, 0 }, /* 246 = linux_io_destroy */ + { AS(linux_io_getevents_args), (sy_call_t *)linux_io_getevents, AUE_NULL, NULL, 0, 0 }, /* 247 = linux_io_getevents */ + { AS(linux_io_submit_args), (sy_call_t *)linux_io_submit, AUE_NULL, NULL, 0, 0 }, /* 248 = linux_io_submit */ + { AS(linux_io_cancel_args), (sy_call_t *)linux_io_cancel, AUE_NULL, NULL, 0, 0 }, /* 249 = linux_io_cancel */ { 0, (sy_call_t *)linux_fadvise64, AUE_NULL, NULL, 0, 0 }, /* 250 = linux_fadvise64 */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 251 = */ { AS(linux_exit_group_args), (sy_call_t *)linux_exit_group, AUE_EXIT, NULL, 0, 0 }, /* 252 = linux_exit_group */ @@ -315,7 +315,7 @@ { 0, (sy_call_t *)linux_inotify_add_watch, AUE_NULL, NULL, 0, 0 }, /* 292 = linux_inotify_add_watch */ { 0, (sy_call_t *)linux_inotify_rm_watch, AUE_NULL, NULL, 0, 0 }, /* 293 = linux_inotify_rm_watch */ { 0, (sy_call_t *)linux_migrate_pages, AUE_NULL, NULL, 0, 0 }, /* 294 = linux_migrate_pages */ - { 0, (sy_call_t *)linux_openat, AUE_NULL, NULL, 0, 0 }, /* 295 = linux_openat */ + { AS(linux_openat_args), (sy_call_t *)linux_openat, AUE_OPEN_RWTC, NULL, 0, 0 }, /* 295 = linux_openat */ { 0, (sy_call_t *)linux_mkdirat, AUE_NULL, NULL, 0, 0 }, /* 296 = linux_mkdirat */ { 0, (sy_call_t *)linux_mknodat, AUE_NULL, NULL, 0, 0 }, /* 297 = linux_mknodat */ { 0, (sy_call_t *)linux_fchownat, AUE_NULL, NULL, 0, 0 }, /* 298 = linux_fchownat */ --- sys/amd64/linux32/linux32_sysvec.c.orig +++ sys/amd64/linux32/linux32_sysvec.c @@ -408,6 +408,7 @@ td->td_pcb->pcb_ds = _udatasel; load_es(_udatasel); td->td_pcb->pcb_es = _udatasel; + /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -528,6 +529,7 @@ td->td_pcb->pcb_ds = _udatasel; load_es(_udatasel); td->td_pcb->pcb_es = _udatasel; + /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -813,18 +815,20 @@ struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; + critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; + critical_exit(); load_ds(_udatasel); load_es(_udatasel); load_fs(_udatasel); - load_gs(0); + load_gs(_udatasel); pcb->pcb_ds = _udatasel; pcb->pcb_es = _udatasel; pcb->pcb_fs = _udatasel; - pcb->pcb_gs = 0; + pcb->pcb_gs = _udatasel; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = entry; @@ -1077,7 +1081,7 @@ linux_ioctl_register_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); - sx_init(&emul_lock, "emuldata lock"); + mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); sx_init(&emul_shared_lock, "emuldata->shared lock"); LIST_INIT(&futex_list); sx_init(&futex_sx, "futex protection lock"); @@ -1108,7 +1112,7 @@ linux_ioctl_unregister_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); - sx_destroy(&emul_lock); + mtx_destroy(&emul_lock); sx_destroy(&emul_shared_lock); sx_destroy(&futex_sx); EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); --- sys/amd64/linux32/syscalls.master.orig +++ sys/amd64/linux32/syscalls.master @@ -117,7 +117,7 @@ 58 AUE_NULL UNIMPL ulimit 59 AUE_NULL STD { int linux_olduname(void); } 60 AUE_UMASK NOPROTO { int umask(int newmask); } -61 AUE_CHROOT NOPROTO { int chroot(char *path); } +61 AUE_CHROOT STD { int linux_chroot(char *path); } 62 AUE_NULL STD { int linux_ustat(l_dev_t dev, \ struct l_ustat *ubuf); } 63 AUE_DUP2 NOPROTO { int dup2(u_int from, u_int to); } @@ -331,7 +331,8 @@ 185 AUE_CAPSET STD { int linux_capset(void); } 186 AUE_NULL STD { int linux_sigaltstack(l_stack_t *uss, \ l_stack_t *uoss); } -187 AUE_SENDFILE STD { int linux_sendfile(void); } +187 AUE_SENDFILE STD { int linux_sendfile(int out, int in, l_long *offset, \ + l_size_t count); } 188 AUE_GETPMSG UNIMPL getpmsg 189 AUE_PUTPMSG UNIMPL putpmsg 190 AUE_VFORK STD { int linux_vfork(void); } @@ -406,16 +407,17 @@ 238 AUE_NULL STD { int linux_tkill(int tid, int sig); } 239 AUE_SENDFILE UNIMPL linux_sendfile64 240 AUE_NULL STD { int linux_sys_futex(void *uaddr, int op, int val, \ - struct l_timespec *timeout, void *uaddr2, int val3); } + struct l_timespec *timeout, void *uaddr2, int val3); } 241 AUE_NULL UNIMPL linux_sched_setaffinity -242 AUE_NULL UNIMPL linux_sched_getaffinity -243 AUE_NULL UNIMPL linux_set_thread_area +242 AUE_NULL STD { int linux_sched_getaffinity(l_pid_t pid, l_uint cpusetsize, \ + l_ulong *mask); } +243 AUE_NULL STD { int linux_set_thread_area(struct l_user_desc *desc); } 244 AUE_NULL UNIMPL linux_get_thread_area -245 AUE_NULL UNIMPL linux_io_setup -246 AUE_NULL UNIMPL linux_io_destroy -247 AUE_NULL UNIMPL linux_io_getevents -248 AUE_NULL UNIMPL inux_io_submit -249 AUE_NULL UNIMPL linux_io_cancel +245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); } +246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); } +247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); } +248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); } +249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); } 250 AUE_NULL STD { int linux_fadvise64(void); } 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } @@ -463,7 +465,8 @@ 292 AUE_NULL STD { int linux_inotify_add_watch(void); } 293 AUE_NULL STD { int linux_inotify_rm_watch(void); } 294 AUE_NULL STD { int linux_migrate_pages(void); } -295 AUE_NULL STD { int linux_openat(void); } +295 AUE_OPEN_RWTC STD { int linux_openat(l_int dfd, char *filename, \ + l_int flags, l_int mode); } 296 AUE_NULL STD { int linux_mkdirat(void); } 297 AUE_NULL STD { int linux_mknodat(void); } 298 AUE_NULL STD { int linux_fchownat(void); } --- sys/compat/linprocfs/linprocfs.c.orig +++ sys/compat/linprocfs/linprocfs.c @@ -525,7 +525,7 @@ state = 'R'; if (ratelimit == 0) { - printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n", + printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zu, mapping to R\n", kp.ki_stat, sizeof(linux_state)); ++ratelimit; } --- /dev/null Sat Mar 17 17:03:38 2007 +++ sys/compat/linux/linux_aio.c Sat Mar 17 17:03:39 2007 @@ -0,0 +1,1254 @@ +/*- + * Copyright (c) 2006 Li, Xiao . All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD: src/sys/compat/linux/linux_aio.c,v 1.1 2006/10/15 14:22:13 netchild Exp $"); + +#include "opt_compat.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef COMPAT_LINUX32 +#include +#include +#else +#include +#include +#endif + +#define LINUX_AIO_DEBUG + +/* + * Linux Kernel Implementation of Asynchronous I/O + */ + +#ifdef LINUX_AIO_DEBUG + +/* Print arguments of syscall */ +#define DARGPRINTF(fmt, ...) printf("linux(%ld): %s("fmt")\n", \ + (long)td->td_proc->p_pid, __func__, __VA_ARGS__) +/* Print message in syscall function */ +#define DPRINTF(fmt, ...) printf(LMSG("%s(): " fmt), \ + __func__, __VA_ARGS__) +/* Print message in non-syscall function, the one more "P" means "private" */ +#define DPPRINTF(fmt, ...) printf("linux(): %s(): " fmt "\n", \ + __func__, __VA_ARGS__) + +#else + +#define DARGPRINTF(fmt, ...) +#define DPRINTF(fmt, ...) +#define DPPRINTF(fmt, ...) + +#endif + +/* + * DATA STRUCTURE HIERARCHY + * + * +--------------------+ +--------------------+ + * context_list ---> | context | ---> | context | ---> ... + * SLIST |(owned by a process)| |(owned by a process)| + * | | | | + * | ctx_req | | ctx_req | + * +----|---------------+ +----|---------------+ + * | STAILQ | STAILQ + * v v + * +------------+ +------------+ + * | request | | request | + * | | | | + * |.req_pbsd | |.req_pbsd | + * |.req_porig | |.req_porig | + * |.req_linux | |.req_linux | + * | | | | + * +------------+ +------------+ + * | | + * v v + * +------------+ +------------+ + * | request | | request | + * | | | | + * |.req_pbsd | |.req_pbsd | + * |.req_porig | |.req_porig | + * |.req_linux | |.req_linux | + * | | | | + * +------------+ +------------+ + * | | + * v v + * ... ... + */ + +struct linux_aio_context; + +struct linux_aio_request { + struct aiocb *req_pbsd; /* Userland clone for FreeBSD */ + struct linux_iocb *req_porig; /* Userland original control block */ + struct linux_iocb req_linux; /* Copy of original control block */ + STAILQ_ENTRY(linux_aio_request) req_ctx_entry; +}; + +struct linux_aio_context { + struct sx ctx_sx; + pid_t ctx_pid; + struct linux_aio_ring *ctx_pring; + int ctx_nreq_max; /* Maximum request number */ + int ctx_nreq_cur; /* Current request number */ + STAILQ_HEAD(,linux_aio_request) ctx_req; + SLIST_ENTRY(linux_aio_context) ctx_list_entry; +}; +static SLIST_HEAD(,linux_aio_context) linux_aio_context_list; + +#define LINUX_AIO_REQ_HOOK(pctx, preq) { \ + STAILQ_INSERT_TAIL(&((pctx)->ctx_req), (preq), req_ctx_entry); \ + (pctx)->ctx_nreq_cur ++; \ +} + +#define LINUX_AIO_REQ_UNHOOK(pctx, preq) { \ + STAILQ_REMOVE(&((pctx)->ctx_req), (preq), linux_aio_request, \ + req_ctx_entry); \ + (pctx)->ctx_nreq_cur --; \ +} + +#define LINUX_AIO_REQ_FOREACH(pctx, preq) \ + STAILQ_FOREACH((preq), &((pctx)->ctx_req), req_ctx_entry) + +#define LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) \ + STAILQ_FOREACH_SAFE((preq), &((pctx)->ctx_req), req_ctx_entry, \ + (ptmpreq)) + +#define LINUX_AIO_CTX_LOCK(pctx) sx_xlock(&((pctx)->ctx_sx)) + +#define LINUX_AIO_CTX_UNLOCK(pctx) sx_unlock(&((pctx)->ctx_sx)) + +#define LINUX_AIO_CTX_HOOK(pctx) \ + SLIST_INSERT_HEAD(&linux_aio_context_list, (pctx), ctx_list_entry) + +#define LINUX_AIO_CTX_UNHOOK(pctx) \ + SLIST_REMOVE(&linux_aio_context_list, (pctx), \ + linux_aio_context, ctx_list_entry) + +#define LINUX_AIO_CTX_FOREACH(pctx) \ + SLIST_FOREACH((pctx), &linux_aio_context_list, ctx_list_entry) + +#define LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) \ + SLIST_FOREACH_SAFE((pctx), &linux_aio_context_list, \ + ctx_list_entry, (ptmpctx)) + +#define LINUX_AIO_CTX_MATCH(pctx, ctxid, pid) \ + ((linux_aio_context_t)(pctx)->ctx_pring == (ctxid) \ + && (pctx)->ctx_pid == (pid)) + +static struct mtx linux_aio_context_list_mtx; + +#define LINUX_AIO_CTX_LIST_LOCK() mtx_lock(&linux_aio_context_list_mtx) + +#define LINUX_AIO_CTX_LIST_UNLOCK() mtx_unlock(&linux_aio_context_list_mtx) + +/* + * The following two macros are substantially identical to the two macros + * AIO_(UN)LOCK in /sys/kern/vfs_aio.c. Thus, the mutex much be unlocked + * before calling functions of FreeBSD native AIO module. + * + * XXX + * I ASSUME the member "kaio_mtx" is the first element of "struct kaioinfo". + */ +#define LINUX_AIO_LOCK(p) { \ + if ((p)->p_aioinfo == NULL) \ + aio_init_aioinfo(p); \ + mtx_lock((struct mtx *)((p)->p_aioinfo)); \ +} + +#define LINUX_AIO_UNLOCK(p) { \ + if ((p)->p_aioinfo == NULL) \ + aio_init_aioinfo(p); \ + mtx_unlock((struct mtx *)((p)->p_aioinfo)); \ +} + +static uma_zone_t linux_aio_context_zone, linux_aio_request_zone; + +static eventhandler_tag linux_aio_exit_tag; + +/* + * To backup pointers to the dummy implementation of these + * system calls faked by the macro DUMMY() in linux_dummy.c. + */ +#define PREPARE_DUMMY_SYSCALL_BACKUP(s) \ + static sy_call_t *p_dummy_linux_ ## s +#define SHOW_REAL_SYSCALL(s) { \ + p_dummy_linux_ ## s = linux_sysent[LINUX_SYS_linux_ ## s].sy_call; \ + linux_sysent[LINUX_SYS_linux_ ## s].sy_call = \ + (sy_call_t *)(linux_ ## s); \ +} +/* + * The concept of "scope": the functions linux_io_xxx defined in this file + * always mask/screen/override/prevent homonymous functions defined in + * any other files. + */ +#define RESTORE_DUMMY_SYSCALL(s) { \ + linux_sysent[LINUX_SYS_linux_ ## s].sy_call = p_dummy_linux_ ## s; \ +} + +PREPARE_DUMMY_SYSCALL_BACKUP(io_setup); +PREPARE_DUMMY_SYSCALL_BACKUP(io_destroy); +PREPARE_DUMMY_SYSCALL_BACKUP(io_getevents); +PREPARE_DUMMY_SYSCALL_BACKUP(io_submit); +PREPARE_DUMMY_SYSCALL_BACKUP(io_cancel); + +/* + * Substantially defined in linux_sysent.c. + * Also declared in linux_sysvec.c. + */ +extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; + +static int user_mem_rw_verify(void *p, size_t s) +{ + char buf[256]; + size_t i; + int nerr = 0; + + for (i = 0; i < s; i += sizeof(buf)) { + /* Verify reading */ + nerr = copyin((char *)p+i, buf, MIN(sizeof(buf), s-i)); + if (nerr != 0) + break; + + /* Verify writing */ + nerr = copyout(buf, (char *)p+i, MIN(sizeof(buf), s-i)); + if (nerr != 0) + break; + } + + return (nerr); +} + +/* Allocate memory in user space */ +static int user_malloc(struct thread *td, void **pp, size_t s) +{ + struct mmap_args mmaparg; + int nerr; + register_t r; + + r = td->td_retval[0]; + + mmaparg.addr = NULL; + mmaparg.len = s; + mmaparg.prot = PROT_READ | PROT_WRITE; + mmaparg.flags = MAP_PRIVATE | MAP_ANON; + mmaparg.fd = -1; + mmaparg.pad = 0; + mmaparg.pos = 0; + + nerr = mmap(td, &mmaparg); + + if (nerr == 0) { + *pp = (void *)td->td_retval[0]; + DPPRINTF("%lu bytes allocated at %p", (unsigned long)s, *pp); + } + + td->td_retval[0] = r; + + return (nerr); +} + +/* Free memory in user space */ +static int user_free(struct thread *td, void *p, size_t s) +{ + struct munmap_args munmaparg; + int nerr; + register_t r; + + r = td->td_retval[0]; + + munmaparg.addr = p; + munmaparg.len = s; + + nerr = munmap(td, &munmaparg); + + td->td_retval[0] = r; + DPPRINTF("%lu bytes at %p", (unsigned long)s, p); + + return (nerr); +} + +#ifdef LINUX_AIO_DEBUG + +static void linux_aio_dump_freebsd_aiocb(struct aiocb *piocb, int isuserland) +{ + struct aiocb localcb, *pcb; + int nerr = 0; + + if (isuserland) { + nerr = copyin(piocb, &localcb, sizeof(localcb)); + pcb = &localcb; + } + else + pcb = piocb; + + DPPRINTF("Dump struct aiocb (%p, %s): %s", + piocb, (isuserland?"userland":"kernel"), + (nerr?"Failure":"")); + if (!nerr) { + DPPRINTF("aio_fildes: %d", + pcb->aio_fildes); + DPPRINTF("aio_offset: %lu", + (unsigned long) pcb->aio_offset); + DPPRINTF("aio_buf: %p", + pcb->aio_buf); + DPPRINTF("aio_nbytes: %lu", + (unsigned long) pcb->aio_nbytes); + DPPRINTF("aio_lio_opcode: %d", + pcb->aio_lio_opcode); + DPPRINTF("aio_reqprio: %d", + pcb->aio_reqprio); + DPPRINTF("aio_sigevent.sigev_notify: %d", + pcb->aio_sigevent.sigev_notify); + DPPRINTF("aio_sigevent.sigev_signo: %d", + pcb->aio_sigevent.sigev_signo); + } +} + +#define DUMP_FREEBSD_AIOCB(p, isu) linux_aio_dump_freebsd_aiocb((p), (isu)); + +#define DUMP_TIMESPEC(f, t ,a) \ + DPRINTF("%s%ld second + %ld nanosecond%s", \ + (f), (long)(t)->tv_sec, (long)(t)->tv_nsec, (a)); + +#else /* ! LINUX_AIO_DEBUG */ + +#define DUMP_FREEBSD_AIOCB(p, isu) +#define DUMP_TIMESPEC(f, t, a) + +#endif /* LINUX_AIO_DEBUG */ + +static int iocb_reformat(struct linux_iocb *plnx, struct aiocb *pbsd) +{ + int nerr = 0; + + bzero(pbsd, sizeof(*pbsd)); + + pbsd->aio_fildes = plnx->aio_fildes; /* File descriptor */ + pbsd->aio_offset = plnx->aio_offset; /* File offset for I/O */ + pbsd->aio_buf = (void *)(unsigned long) plnx->aio_buf; /* + * User space + * I/O buffer + */ + pbsd->aio_nbytes = plnx->aio_nbytes; /* Number of bytes for I/O */ + switch (plnx->aio_lio_opcode) { /* LIO opcode */ + case LINUX_IOCB_CMD_PREAD: + pbsd->aio_lio_opcode = LIO_READ; + break; + case LINUX_IOCB_CMD_PWRITE: + pbsd->aio_lio_opcode = LIO_WRITE; + break; + case LINUX_IOCB_CMD_FSYNC: + case LINUX_IOCB_CMD_FDSYNC: + pbsd->aio_lio_opcode = LIO_SYNC; + break; +#if 0 + case LINUX_IOCB_CMD_PREADX: + break; + case LINUX_IOCB_CMD_POLL: + break; +#endif + case LINUX_IOCB_CMD_NOOP: + pbsd->aio_lio_opcode = LIO_NOP; + break; + default: + nerr = EINVAL; + break; + } + if (nerr != 0) { + DPPRINTF("Unsupported aio_lio_opcode: %u", + (unsigned)plnx->aio_lio_opcode); + return (nerr); + } + pbsd->aio_reqprio = plnx->aio_reqprio; /* Request priority */ + pbsd->aio_sigevent.sigev_notify = SIGEV_NONE; /* No signal to deliver */ + pbsd->aio_sigevent.sigev_signo = 0; /* No signal to deliver */ + + return (nerr); +} + +/* Linux system call io_setup(2) */ +int linux_io_setup(struct thread *td, struct linux_io_setup_args *args) +{ + struct proc *p; + struct linux_aio_ring *pring, ring; + struct linux_aio_context *pctx = NULL, *ptmpctx; + linux_aio_context_t ctx_id; + int nerr = 0, nr, nrall, nq, arg_nr_reqs; + + DARGPRINTF("%u, %p", args->nr_reqs, args->ctxp); + + /* Signed integer is a little safer than unsigned */ + arg_nr_reqs = args->nr_reqs; + if (arg_nr_reqs <= 0) + return (EINVAL); + + if (arg_nr_reqs > max_aio_queue_per_proc + || arg_nr_reqs > max_aio_queue_count) { + printf(LMSG("linux_io_setup(): Please increase sysctls " + "vfs.aio.max_aio_queue_per_proc " + "and/or vfs.aio.max_aio_queue. ")); + return (ENOMEM); + } + + nerr = user_mem_rw_verify(args->ctxp, sizeof(*(args->ctxp))); + if (nerr != 0) + return (nerr); + + copyin(args->ctxp, &ctx_id, sizeof(ctx_id)); + if (ctx_id != 0) /* "Not initialized", described by io_setup(2) */ + return (EINVAL); + + p = td->td_proc; + + /* Get a new "ring" */ + nerr = user_malloc(td, (void **)&pring, sizeof(*pring)); + if (nerr != 0) + return (nerr); + + /* Get a new context */ + pctx = uma_zalloc(linux_aio_context_zone, M_WAITOK); + + LINUX_AIO_CTX_LIST_LOCK(); + + /* Count request capacity of all contexts belonging to this process */ + nr = 0; + nrall = 0; + nq = 0; + LINUX_AIO_CTX_FOREACH(ptmpctx) { + if (ptmpctx->ctx_pid == p->p_pid) { + nr += ptmpctx->ctx_nreq_max; + nq ++; + } + nrall += ptmpctx->ctx_nreq_max; + } + DPRINTF("%d queues of %d requests totally allocated for this process, " + "%d requests' total capacity for the whole system", + nq, nr, nrall); + + /* Check whether there are enough resources for requested queue */ + if (arg_nr_reqs > max_aio_queue_per_proc - nr + || arg_nr_reqs > max_aio_queue_count - nrall) { + printf(LMSG("linux_io_setup(): " + "Please increase sysctls " + "vfs.aio.max_aio_queue_per_proc " + "and/or vfs.aio.max_aio_queue. " + "Besides %d queues of %d requests totally " + "for this process, and %d requests' queues " + "totally for the whole system, " + "this Linux application needs one more " + "AIO queue of %d requests' capacity."), + nq, nr, nrall, arg_nr_reqs); + LINUX_AIO_CTX_LIST_UNLOCK(); + DPRINTF("Free context %p", pctx); + uma_zfree(linux_aio_context_zone, pctx); + user_free(td, pring, sizeof(*pring)); + return (ENOMEM); + } + + /* Initialize the new context */ + sx_init(&(pctx->ctx_sx), "linux_aio_context"); + pctx->ctx_pid = p->p_pid; + pctx->ctx_pring = pring; + pctx->ctx_nreq_max = arg_nr_reqs; + pctx->ctx_nreq_cur = 0; + STAILQ_INIT(&(pctx->ctx_req)); + + /* Hook the new context to global context list */ + LINUX_AIO_CTX_HOOK(pctx); + + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Initialize the new "ring" */ + DPRINTF("initialize the \"ring\" %p", pring); + bzero(&ring, sizeof(ring)); + ring.ring_id = 1; + ring.ring_nr = arg_nr_reqs; + ring.ring_head = 0; + ring.ring_tail = 1; + ring.ring_magic = LINUX_AIO_RING_MAGIC; + ring.ring_compat_features = LINUX_AIO_RING_COMPAT_FEATURES; + ring.ring_incompat_features = LINUX_AIO_RING_INCOMPAT_FEATURES; + ring.ring_header_length = sizeof(ring); + copyout(&ring, pring, sizeof(ring)); /* It has been hooked before */ + + /* Substantial return value */ + ctx_id = (linux_aio_context_t)pctx->ctx_pring; + copyout(&ctx_id, args->ctxp, sizeof(ctx_id)); + DPRINTF("returned context: %lx -> %p", (unsigned long)ctx_id, pctx); + + return (nerr); +} + +/* Linux system call io_destroy(2) */ +int linux_io_destroy(struct thread *td, struct linux_io_destroy_args *args) +{ + int nerr = 0; + struct proc *p; + struct linux_aio_context *pctx; + struct linux_aio_request *preq, *ptmpreq; + struct aio_cancel_args cancelargs; + struct aio_return_args aioretargs; + + DARGPRINTF("%lx", (unsigned long)args->ctx); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx, pctx); + + /* Unhook the context from context list */ + DPRINTF("Unhook context %p", pctx); + LINUX_AIO_CTX_UNHOOK(pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + /* Real cleanup */ + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)", + preq->req_porig, preq->req_pbsd); + + /* Cancel FreeBSD native clone */ + cancelargs.fd = preq->req_linux.aio_fildes; + cancelargs.aiocbp = preq->req_pbsd; + aio_cancel(td, &cancelargs); + DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]); + if (td->td_retval[0] == AIO_NOTCANCELED) + printf(LMSG("linux_io_destroy(): Asynchronous IO " + "request (Linux: %p, FreeBSD: %p) " + "cannot be cancelled. " + "***** Both User Space " + "and Kernel Memory Leaked! *****"), + preq->req_porig, preq->req_pbsd); + + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + if (td->td_retval[0] == AIO_ALLDONE) { + aioretargs.aiocbp = preq->req_pbsd; + aio_return(td, &aioretargs); + DPRINTF("aio_return(%p) returned %ld", + aioretargs.aiocbp, + (long)td->td_retval[0]); + + td->td_retval[0] = AIO_ALLDONE; + } + + /* Free user space clone of the request */ + if (td->td_retval[0] != AIO_NOTCANCELED) /* + * XXX How to avoid + * memory leak here? + */ + user_free(td, preq->req_pbsd, + sizeof(*(preq->req_pbsd))); + + /* Free kernel structure of the request */ + uma_zfree(linux_aio_request_zone, preq); + + td->td_retval[0] = 0; + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + sx_destroy(&(pctx->ctx_sx)); + + /* Free the "ring" */ + DPRINTF("free the \"ring\" %p", pctx->ctx_pring); + user_free(td, pctx->ctx_pring, sizeof(*pctx->ctx_pring)); + + /* Free destroyed context */ + uma_zfree(linux_aio_context_zone, pctx); + + return (nerr); +} + +/* Linux system call io_getevents(2) */ +int linux_io_getevents(struct thread *td, struct linux_io_getevents_args *args) +{ + int i, j, nerr = 0; + struct proc *p; + struct l_timespec l_timeout; + struct timespec timeout, *u_ptimeout, t1, t2; + struct linux_aio_context *pctx; + struct linux_aio_request *preq, *ptmpreq; + struct linux_io_event evt; + struct aio_return_args aioretargs; + struct aio_error_args aioerrargs; + register_t aio_ret, aio_err; + struct aiocb ** u_aiocbp; + struct aio_suspend_args aiosusargs; + + DARGPRINTF("%lx, %ld, %ld, %p, %p", + (unsigned long) args->ctx_id, + (long)args->min_nr, (long)args->nr, + args->events, args->timeout); + + if (args->nr <= 0) + return (EINVAL); + + if (args->min_nr < 0) + return (EINVAL); + + nerr = user_mem_rw_verify(args->events, + sizeof(*(args->events)) * args->nr); + if (nerr != 0) + return (nerr); + + if (args->timeout != NULL) { + nerr = copyin(args->timeout, &l_timeout, sizeof(l_timeout)); + if (nerr != 0) + return (nerr); + timeout.tv_sec = l_timeout.tv_sec; + timeout.tv_nsec = l_timeout.tv_nsec; + DUMP_TIMESPEC("User specified timeout: ", &timeout, ""); + } + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + if (STAILQ_EMPTY(&(pctx->ctx_req))) { + td->td_retval[0] = 0; /* No queued request */ + DPRINTF("No request in queue (context: %p) at all, " + "return directly", pctx); + } else { /* Deal with the request queue */ + i = 0; /* + * This variable's value will be the return value + * of linux_io_getevents() + */ + + nerr = user_malloc(td, (void **)&u_aiocbp, + sizeof(*u_aiocbp) * pctx->ctx_nreq_max); + if (nerr != 0) + goto skip_substantial_0; + + nerr = user_malloc(td, (void **)&u_ptimeout, + sizeof(*u_ptimeout)); + if (nerr != 0) + goto skip_substantial_1; + + for (i = 0;i < args->nr;) { + + /* Collecting finished requests and waiting for queued requests */ + + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + + /* Collect all finished requests */ + + if (i >= args->nr) /* Full */ + break; + + aioerrargs.aiocbp = preq->req_pbsd; + aio_error(td, &aioerrargs); + aio_ret = td->td_retval[0]; + td->td_retval[0] = 0; + + DPRINTF("aio_error(%p) (Linux: %p) " + "returned %ld%s", + aioerrargs.aiocbp, + preq->req_porig, + (long)aio_ret, + aio_ret == EINPROGRESS ? + "(EINPROGRESS)" : "" ); + + if (aio_ret == EINPROGRESS) + continue; + + /* Done */ + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + aioretargs.aiocbp = preq->req_pbsd; + aio_err = aio_return(td, &aioretargs); + aio_ret = td->td_retval[0]; + td->td_retval[0] = 0; + + DPRINTF("aio_return(%p) (Linux: %p) " + "returned %ld, errno=%ld", + aioretargs.aiocbp, + preq->req_porig, + (long)aio_ret, + (long)aio_err); + + evt.data = preq->req_linux.aio_data; + evt.obj = (uint64_t)(unsigned long) + preq->req_porig; + if (aio_ret >= 0) { + /* Normal return (success) */ + evt.res = aio_ret; + } else { /* Error code (failure) */ + /* + * Translate FreeBSD error code + * to Linux's + */ + evt.res = + p->p_sysent->sv_errtbl[aio_err]; + } + DPRINTF("context %p (Linux: %p): " + "io_event.res=%lld", + preq->req_pbsd, + preq->req_porig, + (long long)evt.res); + evt.res2 = 0; + + copyout(&evt, &(args->events[i]), sizeof(evt)); + + uma_zfree(linux_aio_request_zone, preq); + + i ++; + } /* End of collecting all finished requests */ + + if (STAILQ_EMPTY(&(pctx->ctx_req))) { + /* No request remained in this context */ + DPRINTF("returning(context %p): " + "request queue is empty", + pctx); + break; + } + + if (i >= args->nr) { /* Full */ + DPRINTF("returning(context %p): user space " + "event array is full", + pctx); + break; + } + + if (i >= args->min_nr) { + /* Met the minimum requirement */ + DPRINTF("returning(context %p): " + "met the minimum requirement", + pctx); + break; + } + + if (args->timeout != NULL) { + if (! timespecisset(&timeout)) { /* Timed out */ + DPRINTF("returning(context %p): " + "no time remaining", + pctx); + break; + } + } + + if (args->timeout != NULL) { + nanouptime(&t1); /* Time before aio_suspend() */ + DUMP_TIMESPEC("T1: ", &t1, + " (uptime before calling aio_suspend())"); + } + + /* Prepare arguments for aio_suspend() */ + j = 0; + LINUX_AIO_REQ_FOREACH(pctx, preq) { + copyout(&(preq->req_pbsd), &(u_aiocbp[j]), + sizeof(preq->req_pbsd)); + j ++; + } + MPASS(j == pctx->ctx_nreq_cur); + aiosusargs.aiocbp = u_aiocbp; + aiosusargs.nent = j; + + if (args->timeout != NULL) { + copyout(&timeout, u_ptimeout, sizeof(timeout)); + aiosusargs.timeout = u_ptimeout; + DUMP_TIMESPEC("Time remained: ", &timeout, ""); + } else { + aiosusargs.timeout = NULL; + } + + aio_err = aio_suspend(td, &aiosusargs); + DPRINTF("aio_suspend(%p, %d, %p) returned %ld", + aiosusargs.aiocbp, aiosusargs.nent, + aiosusargs.timeout, (long)aio_err); + + if (args->timeout != NULL) { + nanouptime(&t2); /* Time after aio_suspend() */ + DUMP_TIMESPEC("T2: ", &t2, + " (uptime after calling aio_suspend())"); + timespecsub(&t2, &t1); /* + * Time spent by + * aio_suspend() + */ + DUMP_TIMESPEC("T_delta: ", &t2, + " (time spent by calling aio_suspend())"); + if (timespeccmp(&t2, &timeout, >=)) { + timespecclear(&timeout); /* Timed out */ + } else { + timespecsub(&timeout, &t2); + /* Time remaining */ + } + DUMP_TIMESPEC("Time remained: ", &timeout, ""); + } + + if (aio_err == EAGAIN) { /* Timed out */ + DPRINTF("returning(context %p): " + "timed out after calling aio_suspend()", + pctx); + break; + } + } /* + * End of collecting finished requests + * and waiting for queued requests + */ + + l_timeout.tv_sec = timeout.tv_sec; + l_timeout.tv_nsec = timeout.tv_nsec; + copyout(&l_timeout, args->timeout, sizeof(l_timeout)); + /* No matter whether successfully or not */ + + nerr = user_free(td, u_ptimeout, sizeof(*u_ptimeout)); +skip_substantial_1: + nerr = user_free(td, u_aiocbp, + sizeof(*u_aiocbp) * pctx->ctx_nreq_max); +skip_substantial_0: + td->td_retval[0] = i; + /* user_free() resets td->td_retval[0] to 0 */ + DPRINTF("%d requests are unhooked from the context %p", i, pctx); + } /* End of dealing with request queue */ + + LINUX_AIO_CTX_UNLOCK(pctx); + + return (nerr); +} + +/* Linux system call io_submit(2) */ +int linux_io_submit(struct thread *td, struct linux_io_submit_args *args) +{ + int i, nerr = 0; + struct proc *p; + struct linux_aio_context *pctx; + struct linux_aio_request req, *preq; + struct linux_iocb *porig; + struct aiocb iocb, *piocb; + + DARGPRINTF("%lx, %ld, %p", (unsigned long)args->ctx_id, (long)args->nr, args->iocbpp); + + if (args->nr <= 0) + return (EINVAL); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + for (i = 0; pctx->ctx_nreq_cur < pctx->ctx_nreq_max && i < args->nr; + i ++) { + /* Get user space Linux control block */ + nerr = copyin(&(args->iocbpp[i]), &porig, sizeof(porig)); + if (nerr != 0) + break; + nerr = copyin(porig, &(req.req_linux), sizeof(req.req_linux)); + if (nerr != 0) + break; + + /* Create user space FreeBSD control block clone */ + nerr = iocb_reformat(&(req.req_linux), &iocb); + if (nerr != 0) + break; + nerr = user_malloc(td, (void **)&piocb, sizeof(*piocb)); + if (nerr != 0) + break; + nerr = copyout(&iocb, piocb, sizeof(iocb)); + if (nerr != 0) + break; + DUMP_FREEBSD_AIOCB(piocb, 1); + + /* Submit user space control block */ + nerr = aio_aqueue(td, piocb, NULL, iocb.aio_lio_opcode, 0); + if (nerr != 0) { + user_free(td, piocb, sizeof(*piocb)); + break; + } + + req.req_porig = porig; + req.req_pbsd = piocb; + + /* Hook request to the context */ + preq = uma_zalloc(linux_aio_request_zone, M_WAITOK); + memcpy(preq, &req, sizeof(req)); + DPRINTF("Linux IOCB %p (aio_lio_opcode=%u, aio_fildes=%u), " + "FreeBSD IOCB %p", + preq->req_porig, + (unsigned)preq->req_linux.aio_lio_opcode, + (unsigned)preq->req_linux.aio_fildes, + preq->req_pbsd); + LINUX_AIO_REQ_HOOK(pctx, preq); + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + if (i > 0) { + td->td_retval[0] = i; + nerr = 0; + } + + if (i == 0 && nerr == 0) + nerr = EAGAIN; /* No request is successfully submitted */ + + return (nerr); +} + +/* Linux system call io_cancel(2) */ +int linux_io_cancel(struct thread *td, struct linux_io_cancel_args *args) +{ + int nerr = 0; + struct proc *p; + struct linux_iocb lcb; + struct linux_aio_context *pctx; + struct linux_aio_request *preq; + struct linux_io_event evt; + struct aio_cancel_args aiocnclargs; + + DARGPRINTF("%lx, %p, %p", (unsigned long)args->ctx_id, args->iocb, args->result); + + nerr = copyin(args->iocb, &lcb, sizeof(lcb)); + if (nerr != 0) + return (nerr); + + nerr = user_mem_rw_verify(args->result, sizeof(*(args->result))); + if (nerr != 0) + return (nerr); + + p = td->td_proc; + + /* + * Locking: + * + * LINUX_AIO_LOCK(p); <----------------+ + * ... | + * LINUX_AIO_CTX_LIST_LOCK(); <--+ | + * ... | | + * LINUX_AIO_CTX_LIST_UNLOCK(); <--+ | + * ... | + * LINUX_AIO_CTX_LOCK(pctx); <---------|---+ + * LINUX_AIO_UNLOCK(p); <----------------+ | + * ... | + * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+ + */ + + LINUX_AIO_LOCK(p); + + /* Find the context in context list */ + LINUX_AIO_CTX_LIST_LOCK(); + LINUX_AIO_CTX_FOREACH(pctx) { + if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid)) + break; + } + LINUX_AIO_CTX_LIST_UNLOCK(); + + /* Unable to find the context */ + if (pctx == NULL) { + LINUX_AIO_UNLOCK(p); + return (EINVAL); + } + + DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx); + + LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */ + LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */ + + LINUX_AIO_REQ_FOREACH(pctx, preq) { + if (preq->req_porig == args->iocb + && preq->req_linux.aio_key == lcb.aio_key) + break; + } + + if (preq == NULL) { + DPRINTF("Unable to find IO control block %p", args->iocb); + nerr = EINVAL; + } else { /* Found the request in context */ + DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)", + preq->req_porig, preq->req_pbsd); + + /* Cancel FreeBSD native clone */ + aiocnclargs.fd = preq->req_linux.aio_fildes; + aiocnclargs.aiocbp = preq->req_pbsd; + aio_cancel(td, &aiocnclargs); + DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]); + + if (td->td_retval[0] == AIO_CANCELED) { + /* Cancellation succeeded */ + LINUX_AIO_REQ_UNHOOK(pctx, preq); + + evt.data = preq->req_linux.aio_data; + evt.obj = (uint64_t)(unsigned long) preq->req_porig; + evt.res = p->p_sysent->sv_errtbl[ECANCELED]; + evt.res2 = 0; + + /* Fill in user space structure linux_io_event */ + copyout(&evt, args->result, sizeof(evt)); + + /* Free user space clone of the request */ + user_free(td, preq->req_pbsd, + sizeof(*(preq->req_pbsd))); + + /* Free kernel structure of the request */ + uma_zfree(linux_aio_request_zone, preq); + } else if (td->td_retval[0] == AIO_ALLDONE) { + nerr = EINVAL; /* + * This value of Linux 2.6.15 + * is really confusing !!! + */ + } else { /* AIO_NOTCANCELED */ + nerr = EAGAIN; + } + + td->td_retval[0] = 0; + } + + LINUX_AIO_CTX_UNLOCK(pctx); + + return (nerr); +} + +static void linux_aio_proc_rundown(void *arg, struct proc *p) +{ + struct linux_aio_context *pctx, *ptmpctx; + struct linux_aio_request *preq, *ptmpreq; + + /* + * FreeBSD module "aio" can do more essential native cleanup + * (i.e. cancelling all queued requests) itself. + */ + + LINUX_AIO_CTX_LIST_LOCK(); + + LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) { + if (pctx->ctx_pid == p->p_pid) { + LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) { + DPPRINTF("Free request %p from context %p " + "(ring: %p)", + preq, pctx, pctx->ctx_pring); + LINUX_AIO_REQ_UNHOOK(pctx, preq); + uma_zfree(linux_aio_request_zone, preq); + } + + DPPRINTF("Free context %p (ring: %p)", + pctx, pctx->ctx_pring); + + /* Unhook it from context list */ + LINUX_AIO_CTX_UNHOOK(pctx); + + /* Free it really */ + sx_destroy(&(pctx->ctx_sx)); + uma_zfree(linux_aio_context_zone, pctx); + + DPPRINTF("The remaining context list is %s", + (SLIST_EMPTY(&linux_aio_context_list) ? + "empty":"not empty")); + } + } + + LINUX_AIO_CTX_LIST_UNLOCK(); +} + +/* + * Module constructor/destructor + */ +static int +linux_aio_modload(struct module *module, int cmd, void *arg) +{ + int nerr = 0; + + switch (cmd) { + case MOD_LOAD: + linux_aio_context_zone = uma_zcreate("LINUXAIOCTX", + sizeof(struct linux_aio_context), + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + linux_aio_request_zone = uma_zcreate("LINUXAIOREQ", + sizeof(struct linux_aio_request), + NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + mtx_init(&linux_aio_context_list_mtx, + "linux_aio_context_list", NULL, MTX_DEF); + SLIST_INIT(&linux_aio_context_list); + linux_aio_exit_tag = EVENTHANDLER_REGISTER(process_exit, + linux_aio_proc_rundown, + NULL, EVENTHANDLER_PRI_ANY); + + /* + * Backup pointers to the dummy implementation of these + * system calls faked by the macro DUMMY() in linux_dummy.c. + * And then, show real system calls. + */ + SHOW_REAL_SYSCALL(io_setup); + SHOW_REAL_SYSCALL(io_destroy); + SHOW_REAL_SYSCALL(io_getevents); + SHOW_REAL_SYSCALL(io_submit); + SHOW_REAL_SYSCALL(io_cancel); + break; + case MOD_UNLOAD: + LINUX_AIO_CTX_LIST_LOCK(); + if (!SLIST_EMPTY(&linux_aio_context_list)) { + nerr = EBUSY; + LINUX_AIO_CTX_LIST_UNLOCK(); + break; + } + EVENTHANDLER_DEREGISTER(process_exit, linux_aio_exit_tag); + LINUX_AIO_CTX_LIST_UNLOCK(); + mtx_destroy(&linux_aio_context_list_mtx); + uma_zdestroy(linux_aio_request_zone); + uma_zdestroy(linux_aio_context_zone); + + /* + * Restore pointers to the dummy implementation of these + * system calls faked by the macro DUMMY() in linux_dummy.c. + */ + RESTORE_DUMMY_SYSCALL(io_setup); + RESTORE_DUMMY_SYSCALL(io_destroy); + RESTORE_DUMMY_SYSCALL(io_getevents); + RESTORE_DUMMY_SYSCALL(io_submit); + RESTORE_DUMMY_SYSCALL(io_cancel); + break; + case MOD_SHUTDOWN: + break; + default: + nerr = EINVAL; + break; + } + return (nerr); +} + +static moduledata_t linux_aio_mod = { + "linuxaio", + &linux_aio_modload, + NULL +}; + +DECLARE_MODULE(linuxaio, linux_aio_mod, SI_SUB_VFS, SI_ORDER_ANY); +MODULE_DEPEND(linuxaio, aio, 1, 1, 1); +MODULE_DEPEND(linuxaio, linux, 1, 1, 1); --- /dev/null Sat Mar 17 17:03:38 2007 +++ sys/compat/linux/linux_aio.h Sat Mar 17 17:03:39 2007 @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2006 Li, Xiao . All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/compat/linux/linux_aio.h,v 1.1 2006/10/15 14:22:13 netchild Exp $ + */ + +/* + * Linux Kernel Implementation of Asynchronous I/O + */ + +#ifndef _LINUX_AIO_H_ +#define _LINUX_AIO_H_ + +typedef unsigned long linux_aio_context_t; + +enum { + LINUX_IOCB_CMD_PREAD = 0, + LINUX_IOCB_CMD_PWRITE = 1, + LINUX_IOCB_CMD_FSYNC = 2, + LINUX_IOCB_CMD_FDSYNC = 3, +#if 0 + LINUX_IOCB_CMD_PREADX = 4, + LINUX_IOCB_CMD_POLL = 5, +#endif + LINUX_IOCB_CMD_NOOP = 6, +}; + +struct linux_io_event { + uint64_t data; + uint64_t obj; + int64_t res; + int64_t res2; +}; + +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define LINUX_AIO_PADDED(x,y) x,y +#elif _BYTE_ORDER == _BIG_ENDIAN +#define LINUX_AIO_PADDED(x,y) y,x +#else +#error Unidentified byte order !!! +#endif + +struct linux_iocb { + uint64_t aio_data; + uint32_t LINUX_AIO_PADDED(aio_key, aio_reserved1); + + uint16_t aio_lio_opcode; + int16_t aio_reqprio; + uint32_t aio_fildes; + + uint64_t aio_buf; + uint64_t aio_nbytes; + int64_t aio_offset; + + uint64_t aio_reserved2; /* TODO: use this for a (struct sigevent *) */ + uint64_t aio_reserved3; + +}; + +/* User space context information structure */ +struct linux_aio_ring { + l_uint ring_id; + l_uint ring_nr; + l_uint ring_head; + l_uint ring_tail; +#define LINUX_AIO_RING_MAGIC 0xa10a10a1 + l_uint ring_magic; +#define LINUX_AIO_RING_COMPAT_FEATURES 1 + l_uint ring_compat_features; +#define LINUX_AIO_RING_INCOMPAT_FEATURES 0 + l_uint ring_incompat_features; + l_uint ring_header_length; /* Size of this structure */ + + struct linux_io_event ring_io_events[0]; +}; + +#endif /* !_LINUX_AIO_H_ */ --- sys/compat/linux/linux_emul.c.orig +++ sys/compat/linux/linux_emul.c @@ -56,7 +56,7 @@ #endif struct sx emul_shared_lock; -struct sx emul_lock; +struct mtx emul_lock; /* this returns locked reference to the emuldata entry (if found) */ struct linux_emuldata * --- sys/compat/linux/linux_emul.h.orig +++ sys/compat/linux/linux_emul.h @@ -57,8 +57,8 @@ struct linux_emuldata *em_find(struct proc *, int locked); -#define EMUL_LOCK(l) sx_xlock(l) -#define EMUL_UNLOCK(l) sx_xunlock(l) +#define EMUL_LOCK(l) mtx_lock(l) +#define EMUL_UNLOCK(l) mtx_unlock(l) #define EMUL_SHARED_RLOCK(l) sx_slock(l) #define EMUL_SHARED_RUNLOCK(l) sx_sunlock(l) @@ -75,6 +75,6 @@ void linux_proc_exec(void *, struct proc *, struct image_params *); extern struct sx emul_shared_lock; -extern struct sx emul_lock; +extern struct mtx emul_lock; #endif /* !_LINUX_EMUL_H_ */ --- sys/compat/linux/linux_file.c.orig +++ sys/compat/linux/linux_file.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -85,57 +86,51 @@ return (error); } -int -linux_open(struct thread *td, struct linux_open_args *args) + +static int +linux_common_open(struct thread *td, char *path, int l_flags, int mode, int openat) { struct proc *p = td->td_proc; struct file *fp; int fd; - char *path; int bsd_flags, error; - if (args->flags & LINUX_O_CREAT) - LCONVPATHCREAT(td, args->path, &path); - else - LCONVPATHEXIST(td, args->path, &path); - -#ifdef DEBUG - if (ldebug(open)) - printf(ARGS(open, "%s, 0x%x, 0x%x"), - path, args->flags, args->mode); -#endif bsd_flags = 0; - if (args->flags & LINUX_O_RDONLY) - bsd_flags |= O_RDONLY; - if (args->flags & LINUX_O_WRONLY) + switch (l_flags & LINUX_O_ACCMODE) { + case LINUX_O_WRONLY: bsd_flags |= O_WRONLY; - if (args->flags & LINUX_O_RDWR) + break; + case LINUX_O_RDWR: bsd_flags |= O_RDWR; - if (args->flags & LINUX_O_NDELAY) + break; + default: + bsd_flags |= O_RDONLY; + } + if (l_flags & LINUX_O_NDELAY) bsd_flags |= O_NONBLOCK; - if (args->flags & LINUX_O_APPEND) + if (l_flags & LINUX_O_APPEND) bsd_flags |= O_APPEND; - if (args->flags & LINUX_O_SYNC) + if (l_flags & LINUX_O_SYNC) bsd_flags |= O_FSYNC; - if (args->flags & LINUX_O_NONBLOCK) + if (l_flags & LINUX_O_NONBLOCK) bsd_flags |= O_NONBLOCK; - if (args->flags & LINUX_FASYNC) + if (l_flags & LINUX_FASYNC) bsd_flags |= O_ASYNC; - if (args->flags & LINUX_O_CREAT) + if (l_flags & LINUX_O_CREAT) bsd_flags |= O_CREAT; - if (args->flags & LINUX_O_TRUNC) + if (l_flags & LINUX_O_TRUNC) bsd_flags |= O_TRUNC; - if (args->flags & LINUX_O_EXCL) + if (l_flags & LINUX_O_EXCL) bsd_flags |= O_EXCL; - if (args->flags & LINUX_O_NOCTTY) + if (l_flags & LINUX_O_NOCTTY) bsd_flags |= O_NOCTTY; - if (args->flags & LINUX_O_DIRECT) + if (l_flags & LINUX_O_DIRECT) bsd_flags |= O_DIRECT; - if (args->flags & LINUX_O_NOFOLLOW) + if (l_flags & LINUX_O_NOFOLLOW) bsd_flags |= O_NOFOLLOW; /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ - error = kern_open(td, path, UIO_SYSSPACE, bsd_flags, args->mode); + error = kern_open(td, path, UIO_SYSSPACE, bsd_flags, mode); if (!error) { fd = td->td_retval[0]; /* @@ -158,7 +153,7 @@ PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } - if (args->flags & LINUX_O_DIRECTORY) { + if (l_flags & LINUX_O_DIRECTORY) { if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VDIR) { error = ENOTDIR; @@ -177,10 +172,121 @@ if (ldebug(open)) printf(LMSG("open returns error %d"), error); #endif - LFREEPATH(path); + if (!openat) + LFREEPATH(path); return error; } +/* + * common code for linux *at set of syscalls + * + * works like this: + * if filename is absolute + * ignore dirfd + * else + * if dirfd == AT_FDCWD + * return CWD/filename + * else + * return DIRFD/filename + */ +static int +linux_at(struct thread *td, int dirfd, char *filename, char **newpath, char **freebuf) +{ + struct file *fp; + int error = 0; + struct vnode *dvp; + struct filedesc *fdp = td->td_proc->p_fd; + char *fullpath = "unknown"; + char *freepath = NULL; + + /* don't do anything if the pathname is absolute */ + if (*filename == '/') { + *newpath= filename; + return (0); + } + + /* check for AT_FDWCD */ + if (dirfd == LINUX_AT_FDCWD) { + FILEDESC_LOCK(fdp); + dvp = fdp->fd_cdir; + FILEDESC_UNLOCK(fdp); + } else { + error = fget(td, dirfd, &fp); + if (error) + return (error); + dvp = fp->f_vnode; + /* only a dir can be dfd */ + if (dvp->v_type != VDIR) { + fdrop(fp, td); + return (ENOTDIR); + } + fdrop(fp, td); + } + + error = vn_fullpath(td, dvp, &fullpath, &freepath); + if (!error) { + *newpath = malloc(strlen(fullpath) + strlen(filename) + 2, M_TEMP, M_WAITOK | M_ZERO); + *freebuf = freepath; + sprintf(*newpath, "%s/%s", fullpath, filename); + } + + return (error); +} + +int +linux_openat(struct thread *td, struct linux_openat_args *args) +{ + char *newpath, *oldpath, *freebuf = NULL, *path; + int error; + + oldpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + error = copyinstr(args->filename, oldpath, MAXPATHLEN, NULL); + +#ifdef DEBUG + if (ldebug(openat)) + printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd, + oldpath, args->flags, args->mode); +#endif + + error = linux_at(td, args->dfd, oldpath, &newpath, &freebuf); + if (error) + return (error); +#ifdef DEBUG + printf(LMSG("newpath: %s"), newpath); +#endif + if (args->flags & LINUX_O_CREAT) + LCONVPATH_SEG(td, newpath, &path, 1, UIO_SYSSPACE); + else + LCONVPATH_SEG(td, newpath, &path, 0, UIO_SYSSPACE); + if (freebuf) + free(freebuf, M_TEMP); + if (*oldpath != '/') + free(newpath, M_TEMP); + + error = linux_common_open(td, path, args->flags, args->mode, 1); + free(oldpath, M_TEMP); + return (error); +} + +int +linux_open(struct thread *td, struct linux_open_args *args) +{ + char *path; + + if (args->flags & LINUX_O_CREAT) + LCONVPATHCREAT(td, args->path, &path); + else + LCONVPATHEXIST(td, args->path, &path); + +#ifdef DEBUG + if (ldebug(open)) + printf(ARGS(open, "%s, 0x%x, 0x%x"), + path, args->flags, args->mode); +#endif + + return linux_common_open(td, path, args->flags, args->mode, 0); +} + int linux_lseek(struct thread *td, struct linux_lseek_args *args) { --- sys/compat/linux/linux_futex.c.orig +++ sys/compat/linux/linux_futex.c @@ -88,17 +88,15 @@ static struct futex *futex_get(void *, int); static void futex_put(struct futex *); static int futex_sleep(struct futex *, struct thread *, unsigned long); -static int futex_wake(struct futex *, int, struct futex *); -#ifdef __i386__ +static int futex_wake(struct futex *, int, struct futex *, int); static int futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr); -#endif +static int futex_orl(int oparg, caddr_t uaddr, int *oldval); +static int futex_andl(int oparg, caddr_t uaddr, int *oldval); +static int futex_xorl(int oparg, caddr_t uaddr, int *oldval); /* support.s */ int futex_xchgl(int oparg, caddr_t uaddr, int *oldval); int futex_addl(int oparg, caddr_t uaddr, int *oldval); -int futex_orl(int oparg, caddr_t uaddr, int *oldval); -int futex_andnl(int oparg, caddr_t uaddr, int *oldval); -int futex_xorl(int oparg, caddr_t uaddr, int *oldval); int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) @@ -111,10 +109,8 @@ struct futex *newf; int timeout_hz; struct timeval tv = {0, 0}; -#ifdef __i386__ struct futex *f2; int op_ret; -#endif #ifdef DEBUG if (ldebug(sys_futex)) @@ -212,9 +208,9 @@ FUTEX_SYSTEM_LOCK; /* - * XXX: Linux is able cope with different addresses + * XXX: Linux is able to cope with different addresses * corresponding to the same mapped memory in the sleeping - * and the waker process. + * and waker process(es). */ #ifdef DEBUG if (ldebug(sys_futex)) @@ -222,7 +218,7 @@ td->td_proc->p_pid, args->uaddr, args->val); #endif f = futex_get(args->uaddr, FUTEX_UNLOCKED); - td->td_retval[0] = futex_wake(f, args->val, NULL); + td->td_retval[0] = futex_wake(f, args->val, NULL, 0); futex_put(f); FUTEX_SYSTEM_UNLOCK; @@ -244,7 +240,7 @@ f = futex_get(args->uaddr, FUTEX_UNLOCKED); newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); - td->td_retval[0] = futex_wake(f, args->val, newf); + td->td_retval[0] = futex_wake(f, args->val, newf, (int)(unsigned long)args->timeout); futex_put(f); futex_put(newf); @@ -253,23 +249,23 @@ case LINUX_FUTEX_REQUEUE: FUTEX_SYSTEM_LOCK; - + f = futex_get(args->uaddr, FUTEX_UNLOCKED); newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); - td->td_retval[0] = futex_wake(f, args->val, newf); + td->td_retval[0] = futex_wake(f, args->val, newf, (int)(unsigned long)args->timeout); futex_put(f); futex_put(newf); - + FUTEX_SYSTEM_UNLOCK; break; case LINUX_FUTEX_FD: - printf("linux_sys_futex: unimplemented op %d\n", + /* XXX: linux plans to remove this operation */ + printf("linux_sys_futex: unimplemented op %d\n", args->op); break; case LINUX_FUTEX_WAKE_OP: -#ifdef __i386__ FUTEX_SYSTEM_LOCK; #ifdef DEBUG if (ldebug(sys_futex)) @@ -287,7 +283,7 @@ op_ret = futex_atomic_op(td, args->val3, args->uaddr2); if (op_ret < 0) { - /* XXX: we dont handle the EFAULT yet */ + /* XXX: We don't handle the EFAULT yet. */ if (op_ret != -EFAULT) { futex_put(f); futex_put(f2); @@ -303,29 +299,25 @@ } - ret = futex_wake(f, args->val, NULL); + ret = futex_wake(f, args->val, NULL, 0); futex_put(f); if (op_ret > 0) { - op_ret = 0; - /* - * Linux uses the address of the timespec parameter - * as the number of retries, so any large number will - * be ok. + op_ret = 0; + /* + * Linux abuses the address of the timespec parameter + * as the number of retries. */ - op_ret += futex_wake(f2, 0x7fffffff, NULL); + op_ret += futex_wake(f2, (int) (unsigned long) args->timeout, NULL, 0); ret += op_ret; } futex_put(f2); td->td_retval[0] = ret; FUTEX_SYSTEM_UNLOCK; -#else - printf("linux_sys_futex: wake_op not implemented"); -#endif - break; + break; default: - printf("linux_sys_futex: unknown op %d\n", + printf("linux_sys_futex: unknown op %d\n", args->op); break; } @@ -414,10 +406,17 @@ } static int -futex_wake(struct futex *f, int n, struct futex *newf) +futex_wake(struct futex *f, int n, struct futex *newf, int n2) { struct waiting_proc *wp; - int count = 0; + int count; + + /* + * Linux is very strange it wakes up N threads for + * all operations BUT requeue ones where its N+1 + * mimic this. + */ + count = newf ? 0 : 1; FUTEX_LOCK; TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) { @@ -429,6 +428,8 @@ /* futex_put called after tsleep */ wp->wp_new_futex = futex_get(newf->f_uaddr, FUTEX_LOCKED); wakeup_one(wp); + if (count - n >= n2) + break; } } } @@ -437,72 +438,106 @@ return count; } -#ifdef __i386__ static int futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; - int oldval = 0, ret; + int op = (encoded_op >> 28) & 7; + int cmp = (encoded_op >> 24) & 15; + int oparg = (encoded_op << 8) >> 20; + int cmparg = (encoded_op << 20) >> 20; + int oldval = 0, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; + oparg = 1 << oparg; -#ifdef DEBUG +#ifdef DEBUG printf("futex_atomic_op: op = %d, cmp = %d, oparg = %d, cmparg = %d, uaddr = %p\n", - op, cmp, oparg, cmparg, uaddr); + op, cmp, oparg, cmparg, uaddr); #endif /* XXX: linux verifies access here and returns EFAULT */ - critical_enter(); + switch (op) { + case FUTEX_OP_SET: + ret = futex_xchgl(oparg, uaddr, &oldval); + break; + case FUTEX_OP_ADD: + ret = futex_addl(oparg, uaddr, &oldval); + break; + case FUTEX_OP_OR: + ret = futex_orl(oparg, uaddr, &oldval); + break; + case FUTEX_OP_ANDN: + ret = futex_andl(~oparg, uaddr, &oldval); + break; + case FUTEX_OP_XOR: + ret = futex_xorl(oparg, uaddr, &oldval); + break; + default: + ret = -ENOSYS; + } - switch (op) { - case FUTEX_OP_SET: - ret = futex_xchgl(oparg, uaddr, &oldval); + if (!ret) + switch (cmp) { + case FUTEX_OP_CMP_EQ: + ret = (oldval == cmparg); + break; + case FUTEX_OP_CMP_NE: + ret = (oldval != cmparg); break; - case FUTEX_OP_ADD: - ret = futex_addl(oparg, uaddr, &oldval); + case FUTEX_OP_CMP_LT: + ret = (oldval < cmparg); break; - case FUTEX_OP_OR: - ret = futex_orl(oparg, uaddr, &oldval); + case FUTEX_OP_CMP_GE: + ret = (oldval >= cmparg); break; - case FUTEX_OP_ANDN: - ret = futex_andnl(oparg, uaddr, &oldval); + case FUTEX_OP_CMP_LE: + ret = (oldval <= cmparg); break; - case FUTEX_OP_XOR: - ret = futex_xorl(oparg, uaddr, &oldval); + case FUTEX_OP_CMP_GT: + ret = (oldval > cmparg); break; default: ret = -ENOSYS; + } + + return (ret); +} + +static int +futex_orl(int oparg, caddr_t uaddr, int *oldval) +{ + uint32_t ua, ua_old; + + for (;;) { + ua = ua_old = fuword32(uaddr); + ua |= oparg; + if (casuword32((void *)uaddr, ua_old, ua) == ua_old) + return ua_old; } +} - critical_exit(); +static int +futex_andl(int oparg, caddr_t uaddr, int *oldval) +{ + uint32_t ua, ua_old; + + for (;;) { + ua = ua_old = fuword32(uaddr); + ua &= oparg; + if (casuword32((void *)uaddr, ua_old, ua) == ua_old) + return ua_old; + } +} - if (!ret) - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: ret = -ENOSYS; - } +static int +futex_xorl(int oparg, caddr_t uaddr, int *oldval) +{ + uint32_t ua, ua_old; - return (ret); + for (;;) { + ua = ua_old = fuword32(uaddr); + ua ^= oparg; + if (casuword32((void *)uaddr, ua_old, ua) == ua_old) + return ua_old; + } } -#endif --- sys/compat/linux/linux_misc.c.orig +++ sys/compat/linux/linux_misc.c @@ -52,8 +52,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -219,7 +221,9 @@ } #if defined(__i386__) -/* XXX: what about amd64/linux32? */ +/* XXX: this syscall is used mainly by a.out binaries (which dont exist + * on amd64/linux32) or libc5 + */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) @@ -588,6 +592,18 @@ (unsigned long)args->new_len, (unsigned long)args->flags); #endif + + if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { + td->td_retval[0] = 0; + return (EINVAL); + } + + /* Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK */ + if (args->addr & PAGE_MASK) { + td->td_retval[0] = 0; + return (EINVAL); + } + args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); @@ -748,7 +764,7 @@ return (copyout(&utsname, args->buf, sizeof(utsname))); } -#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) +#if defined(__i386__) || defined(__amd64__) struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; @@ -818,7 +834,7 @@ LFREEPATH(fname); return (error); } -#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ +#endif /* __i386__ || __amd64__ */ #define __WCLONE 0x80000000 @@ -1169,6 +1185,13 @@ if (error) return (error); + /* + * the 1024*1024 is a hardcoded constant of max files per proc in + * linux + */ + if (which == RLIMIT_NOFILE && rlim.rlim_max > (1024 * 1024)) + return (EPERM); + bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); @@ -1486,14 +1509,15 @@ /* find the group leader */ p = pfind(em->shared->group_pid); + /* lets pretend we were reparented to init */ if (p == NULL) { #ifdef DEBUG - printf(LMSG("parent process not found.\n")); + printf("getppid: thread group leader not found.\n"); #endif + td->td_retval[0] = 1; return (0); } - - pp = p->p_pptr; /* switch to parent */ + pp = p->p_pptr; /* switch to parent */ PROC_LOCK(pp); PROC_UNLOCK(p); @@ -1711,3 +1735,28 @@ return (error); } + +int +linux_sched_getaffinity(struct thread *td, + struct linux_sched_getaffinity_args *args) +{ + l_ulong mask; + int i; + + if (PTRIN(args->mask) == NULL) + return (EFAULT); + if (args->cpusetsize < (l_uint)sizeof(cpumask_t)) + return (EINVAL); + + /* We do not support process affinity yet. Just allow all for now. */ + for (i = mp_ncpus, mask = 0; i > 0; i--) + mask |= 1 << (i - 1); + + return (copyout(&mask, args->mask, sizeof(l_ulong))); +} + +int +linux_chroot(struct thread *td, struct linux_chroot_args *args) +{ + return (chroot(td, (struct chroot_args *)args)); +} --- sys/compat/linux/linux_misc.h.orig +++ sys/compat/linux/linux_misc.h @@ -42,4 +42,7 @@ #define LINUX_MAX_COMM_LEN 16 /* Maximum length of the process name. */ +#define LINUX_MREMAP_MAYMOVE 1 +#define LINUX_MREMAP_FIXED 2 + #endif /* _LINUX_MISC_H_ */ --- sys/compat/linux/linux_socket.c.orig +++ sys/compat/linux/linux_socket.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -1026,6 +1027,15 @@ if (error) return (error); error = copyin(PTRIN(linux_args.msg), &msg, sizeof(msg)); + + /* + * Ping on linux does pass 0 in controllen which is forbidden + * by FreeBSD but seems to be ok on Linux. This needs some + * checking but now it lets ping work. + */ + if (msg.msg_control && msg.msg_controllen == 0) + msg.msg_controllen = sizeof(struct cmsghdr); + if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); @@ -1194,7 +1204,13 @@ caddr_t val; int *avalsize; } */ bsd_args; - int error, name; + struct linux_ucred { + uint32_t pid; + uint32_t uid; + uint32_t gid; + } linux_ucred; + struct xucred xuc; + int error, name, optlen, rc, xuclen; if ((error = copyin(args, &linux_args, sizeof(linux_args)))) return (error); @@ -1216,18 +1232,47 @@ name = -1; break; } - if (name == -1) + if (name == -1) { + log(LOG_WARNING, "LINUX: 'getsockopt' level=0x%04x" + "optname=0x%04x not implemented\n", + linux_args.level, linux_args.optname); return (EINVAL); + }; bsd_args.name = name; - bsd_args.val = PTRIN(linux_args.optval); - bsd_args.avalsize = PTRIN(linux_args.optlen); + if (bsd_args.level == SOL_SOCKET && name == LOCAL_PEERCRED) { + if ((error = copyin(PTRIN(linux_args.optval), + &linux_ucred, sizeof(linux_ucred)))) + return (error); + if ((error = copyin(PTRIN(linux_args.optlen), + &optlen, sizeof(optlen)))) + return (error); + if (optlen < sizeof(linux_ucred)) + return (EFAULT); + xuclen = sizeof(xuc); + if ((rc = error = kern_getsockopt(td, bsd_args.s, + 0, bsd_args.name, + (caddr_t) &xuc, UIO_SYSSPACE, &xuclen))) + return (error); + if (xuc.cr_version != XUCRED_VERSION) + return (EINVAL); + /* XXX get PID */ + linux_ucred.pid = 0; + linux_ucred.uid = xuc.cr_uid; + linux_ucred.gid = xuc.cr_gid; + if ((error = copyout(&linux_ucred, + PTRIN(linux_args.optval), sizeof(linux_ucred)))) + return (error); + return (rc); + } else { + bsd_args.val = PTRIN(linux_args.optval); + bsd_args.avalsize = PTRIN(linux_args.optlen); + } + + error = getsockopt(td, &bsd_args); - if (name == IPV6_NEXTHOP) { - error = getsockopt(td, &bsd_args); + if (name == IPV6_NEXTHOP) bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); - } else - error = getsockopt(td, &bsd_args); return (error); } @@ -1277,3 +1322,34 @@ uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what); return (ENOSYS); } + +int +linux_sendfile(struct thread *td, struct linux_sendfile_args *args) +{ + struct sendfile_args sa; + off_t off; + int error; + + if ((error = copyin(args->offset, &off, sizeof(off)))) + return (error); +#ifdef DEBUG + if (ldebug(sendfile)) + printf(ARGS(sendfile, "%d, %d, %d, %d"), args->in, args->out, (int) off, + args->count); +#endif + + sa.fd = args->in; + sa.s = args->out; + sa.offset = off; + sa.nbytes = args->count; + sa.hdtr = NULL; + sa.sbytes = NULL; + sa.flags = 0; + + error = sendfile(td, &sa); + if (error) + return (error); + + td->td_retval[0] = args->count; + return (0); +} --- sys/compat/linux/linux_stats.c.orig +++ sys/compat/linux/linux_stats.c @@ -461,12 +461,27 @@ int linux_ustat(struct thread *td, struct linux_ustat_args *args) { + struct mount *mp; + struct l_ustat linux_ustat; + #ifdef DEBUG - if (ldebug(ustat)) - printf(ARGS(ustat, "%d, *"), args->dev); + if (ldebug(ustat)) + printf(ARGS(ustat, "%d, *"), args->dev); #endif + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) + if (mp->mnt_stat.f_fsid.val[0] == args->dev) + break; + mtx_unlock(&mountlist_mtx); + + if (mp == NULL) + return (EINVAL); + + bzero(&linux_ustat, sizeof(linux_ustat)); + linux_ustat.f_tfree = mp->mnt_stat.f_bfree; + linux_ustat.f_tinode = mp->mnt_stat.f_ffree; - return (EOPNOTSUPP); + return copyout(&linux_ustat, args->ubuf, sizeof(linux_ustat)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) --- sys/compat/linux/linux_util.h.orig +++ sys/compat/linux/linux_util.h @@ -53,16 +53,19 @@ int linux_emul_convpath(struct thread *, char *, enum uio_seg, char **, int); -#define LCONVPATH(td, upath, pathp, i) \ +#define LCONVPATH_SEG(td, upath, pathp, i, seg) \ do { \ int _error; \ \ - _error = linux_emul_convpath(td, upath, UIO_USERSPACE, \ + _error = linux_emul_convpath(td, upath, seg, \ pathp, i); \ if (*(pathp) == NULL) \ return (_error); \ } while (0) +#define LCONVPATH(td, upath, pathp, i) \ + LCONVPATH_SEG(td, upath, pathp, i, UIO_USERSPACE) + #define LCONVPATHEXIST(td, upath, pathp) LCONVPATH(td, upath, pathp, 0) #define LCONVPATHCREAT(td, upath, pathp) LCONVPATH(td, upath, pathp, 1) #define LFREEPATH(path) free(path, M_TEMP) --- sys/conf/files.amd64.orig +++ sys/conf/files.amd64 @@ -233,7 +233,8 @@ amd64/linux32/linux32_machdep.c optional compat_linux32 amd64/linux32/linux32_sysent.c optional compat_linux32 amd64/linux32/linux32_sysvec.c optional compat_linux32 -compat/linux/linux_emul.c optional compat_linux32 +compat/linux/linux_aio.c optional linuxaio compat_linux32 vfs_aio +compat/linux/linux_emul.c optional compat_linux32 compat/linux/linux_file.c optional compat_linux32 compat/linux/linux_futex.c optional compat_linux32 compat/linux/linux_getcwd.c optional compat_linux32 --- sys/conf/files.i386.orig +++ sys/conf/files.i386 @@ -85,6 +85,7 @@ # compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_aio.c optional linuxaio compat_linux aio compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_futex.c optional compat_linux --- sys/conf/files.pc98.orig +++ sys/conf/files.pc98 @@ -55,6 +55,7 @@ # compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_aio.c optional linuxaio compat_linux vfs_aio compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_futex.c optional compat_linux --- sys/conf/options.amd64.orig +++ sys/conf/options.amd64 @@ -19,6 +19,7 @@ #DEBUG_SVR4 opt_svr4.h LINPROCFS opt_dontuse.h LINSYSFS opt_dontuse.h +LINUXAIO opt_dontuse.h NDISAPI opt_dontuse.h CLK_CALIBRATION_LOOP opt_clock.h --- sys/conf/options.i386.orig +++ sys/conf/options.i386 @@ -26,6 +26,7 @@ DEBUG_SVR4 opt_svr4.h LINPROCFS opt_dontuse.h LINSYSFS opt_dontuse.h +LINUXAIO opt_dontuse.h NDISAPI opt_dontuse.h PECOFF_DEBUG opt_pecoff.h PECOFF_SUPPORT opt_dontuse.h --- sys/conf/options.pc98.orig +++ sys/conf/options.pc98 @@ -25,6 +25,7 @@ DEBUG_SVR4 opt_svr4.h LINPROCFS opt_dontuse.h LINSYSFS opt_dontuse.h +LINUXAIO opt_dontuse.h PECOFF_DEBUG opt_pecoff.h PECOFF_SUPPORT opt_dontuse.h --- sys/i386/conf/NOTES.orig +++ sys/i386/conf/NOTES @@ -1133,9 +1133,12 @@ # and PSEUDOFS) options LINPROCFS -#Enable the linux-like sys filesystem support (requires COMPAT_LINUX +# Enable the linux-like sys filesystem support (requires COMPAT_LINUX # and PSEUDOFS) -options LINSYSFS +options LINSYSFS + +# Enable the linux aio support (requires COMPAT_LINUX and VFS_AIO) +options LINUXAIO # # SysVR4 ABI emulation --- sys/i386/i386/support.s.orig +++ sys/i386/i386/support.s @@ -1549,8 +1549,7 @@ ret /* int futex_xchgl(int oparg, caddr_t uaddr, int *oldval); */ - .globl futex_xchgl -futex_xchgl: +ENTRY(futex_xchgl) movl PCPU(CURPCB), %eax movl $futex_fault, PCB_ONFAULT(%eax) movl 4(%esp), %eax @@ -1568,8 +1567,7 @@ ret /* int futex_addl(int oparg, caddr_t uaddr, int *oldval); */ - .globl futex_addl -futex_addl: +ENTRY(futex_addl) movl PCPU(CURPCB), %eax movl $futex_fault, PCB_ONFAULT(%eax) movl 4(%esp), %eax @@ -1586,60 +1584,3 @@ movl $0, PCB_ONFAULT(%edx) ret -/* int futex_orl(int oparg, caddr_t uaddr, int *oldval); */ - .globl futex_orl -futex_orl: - movl PCPU(CURPCB), %eax - movl $futex_fault, PCB_ONFAULT(%eax) - movl 4(%esp), %eax - movl 8(%esp), %edx - cmpl $VM_MAXUSER_ADDRESS,%edx - ja futex_fault - - MPLOCKED orl %eax, (%edx) - movl 0xc(%esp), %edx - movl %eax, (%edx) - xorl %eax, %eax - - movl PCPU(CURPCB), %edx - movl $0, PCB_ONFAULT(%edx) - ret - -/* int futex_andnl(int oparg, caddr_t uaddr, int *oldval); */ - .globl futex_andnl -futex_andnl: - movl PCPU(CURPCB), %eax - movl $futex_fault, PCB_ONFAULT(%eax) - movl 4(%esp), %eax - movl 8(%esp), %edx - cmpl $VM_MAXUSER_ADDRESS,%edx - ja futex_fault - - notl (%edx) - MPLOCKED andl %eax, (%edx) - movl 0xc(%esp), %edx - movl %eax, (%edx) - xorl %eax, %eax - - movl PCPU(CURPCB), %edx - movl $0, PCB_ONFAULT(%edx) - ret - -/* int futex_xorl(int oparg, caddr_t uaddr, int *oldval); */ - .globl futex_xorl -futex_xorl: - movl PCPU(CURPCB), %eax - movl $futex_fault, PCB_ONFAULT(%eax) - movl 4(%esp), %eax - movl 8(%esp), %edx - cmpl $VM_MAXUSER_ADDRESS,%edx - ja futex_fault - - MPLOCKED xorl %eax, (%edx) - movl 0xc(%esp), %edx - movl %eax, (%edx) - xorl %eax, %eax - - movl PCPU(CURPCB), %edx - movl $0, PCB_ONFAULT(%edx) - ret --- sys/i386/linux/linux.h.orig +++ sys/i386/linux/linux.h @@ -502,6 +502,7 @@ #define LINUX_O_RDONLY 00000000 #define LINUX_O_WRONLY 00000001 #define LINUX_O_RDWR 00000002 +#define LINUX_O_ACCMODE 00000003 #define LINUX_O_CREAT 00000100 #define LINUX_O_EXCL 00000200 #define LINUX_O_NOCTTY 00000400 @@ -536,6 +537,8 @@ #define LINUX_F_WRLCK 1 #define LINUX_F_UNLCK 2 +#define LINUX_AT_FDCWD -100 + /* * mount flags */ @@ -831,4 +834,6 @@ (LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES | \ LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD) +#include + #endif /* !_I386_LINUX_H_ */ --- sys/i386/linux/linux_dummy.c.orig +++ sys/i386/linux/linux_dummy.c @@ -59,7 +59,6 @@ DUMMY(rt_sigqueueinfo); DUMMY(capget); DUMMY(capset); -DUMMY(sendfile); /* different semantics */ DUMMY(truncate64); DUMMY(setfsuid); DUMMY(setfsgid); @@ -87,7 +86,6 @@ DUMMY(inotify_add_watch); DUMMY(inotify_rm_watch); DUMMY(migrate_pages); -DUMMY(openat); DUMMY(mkdirat); DUMMY(mknodat); DUMMY(fchownat); @@ -103,6 +101,11 @@ DUMMY(pselect6); DUMMY(ppoll); DUMMY(unshare); +DUMMY(io_setup); +DUMMY(io_destroy); +DUMMY(io_getevents); +DUMMY(io_submit); +DUMMY(io_cancel); #define DUMMY_XATTR(s) \ int \ --- sys/i386/linux/linux_machdep.c.orig +++ sys/i386/linux/linux_machdep.c @@ -73,18 +73,6 @@ extern struct sysentvec elf32_freebsd_sysvec; /* defined in i386/i386/elf_machdep.c */ -struct l_descriptor { - l_uint entry_number; - l_ulong base_addr; - l_uint limit; - l_uint seg_32bit:1; - l_uint contents:2; - l_uint read_exec_only:1; - l_uint limit_in_pages:1; - l_uint seg_not_present:1; - l_uint useable:1; -}; - struct l_old_select_argv { l_int nfds; l_fd_set *readfds; @@ -137,7 +125,7 @@ if (error == 0) error = kern_execve(td, &eargs, NULL); if (error == 0) - /* linux process can exec fbsd one, dont attempt + /* linux process can exec fbsd one, don't attempt * to create emuldata for such process using * linux_proc_init, this leads to a panic on KASSERT * because such process has p->p_emuldata == NULL @@ -459,9 +447,6 @@ /* and adjust it */ if (args->flags & LINUX_CLONE_THREAD) { - /* XXX: linux mangles pgrp and pptr somehow - * I think it might be this but I am not sure. - */ #ifdef notyet PROC_LOCK(p2); p2->p_pgrp = td->td_proc->p_pgrp; @@ -873,7 +858,7 @@ { int error; struct i386_ldt_args ldt; - struct l_descriptor ld; + struct l_user_desc ld; union descriptor desc; if (uap->ptr == NULL) @@ -968,7 +953,7 @@ } /* - * Linux has two extra args, restart and oldmask. We dont use these, + * Linux has two extra args, restart and oldmask. We don't use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ @@ -1110,12 +1095,12 @@ idx = info.entry_number; /* * Semantics of linux version: every thread in the system has array of - * 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This + * 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This * syscall loads one of the selected tls decriptors with a value and * also loads GDT descriptors 6, 7 and 8 with the content of the * per-thread descriptors. * - * Semantics of fbsd version: I think we can ignore that linux has 3 + * Semantics of fbsd version: I think we can ignore that linux has 3 * per-thread descriptors and use just the 1st one. The tls_array[] * is used only in set/get-thread_area() syscalls and for loading the * GDT descriptors. In fbsd we use just one GDT descriptor for TLS so --- sys/i386/linux/linux_proto.h.orig +++ sys/i386/linux/linux_proto.h @@ -2,7 +2,7 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/i386/linux/linux_proto.h,v 1.91 2007/02/15 00:57:03 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/i386/linux/syscalls.master,v 1.86 2007/02/15 00:54:40 jkim Exp */ @@ -196,6 +196,9 @@ struct linux_olduname_args { register_t dummy; }; +struct linux_chroot_args { + char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)]; +}; struct linux_ustat_args { char dev_l_[PADL_(l_dev_t)]; l_dev_t dev; char dev_r_[PADR_(l_dev_t)]; char ubuf_l_[PADL_(struct l_ustat *)]; struct l_ustat * ubuf; char ubuf_r_[PADR_(struct l_ustat *)]; @@ -591,7 +594,10 @@ char uoss_l_[PADL_(l_stack_t *)]; l_stack_t * uoss; char uoss_r_[PADR_(l_stack_t *)]; }; struct linux_sendfile_args { - register_t dummy; + char out_l_[PADL_(int)]; int out; char out_r_[PADR_(int)]; + char in_l_[PADL_(int)]; int in; char in_r_[PADR_(int)]; + char offset_l_[PADL_(l_long *)]; l_long * offset; char offset_r_[PADR_(l_long *)]; + char count_l_[PADL_(l_size_t)]; l_size_t count; char count_r_[PADR_(l_size_t)]; }; struct linux_vfork_args { register_t dummy; @@ -731,12 +737,41 @@ char uaddr2_l_[PADL_(void *)]; void * uaddr2; char uaddr2_r_[PADR_(void *)]; char val3_l_[PADL_(int)]; int val3; char val3_r_[PADR_(int)]; }; +struct linux_sched_getaffinity_args { + char pid_l_[PADL_(l_pid_t)]; l_pid_t pid; char pid_r_[PADR_(l_pid_t)]; + char cpusetsize_l_[PADL_(l_uint)]; l_uint cpusetsize; char cpusetsize_r_[PADR_(l_uint)]; + char mask_l_[PADL_(l_ulong *)]; l_ulong * mask; char mask_r_[PADR_(l_ulong *)]; +}; struct linux_set_thread_area_args { char desc_l_[PADL_(struct l_user_desc *)]; struct l_user_desc * desc; char desc_r_[PADR_(struct l_user_desc *)]; }; struct linux_get_thread_area_args { char desc_l_[PADL_(struct l_user_desc *)]; struct l_user_desc * desc; char desc_r_[PADR_(struct l_user_desc *)]; }; +struct linux_io_setup_args { + char nr_reqs_l_[PADL_(l_uint)]; l_uint nr_reqs; char nr_reqs_r_[PADR_(l_uint)]; + char ctxp_l_[PADL_(linux_aio_context_t *)]; linux_aio_context_t * ctxp; char ctxp_r_[PADR_(linux_aio_context_t *)]; +}; +struct linux_io_destroy_args { + char ctx_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx; char ctx_r_[PADR_(linux_aio_context_t)]; +}; +struct linux_io_getevents_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char min_nr_l_[PADL_(l_long)]; l_long min_nr; char min_nr_r_[PADR_(l_long)]; + char nr_l_[PADL_(l_long)]; l_long nr; char nr_r_[PADR_(l_long)]; + char events_l_[PADL_(struct linux_io_event *)]; struct linux_io_event * events; char events_r_[PADR_(struct linux_io_event *)]; + char timeout_l_[PADL_(struct l_timespec *)]; struct l_timespec * timeout; char timeout_r_[PADR_(struct l_timespec *)]; +}; +struct linux_io_submit_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char nr_l_[PADL_(l_long)]; l_long nr; char nr_r_[PADR_(l_long)]; + char iocbpp_l_[PADL_(struct linux_iocb **)]; struct linux_iocb ** iocbpp; char iocbpp_r_[PADR_(struct linux_iocb **)]; +}; +struct linux_io_cancel_args { + char ctx_id_l_[PADL_(linux_aio_context_t)]; linux_aio_context_t ctx_id; char ctx_id_r_[PADR_(linux_aio_context_t)]; + char iocb_l_[PADL_(struct linux_iocb *)]; struct linux_iocb * iocb; char iocb_r_[PADR_(struct linux_iocb *)]; + char result_l_[PADL_(struct linux_io_event *)]; struct linux_io_event * result; char result_r_[PADR_(struct linux_io_event *)]; +}; struct linux_fadvise64_args { register_t dummy; }; @@ -893,7 +928,10 @@ register_t dummy; }; struct linux_openat_args { - register_t dummy; + char dfd_l_[PADL_(l_int)]; l_int dfd; char dfd_r_[PADR_(l_int)]; + char filename_l_[PADL_(char *)]; char * filename; char filename_r_[PADR_(char *)]; + char flags_l_[PADL_(l_int)]; l_int flags; char flags_r_[PADR_(l_int)]; + char mode_l_[PADL_(l_int)]; l_int mode; char mode_r_[PADR_(l_int)]; }; struct linux_mkdirat_args { register_t dummy; @@ -984,6 +1022,7 @@ int linux_ioctl(struct thread *, struct linux_ioctl_args *); int linux_fcntl(struct thread *, struct linux_fcntl_args *); int linux_olduname(struct thread *, struct linux_olduname_args *); +int linux_chroot(struct thread *, struct linux_chroot_args *); int linux_ustat(struct thread *, struct linux_ustat_args *); int linux_getppid(struct thread *, struct linux_getppid_args *); int linux_sigaction(struct thread *, struct linux_sigaction_args *); @@ -1115,8 +1154,14 @@ int linux_fremovexattr(struct thread *, struct linux_fremovexattr_args *); int linux_tkill(struct thread *, struct linux_tkill_args *); int linux_sys_futex(struct thread *, struct linux_sys_futex_args *); +int linux_sched_getaffinity(struct thread *, struct linux_sched_getaffinity_args *); int linux_set_thread_area(struct thread *, struct linux_set_thread_area_args *); int linux_get_thread_area(struct thread *, struct linux_get_thread_area_args *); +int linux_io_setup(struct thread *, struct linux_io_setup_args *); +int linux_io_destroy(struct thread *, struct linux_io_destroy_args *); +int linux_io_getevents(struct thread *, struct linux_io_getevents_args *); +int linux_io_submit(struct thread *, struct linux_io_submit_args *); +int linux_io_cancel(struct thread *, struct linux_io_cancel_args *); int linux_fadvise64(struct thread *, struct linux_fadvise64_args *); int linux_exit_group(struct thread *, struct linux_exit_group_args *); int linux_lookup_dcookie(struct thread *, struct linux_lookup_dcookie_args *); @@ -1232,6 +1277,7 @@ #define LINUX_SYS_AUE_linux_ioctl AUE_IOCTL #define LINUX_SYS_AUE_linux_fcntl AUE_FCNTL #define LINUX_SYS_AUE_linux_olduname AUE_NULL +#define LINUX_SYS_AUE_linux_chroot AUE_CHROOT #define LINUX_SYS_AUE_linux_ustat AUE_NULL #define LINUX_SYS_AUE_linux_getppid AUE_GETPPID #define LINUX_SYS_AUE_linux_sigaction AUE_NULL @@ -1363,8 +1409,14 @@ #define LINUX_SYS_AUE_linux_fremovexattr AUE_NULL #define LINUX_SYS_AUE_linux_tkill AUE_NULL #define LINUX_SYS_AUE_linux_sys_futex AUE_NULL +#define LINUX_SYS_AUE_linux_sched_getaffinity AUE_NULL #define LINUX_SYS_AUE_linux_set_thread_area AUE_NULL #define LINUX_SYS_AUE_linux_get_thread_area AUE_NULL +#define LINUX_SYS_AUE_linux_io_setup AUE_NULL +#define LINUX_SYS_AUE_linux_io_destroy AUE_NULL +#define LINUX_SYS_AUE_linux_io_getevents AUE_NULL +#define LINUX_SYS_AUE_linux_io_submit AUE_NULL +#define LINUX_SYS_AUE_linux_io_cancel AUE_NULL #define LINUX_SYS_AUE_linux_fadvise64 AUE_NULL #define LINUX_SYS_AUE_linux_exit_group AUE_EXIT #define LINUX_SYS_AUE_linux_lookup_dcookie AUE_NULL @@ -1407,7 +1459,7 @@ #define LINUX_SYS_AUE_linux_inotify_add_watch AUE_NULL #define LINUX_SYS_AUE_linux_inotify_rm_watch AUE_NULL #define LINUX_SYS_AUE_linux_migrate_pages AUE_NULL -#define LINUX_SYS_AUE_linux_openat AUE_NULL +#define LINUX_SYS_AUE_linux_openat AUE_OPEN_RWTC #define LINUX_SYS_AUE_linux_mkdirat AUE_NULL #define LINUX_SYS_AUE_linux_mknodat AUE_NULL #define LINUX_SYS_AUE_linux_fchownat AUE_NULL --- sys/i386/linux/linux_syscall.h.orig +++ sys/i386/linux/linux_syscall.h @@ -2,7 +2,7 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/i386/linux/linux_syscall.h,v 1.84 2007/02/15 00:57:04 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/i386/linux/syscalls.master,v 1.86 2007/02/15 00:54:40 jkim Exp */ @@ -58,7 +58,7 @@ #define LINUX_SYS_setpgid 57 #define LINUX_SYS_linux_olduname 59 #define LINUX_SYS_umask 60 -#define LINUX_SYS_chroot 61 +#define LINUX_SYS_linux_chroot 61 #define LINUX_SYS_linux_ustat 62 #define LINUX_SYS_dup2 63 #define LINUX_SYS_linux_getppid 64 @@ -228,8 +228,14 @@ #define LINUX_SYS_linux_fremovexattr 237 #define LINUX_SYS_linux_tkill 238 #define LINUX_SYS_linux_sys_futex 240 +#define LINUX_SYS_linux_sched_getaffinity 242 #define LINUX_SYS_linux_set_thread_area 243 #define LINUX_SYS_linux_get_thread_area 244 +#define LINUX_SYS_linux_io_setup 245 +#define LINUX_SYS_linux_io_destroy 246 +#define LINUX_SYS_linux_io_getevents 247 +#define LINUX_SYS_linux_io_submit 248 +#define LINUX_SYS_linux_io_cancel 249 #define LINUX_SYS_linux_fadvise64 250 #define LINUX_SYS_linux_exit_group 252 #define LINUX_SYS_linux_lookup_dcookie 253 --- sys/i386/linux/linux_sysent.c.orig +++ sys/i386/linux/linux_sysent.c @@ -2,7 +2,7 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/i386/linux/linux_sysent.c,v 1.91 2007/02/15 00:57:04 jkim Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/i386/linux/syscalls.master,v 1.86 2007/02/15 00:54:40 jkim Exp */ @@ -80,7 +80,7 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 58 = ulimit */ { 0, (sy_call_t *)linux_olduname, AUE_NULL, NULL, 0, 0 }, /* 59 = linux_olduname */ { AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0 }, /* 60 = umask */ - { AS(chroot_args), (sy_call_t *)chroot, AUE_CHROOT, NULL, 0, 0 }, /* 61 = chroot */ + { AS(linux_chroot_args), (sy_call_t *)linux_chroot, AUE_CHROOT, NULL, 0, 0 }, /* 61 = linux_chroot */ { AS(linux_ustat_args), (sy_call_t *)linux_ustat, AUE_NULL, NULL, 0, 0 }, /* 62 = linux_ustat */ { AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0 }, /* 63 = dup2 */ { 0, (sy_call_t *)linux_getppid, AUE_GETPPID, NULL, 0, 0 }, /* 64 = linux_getppid */ @@ -206,7 +206,7 @@ { 0, (sy_call_t *)linux_capget, AUE_CAPGET, NULL, 0, 0 }, /* 184 = linux_capget */ { 0, (sy_call_t *)linux_capset, AUE_CAPSET, NULL, 0, 0 }, /* 185 = linux_capset */ { AS(linux_sigaltstack_args), (sy_call_t *)linux_sigaltstack, AUE_NULL, NULL, 0, 0 }, /* 186 = linux_sigaltstack */ - { 0, (sy_call_t *)linux_sendfile, AUE_SENDFILE, NULL, 0, 0 }, /* 187 = linux_sendfile */ + { AS(linux_sendfile_args), (sy_call_t *)linux_sendfile, AUE_SENDFILE, NULL, 0, 0 }, /* 187 = linux_sendfile */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 188 = getpmsg */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 189 = putpmsg */ { 0, (sy_call_t *)linux_vfork, AUE_VFORK, NULL, 0, 0 }, /* 190 = linux_vfork */ @@ -261,14 +261,14 @@ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 239 = linux_sendfile64 */ { AS(linux_sys_futex_args), (sy_call_t *)linux_sys_futex, AUE_NULL, NULL, 0, 0 }, /* 240 = linux_sys_futex */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 241 = linux_sched_setaffinity */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 242 = linux_sched_getaffinity */ + { AS(linux_sched_getaffinity_args), (sy_call_t *)linux_sched_getaffinity, AUE_NULL, NULL, 0, 0 }, /* 242 = linux_sched_getaffinity */ { AS(linux_set_thread_area_args), (sy_call_t *)linux_set_thread_area, AUE_NULL, NULL, 0, 0 }, /* 243 = linux_set_thread_area */ { AS(linux_get_thread_area_args), (sy_call_t *)linux_get_thread_area, AUE_NULL, NULL, 0, 0 }, /* 244 = linux_get_thread_area */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 245 = linux_io_setup */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 246 = linux_io_destroy */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 247 = linux_io_getevents */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 248 = linux_io_submit */ - { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 249 = linux_io_cancel */ + { AS(linux_io_setup_args), (sy_call_t *)linux_io_setup, AUE_NULL, NULL, 0, 0 }, /* 245 = linux_io_setup */ + { AS(linux_io_destroy_args), (sy_call_t *)linux_io_destroy, AUE_NULL, NULL, 0, 0 }, /* 246 = linux_io_destroy */ + { AS(linux_io_getevents_args), (sy_call_t *)linux_io_getevents, AUE_NULL, NULL, 0, 0 }, /* 247 = linux_io_getevents */ + { AS(linux_io_submit_args), (sy_call_t *)linux_io_submit, AUE_NULL, NULL, 0, 0 }, /* 248 = linux_io_submit */ + { AS(linux_io_cancel_args), (sy_call_t *)linux_io_cancel, AUE_NULL, NULL, 0, 0 }, /* 249 = linux_io_cancel */ { 0, (sy_call_t *)linux_fadvise64, AUE_NULL, NULL, 0, 0 }, /* 250 = linux_fadvise64 */ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 }, /* 251 = */ { AS(linux_exit_group_args), (sy_call_t *)linux_exit_group, AUE_EXIT, NULL, 0, 0 }, /* 252 = linux_exit_group */ @@ -314,7 +314,7 @@ { 0, (sy_call_t *)linux_inotify_add_watch, AUE_NULL, NULL, 0, 0 }, /* 292 = linux_inotify_add_watch */ { 0, (sy_call_t *)linux_inotify_rm_watch, AUE_NULL, NULL, 0, 0 }, /* 293 = linux_inotify_rm_watch */ { 0, (sy_call_t *)linux_migrate_pages, AUE_NULL, NULL, 0, 0 }, /* 294 = linux_migrate_pages */ - { 0, (sy_call_t *)linux_openat, AUE_NULL, NULL, 0, 0 }, /* 295 = linux_openat */ + { AS(linux_openat_args), (sy_call_t *)linux_openat, AUE_OPEN_RWTC, NULL, 0, 0 }, /* 295 = linux_openat */ { 0, (sy_call_t *)linux_mkdirat, AUE_NULL, NULL, 0, 0 }, /* 296 = linux_mkdirat */ { 0, (sy_call_t *)linux_mknodat, AUE_NULL, NULL, 0, 0 }, /* 297 = linux_mknodat */ { 0, (sy_call_t *)linux_fchownat, AUE_NULL, NULL, 0, 0 }, /* 298 = linux_fchownat */ --- sys/i386/linux/linux_sysvec.c.orig +++ sys/i386/linux/linux_sysvec.c @@ -917,7 +917,7 @@ linux_ioctl_register_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); - sx_init(&emul_lock, "emuldata lock"); + mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); sx_init(&emul_shared_lock, "emuldata->shared lock"); LIST_INIT(&futex_list); sx_init(&futex_sx, "futex protection lock"); @@ -948,7 +948,7 @@ linux_ioctl_unregister_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); - sx_destroy(&emul_lock); + mtx_destroy(&emul_lock); sx_destroy(&emul_shared_lock); sx_destroy(&futex_sx); EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); --- sys/i386/linux/syscalls.master.orig +++ sys/i386/linux/syscalls.master @@ -117,7 +117,7 @@ 58 AUE_NULL UNIMPL ulimit 59 AUE_NULL STD { int linux_olduname(void); } 60 AUE_UMASK NOPROTO { int umask(int newmask); } -61 AUE_CHROOT NOPROTO { int chroot(char *path); } +61 AUE_CHROOT STD { int linux_chroot(char *path); } 62 AUE_NULL STD { int linux_ustat(l_dev_t dev, \ struct l_ustat *ubuf); } 63 AUE_DUP2 NOPROTO { int dup2(u_int from, u_int to); } @@ -333,7 +333,8 @@ 185 AUE_CAPSET STD { int linux_capset(void); } 186 AUE_NULL STD { int linux_sigaltstack(l_stack_t *uss, \ l_stack_t *uoss); } -187 AUE_SENDFILE STD { int linux_sendfile(void); } +187 AUE_SENDFILE STD { int linux_sendfile(int out, int in, l_long *offset, \ + l_size_t count); } 188 AUE_GETPMSG UNIMPL getpmsg 189 AUE_PUTPMSG UNIMPL putpmsg 190 AUE_VFORK STD { int linux_vfork(void); } @@ -408,16 +409,17 @@ 238 AUE_NULL STD { int linux_tkill(int tid, int sig); } 239 AUE_SENDFILE UNIMPL linux_sendfile64 240 AUE_NULL STD { int linux_sys_futex(void *uaddr, int op, int val, \ - struct l_timespec *timeout, void *uaddr2, int val3); } + struct l_timespec *timeout, void *uaddr2, int val3); } 241 AUE_NULL UNIMPL linux_sched_setaffinity -242 AUE_NULL UNIMPL linux_sched_getaffinity +242 AUE_NULL STD { int linux_sched_getaffinity(l_pid_t pid, l_uint cpusetsize, \ + l_ulong *mask); } 243 AUE_NULL STD { int linux_set_thread_area(struct l_user_desc *desc); } 244 AUE_NULL STD { int linux_get_thread_area(struct l_user_desc *desc); } -245 AUE_NULL UNIMPL linux_io_setup -246 AUE_NULL UNIMPL linux_io_destroy -247 AUE_NULL UNIMPL linux_io_getevents -248 AUE_NULL UNIMPL linux_io_submit -249 AUE_NULL UNIMPL linux_io_cancel +245 AUE_NULL STD { int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); } +246 AUE_NULL STD { int linux_io_destroy(linux_aio_context_t ctx); } +247 AUE_NULL STD { int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); } +248 AUE_NULL STD { int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); } +249 AUE_NULL STD { int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); } 250 AUE_NULL STD { int linux_fadvise64(void); } 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } @@ -473,7 +475,8 @@ 292 AUE_NULL STD { int linux_inotify_add_watch(void); } 293 AUE_NULL STD { int linux_inotify_rm_watch(void); } 294 AUE_NULL STD { int linux_migrate_pages(void); } -295 AUE_NULL STD { int linux_openat(void); } +295 AUE_OPEN_RWTC STD { int linux_openat(l_int dfd, char *filename, \ + l_int flags, l_int mode); } 296 AUE_NULL STD { int linux_mkdirat(void); } 297 AUE_NULL STD { int linux_mknodat(void); } 298 AUE_NULL STD { int linux_fchownat(void); } --- sys/kern/kern_fork.c.orig +++ sys/kern/kern_fork.c @@ -522,10 +522,11 @@ sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } - if (flags & RFLINUXTHPN) - p2->p_sigparent = SIGUSR1; + /* This flag is used by linuxthreads and is passed in via rfork(). */ + if (flags & RFLINUXTHPN) + p2->p_sigparent = SIGUSR1; else - p2->p_sigparent = SIGCHLD; + p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; --- sys/kern/vfs_aio.c.orig +++ sys/kern/vfs_aio.c @@ -139,8 +139,8 @@ SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel threads for async IO"); -static int max_queue_count = MAX_AIO_QUEUE; -SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, +int max_aio_queue_count = MAX_AIO_QUEUE; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_aio_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; @@ -172,7 +172,7 @@ SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); -static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; +int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); @@ -257,19 +257,6 @@ struct thread *aiothread; /* (*) the AIO thread */ }; -/* - * data-structure for lio signal management - */ -struct aioliojob { - int lioj_flags; /* (a) listio flags */ - int lioj_count; /* (a) listio flags */ - int lioj_finished_count; /* (a) listio flags */ - struct sigevent lioj_signal; /* (a) signal on all I/O done */ - TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ - struct knlist klist; /* (a) list of knotes */ - ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ -}; - #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ @@ -313,13 +300,10 @@ static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; -void aio_init_aioinfo(struct proc *p); static void aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(int *); -int aio_aqueue(struct thread *td, struct aiocb *job, - struct aioliojob *lio, int type, int osigev); static void aio_physwakeup(struct buf *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); @@ -1347,7 +1331,7 @@ suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.kernelinfo, -1); - if (num_queue_count >= max_queue_count || + if (num_queue_count >= max_aio_queue_count || ki->kaio_count >= ki->kaio_qallowed_count) { suword(&job->_aiocb_private.error, EAGAIN); return (EAGAIN); --- sys/modules/Makefile.orig +++ sys/modules/Makefile @@ -141,6 +141,7 @@ ${_linprocfs} \ ${_linsysfs} \ ${_linux} \ + ${_linuxaio} \ lmc \ lpt \ mac_biba \ @@ -374,6 +375,7 @@ _linprocfs= linprocfs _linsysfs= linsysfs _linux= linux +_linuxaio= linuxaio _mse= mse .if ${MK_NCP} != "no" _ncp= ncp @@ -485,6 +487,7 @@ _linprocfs= linprocfs _linsysfs= linsysfs _linux= linux +_linuxaio= linuxaio _mly= mly _mxge= mxge _ndis= ndis @@ -504,8 +507,8 @@ .if ${MACHINE_ARCH} == "ia64" # Modules not enabled on ia64 (as compared to i386) include: -# aac acpi aout apm atspeaker drm ibcs2 linprocfs linux ncv -# nsp oltr pecoff s3 sbni stg vesa +# aac acpi aout apm ats