--- src/sys/kern/kern_jail.c Sat Nov 12 22:12:32 2005 +++ src/sys/kern/kern_jail.c Thu Sep 20 17:47:21 2007 @@ -5,6 +5,35 @@ * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- + * + * Portions copyright (c) 2006 Chris Jones + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * */ #include @@ -15,12 +44,19 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include @@ -71,6 +107,17 @@ &jail_chflags_allowed, 0, "Processes in jail can alter system file flags"); +int jail_limit_memory = 0; +SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW, + &jail_limit_memory, 0, + "Limit jails' memory usage"); + +int jail_memory_pager_interval = 5; +SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval, + CTLTYPE_INT | CTLFLAG_RW, + &jail_memory_pager_interval, 0, + "Interval between jail memory limit checks"); + /* allprison, lastprid, and prisoncount are protected by allprison_mtx. */ struct prisonlist allprison; struct mtx allprison_mtx; @@ -92,6 +139,105 @@ SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); +static void +jpager_td(void *arg) +{ + struct proc *p; + struct prison *pr = arg; + struct thread *td; + long limit, cursize, newsize, usage; + int breakout; + int flags = J_PAGER_TD_ACTIVE; + pr->pr_pager_flags_ptr = &flags; + + for (;;) { + if (flags & J_PAGER_TD_DIE) + break; + + if (jail_limit_memory && pr->pr_mem_limit) { + /* + * TODO: consider whether it might be better to start + * pushing back when we approach the limit, rather than + * when we hit it. + * + */ + limit = prison_memory_limit(pr); + + sx_slock(&allproc_lock); + usage = prison_memory(pr); + + mtx_lock(&pr->pr_mtx); + pr->pr_mem_usage = usage; + mtx_unlock(&pr->pr_mtx); + + /* + * The logic from vm_daemon() really needs to go here. + * Problem: we want to push things below their rlimits, + * and vm_daemon doesn't do that. It'd be better to + * refactor vm_daemon to fit, but this'll do for now. + * + */ + + if ((usage - limit) > 0) { + + LIST_FOREACH(p, &allproc, p_list) { + + if ( p->p_ucred == NULL || pr != p->p_ucred->cr_prison) + continue; + + PROC_LOCK(p); + if (p->p_flag & (P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + + mtx_lock_spin(&sched_lock); + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td)) { + breakout = 1; + break; + } + } + mtx_unlock_spin(&sched_lock); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + + /* NOTE: we differ here from vm_daemon b/c we don't + * care about the rlimit; things that are exceeding that will + * get caught in due course. We need, however, to decrease + * the pressure on our permitted memory allocation. Fortunately, + * we only care about eventually hitting the limit, so if we + * don't get there right away, it's okay. + */ + + /* TODO: this arbitrarily reduces each process's space by + * 6.25% (until it's completely swapped out) while + * we're under memory pressure. A better way would be + * to either hit large processes first, or to hit the + * least-active processes first, or go proportionally, + * or .... + */ + newsize = cursize = vmspace_resident_count(p->p_vmspace); + newsize -= newsize / 16; + if (cursize < 0) + newsize = 0; + PROC_UNLOCK(p); + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize); + } /* end LIST_FOREACH procs */ + } + sx_sunlock(&allproc_lock); + } + tsleep(pr, 0, "-", jail_memory_pager_interval * hz); + } + + kthread_exit(0); +} + /* * MPSAFE * @@ -106,6 +252,7 @@ struct prison *pr, *tpr; struct jail j; struct jail_attach_args jaa; + struct proc *j_pager_proc = NULL; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); @@ -135,7 +282,9 @@ goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; + pr->pr_sched_shares = j.sched_shares; pr->pr_securelevel = securelevel; + pr->pr_mem_limit = j.mem_limit; /* Determine next pr_id and add prison to allprison list. */ mtx_lock(&allprison_mtx); @@ -159,6 +308,11 @@ prisoncount++; mtx_unlock(&allprison_mtx); + if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) + goto e_dropprref; + KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc")); + pr->pr_pager = j_pager_proc; + error = jail_attach(td, &jaa); if (error) goto e_dropprref; @@ -168,6 +322,10 @@ td->td_retval[0] = jaa.jid; return (0); e_dropprref: + if (j_pager_proc != NULL) { + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + } mtx_lock(&allprison_mtx); LIST_REMOVE(pr, pr_list); prisoncount--; @@ -282,6 +440,10 @@ prisoncount--; mtx_unlock(&allprison_mtx); + /* Tell scheduler, pager to die. No need to wait. */ + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; @@ -393,6 +555,45 @@ return (ok); } +/* Given credential, return memory usage in bytes. */ +long +prison_memory(struct prison *pr) +{ + struct proc *p; + long mem_used = 0; + + /* + * TODO: this is a really bad way of doing the + * search, as we end up going across all processes + * for each jail. It'd be more efficient to just do + * this once in a period and update the relevant jail. + * + */ + FOREACH_PROC_IN_SYSTEM(p) { + if ( p->p_ucred == NULL || + p->p_vmspace == NULL || + !jailed(p->p_ucred) || + (pr != p->p_ucred->cr_prison)) + continue; + + mem_used += vmspace_resident_count(p->p_vmspace); + } + mem_used *= PAGE_SIZE; + + return mem_used; +} + +/* Given credential, return permitted memory usage in bytes. */ +long +prison_memory_limit(struct prison *pr) +{ + vm_pindex_t memlimit; + mtx_lock(&pr->pr_mtx); + memlimit = (vm_pindex_t) pr->pr_mem_limit; + mtx_unlock(&pr->pr_mtx); + return memlimit; +} + /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ @@ -523,6 +724,52 @@ } } +/* + * Change resource limit for a prison. + * + * unsigned int jid: id of jail to mess with + * + * int cpushares: 0 -> remove prison from cpu limits + * -1 -> don't change existing shares + * >0 -> set cpu shares + * + * int memlimit: 0 -> remove prison from mem limits + * -1 -> don't change existing limit + * >1 -> set memory limit (bytes) + * + * TODO: might this be better handled via a writable + * sysctl than with a new syscall? + */ +int +jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap) +{ + struct prison *pr; + int error; + + error = suser(td); + if (error) + return (error); + + mtx_lock(&allprison_mtx); + LIST_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id == uap->jid) + break; + } + if (NULL == pr) { + mtx_unlock(&allprison_mtx); + return 1; + } + + mtx_lock(&pr->pr_mtx); + if (-1 != uap->cpushares) + pr->pr_sched_shares = uap->cpushares; + if (-1 != uap->memlimit) + pr->pr_mem_limit = uap->memlimit; + mtx_unlock(&pr->pr_mtx); + mtx_unlock(&allprison_mtx); + return 0; +} + static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { @@ -555,6 +802,10 @@ strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path)); strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host)); xp->pr_ip = pr->pr_ip; + xp->pr_sched_shares = pr->pr_sched_shares; + xp->pr_estcpu = pr->pr_estcpu; + xp->pr_mem_limit = pr->pr_mem_limit; + xp->pr_mem_usage = pr->pr_mem_usage; mtx_unlock(&pr->pr_mtx); xp++; }