Re: [PATCH] allow vma merging with mlock et. al.

From: Chris Wright (chrisw_at_osdl.org)
Date: 02/28/05

  • Next message: Willy Tarreau: "Re: ethertap.c compilation problem"
    Date:	Mon, 28 Feb 2005 12:33:07 -0800
    To: Hugh Dickins <hugh@veritas.com>
    
    

    * Hugh Dickins (hugh@veritas.com) wrote:
    > On Fri, 25 Feb 2005, Chris Wright wrote:
    > >
    > > Actually I think it winds up being fine since we don't do merging with
    > > mlock. But why not? Patch below remedies that.
    >
    > > Successive mlock/munlock calls can leave fragmented vmas because they can
    > > be split but not merged. Give mlock et. al. full vma merging support.
    >
    > Phew, you followed mprotect, saving me from having to think too deeply
    > about the correctness of this (I'm assuming mprotect is perfect ;))

    Heh, same assumption here ;-)

    > Some remarks then on the three places where you departed from mprotect.
    >
    > > ===== mm/mlock.c 1.19 vs edited =====
    > > --- 1.19/mm/mlock.c 2005-02-11 11:07:35 -08:00
    > > +++ edited/mm/mlock.c 2005-02-24 23:53:10 -08:00
    > > @@ -7,18 +7,32 @@
    > >
    > > #include <linux/mman.h>
    > > #include <linux/mm.h>
    > > +#include <linux/mempolicy.h>
    > > #include <linux/syscalls.h>
    > >
    > >
    > > -static int mlock_fixup(struct vm_area_struct * vma,
    > > +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
    > > unsigned long start, unsigned long end, unsigned int newflags)
    > > {
    > > struct mm_struct * mm = vma->vm_mm;
    > > + pgoff_t pgoff;
    > > int pages;
    > > int ret = 0;
    > >
    > > - if (newflags == vma->vm_flags)
    > > + if (newflags == vma->vm_flags) {
    > > + *prev = vma;
    > > goto out;
    > > + }
    > > +
    > > + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
    > > + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
    > > + vma->vm_file, pgoff, vma_policy(vma));
    > > + if (*prev) {
    > > + vma = *prev;
    > > + goto success;
    > > + }
    > > +
    > > + *prev = vma;
    >
    > You've raised that line higher (so do_mlockall's "Ignore errors" works):
    > that's an improvement because it saves special casing errors, I'd like
    > you to make the same adjustment to mprotect_fixup, even though it's not
    > required there (and delete "Unless it returns an error, " from comment).
    > Let's keep the two in step.

    Sure, makes sense.

    > > if (start != vma->vm_start) {
    > > ret = split_vma(mm, vma, start, 1);
    > > @@ -32,6 +46,7 @@ static int mlock_fixup(struct vm_area_st
    > > goto out;
    > > }
    > >
    > > +success:
    > > /*
    > > * vm_flags is protected by the mmap_sem held in write mode.
    > > * It's okay if try_to_unmap_one unmaps a page just after we
    > > @@ -59,7 +74,7 @@ out:
    > > static int do_mlock(unsigned long start, size_t len, int on)
    > > {
    > > unsigned long nstart, end, tmp;
    > > - struct vm_area_struct * vma, * next;
    > > + struct vm_area_struct * vma, * prev;
    > > int error;
    > >
    > > len = PAGE_ALIGN(len);
    > > @@ -68,7 +83,7 @@ static int do_mlock(unsigned long start,
    > > return -EINVAL;
    > > if (end == start)
    > > return 0;
    > > - vma = find_vma(current->mm, start);
    > > + vma = find_vma_prev(current->mm, start, &prev);
    > > if (!vma || vma->vm_start > start)
    > > return -ENOMEM;
    >
    > But here sys_mprotect also says:
    >
    > if (start > vma->vm_start)
    > prev = vma;
    >
    > Perhaps you've worked your way through vma_merge and convinced yourself
    > this is never necessary, that's quite possible; but I'd still be happier
    > if you were to add it into your do_mlock: it limits the cases vma_merge
    > has to worry about. Or if you feel strongly about it, explain why I'm
    > just being silly, and delete it from mprotect too.

    No, I think you're right, and it measurably improves (~5%) the worst
    case benchmark I was doing, thanks.

    > > @@ -81,18 +96,19 @@ static int do_mlock(unsigned long start,
    > > if (!on)
    > > newflags &= ~VM_LOCKED;
    > >
    > > - if (vma->vm_end >= end) {
    > > - error = mlock_fixup(vma, nstart, end, newflags);
    > > - break;
    > > - }
    > > -
    > > tmp = vma->vm_end;
    > > - next = vma->vm_next;
    > > - error = mlock_fixup(vma, nstart, tmp, newflags);
    > > + if (tmp > end)
    > > + tmp = end;
    > > + error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
    > > if (error)
    > > break;
    > > nstart = tmp;
    > > - vma = next;
    > > + if (nstart < prev->vm_end)
    > > + nstart = prev->vm_end;
    > > + if (nstart >= end)
    > > + break;
    > > +
    > > + vma = prev->vm_next;
    > > if (!vma || vma->vm_start != nstart) {
    > > error = -ENOMEM;
    > > break;
    > > @@ -141,7 +157,7 @@ asmlinkage long sys_munlock(unsigned lon
    > >
    > > static int do_mlockall(int flags)
    > > {
    > > - struct vm_area_struct * vma;
    > > + struct vm_area_struct * vma, * prev;
    > > unsigned int def_flags = 0;
    > >
    > > if (flags & MCL_FUTURE)
    > > @@ -150,7 +166,7 @@ static int do_mlockall(int flags)
    > > if (flags == MCL_FUTURE)
    > > goto out;
    > >
    > > - for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
    > > + for (prev = vma = current->mm->mmap; vma ; vma = vma->vm_next) {
    >
    > Here prev should be initialized to NULL, rather than the first vma.
    > Again, you've probably worked out that it's safe as you've written it,
    > but vma_merge does expect prev NULL at the beginning.

    No, it was a bug.

    > > unsigned int newflags;
    > >
    > > newflags = vma->vm_flags | VM_LOCKED;
    > > @@ -158,7 +174,8 @@ static int do_mlockall(int flags)
    > > newflags &= ~VM_LOCKED;
    > >
    > > /* Ignore errors */
    > > - mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
    > > + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
    > > + vma = prev;
    >
    > Scrap that "vma = prev;" line, just say "vma = prev->vm_next" in the loop?

    Yup, thanks for reviewing. Updated patch below.

    thanks,
    -chris

    -- 
    Successive mlock/munlock calls can leave fragmented vmas because they can
    be split but not merged.  Give mlock et. al. full vma merging support.
    While we're at it, move *pprev assignment above first split_vma in
    mprotect_fixup to keep it in step with mlock_fixup (which for mlockall
    ignores errors yet still needs a valid prev pointer).
    Signed-off-by: Chris Wright <chrisw@osdl.org>
    ===== mm/mlock.c 1.19 vs edited =====
    --- 1.19/mm/mlock.c	2005-02-11 11:07:35 -08:00
    +++ edited/mm/mlock.c	2005-02-28 11:08:23 -08:00
    @@ -7,18 +7,32 @@
     
     #include <linux/mman.h>
     #include <linux/mm.h>
    +#include <linux/mempolicy.h>
     #include <linux/syscalls.h>
     
     
    -static int mlock_fixup(struct vm_area_struct * vma, 
    +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
     	unsigned long start, unsigned long end, unsigned int newflags)
     {
     	struct mm_struct * mm = vma->vm_mm;
    +	pgoff_t pgoff;
     	int pages;
     	int ret = 0;
     
    -	if (newflags == vma->vm_flags)
    +	if (newflags == vma->vm_flags) {
    +		*prev = vma;
     		goto out;
    +	}
    +
    +	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
    +	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
    +			  vma->vm_file, pgoff, vma_policy(vma));
    +	if (*prev) {
    +		vma = *prev;
    +		goto success;
    +	}
    +
    +	*prev = vma;
     
     	if (start != vma->vm_start) {
     		ret = split_vma(mm, vma, start, 1);
    @@ -32,6 +46,7 @@ static int mlock_fixup(struct vm_area_st
     			goto out;
     	}
     
    +success:
     	/*
     	 * vm_flags is protected by the mmap_sem held in write mode.
     	 * It's okay if try_to_unmap_one unmaps a page just after we
    @@ -59,7 +74,7 @@ out:
     static int do_mlock(unsigned long start, size_t len, int on)
     {
     	unsigned long nstart, end, tmp;
    -	struct vm_area_struct * vma, * next;
    +	struct vm_area_struct * vma, * prev;
     	int error;
     
     	len = PAGE_ALIGN(len);
    @@ -68,10 +83,13 @@ static int do_mlock(unsigned long start,
     		return -EINVAL;
     	if (end == start)
     		return 0;
    -	vma = find_vma(current->mm, start);
    +	vma = find_vma_prev(current->mm, start, &prev);
     	if (!vma || vma->vm_start > start)
     		return -ENOMEM;
     
    +	if (start > vma->vm_start)
    +		prev = vma;
    +
     	for (nstart = start ; ; ) {
     		unsigned int newflags;
     
    @@ -81,18 +99,19 @@ static int do_mlock(unsigned long start,
     		if (!on)
     			newflags &= ~VM_LOCKED;
     
    -		if (vma->vm_end >= end) {
    -			error = mlock_fixup(vma, nstart, end, newflags);
    -			break;
    -		}
    -
     		tmp = vma->vm_end;
    -		next = vma->vm_next;
    -		error = mlock_fixup(vma, nstart, tmp, newflags);
    +		if (tmp > end)
    +			tmp = end;
    +		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
     		if (error)
     			break;
     		nstart = tmp;
    -		vma = next;
    +		if (nstart < prev->vm_end)
    +			nstart = prev->vm_end;
    +		if (nstart >= end)
    +			break;
    +
    +		vma = prev->vm_next;
     		if (!vma || vma->vm_start != nstart) {
     			error = -ENOMEM;
     			break;
    @@ -141,7 +160,7 @@ asmlinkage long sys_munlock(unsigned lon
     
     static int do_mlockall(int flags)
     {
    -	struct vm_area_struct * vma;
    +	struct vm_area_struct * vma, * prev = NULL;
     	unsigned int def_flags = 0;
     
     	if (flags & MCL_FUTURE)
    @@ -150,7 +169,7 @@ static int do_mlockall(int flags)
     	if (flags == MCL_FUTURE)
     		goto out;
     
    -	for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
    +	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
     		unsigned int newflags;
     
     		newflags = vma->vm_flags | VM_LOCKED;
    @@ -158,7 +177,7 @@ static int do_mlockall(int flags)
     			newflags &= ~VM_LOCKED;
     
     		/* Ignore errors */
    -		mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
    +		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
     	}
     out:
     	return 0;
    ===== mm/mprotect.c 1.40 vs edited =====
    --- 1.40/mm/mprotect.c	2004-12-22 01:31:46 -08:00
    +++ edited/mm/mprotect.c	2005-02-28 11:09:39 -08:00
    @@ -185,16 +185,13 @@ mprotect_fixup(struct vm_area_struct *vm
     		goto success;
     	}
     
    +	*pprev = vma;
    +
     	if (start != vma->vm_start) {
     		error = split_vma(mm, vma, start, 1);
     		if (error)
     			goto fail;
     	}
    -	/*
    -	 * Unless it returns an error, this function always sets *pprev to
    -	 * the first vma for which vma->vm_end >= end.
    -	 */
    -	*pprev = vma;
     
     	if (end != vma->vm_end) {
     		error = split_vma(mm, vma, end, 0);
    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at  http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at  http://www.tux.org/lkml/
    

  • Next message: Willy Tarreau: "Re: ethertap.c compilation problem"