[RFC 1/3] non-resident page tracking

From: Rik van Riel (riel_at_redhat.com)
Date: 08/08/05

  • Next message: Sonny Rao: "Re: OOPS in 2.6.13-rc1-mm1 -- EIP is at sysfs_release+0x49/0xb0"
    Date:	Mon, 08 Aug 2005 16:14:17 -0400
    To: linux-mm@kvack.org
    
    

    Track non-resident pages through a simple hashing scheme. This way
    the space overhead is limited to 1 u32 per page, or 0.1% space overhead
    and lookups are one cache miss.

    Aside from seeing whether or not a page was recently evicted, we can
    also take a reasonable guess at how many other pages were evicted since
    this page was evicted.

    Signed-off-by: Rik van Riel <riel@redhat.com>

    Index: linux-2.6.12-vm/include/linux/swap.h
    ===================================================================
    --- linux-2.6.12-vm.orig/include/linux/swap.h
    +++ linux-2.6.12-vm/include/linux/swap.h
    @@ -153,6 +153,11 @@ extern void out_of_memory(unsigned int _
     /* linux/mm/memory.c */
     extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
     
    +/* linux/mm/nonresident.c */
    +extern int remember_page(struct address_space *, unsigned long);
    +extern int recently_evicted(struct address_space *, unsigned long);
    +extern void init_nonresident(void);
    +
     /* linux/mm/page_alloc.c */
     extern unsigned long totalram_pages;
     extern unsigned long totalhigh_pages;
    @@ -288,6 +293,11 @@ static inline swp_entry_t get_swap_page(
     #define grab_swap_token() do { } while(0)
     #define has_swap_token(x) 0
     
    +/* linux/mm/nonresident.c */
    +#define init_nonresident() do { } while (0)
    +#define remember_page(x,y) 0
    +#define recently_evicted(x,y) 0
    +
     #endif /* CONFIG_SWAP */
     #endif /* __KERNEL__*/
     #endif /* _LINUX_SWAP_H */
    Index: linux-2.6.12-vm/init/main.c
    ===================================================================
    --- linux-2.6.12-vm.orig/init/main.c
    +++ linux-2.6.12-vm/init/main.c
    @@ -47,6 +47,7 @@
     #include <linux/rmap.h>
     #include <linux/mempolicy.h>
     #include <linux/key.h>
    +#include <linux/swap.h>
     
     #include <asm/io.h>
     #include <asm/bugs.h>
    @@ -488,6 +489,7 @@ asmlinkage void __init start_kernel(void
             }
     #endif
             vfs_caches_init_early();
    + init_nonresident();
             mem_init();
             kmem_cache_init();
             numa_policy_init();
    Index: linux-2.6.12-vm/mm/Makefile
    ===================================================================
    --- linux-2.6.12-vm.orig/mm/Makefile
    +++ linux-2.6.12-vm/mm/Makefile
    @@ -12,7 +12,8 @@ obj-y := bootmem.o filemap.o mempool.o
                                readahead.o slab.o swap.o truncate.o vmscan.o \
                                prio_tree.o $(mmu-y)
     
    -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
    +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
    + nonresident.o
     obj-$(CONFIG_HUGETLBFS) += hugetlb.o
     obj-$(CONFIG_NUMA) += mempolicy.o
     obj-$(CONFIG_SHMEM) += shmem.o
    Index: linux-2.6.12-vm/mm/nonresident.c
    ===================================================================
    --- /dev/null
    +++ linux-2.6.12-vm/mm/nonresident.c
    @@ -0,0 +1,157 @@
    +/*
    + * mm/nonresident.c
    + * (C) 2004,2005 Red Hat, Inc
    + * Written by Rik van Riel <riel@redhat.com>
    + * Released under the GPL, see the file COPYING for details.
    + *
    + * Keeps track of whether a non-resident page was recently evicted
    + * and should be immediately promoted to the active list. This also
    + * helps automatically tune the inactive target.
    + *
    + * The pageout code stores a recently evicted page in this cache
    + * by calling remember_page(mapping/mm, index/vaddr, generation)
    + * and can look it up in the cache by calling recently_evicted()
    + * with the same arguments.
    + *
    + * Note that there is no way to invalidate pages after eg. truncate
    + * or exit, we let the pages fall out of the non-resident set through
    + * normal replacement.
    + */
    +#include <linux/mm.h>
    +#include <linux/cache.h>
    +#include <linux/spinlock.h>
    +#include <linux/bootmem.h>
    +#include <linux/hash.h>
    +#include <linux/prefetch.h>
    +#include <linux/kernel.h>
    +
    +/* Number of non-resident pages per hash bucket */
    +#define NUM_NR ((L1_CACHE_BYTES - sizeof(atomic_t))/sizeof(u32))
    +
    +struct nr_bucket
    +{
    + atomic_t hand;
    + u32 page[NUM_NR];
    +} ____cacheline_aligned;
    +
    +/* The non-resident page hash table. */
    +static struct nr_bucket * nonres_table;
    +static unsigned int nonres_shift;
    +static unsigned int nonres_mask;
    +
    +struct nr_bucket * nr_hash(void * mapping, unsigned long index)
    +{
    + unsigned long bucket;
    + unsigned long hash;
    +
    + hash = hash_ptr(mapping, BITS_PER_LONG);
    + hash = 37 * hash + hash_long(index, BITS_PER_LONG);
    + bucket = hash & nonres_mask;
    +
    + return nonres_table + bucket;
    +}
    +
    +static u32 nr_cookie(struct address_space * mapping, unsigned long index)
    +{
    + unsigned long cookie = hash_ptr(mapping, BITS_PER_LONG);
    + cookie = 37 * cookie + hash_long(index, BITS_PER_LONG);
    +
    + if (mapping->host) {
    + cookie = 37 * cookie + hash_long(mapping->host->i_ino, BITS_PER_LONG);
    + }
    +
    + return (u32)(cookie >> (BITS_PER_LONG - 32));
    +}
    +
    +int recently_evicted(struct address_space * mapping, unsigned long index)
    +{
    + struct nr_bucket * nr_bucket;
    + int distance;
    + u32 wanted;
    + int i;
    +
    + prefetch(mapping->host);
    + nr_bucket = nr_hash(mapping, index);
    +
    + prefetch(nr_bucket);
    + wanted = nr_cookie(mapping, index);
    +
    + for (i = 0; i < NUM_NR; i++) {
    + if (nr_bucket->page[i] == wanted) {
    + nr_bucket->page[i] = 0;
    + /* Return the distance between entry and clock hand. */
    + distance = atomic_read(&nr_bucket->hand) + NUM_NR - i;
    + distance = (distance % NUM_NR) + 1;
    + return distance * (1 << nonres_shift);
    + }
    + }
    +
    + return -1;
    +}
    +
    +int remember_page(struct address_space * mapping, unsigned long index)
    +{
    + struct nr_bucket * nr_bucket;
    + u32 nrpage;
    + int i;
    +
    + prefetch(mapping->host);
    + nr_bucket = nr_hash(mapping, index);
    +
    + prefetchw(nr_bucket);
    + nrpage = nr_cookie(mapping, index);
    +
    + /* Atomically find the next array index. */
    + preempt_disable();
    + retry:
    + i = atomic_inc_return(&nr_bucket->hand);
    + if (unlikely(i >= NUM_NR)) {
    + if (i == NUM_NR)
    + atomic_set(&nr_bucket->hand, -1);
    + goto retry;
    + }
    + preempt_enable();
    +
    + /* Statistics may want to know whether the entry was in use. */
    + return xchg(&nr_bucket->page[i], nrpage);
    +}
    +
    +/*
    + * For interactive workloads, we remember about as many non-resident pages
    + * as we have actual memory pages. For server workloads with large inter-
    + * reference distances we could benefit from remembering more.
    + */
    +static __initdata unsigned long nonresident_factor = 1;
    +void __init init_nonresident(void)
    +{
    + int target;
    + int i;
    +
    + /*
    + * Calculate the non-resident hash bucket target. Use a power of
    + * two for the division because alloc_large_system_hash rounds up.
    + */
    + target = nr_all_pages * nonresident_factor;
    + target /= (sizeof(struct nr_bucket) / sizeof(u32));
    +
    + nonres_table = alloc_large_system_hash("Non-resident page tracking",
    + sizeof(struct nr_bucket),
    + target,
    + 0,
    + HASH_EARLY | HASH_HIGHMEM,
    + &nonres_shift,
    + &nonres_mask,
    + 0);
    +
    + for (i = 0; i < (1 << nonres_shift); i++)
    + atomic_set(&nonres_table[i].hand, 0);
    +}
    +
    +static int __init set_nonresident_factor(char * str)
    +{
    + if (!str)
    + return 0;
    + nonresident_factor = simple_strtoul(str, &str, 0);
    + return 1;
    +}
    +__setup("nonresident_factor=", set_nonresident_factor);
    Index: linux-2.6.12-vm/mm/vmscan.c
    ===================================================================
    --- linux-2.6.12-vm.orig/mm/vmscan.c
    +++ linux-2.6.12-vm/mm/vmscan.c
    @@ -509,6 +509,7 @@ static int shrink_list(struct list_head
     #ifdef CONFIG_SWAP
                     if (PageSwapCache(page)) {
                             swp_entry_t swap = { .val = page->private };
    + remember_page(&swapper_space, page->private);
                             __delete_from_swap_cache(page);
                             write_unlock_irq(&mapping->tree_lock);
                             swap_free(swap);
    @@ -517,6 +518,7 @@ static int shrink_list(struct list_head
                     }
     #endif /* CONFIG_SWAP */
     
    + remember_page(page->mapping, page->index);
                     __remove_from_page_cache(page);
                     write_unlock_irq(&mapping->tree_lock);
                     __put_page(page);
    Index: linux-2.6.12-vm/mm/filemap.c
    ===================================================================
    --- linux-2.6.12-vm.orig/mm/filemap.c
    +++ linux-2.6.12-vm/mm/filemap.c
    @@ -400,8 +400,13 @@ int add_to_page_cache_lru(struct page *p
                                     pgoff_t offset, int gfp_mask)
     {
             int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
    - if (ret == 0)
    - lru_cache_add(page);
    + int activate = recently_evicted(mapping, offset);
    + if (ret == 0) {
    + if (activate >= 0)
    + lru_cache_add_active(page);
    + else
    + lru_cache_add(page);
    + }
             return ret;
     }
     
    Index: linux-2.6.12-vm/mm/swap_state.c
    ===================================================================
    --- linux-2.6.12-vm.orig/mm/swap_state.c
    +++ linux-2.6.12-vm/mm/swap_state.c
    @@ -323,6 +323,7 @@ struct page *read_swap_cache_async(swp_e
                             struct vm_area_struct *vma, unsigned long addr)
     {
             struct page *found_page, *new_page = NULL;
    + int activate;
             int err;
     
             do {
    @@ -344,6 +345,8 @@ struct page *read_swap_cache_async(swp_e
                                     break; /* Out of memory */
                     }
     
    + activate = recently_evicted(&swapper_space, entry.val);
    +
                     /*
                      * Associate the page with swap entry in the swap cache.
                      * May fail (-ENOENT) if swap entry has been freed since
    @@ -359,7 +362,10 @@ struct page *read_swap_cache_async(swp_e
                             /*
                              * Initiate read into locked page and return.
                              */
    - lru_cache_add_active(new_page);
    + if (activate >= 0)
    + lru_cache_add_active(new_page);
    + else
    + lru_cache_add(new_page);
                             swap_readpage(NULL, new_page);
                             return new_page;
                     }

    --
    -- 
    All Rights Reversed
    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at  http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at  http://www.tux.org/lkml/
    

  • Next message: Sonny Rao: "Re: OOPS in 2.6.13-rc1-mm1 -- EIP is at sysfs_release+0x49/0xb0"

    Relevant Pages

    • [RFC] non resident page management, #4
      ... * Keeps track of whether a non-resident page was recently evicted ... +extern int recently_evicted ... +static unsigned long nr_cookie(struct address_space * mapping, unsigned long offset) ...
      (Linux-Kernel)
    • [PATCH/RFT 1/5] CLOCK-Pro page replacement
      ... Track non-resident pages through a simple hashing scheme. ... +extern int recently_evicted ... +static u32 nr_cookie(struct address_space * mapping, ...
      (Linux-Kernel)
    • Re: Real Microkernel Need C coders
      ... extern int vga_setmode; ... extern int vga_drawscanline(int line, unsigned char *colors); ... extern void vga_setreadpage; ...
      (comp.lang.c)
    • Re: Real Microkernel Need C coders
      ... extern int vga_setmode; ... extern int vga_drawscanline(int line, unsigned char *colors); ... extern void vga_setreadpage; ...
      (comp.lang.c)
    • Re: Real Microkernel Need C coders
      ... extern int vga_setmode; ... extern int vga_drawscanline(int line, unsigned char *colors); ... extern void vga_setreadpage; ...
      (comp.lang.c)