attached are the 4 patches that implement dynamic unmapping from the linear array for aggressive use-after-free detection. patch-umap-core: update the cache flushing logic within change_attr: - use page->link for storing the pages to be unlinked, avoids the need for OOM handling. - use a spinlock instead of the semaphore patch-umap-gfp: unmap pages between free_pages and gfp. - use change_attr - x86-isms: sending an IPI for the flush_tlb during gfp is not possible, because gfp can be called with disabled local interrupts. This means that after free_pages, another cpu could continue to access the page if it has an tlb entry. x86 cpu do not cache negative lookup results, thus there won't be oopses due to missing flushes after alloc_pages. - use page->private to log failed unmap calls. Without it, I got an oops during boot. patch-umap-slab: add change_page_attr to slabs that support it. Implementation identical to patch-umap-gfp. patch-umap-task: increase the taskstruct_cache object size to PAGE_SIZE, then slab can unmap the pages. arch/i386/Kconfig | 8 ++ arch/i386/kernel/cpu/common.c | 8 ++ arch/i386/mm/pageattr.c | 65 +++++++--------- kernel/fork.c | 12 ++- mm/page_alloc.c | 56 +++++++++++++- mm/slab.c | 166 +++++++++++++++++++++++++++--------------- 6 files changed, 217 insertions(+), 98 deletions(-) diff -puN arch/i386/mm/pageattr.c~unmap-page-debugging arch/i386/mm/pageattr.c --- 25/arch/i386/mm/pageattr.c~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/arch/i386/mm/pageattr.c 2003-05-13 20:50:19.000000000 -0700 @@ -13,6 +13,10 @@ #include #include +static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED; +static struct list_head df_list = LIST_HEAD_INIT(df_list); + + static inline pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); @@ -31,10 +35,15 @@ static struct page *split_large_page(uns { int i; unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; pte_t *pbase; + + spin_unlock_irq(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + spin_lock_irq(&cpa_lock); if (!base) return NULL; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -90,7 +99,7 @@ static inline void revert_page(struct pa } static int -__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) +__change_page_attr(struct page *page, pgprot_t prot) { pte_t *kpte; unsigned long address; @@ -126,7 +135,7 @@ __change_page_attr(struct page *page, pg } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - *oldpage = kpte_page; + list_add(&kpte_page->list, &df_list); revert_page(kpte_page, address); } return 0; @@ -137,12 +146,6 @@ static inline void flush_map(void) on_each_cpu(flush_kernel_map, NULL, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ - /* * Change the page attributes of an page in the linear mapping. * @@ -159,46 +162,36 @@ static struct deferred_page *df_list; /* int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int err = 0; - struct page *fpage; int i; + unsigned long flags; - down_write(&init_mm.mmap_sem); + spin_lock_irqsave(&cpa_lock, flags); for (i = 0; i < numpages; i++, page++) { - fpage = NULL; - err = __change_page_attr(page, prot, &fpage); + err = __change_page_attr(page, prot); if (err) break; - if (fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df_list = df; - } - } } - up_write(&init_mm.mmap_sem); + spin_unlock_irqrestore(&cpa_lock, flags); return err; } void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + LIST_HEAD(l); + struct list_head* n; + + BUG_ON(irqs_disabled()); - down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); - up_read(&init_mm.mmap_sem); + spin_lock_irq(&cpa_lock); + list_splice_init(&df_list, &l); + spin_unlock_irq(&cpa_lock); flush_map(); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); - } + n = l.next; + while (n != &l) { + struct page *pg = list_entry(n, struct page, list); + n = n->next; + __free_page(pg); + } } EXPORT_SYMBOL(change_page_attr); diff -puN arch/i386/Kconfig~unmap-page-debugging arch/i386/Kconfig --- 25/arch/i386/Kconfig~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-05-13 20:50:19.000000000 -0700 @@ -1559,6 +1559,14 @@ config SPINLINE itself (as ".text.lock.filename"). This can be helpful for finding the callers of locks. +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM diff -puN arch/i386/kernel/cpu/common.c~unmap-page-debugging arch/i386/kernel/cpu/common.c --- 25/arch/i386/kernel/cpu/common.c~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/common.c 2003-05-13 20:50:19.000000000 -0700 @@ -430,6 +430,14 @@ void __init early_cpu_init(void) rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif } /* * cpu_init() initializes state that is per-CPU. Some data is already diff -puN mm/page_alloc.c~unmap-page-debugging mm/page_alloc.c --- 25/mm/page_alloc.c~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-05-13 20:50:19.000000000 -0700 @@ -30,6 +30,8 @@ #include #include +#include +#include DECLARE_BITMAP(node_online_map, MAX_NUMNODES); DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS); @@ -52,6 +54,47 @@ static int zone_balance_ratio[MAX_NR_ZON static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +#ifdef CONFIG_DEBUG_PAGEALLOC +static int __map_pages(struct page *page, unsigned int num, pgprot_t prot) +{ + int retval; +#ifdef CONFIG_HIGHMEM + if (page >= highmem_start_page) + return -1; +#endif + retval = change_page_attr(page,num,prot); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); + return retval; +} + +static void map_pages(struct page *page, unsigned int num) +{ + if (page->private == 1) + return; + __map_pages(page, num, PAGE_KERNEL); +} + +static void unmap_pages(struct page *page, unsigned int num) +{ + if (__map_pages(page, num, __pgprot(0)) < 0) { + page->private = 1; + return ; + } + page->private = 0; +} +#else +static void unmap_pages(struct page *page, unsigned int num) +{ +} + +static void map_pages(struct page *page, unsigned int num) +{ +} +#endif + /* * Temporary debugging check for pages not lying within a given zone. */ @@ -266,6 +309,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); list_add(&page->list, &list); + unmap_pages(page, 1<pageset[get_cpu()].pcp[cold]; @@ -557,7 +602,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -580,7 +625,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += local_min * sysctl_lower_zone_protection; } @@ -595,7 +640,7 @@ rebalance: page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } goto nopage; } @@ -623,7 +668,7 @@ rebalance: (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -654,6 +699,9 @@ nopage: current->comm, order, gfp_mask); } return NULL; +got_pg: + map_pages(page, 1 << order); + return page; } /* diff -puN mm/slab.c~unmap-page-debugging mm/slab.c --- 25/mm/slab.c~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/mm/slab.c 2003-05-13 20:50:19.000000000 -0700 @@ -85,6 +85,8 @@ #include #include #include +#include +#include /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, @@ -204,8 +206,8 @@ struct arraycache_init { #define SHARED_ARRAY_FACTOR 16 struct kmem_list3 { struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; struct list_head slabs_free; + unsigned long slab_cnt; unsigned long free_objects; int free_touched; unsigned long next_reap; @@ -214,7 +216,6 @@ struct kmem_list3 { #define LIST3_INIT(parent) \ { \ - .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ } @@ -755,6 +756,49 @@ static inline void kmem_freepages (kmem_ } #if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static int __map_pages(struct page *page, unsigned int num, pgprot_t prot) +{ + int retval; + + retval = change_page_attr(page,num,prot); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); + return retval; +} + +static void map_pages(void *objp, unsigned int size) +{ + struct page *pg = virt_to_page(objp); + + if (pg->private == 1) + return; + __map_pages(pg, size/PAGE_SIZE, PAGE_KERNEL); +} + +static void unmap_pages(void *objp, unsigned int size) +{ + struct page *pg = virt_to_page(objp); + + if (__map_pages(pg, size/PAGE_SIZE, __pgprot(0)) < 0) { + pg->private = 1; + return ; + } + pg->private = 0; +} +#else +static void map_pages(void *objp, unsigned int size) +{ +} + +static void unmap_pages(void *objp, unsigned int size) +{ +} +#endif + static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) { int size = cachep->objsize; @@ -848,8 +892,12 @@ static void slab_destroy (kmem_cache_t * void *objp = slabp->s_mem + cachep->objsize * i; int objlen = cachep->objsize; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + map_pages(objp, cachep->objsize); + check_poison_obj(cachep, objp); + } if (cachep->flags & SLAB_STORE_USER) objlen -= BYTES_PER_WORD; @@ -1084,7 +1132,6 @@ next: spin_lock_init(&cachep->spinlock); cachep->objsize = size; /* NUMA */ - INIT_LIST_HEAD(&cachep->lists.slabs_full); INIT_LIST_HEAD(&cachep->lists.slabs_partial); INIT_LIST_HEAD(&cachep->lists.slabs_free); @@ -1249,12 +1296,12 @@ static int __cache_shrink(kmem_cache_t * list_del(&slabp->list); cachep->lists.free_objects -= cachep->num; + cachep->lists.slab_cnt--; spin_unlock_irq(&cachep->spinlock); slab_destroy(cachep, slabp); spin_lock_irq(&cachep->spinlock); } - ret = !list_empty(&cachep->lists.slabs_full) || - !list_empty(&cachep->lists.slabs_partial); + ret = cachep->lists.slab_cnt; spin_unlock_irq(&cachep->spinlock); return ret; } @@ -1363,8 +1410,11 @@ static void cache_init_objs (kmem_cache_ #if DEBUG int objlen = cachep->objsize; /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_BEFORE); + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + unmap_pages(objp, cachep->objsize); + } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; ((unsigned long*)(objp+objlen))[0] = 0; @@ -1499,6 +1549,7 @@ static int cache_grow (kmem_cache_t * ca list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); STATS_INC_GROWN(cachep); list3_data(cachep)->free_objects += cachep->num; + list3_data(cachep)->slab_cnt++; spin_unlock(&cachep->spinlock); return 1; opps1: @@ -1589,8 +1640,11 @@ static inline void *cache_free_debugchec else cachep->dtor(objp, cachep, 0); } - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_AFTER); + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + unmap_pages(objp, cachep->objsize); + } #endif return objp; } @@ -1611,35 +1665,6 @@ static inline void check_slabp(kmem_cach #endif } -static inline void * cache_alloc_one_tail (kmem_cache_t *cachep, - struct slab *slabp) -{ - void *objp; - - check_spinlock_acquired(cachep); - - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - /* get obj pointer */ - slabp->inuse++; - objp = slabp->s_mem + slabp->free*cachep->objsize; - slabp->free=slab_bufctl(slabp)[slabp->free]; - - return objp; -} - -static inline void cache_alloc_listfixup(struct kmem_list3 *l3, struct slab *slabp) -{ - list_del(&slabp->list); - if (slabp->free == BUFCTL_END) { - list_add(&slabp->list, &l3->slabs_full); - } else { - list_add(&slabp->list, &l3->slabs_partial); - } -} - static void* cache_alloc_refill(kmem_cache_t* cachep, int flags) { int batchcount; @@ -1688,11 +1713,29 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); - while (slabp->inuse < cachep->num && batchcount--) - ac_entry(ac)[ac->avail++] = - cache_alloc_one_tail(cachep, slabp); + for (;;) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + /* get obj pointer */ + ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; + slabp->inuse++; + slabp->free=slab_bufctl(slabp)[slabp->free]; + batchcount--; + if (slabp->free == BUFCTL_END) { + list_del_init(&slabp->list); + break; + } + if (!batchcount) + break; + } + if (unlikely(slabp->list.prev == &l3->slabs_free)) { + list_del(&slabp->list); + list_add(&slabp->list, &l3->slabs_partial); + } + check_slabp(cachep, slabp); - cache_alloc_listfixup(l3, slabp); } must_grow: @@ -1735,8 +1778,13 @@ cache_alloc_debugcheck_after(kmem_cache_ if (!objp) return objp; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + map_pages(objp, cachep->objsize); + } + check_poison_obj(cachep, objp); + } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; *((void **)(objp+objlen)) = caller; @@ -1811,7 +1859,6 @@ static void free_block(kmem_cache_t *cac unsigned int objnr; slabp = GET_PAGE_SLAB(virt_to_page(objp)); - list_del(&slabp->list); objnr = (objp - slabp->s_mem) / cachep->objsize; check_slabp(cachep, slabp); slab_bufctl(slabp)[objnr] = slabp->free; @@ -1822,20 +1869,24 @@ static void free_block(kmem_cache_t *cac /* fixup slab chains */ if (slabp->inuse == 0) { + list_del(&slabp->list); if (cachep->lists.free_objects > cachep->free_limit) { cachep->lists.free_objects -= cachep->num; + cachep->lists.slab_cnt--; slab_destroy(cachep, slabp); } else { list_add(&slabp->list, &list3_data_ptr(cachep, objp)->slabs_free); } } else { - /* Unconditionally move a slab to the end of the - * partial list on free - maximum time for the - * other objects to be freed, too. + /* list_empty means that slabp->list references itself, + * which means it's an unlinked full slab. + * Link it at the tail of the partial list. */ - list_add_tail(&slabp->list, - &list3_data_ptr(cachep, objp)->slabs_partial); + if (list_empty(&slabp->list)) { + list_add_tail(&slabp->list, + &list3_data_ptr(cachep, objp)->slabs_partial); + } } } } @@ -2339,6 +2390,7 @@ static inline void cache_reap (void) * cache_chain_lock */ searchp->lists.free_objects -= searchp->num; + searchp->lists.slab_cnt--; spin_unlock_irq(&searchp->spinlock); slab_destroy(searchp, slabp); spin_lock_irq(&searchp->spinlock); @@ -2427,13 +2479,6 @@ static int s_show(struct seq_file *m, vo spin_lock_irq(&cachep->spinlock); active_objs = 0; num_slabs = 0; - list_for_each(q,&cachep->lists.slabs_full) { - slabp = list_entry(q, struct slab, list); - if (slabp->inuse != cachep->num && !error) - error = "slabs_full accounting error"; - active_objs += cachep->num; - active_slabs++; - } list_for_each(q,&cachep->lists.slabs_partial) { slabp = list_entry(q, struct slab, list); if (slabp->inuse == cachep->num && !error) @@ -2449,7 +2494,16 @@ static int s_show(struct seq_file *m, vo error = "slabs_free/inuse accounting error"; num_slabs++; } - num_slabs+=active_slabs; + /* full slabs are hiding, figure out how many exist and + * update the counters. + */ + { + unsigned long slabs_full; + slabs_full = cachep->lists.slab_cnt-num_slabs-active_slabs; + active_objs += cachep->num*slabs_full; + active_slabs += slabs_full; + } + num_slabs = cachep->lists.slab_cnt; num_objs = num_slabs*cachep->num; if (num_objs - active_objs != cachep->lists.free_objects && !error) error = "free_objects accounting error"; diff -puN kernel/fork.c~unmap-page-debugging kernel/fork.c --- 25/kernel/fork.c~unmap-page-debugging 2003-05-13 20:50:19.000000000 -0700 +++ 25-akpm/kernel/fork.c 2003-05-13 20:50:19.000000000 -0700 @@ -187,10 +187,18 @@ int autoremove_wake_function(wait_queue_ void __init fork_init(unsigned long mempages) { /* create a slab on which task_structs can be allocated */ +#ifdef CONFIG_DEBUG_PAGEALLOC task_struct_cachep = kmem_cache_create("task_struct", - sizeof(struct task_struct),0, - SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + min((size_t)PAGE_SIZE, sizeof(struct task_struct)), + 0, SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); +#else + task_struct_cachep = + kmem_cache_create("task_struct", + sizeof(struct task_struct), + 0, SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); +#endif + if (!task_struct_cachep) panic("fork_init(): cannot create task_struct SLAB cache"); _