Return-Path: X-Sieve: cmu-sieve 2.0 Return-path: Envelope-to: mbligh@localhost Delivery-date: Wed, 17 Mar 2004 14:23:35 -0800 Received: from w-mbligh.beaverton.ibm.com ([127.0.0.1] helo=mail.aracnet.com ident=mbligh) by w-mbligh.beaverton.ibm.com with esmtp (Exim 3.35 #1 (Debian)) id 1B3jRm-0002x1-00 for ; Wed, 17 Mar 2004 14:23:34 -0800 Received: from psmtp.com (exprod5mx18.postini.com [12.158.34.158]) by obsidian.spiritone.com (8.12.10/8.12.8) with SMTP id i2HMQSkT010825 for ; Wed, 17 Mar 2004 14:26:28 -0800 Delivered-To: Received: from source ([32.97.110.131]) by exprod5mx18.postini.com ([12.158.34.245]) with SMTP; Wed, 17 Mar 2004 17:19:46 EST Received: from westrelay04.boulder.ibm.com (westrelay04.boulder.ibm.com [9.17.193.32]) by e33.co.us.ibm.com (8.12.10/8.12.2) with ESMTP id i2HMJjfS792882 for ; Wed, 17 Mar 2004 17:19:45 -0500 Received: from DYN317989BLD.beaverton.ibm.com (d03av02.boulder.ibm.com [9.17.193.82]) by westrelay04.boulder.ibm.com (8.12.10/NCO/VER6.6) with ESMTP id i2HMJi3D113458 for ; Wed, 17 Mar 2004 15:19:45 -0700 Subject: 2.6.4-mjb1 : 760-implicit_hugetlb From: Adam Litke To: Martin Bligh Content-Type: text/plain Organization: IBM Message-Id: <1079561652.5224.1.camel@agtpad> Mime-Version: 1.0 X-Mailer: Ximian Evolution 1.4.5 Date: Wed, 17 Mar 2004 14:14:13 -0800 Content-Transfer-Encoding: 7bit X-Accept: 2.6 or must-fix diff -upN reference/arch/ppc64/mm/hugetlbpage.c current/arch/ppc64/mm/hugetlbpage.c --- reference/arch/ppc64/mm/hugetlbpage.c 2004-04-29 10:39:11.000000000 -0700 +++ current/arch/ppc64/mm/hugetlbpage.c 2004-04-29 10:39:27.000000000 -0700 @@ -307,6 +307,21 @@ int prepare_hugepage_range(unsigned long return -EINVAL; } +int close_32bit_htlbpage_range(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(mm->context.low_hpages == 0); + + /* Check if any vmas are in the region */ + vma = find_vma(mm, TASK_HPAGE_BASE_32); + if (vma && vma->vm_start < TASK_HPAGE_END_32) + return -EBUSY; + + mm->context.low_hpages = 0; + return 0; +} + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { @@ -637,8 +652,11 @@ unsigned long hugetlb_get_unmapped_area( for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ - if (addr + len > end) + if (addr + len > end) { + if (test_thread_flag(TIF_32BIT)) + close_32bit_htlbpage_range(current->mm); return -ENOMEM; + } if (!vma || (addr + len) <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, HPAGE_SIZE); diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c --- reference/fs/hugetlbfs/inode.c 2004-04-29 10:39:24.000000000 -0700 +++ current/fs/hugetlbfs/inode.c 2004-04-29 10:39:27.000000000 -0700 @@ -26,12 +26,17 @@ #include #include #include +#include #include +#include /* some random number */ #define HUGETLBFS_MAGIC 0x958458f6 +extern int mmap_use_hugepages; +extern int mmap_hugepages_map_sz; + static struct super_operations hugetlbfs_ops; static struct address_space_operations hugetlbfs_aops; struct file_operations hugetlbfs_file_operations; @@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static unsigned long +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { diff -upN reference/include/asm-i386/mman.h current/include/asm-i386/mman.h --- reference/include/asm-i386/mman.h 2003-10-14 15:50:32.000000000 -0700 +++ current/include/asm-i386/mman.h 2004-04-29 10:39:27.000000000 -0700 @@ -16,6 +16,7 @@ #define MAP_ANONYMOUS 0x20 /* don't use a file */ #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_HUGETLB 0x0400 /* Backed by hugetlb pages */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ #define MAP_LOCKED 0x2000 /* pages are locked */ diff -upN reference/include/asm-ppc64/mman.h current/include/asm-ppc64/mman.h --- reference/include/asm-ppc64/mman.h 2003-10-01 11:48:24.000000000 -0700 +++ current/include/asm-ppc64/mman.h 2004-04-29 10:39:27.000000000 -0700 @@ -26,6 +26,7 @@ #define MAP_LOCKED 0x80 #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_HUGETLB 0x0400 /* Backed with hugetlb pages */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h --- reference/include/linux/hugetlb.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/hugetlb.h 2004-04-29 10:39:27.000000000 -0700 @@ -50,6 +50,9 @@ mark_mm_hugetlb(struct mm_struct *mm, st int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif +unsigned long try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long *flags); + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -123,12 +126,21 @@ static inline void set_file_hugepages(st { file->f_op = &hugetlbfs_file_operations; } + +unsigned long +hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() #define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) +static inline unsigned long +hugetlb_get_unmapped_area(struct file * a, unsigned long b, unsigned long c, + unsigned long d, unsigned long e) { return -ENOSYS; } #endif /* !CONFIG_HUGETLBFS */ + + #endif /* _LINUX_HUGETLB_H */ diff -upN reference/include/linux/mman.h current/include/linux/mman.h --- reference/include/linux/mman.h 2003-10-14 15:50:34.000000000 -0700 +++ current/include/linux/mman.h 2004-04-29 10:39:27.000000000 -0700 @@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | +#ifdef CONFIG_HUGETLB_PAGE + _calc_vm_trans(flags, MAP_HUGETLB, VM_HUGETLB ) | +#endif _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } diff -upN reference/include/linux/sysctl.h current/include/linux/sysctl.h --- reference/include/linux/sysctl.h 2004-04-07 14:54:37.000000000 -0700 +++ current/include/linux/sysctl.h 2004-04-29 10:39:27.000000000 -0700 @@ -131,6 +131,10 @@ enum KERN_PRINTK_RATELIMIT_BURST=61, /* int: tune printk ratelimiting */ KERN_PTY=62, /* dir: pty driver */ KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */ + KERN_SHMUSEHUGEPAGES=64, /* int: back shm with huge pages */ + KERN_MMAPUSEHUGEPAGES=65, /* int: back anon mmap with huge pages */ + KERN_HPAGES_PER_FILE=66, /* int: max bigpages per file */ + KERN_HPAGES_MAP_SZ=67, /* int: min size (MB) of mapping */ }; diff -upN reference/ipc/shm.c current/ipc/shm.c --- reference/ipc/shm.c 2004-04-07 14:54:37.000000000 -0700 +++ current/ipc/shm.c 2004-04-29 10:39:27.000000000 -0700 @@ -32,6 +32,9 @@ #define shm_flags shm_perm.mode +extern int shm_use_hugepages; +extern int shm_hugepages_per_file; + static struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; @@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v .nopage = shmem_nopage, }; +#ifdef CONFIG_HUGETLBFS +int shm_with_hugepages(int shmflag, size_t size) +{ + /* flag specified explicitly */ + if (shmflag & SHM_HUGETLB) + return 1; + /* Are we disabled? */ + if (!shm_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (size & ~HPAGE_MASK) + return 0; + /* Are we under the max per file? */ + if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file) + return 0; + /* Do we have enough free huge pages? */ + if (!is_hugepage_mem_enough(size)) + return 0; + + return 1; +} +#else +int shm_with_hugepages(int shmflag, size_t size) { return 0; } +#endif + static int newseg (key_t key, int shmflg, size_t size) { int error; @@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg return error; } - if (shmflg & SHM_HUGETLB) + if (shm_with_hugepages(shmflg, size)) { + shmflg |= SHM_HUGETLB; file = hugetlb_zero_setup(size); + } else { sprintf (name, "SYSV%08x", key); file = shmem_file_setup(name, size, VM_ACCOUNT); diff -upN reference/kernel/sysctl.c current/kernel/sysctl.c --- reference/kernel/sysctl.c 2004-04-29 10:39:19.000000000 -0700 +++ current/kernel/sysctl.c 2004-04-29 10:39:27.000000000 -0700 @@ -64,6 +64,8 @@ extern int sysctl_lower_zone_protection; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; +extern int shm_use_hugepages, shm_hugepages_per_file; +extern int mmap_use_hugepages, mmap_hugepages_map_sz; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -615,6 +617,40 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_HUGETLBFS + { + .ctl_name = KERN_SHMUSEHUGEPAGES, + .procname = "shm-use-hugepages", + .data = &shm_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MMAPUSEHUGEPAGES, + .procname = "mmap-use-hugepages", + .data = &mmap_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_PER_FILE, + .procname = "shm-hugepages-per-file", + .data = &shm_hugepages_per_file, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_MAP_SZ, + .procname = "mmap-hugepages-min-mapping", + .data = &mmap_hugepages_map_sz, + .maxlen = sizeof(int), + .mode 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff -upN reference/mm/mmap.c current/mm/mmap.c --- reference/mm/mmap.c 2004-04-29 10:39:14.000000000 -0700 +++ current/mm/mmap.c 2004-04-29 10:39:27.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,9 @@ EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(sysctl_max_map_count); EXPORT_SYMBOL(vm_committed_space); +int mmap_use_hugepages = 0; +int mmap_hugepages_map_sz = 256; + /* * Requires inode->i_mapping->i_shared_sem */ @@ -474,6 +478,46 @@ static struct vm_area_struct *vma_merge( return NULL; } +#ifdef CONFIG_HUGETLBFS +int mmap_hugetlb_implicit(unsigned long len) +{ + /* Are we enabled? */ + if (!mmap_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (len & ~HPAGE_MASK) + return 0; + /* Are we under the minimum size? */ + if (mmap_hugepages_map_sz + && len < (mmap_hugepages_map_sz << 20)) + return 0; + + return 1; +} +#else +int mmap_hugetlb_implicit(unsigned long len) { return 0; } +#endif + +unsigned long +try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long *flags) +{ + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + + if (*flags & MAP_HUGETLB) { + return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + } + + if (mmap_hugetlb_implicit(len)) { + addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + if (!(addr & ~HPAGE_MASK)) + *flags |= MAP_HUGETLB; + return addr; + } + return -ENOMEM; +} + /* * The caller must hold down_write(current->mm->mmap_sem). */ @@ -490,7 +534,8 @@ unsigned long do_mmap_pgoff(struct file int error; struct rb_node ** rb_link, * rb_parent; int accountable = 1; - unsigned long charged = 0; + unsigned long charged = 0, addr_save = addr; + int hugetlb_explicit = (flags & MAP_HUGETLB) != 0; if (file) { if (is_file_hugepages(file)) @@ -521,8 +566,14 @@ unsigned long do_mmap_pgoff(struct file /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. + * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is + * unset. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags); + if (!(flags & MAP_HUGETLB)) +hugetlb_fallback: + addr = get_unmapped_area(file, addr_save, len, pgoff, flags); + if (addr & ~PAGE_MASK) return addr; @@ -671,10 +722,44 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; + } else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) { + if (!is_vm_hugetlb_page(vma)) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + /* + * Presumably hugetlb_zero_setup() acquires a + * reference count for us. The difference + * between this and the shmem_zero_setup() + * case is that we can encounter an error + * _after_ allocating the file. The error + * path was adjusted slightly to fput() for us. + */ + struct file *new_file = hugetlb_zero_setup(len); + if (IS_ERR(new_file)) { + if (hugetlb_explicit) { + error = PTR_ERR(new_file); + goto free_vma; + } else { + /* + * We tried an implicit hugetlb mmap + * but we failed to get the pages. + * We basically have to start over. + */ + flags &= ~MAP_HUGETLB; + kmem_cache_free(vm_area_cachep, vma); + if (charged) + vm_unacct_memory(charged); + goto hugetlb_fallback; + } + } else { + vma->vm_file = new_file; + error = new_file->f_op->mmap(new_file, vma); + if (error) + goto unmap_and_free_vma; + } + } } /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform @@ -722,11 +807,21 @@ out: unmap_and_free_vma: if (correct_wcount) atomic_inc(&inode->i_writecount); - vma->vm_file = NULL; - fput(file); - /* Undo any partial mapping done by a device driver. */ + /* + * Undo any partial mapping done by a device driver. + * hugetlb wants to know the vma's file etc. so nuke + * the file afterward. + */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + /* + * vma->vm_file may be different from file in the hugetlb case. + */ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = NULL; + free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff -upN reference/mm/shmem.c current/mm/shmem.c --- reference/mm/shmem.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/shmem.c 2004-04-29 10:39:27.000000000 -0700 @@ -40,6 +40,29 @@ #include #include +int shm_use_hugepages; + +/* + * On 64bit archs the vmalloc area is very large, + * so we allocate the array in vmalloc on 64bit archs. + * + * Assuming 2M pages (x86 and x86-64) those default setting + * will allow up to 128G of bigpages in a single file on + * 64bit archs and 64G on 32bit archs using the max + * kmalloc size of 128k. So tweaking in practice is needed + * only to go past 128G of bigpages per file on 64bit archs. + * + * This sysctl is in page units (each page large BIGPAGE_SIZE). + */ +#ifdef CONFIG_HUGETLBFS +#if BITS_PER_LONG == 64 +int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT); +#else +int shm_hugepages_per_file = 131072 / sizeof(struct page *); +#endif +#endif + + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994