From 4fd017708c4a067da51a2b5cf8aedddf4e840b1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 12 Oct 2011 21:06:51 +0200 Subject: mm: Check if PTE is already allocated during page fault With transparent hugepage support, handle_mm_fault() has to be careful that a normal PMD has been established before handling a PTE fault. To achieve this, it used __pte_alloc() directly instead of pte_alloc_map as pte_alloc_map is unsafe to run against a huge PMD. pte_offset_map() is called once it is known the PMD is safe. pte_alloc_map() is smart enough to check if a PTE is already present before calling __pte_alloc but this check was lost. As a consequence, PTEs may be allocated unnecessarily and the page table lock taken. Thi useless PTE does get cleaned up but it's a performance hit which is visible in page_test from aim9. This patch simply re-adds the check normally done by pte_alloc_map to check if the PTE needs to be allocated before taking the page table lock. The effect is noticable in page_test from aim9. AIM9 2.6.38-vanilla 2.6.38-checkptenone creat-clo 446.10 ( 0.00%) 424.47 (-5.10%) page_test 38.10 ( 0.00%) 42.04 ( 9.37%) brk_test 52.45 ( 0.00%) 51.57 (-1.71%) exec_test 382.00 ( 0.00%) 456.90 (16.39%) fork_test 60.11 ( 0.00%) 67.79 (11.34%) MMTests Statistics: duration Total Elapsed Time (seconds) 611.90 612.22 (While this affects 2.6.38, it is a performance rather than a functional bug and normally outside the rules -stable. While the big performance differences are to a microbench, the difference in fork and exec performance may be significant enough that -stable wants to consider the patch) Reported-by: Raz Ben Yehuda Signed-off-by: Mel Gorman Signed-off-by: Andrea Arcangeli Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Rik van Riel [ Picked this up from the AutoNUMA tree to help it upstream and to allow apples-to-apples performance comparisons. ] Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b41..35c66a269bc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -710,7 +710,8 @@ out: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) -- cgit v1.2.3 From 1ba6e0b50b479cbadb8f05ebde3020da9ac87201 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 4 Oct 2012 01:51:06 +0200 Subject: mm: numa: split_huge_page: transfer the NUMA type from the pmd to the pte When we split a transparent hugepage, transfer the NUMA type from the pmd to the pte if needed. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 35c66a269bc..cd24aa56214 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1364,6 +1364,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); -- cgit v1.2.3 From d10e63f29488b0f312a443f9507ea9b6fd3c9090 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:31 +0200 Subject: mm: numa: Create basic numa page hinting infrastructure Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/huge_memory.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cd24aa56214..f5f37630c54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1018,6 +1018,28 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp) +{ + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + page = pmd_page(pmd); + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { -- cgit v1.2.3 From 4daae3b4b9e49b7e0935499a352f1c59d90287d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 11:33:45 +0000 Subject: mm: mempolicy: Use _PAGE_NUMA to migrate pages Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but sufficiently different that the signed-off-bys were dropped Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page() pieces into an effective migrate on fault scheme. Note that (on x86) we rely on PROT_NONE pages being !present and avoid the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the page-migration performance. Based-on-work-by: Peter Zijlstra Signed-off-by: Mel Gorman --- mm/huge_memory.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5f37630c54..5723b551c02 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1019,17 +1020,39 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page; + struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); + get_page(page); + spin_unlock(&mm->page_table_lock); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) + goto clear_pmdnuma; + + /* + * Due to lacking code to migrate thp pages, we'll split + * (which preserves the special PROT_NONE) and re-take the + * fault on the normal pages. + */ + split_huge_page(page); + put_page(page); + return 0; + +clear_pmdnuma: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); @@ -1037,6 +1060,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, out_unlock: spin_unlock(&mm->page_table_lock); + if (page) + put_page(page); return 0; } -- cgit v1.2.3 From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- mm/huge_memory.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5723b551c02..d79f7a55bf6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1147,7 +1147,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1155,7 +1155,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; -- cgit v1.2.3 From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- mm/huge_memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6..ee8133794a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ split_huge_page(page); put_page(page); + return 0; clear_pmdnuma: @@ -1060,8 +1061,10 @@ clear_pmdnuma: out_unlock: spin_unlock(&mm->page_table_lock); - if (page) + if (page) { put_page(page); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + } return 0; } -- cgit v1.2.3 From 03c5a6e16322c997bf8f264851bfa3f532ad515f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 14:52:48 +0000 Subject: mm: numa: Add pte updates, hinting and migration stats It is tricky to quantify the basic cost of automatic NUMA placement in a meaningful manner. This patch adds some vmstats that can be used as part of a basic costing model. u = basic unit = sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cpte = Cost PTE access = Ca Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock) where Cpte is incurred twice for a read and a write and Wlock is a constant representing the cost of taking or releasing a lock Cnumahint = Cost of a minor page fault = some high constant e.g. 1000 Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u Ci = Cost of page isolation = Ca + Wi where Wi is a constant that should reflect the approximate cost of the locking operation Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma) where Wnuma is the approximate NUMA factor. 1 is local. 1.2 would imply that remote accesses are 20% more expensive Balancing cost = Cpte * numa_pte_updates + Cnumahint * numa_hint_faults + Ci * numa_pages_migrated + Cpagecopy * numa_pages_migrated Note that numa_pages_migrated is used as a measure of how many pages were isolated even though it would miss pages that failed to migrate. A vmstat counter could have been added for it but the isolation cost is pretty marginal in comparison to the overall cost so it seemed overkill. The ideal way to measure automatic placement benefit would be to count the number of remote accesses versus local accesses and do something like benefit = (remote_accesses_before - remove_access_after) * Wnuma but the information is not readily available. As a workload converges, the expection would be that the number of remote numa hints would reduce to 0. convergence = numa_hint_faults_local / numa_hint_faults where this is measured for the last N number of numa hints recorded. When the workload is fully converged the value is 1. This can measure if the placement policy is converging and how fast it is doing it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- mm/huge_memory.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee8133794a5..f3a477fffd0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1026,6 +1026,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; + int current_nid = -1; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,6 +1035,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); spin_unlock(&mm->page_table_lock); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) -- cgit v1.2.3 From 5aa80374a10567f8e25de0615d3d40f3aa3a4298 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 27 Nov 2012 14:40:32 +0000 Subject: mm: numa: split_huge_page: Transfer last_nid on tail page Pass last_nid from head page to tail page. Signed-off-by: Hillf Danton Signed-off-by: Mel Gorman --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3a477fffd0..79b96064f8f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1362,6 +1362,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); -- cgit v1.2.3 From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79b96064f8f..199b261a257 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1068,7 +1068,7 @@ out_unlock: spin_unlock(&mm->page_table_lock); if (page) { put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); } return 0; } -- cgit v1.2.3 From b32967ff101a7508f70be8de59b278d4df92fa00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 12:35:47 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case. Note: This is very heavily based on a patch from Peter Zijlstra with fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch put a lot of migration logic into mm/huge_memory.c where it does not belong. This version puts tries to share some of the migration logic with migrate_misplaced_page. However, it should be noted that now migrate.c is doing more with the pagetable manipulation than is preferred. The end result is barely recognisable so as before, the signed-offs had to be removed but will be re-added if the original authors are ok with it. Add THP migration for the NUMA working set scanning fault case. It uses the page lock to serialize. No migration pte dance is necessary because the pte is already unmapped when we decide to migrate. [dhillf@gmail.com: Fix memory leak on isolation failure] [dhillf@gmail.com: Fix transfer of last_nid information] Signed-off-by: Mel Gorman --- mm/huge_memory.c | 59 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 19 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 199b261a257..711baf84b15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -600,7 +600,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -1023,10 +1023,12 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page = NULL; + struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; + bool migrated; + bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); - spin_unlock(&mm->page_table_lock); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) + if (target_nid == -1) { + put_page(page); goto clear_pmdnuma; + } - /* - * Due to lacking code to migrate thp pages, we'll split - * (which preserves the special PROT_NONE) and re-take the - * fault on the normal pages. - */ - split_huge_page(page); - put_page(page); - - return 0; + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; -clear_pmdnuma: + /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); - if (page) { - put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); - } + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); return 0; } -- cgit v1.2.3 From 5a505085f043e8380f83610f79642853c051e2f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:46 +0000 Subject: mm/rmap: Convert the struct anon_vma::mutex to an rwsem Convert the struct anon_vma::mutex to an rwsem, which will help in solving a page-migration scalability problem. (Addressed in a separate patch.) The conversion is simple and straightforward: in every case where we mutex_lock()ed we'll now down_write(). Suggested-by: Linus Torvalds Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 711baf84b15..acd37fe55eb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1292,7 +1292,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1495,7 +1495,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { -- cgit v1.2.3 From 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:50 +0000 Subject: mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable rmap_walk_anon() and try_to_unmap_anon() appears to be too careful about locking the anon vma: while it needs protection against anon vma list modifications, it does not need exclusive access to the list itself. Transforming this exclusive lock to a read-locked rwsem removes a global lock from the hot path of page-migration intense threaded workloads which can cause pathological performance like this: 96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch | --- perf_trace_sched_switch __schedule schedule schedule_preempt_disabled __mutex_lock_common.isra.6 __mutex_lock_slowpath mutex_lock | |--50.61%-- rmap_walk | move_to_new_page | migrate_pages | migrate_misplaced_page | __do_numa_page.isra.69 | handle_pte_fault | handle_mm_fault | __do_page_fault | do_page_fault | page_fault | __memset_sse2 | | | --100.00%-- worker_thread | | | --100.00%-- start_thread | --49.39%-- page_lock_anon_vma try_to_unmap_anon try_to_unmap migrate_pages migrate_misplaced_page __do_numa_page.isra.69 handle_pte_fault handle_mm_fault __do_page_fault do_page_fault page_fault __memset_sse2 | --100.00%-- worker_thread start_thread With this change applied the profile is now nicely flat and there's no anon-vma related scheduling/blocking. Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), to make it clearer that it's an exclusive write-lock in that case - suggested by Rik van Riel. Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Rik van Riel Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acd37fe55eb..a24c9cb9c83 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page) int ret = 1; BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); -- cgit v1.2.3