Skip to content
5 changes: 4 additions & 1 deletion include/linux/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,

bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);

#else /* !CONFIG_HUGETLB_PAGE */

Expand Down Expand Up @@ -464,6 +465,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}

#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
Expand Down Expand Up @@ -1229,7 +1232,7 @@ static inline __init void hugetlb_cma_reserve(int order)
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return page_count(virt_to_page(pte)) > 1;
return atomic_read(&virt_to_page(pte)->pt_share_count);
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2539,6 +2539,9 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
if (!pmd_ptlock_init(page))
return false;
__SetPageTable(page);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
atomic_set(&page->pt_share_count, 0);
#endif
inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ struct page {
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
RH_KABI_BROKEN_INSERT(atomic_t pt_share_count)
#endif
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
Expand Down
136 changes: 91 additions & 45 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
Expand Down Expand Up @@ -4834,6 +4836,39 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
return 0;
}

void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
* This function is called in the middle of a VMA split operation, with
* MM, VMA and rmap all write-locked to prevent concurrent page table
* walks (except hardware and gup_fast()).
*/
mmap_assert_write_locked(vma->vm_mm);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);

if (addr & ~PUD_MASK) {
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;

if (floor >= vma->vm_start && ceil <= vma->vm_end) {
/*
* Locking:
* Use take_locks=false here.
* The file rmap lock is already held.
* The hugetlb VMA lock can't be taken when we already
* hold the file rmap lock, and we don't need it because
* its purpose is to synchronize against concurrent page
* table walks, which are not possible thanks to the
* locks held by our caller.
*/
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
}
}
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
return huge_page_size(hstate_vma(vma));
Expand Down Expand Up @@ -4978,18 +5013,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}

/*
* If the pagetables are shared don't copy or take references.
*
* dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
* another vma. So page_count of ptep page is checked instead
* to reliably determine whether pte is shared.
*/
if (page_count(virt_to_page(dst_pte)) > 1) {
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
/* If the pagetables are shared, there is nothing to do */
if (atomic_read(&virt_to_page(dst_pte)->pt_share_count)) {
addr |= last_addr_mask;
continue;
}
#endif

dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
Expand Down Expand Up @@ -5330,17 +5360,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_end_vma(tlb, vma);

/*
* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
* could defer the flush until now, since by holding i_mmap_rwsem we
* guaranteed that the last refernece would not be dropped. But we must
* do the flushing before we return, as otherwise i_mmap_rwsem will be
* dropped and the last reference to the shared PMDs page might be
* dropped as well.
*
* In theory we could defer the freeing of the PMD pages as well, but
* huge_pmd_unshare() relies on the exact page_count for the PMD page to
* detect sharing, so we cannot defer the release of the page either.
* Instead, do flush now.
* There is nothing protecting a previously-shared page table that we
* unshared through huge_pmd_unshare() from getting freed after we
* release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
* succeeded, flush the range corresponding to the pud.
*/
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
Expand Down Expand Up @@ -6726,11 +6749,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
spin_unlock(ptl);
}
/*
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk. If we actually
* did unshare a page of pmds, flush the range corresponding to the pud.
* There is nothing protecting a previously-shared page table that we
* unshared through huge_pmd_unshare() from getting freed after we
* release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
* succeeded, flush the range corresponding to the pud.
*/
if (shared_pmd)
flush_hugetlb_tlb_range(vma, range.start, range.end);
Expand Down Expand Up @@ -7058,7 +7080,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
atomic_inc(&virt_to_page(spte)->pt_share_count);
break;
}
}
Expand All @@ -7073,7 +7095,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
put_page(virt_to_page(spte));
atomic_dec(&virt_to_page(spte)->pt_share_count);
}
spin_unlock(ptl);
out:
Expand All @@ -7085,10 +7107,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
Expand All @@ -7097,18 +7115,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);

i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
if (sz != PMD_SIZE)
return 0;
if (!atomic_read(&virt_to_page(ptep)->pt_share_count))
return 0;

pud_clear(pud);
put_page(virt_to_page(ptep));
/*
* Once our caller drops the rmap lock, some other process might be
* using this page table as a normal, non-hugetlb page table.
* Wait for pending gup_fast() in other threads to finish before letting
* that happen.
*/
tlb_remove_table_sync_one();
atomic_dec(&virt_to_page(ptep)->pt_share_count);
mm_dec_nr_pmds(mm);
return 1;
}
Expand Down Expand Up @@ -7340,25 +7367,27 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}

/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
* If @take_locks is false, the caller must ensure that no concurrent page table
* access can happen (except for gup_fast() and hardware page walks).
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
* concurrent page fault handling) and the file rmap lock.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
unsigned long address, start, end;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;

if (!(vma->vm_flags & VM_MAYSHARE))
return;

start = ALIGN(vma->vm_start, PUD_SIZE);
end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);

if (start >= end)
return;

Expand All @@ -7370,8 +7399,12 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
if (take_locks) {
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
} else {
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
}
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
Expand All @@ -7381,15 +7414,28 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
}

/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
/* take_locks = */ true);
}

#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;

Expand Down
8 changes: 8 additions & 0 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
}
}
again:
/*
* Get rid of huge pages and shared page tables straddling the split
* boundary.
*/
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (is_vm_hugetlb_page(orig_vma)) {
hugetlb_split(orig_vma, start);
hugetlb_split(orig_vma, end);
}

if (file) {
mapping = file->f_mapping;
Expand Down
20 changes: 4 additions & 16 deletions mm/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1556,14 +1556,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
flush_tlb_range(vma,
range.start, range.end);
/*
* The ref count of the PMD page was
* dropped which is part of the way map
* counting is done for shared PMDs.
* Return 'true' here. When there is
* no other sharing, huge_pmd_unshare
* returns false and we will unmap the
* actual page and drop map count
* to zero.
* The PMD table was unmapped,
* consequently unmapping the folio.
*/
page_vma_mapped_walk_done(&pvmw);
break;
Expand Down Expand Up @@ -1926,14 +1920,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
range.start, range.end);

/*
* The ref count of the PMD page was
* dropped which is part of the way map
* counting is done for shared PMDs.
* Return 'true' here. When there is
* no other sharing, huge_pmd_unshare
* returns false and we will unmap the
* actual page and drop map count
* to zero.
* The PMD table was unmapped,
* consequently unmapping the folio.
*/
page_vma_mapped_walk_done(&pvmw);
break;
Expand Down