You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
515 lines
12 KiB
515 lines
12 KiB
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/debugfs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/stacktrace.h>
|
|
#include <linux/page_pinner.h>
|
|
#include <linux/jump_label.h>
|
|
#include <linux/migrate.h>
|
|
#include <linux/stackdepot.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/sched/clock.h>
|
|
|
|
#include "internal.h"
|
|
|
|
#define PAGE_PINNER_STACK_DEPTH 16
|
|
#define LONGTERM_PIN_BUCKETS 4096
|
|
|
|
struct page_pinner {
|
|
depot_stack_handle_t handle;
|
|
s64 ts_usec;
|
|
atomic_t count;
|
|
};
|
|
|
|
struct captured_pinner {
|
|
depot_stack_handle_t handle;
|
|
union {
|
|
s64 ts_usec;
|
|
s64 elapsed;
|
|
};
|
|
|
|
/* struct page fields */
|
|
unsigned long pfn;
|
|
int count;
|
|
int mapcount;
|
|
struct address_space *mapping;
|
|
unsigned long flags;
|
|
};
|
|
|
|
struct longterm_pinner {
|
|
spinlock_t lock;
|
|
unsigned int index;
|
|
struct captured_pinner pinner[LONGTERM_PIN_BUCKETS];
|
|
};
|
|
|
|
static struct longterm_pinner lt_pinner = {
|
|
.lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
|
|
};
|
|
|
|
static s64 threshold_usec = 300000;
|
|
|
|
/* alloc_contig failed pinner */
|
|
static struct longterm_pinner acf_pinner = {
|
|
.lock = __SPIN_LOCK_UNLOCKED(acf_pinner.lock),
|
|
};
|
|
|
|
static bool page_pinner_enabled;
|
|
DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
|
|
|
|
DEFINE_STATIC_KEY_TRUE(failure_tracking);
|
|
EXPORT_SYMBOL_GPL(failure_tracking);
|
|
|
|
static depot_stack_handle_t failure_handle;
|
|
|
|
static int __init early_page_pinner_param(char *buf)
|
|
{
|
|
page_pinner_enabled = true;
|
|
return 0;
|
|
}
|
|
early_param("page_pinner", early_page_pinner_param);
|
|
|
|
static bool need_page_pinner(void)
|
|
{
|
|
return page_pinner_enabled;
|
|
}
|
|
|
|
static noinline void register_failure_stack(void)
|
|
{
|
|
unsigned long entries[4];
|
|
unsigned int nr_entries;
|
|
|
|
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
|
|
failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
|
|
}
|
|
|
|
static void init_page_pinner(void)
|
|
{
|
|
if (!page_pinner_enabled)
|
|
return;
|
|
|
|
register_failure_stack();
|
|
static_branch_enable(&page_pinner_inited);
|
|
}
|
|
|
|
struct page_ext_operations page_pinner_ops = {
|
|
.size = sizeof(struct page_pinner),
|
|
.need = need_page_pinner,
|
|
.init = init_page_pinner,
|
|
};
|
|
|
|
static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
|
|
{
|
|
return (void *)page_ext + page_pinner_ops.offset;
|
|
}
|
|
|
|
static noinline depot_stack_handle_t save_stack(gfp_t flags)
|
|
{
|
|
unsigned long entries[PAGE_PINNER_STACK_DEPTH];
|
|
depot_stack_handle_t handle;
|
|
unsigned int nr_entries;
|
|
|
|
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
|
|
handle = stack_depot_save(entries, nr_entries, flags);
|
|
if (!handle)
|
|
handle = failure_handle;
|
|
|
|
return handle;
|
|
}
|
|
|
|
static void capture_page_state(struct page *page,
|
|
struct captured_pinner *record)
|
|
{
|
|
record->flags = page->flags;
|
|
record->mapping = page_mapping(page);
|
|
record->pfn = page_to_pfn(page);
|
|
record->count = page_count(page);
|
|
record->mapcount = page_mapcount(page);
|
|
}
|
|
|
|
static void check_longterm_pin(struct page_pinner *page_pinner,
|
|
struct page *page)
|
|
{
|
|
s64 now, delta = 0;
|
|
unsigned long flags;
|
|
unsigned int idx;
|
|
struct captured_pinner record;
|
|
|
|
now = ktime_to_us(ktime_get_boottime());
|
|
|
|
/* get/put_page can be raced. Ignore that case */
|
|
if (page_pinner->ts_usec < now)
|
|
delta = now - page_pinner->ts_usec;
|
|
|
|
if (delta <= threshold_usec)
|
|
return;
|
|
|
|
record.handle = page_pinner->handle;
|
|
record.elapsed = delta;
|
|
capture_page_state(page, &record);
|
|
|
|
spin_lock_irqsave(<_pinner.lock, flags);
|
|
idx = lt_pinner.index++;
|
|
lt_pinner.index %= LONGTERM_PIN_BUCKETS;
|
|
lt_pinner.pinner[idx] = record;
|
|
spin_unlock_irqrestore(<_pinner.lock, flags);
|
|
}
|
|
|
|
void __reset_page_pinner(struct page *page, unsigned int order, bool free)
|
|
{
|
|
struct page_pinner *page_pinner;
|
|
struct page_ext *page_ext;
|
|
int i;
|
|
|
|
page_ext = lookup_page_ext(page);
|
|
if (unlikely(!page_ext))
|
|
return;
|
|
|
|
for (i = 0; i < (1 << order); i++) {
|
|
if (!test_bit(PAGE_EXT_GET, &page_ext->flags) &&
|
|
!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED,
|
|
&page_ext->flags))
|
|
continue;
|
|
|
|
page_pinner = get_page_pinner(page_ext);
|
|
if (free) {
|
|
/* record page free call path */
|
|
__page_pinner_migration_failed(page);
|
|
atomic_set(&page_pinner->count, 0);
|
|
__clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
|
|
} else {
|
|
check_longterm_pin(page_pinner, page);
|
|
}
|
|
clear_bit(PAGE_EXT_GET, &page_ext->flags);
|
|
page_ext = page_ext_next(page_ext);
|
|
}
|
|
}
|
|
|
|
static inline void __set_page_pinner_handle(struct page *page,
|
|
struct page_ext *page_ext, depot_stack_handle_t handle,
|
|
unsigned int order)
|
|
{
|
|
struct page_pinner *page_pinner;
|
|
int i;
|
|
s64 usec = ktime_to_us(ktime_get_boottime());
|
|
|
|
for (i = 0; i < (1 << order); i++) {
|
|
page_pinner = get_page_pinner(page_ext);
|
|
page_pinner->handle = handle;
|
|
page_pinner->ts_usec = usec;
|
|
set_bit(PAGE_EXT_GET, &page_ext->flags);
|
|
atomic_inc(&page_pinner->count);
|
|
page_ext = page_ext_next(page_ext);
|
|
}
|
|
}
|
|
|
|
noinline void __set_page_pinner(struct page *page, unsigned int order)
|
|
{
|
|
struct page_ext *page_ext = lookup_page_ext(page);
|
|
depot_stack_handle_t handle;
|
|
|
|
if (unlikely(!page_ext))
|
|
return;
|
|
|
|
handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
|
|
__set_page_pinner_handle(page, page_ext, handle, order);
|
|
}
|
|
|
|
static ssize_t
|
|
print_page_pinner(bool longterm, char __user *buf, size_t count, struct captured_pinner *record)
|
|
{
|
|
int ret;
|
|
unsigned long *entries;
|
|
unsigned int nr_entries;
|
|
char *kbuf;
|
|
|
|
count = min_t(size_t, count, PAGE_SIZE);
|
|
kbuf = kmalloc(count, GFP_KERNEL);
|
|
if (!kbuf)
|
|
return -ENOMEM;
|
|
|
|
if (longterm) {
|
|
ret = snprintf(kbuf, count, "Page pinned for %lld us\n",
|
|
record->elapsed);
|
|
} else {
|
|
s64 ts_usec = record->ts_usec;
|
|
unsigned long rem_usec = do_div(ts_usec, 1000000);
|
|
|
|
ret = snprintf(kbuf, count,
|
|
"Page pinned ts [%5lu.%06lu]\n",
|
|
(unsigned long)ts_usec, rem_usec);
|
|
}
|
|
|
|
if (ret >= count)
|
|
goto err;
|
|
|
|
/* Print information relevant to grouping pages by mobility */
|
|
ret += snprintf(kbuf + ret, count - ret,
|
|
"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
|
|
record->pfn,
|
|
record->pfn >> pageblock_order,
|
|
record->count, record->mapcount,
|
|
record->mapping,
|
|
record->flags, &record->flags);
|
|
|
|
if (ret >= count)
|
|
goto err;
|
|
|
|
nr_entries = stack_depot_fetch(record->handle, &entries);
|
|
ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
|
|
nr_entries, 0);
|
|
if (ret >= count)
|
|
goto err;
|
|
|
|
ret += snprintf(kbuf + ret, count - ret, "\n");
|
|
if (ret >= count)
|
|
goto err;
|
|
|
|
if (copy_to_user(buf, kbuf, ret))
|
|
ret = -EFAULT;
|
|
|
|
kfree(kbuf);
|
|
return ret;
|
|
|
|
err:
|
|
kfree(kbuf);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
void __dump_page_pinner(struct page *page)
|
|
{
|
|
struct page_ext *page_ext = lookup_page_ext(page);
|
|
struct page_pinner *page_pinner;
|
|
depot_stack_handle_t handle;
|
|
unsigned long *entries;
|
|
unsigned int nr_entries;
|
|
int pageblock_mt;
|
|
unsigned long pfn;
|
|
int count;
|
|
unsigned long rem_usec;
|
|
s64 ts_usec;
|
|
|
|
if (unlikely(!page_ext)) {
|
|
pr_alert("There is not page extension available.\n");
|
|
return;
|
|
}
|
|
|
|
page_pinner = get_page_pinner(page_ext);
|
|
|
|
count = atomic_read(&page_pinner->count);
|
|
if (!count) {
|
|
pr_alert("page_pinner info is not present (never set?)\n");
|
|
return;
|
|
}
|
|
|
|
pfn = page_to_pfn(page);
|
|
ts_usec = page_pinner->ts_usec;
|
|
rem_usec = do_div(ts_usec, 1000000);
|
|
pr_alert("page last pinned %5lu.%06lu] count %d\n",
|
|
(unsigned long)ts_usec, rem_usec, count);
|
|
|
|
pageblock_mt = get_pageblock_migratetype(page);
|
|
pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
|
|
pfn,
|
|
pfn >> pageblock_order,
|
|
migratetype_names[pageblock_mt],
|
|
page->flags, &page->flags);
|
|
|
|
handle = READ_ONCE(page_pinner->handle);
|
|
if (!handle) {
|
|
pr_alert("page_pinner allocation stack trace missing\n");
|
|
} else {
|
|
nr_entries = stack_depot_fetch(handle, &entries);
|
|
stack_trace_print(entries, nr_entries, 0);
|
|
}
|
|
}
|
|
|
|
void __page_pinner_migration_failed(struct page *page)
|
|
{
|
|
struct page_ext *page_ext = lookup_page_ext(page);
|
|
struct page_pinner *page_pinner;
|
|
struct captured_pinner record;
|
|
unsigned long flags;
|
|
unsigned int idx;
|
|
|
|
if (unlikely(!page_ext))
|
|
return;
|
|
|
|
page_pinner = get_page_pinner(page_ext);
|
|
if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
|
|
return;
|
|
|
|
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
|
|
record.ts_usec = ktime_to_us(ktime_get_boottime());
|
|
capture_page_state(page, &record);
|
|
|
|
spin_lock_irqsave(&acf_pinner.lock, flags);
|
|
idx = acf_pinner.index++;
|
|
acf_pinner.index %= LONGTERM_PIN_BUCKETS;
|
|
acf_pinner.pinner[idx] = record;
|
|
spin_unlock_irqrestore(&acf_pinner.lock, flags);
|
|
}
|
|
EXPORT_SYMBOL_GPL(__page_pinner_migration_failed);
|
|
|
|
void __page_pinner_mark_migration_failed_pages(struct list_head *page_list)
|
|
{
|
|
struct page *page;
|
|
struct page_ext *page_ext;
|
|
|
|
list_for_each_entry(page, page_list, lru) {
|
|
/* The page will be freed by putback_movable_pages soon */
|
|
if (page_count(page) == 1)
|
|
continue;
|
|
page_ext = lookup_page_ext(page);
|
|
if (unlikely(!page_ext))
|
|
continue;
|
|
__set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
|
|
__page_pinner_migration_failed(page);
|
|
}
|
|
}
|
|
|
|
static ssize_t
|
|
read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
|
|
loff_t *ppos)
|
|
{
|
|
loff_t i, idx;
|
|
struct captured_pinner record;
|
|
unsigned long flags;
|
|
|
|
if (!static_branch_unlikely(&page_pinner_inited))
|
|
return -EINVAL;
|
|
|
|
if (*ppos >= LONGTERM_PIN_BUCKETS)
|
|
return 0;
|
|
|
|
i = *ppos;
|
|
*ppos = i + 1;
|
|
|
|
/*
|
|
* reading the records in the reverse order with newest one
|
|
* being read first followed by older ones
|
|
*/
|
|
idx = (lt_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
|
|
LONGTERM_PIN_BUCKETS;
|
|
spin_lock_irqsave(<_pinner.lock, flags);
|
|
record = lt_pinner.pinner[idx];
|
|
spin_unlock_irqrestore(<_pinner.lock, flags);
|
|
if (!record.handle)
|
|
return 0;
|
|
|
|
return print_page_pinner(true, buf, count, &record);
|
|
}
|
|
|
|
static const struct file_operations proc_longterm_pinner_operations = {
|
|
.read = read_longterm_page_pinner,
|
|
};
|
|
|
|
static ssize_t read_alloc_contig_failed(struct file *file, char __user *buf,
|
|
size_t count, loff_t *ppos)
|
|
{
|
|
loff_t i, idx;
|
|
struct captured_pinner record;
|
|
unsigned long flags;
|
|
|
|
if (!static_branch_unlikely(&failure_tracking))
|
|
return -EINVAL;
|
|
|
|
if (*ppos >= LONGTERM_PIN_BUCKETS)
|
|
return 0;
|
|
|
|
i = *ppos;
|
|
*ppos = i + 1;
|
|
|
|
/*
|
|
* reading the records in the reverse order with newest one
|
|
* being read first followed by older ones
|
|
*/
|
|
idx = (acf_pinner.index - 1 - i + LONGTERM_PIN_BUCKETS) %
|
|
LONGTERM_PIN_BUCKETS;
|
|
|
|
spin_lock_irqsave(&acf_pinner.lock, flags);
|
|
record = acf_pinner.pinner[idx];
|
|
spin_unlock_irqrestore(&acf_pinner.lock, flags);
|
|
if (!record.handle)
|
|
return 0;
|
|
|
|
return print_page_pinner(false, buf, count, &record);
|
|
}
|
|
|
|
static const struct file_operations proc_alloc_contig_failed_operations = {
|
|
.read = read_alloc_contig_failed,
|
|
};
|
|
|
|
static int pp_threshold_set(void *data, unsigned long long val)
|
|
{
|
|
unsigned long flags;
|
|
|
|
threshold_usec = (s64)val;
|
|
|
|
spin_lock_irqsave(<_pinner.lock, flags);
|
|
memset(lt_pinner.pinner, 0,
|
|
sizeof(struct captured_pinner) * LONGTERM_PIN_BUCKETS);
|
|
lt_pinner.index = 0;
|
|
spin_unlock_irqrestore(<_pinner.lock, flags);
|
|
return 0;
|
|
}
|
|
|
|
static int pp_threshold_get(void *data, unsigned long long *val)
|
|
{
|
|
*val = (unsigned long long)threshold_usec;
|
|
|
|
return 0;
|
|
}
|
|
DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
|
|
pp_threshold_set, "%lld\n");
|
|
|
|
static int failure_tracking_set(void *data, u64 val)
|
|
{
|
|
bool on;
|
|
|
|
on = (bool)val;
|
|
if (on)
|
|
static_branch_enable(&failure_tracking);
|
|
else
|
|
static_branch_disable(&failure_tracking);
|
|
return 0;
|
|
}
|
|
|
|
static int failure_tracking_get(void *data, u64 *val)
|
|
{
|
|
*val = static_branch_unlikely(&failure_tracking);
|
|
return 0;
|
|
}
|
|
DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
|
|
failure_tracking_get,
|
|
failure_tracking_set, "%llu\n");
|
|
|
|
static int __init page_pinner_init(void)
|
|
{
|
|
struct dentry *pp_debugfs_root;
|
|
|
|
if (!static_branch_unlikely(&page_pinner_inited))
|
|
return 0;
|
|
|
|
pr_info("page_pinner enabled\n");
|
|
|
|
pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
|
|
|
|
debugfs_create_file("longterm_pinner", 0444, pp_debugfs_root, NULL,
|
|
&proc_longterm_pinner_operations);
|
|
|
|
debugfs_create_file("threshold", 0644, pp_debugfs_root, NULL,
|
|
&pp_threshold_fops);
|
|
|
|
debugfs_create_file("alloc_contig_failed", 0444,
|
|
pp_debugfs_root, NULL,
|
|
&proc_alloc_contig_failed_operations);
|
|
|
|
debugfs_create_file("failure_tracking", 0644,
|
|
pp_debugfs_root, NULL,
|
|
&failure_tracking_fops);
|
|
return 0;
|
|
}
|
|
late_initcall(page_pinner_init)
|