patch-2.4.10 linux/mm/vmscan.c
Next file: linux/net/appletalk/ddp.c
Previous file: linux/mm/vmalloc.c
Back to the patch index
Back to the overall index
- Lines: 1269
- Date:
Sun Sep 23 09:58:51 2001
- Orig file:
v2.4.9/linux/mm/vmscan.c
- Orig date:
Mon Aug 27 12:41:49 2001
diff -u --recursive --new-file v2.4.9/linux/mm/vmscan.c linux/mm/vmscan.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
+#include <linux/compiler.h>
#include <asm/pgalloc.h>
@@ -32,21 +33,6 @@
*/
#define DEF_PRIORITY (6)
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-static inline void age_page_up(struct page *page)
-{
- unsigned age = page->age + PAGE_AGE_ADV;
- if (age > PAGE_AGE_MAX)
- age = PAGE_AGE_MAX;
- page->age = age;
-}
-
-static inline void age_page_down(struct page * page)
-{
- page->age /= 2;
-}
-
/*
* The swap-out function returns 1 if it successfully
* scanned all the pages it was asked to (`count').
@@ -56,61 +42,32 @@
* doesn't count as having freed a page.
*/
-/*
- * Estimate whether a zone has enough inactive or free pages..
- */
-static unsigned int zone_inactive_plenty(zone_t *zone)
-{
- unsigned int inactive;
-
- if (!zone->size)
- return 0;
-
- inactive = zone->inactive_dirty_pages;
- inactive += zone->inactive_clean_pages;
- inactive += zone->free_pages;
-
- return (inactive > (zone->size / 3));
-}
-
-static unsigned int zone_free_plenty(zone_t *zone)
-{
- unsigned int free;
-
- free = zone->free_pages;
- free += zone->inactive_clean_pages;
-
- return free > zone->pages_high*2;
-}
-
/* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
{
pte_t pte;
swp_entry_t entry;
-
- /*
- * If we are doing a zone-specific scan, do not
- * touch pages from zones which don't have a
- * shortage.
- */
- if (zone_inactive_plenty(page->zone))
- return;
+ int right_classzone;
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
- age_page_up(page);
- return;
+ flush_tlb_page(vma, address);
+ return 0;
}
if (TryLockPage(page))
- return;
+ return 0;
+
+ right_classzone = 1;
+ if (!memclass(page->zone, classzone))
+ right_classzone = 0;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
* is needed on CPUs which update the accessed and dirty
* bits in hardware.
*/
+ flush_cache_page(vma, address);
pte = ptep_get_and_clear(page_table);
flush_tlb_page(vma, address);
@@ -123,22 +80,24 @@
entry.val = page->index;
if (pte_dirty(pte))
set_page_dirty(page);
-set_swap_pte:
swap_duplicate(entry);
+set_swap_pte:
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
- if (!page->age)
- deactivate_page(page);
UnlockPage(page);
- page_cache_release(page);
- return;
+ {
+ int freeable = page_count(page) - !!page->buffers <= 2;
+ page_cache_release(page);
+ return freeable & right_classzone;
+ }
}
/*
* Is it a clean page? Then it must be recoverable
* by just paging it in again, and we can just drop
- * it..
+ * it.. or if it's dirty but has backing store,
+ * just mark the page dirty and drop it.
*
* However, this won't actually free any real
* memory, as the page will just be in the page cache
@@ -148,20 +107,17 @@
* Basically, this just makes it possible for us to do
* some real work in the future in "refill_inactive()".
*/
- flush_cache_page(vma, address);
- if (!pte_dirty(pte))
+ if (page->mapping) {
+ if (pte_dirty(pte))
+ set_page_dirty(page);
goto drop_pte;
-
+ }
/*
- * Ok, it's really dirty. That means that
- * we should either create a new swap cache
- * entry for it, or we should write it back
- * to its own backing store.
+ * Check PageDirty as well as pte_dirty: page may
+ * have been brought back from swap by swapoff.
*/
- if (page->mapping) {
- set_page_dirty(page);
+ if (!pte_dirty(pte) && !PageDirty(page))
goto drop_pte;
- }
/*
* This is a dirty, swappable page. First of all,
@@ -169,23 +125,25 @@
* we have the swap cache set up to associate the
* page with that swap entry.
*/
+ swap_list_lock();
entry = get_swap_page();
- if (!entry.val)
- goto out_unlock_restore; /* No swap space left */
-
- /* Add it to the swap cache and mark it dirty */
- add_to_swap_cache(page, entry);
- set_page_dirty(page);
- goto set_swap_pte;
+ if (entry.val) {
+ /* Add it to the swap cache and mark it dirty */
+ add_to_swap_cache(page, entry);
+ swap_list_unlock();
+ set_page_dirty(page);
+ goto set_swap_pte;
+ }
-out_unlock_restore:
+ /* No swap space left */
+ swap_list_unlock();
set_pte(page_table, pte);
UnlockPage(page);
- return;
+ return 0;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
{
pte_t * pte;
unsigned long pmd_end;
@@ -209,20 +167,22 @@
struct page *page = pte_page(*pte);
if (VALID_PAGE(page) && !PageReserved(page)) {
- try_to_swap_out(mm, vma, address, pte, page);
- if (!--count)
+ count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
+ if (!count) {
+ address += PAGE_SIZE;
break;
+ }
}
}
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
- mm->swap_address = address + PAGE_SIZE;
+ mm->swap_address = address;
return count;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -242,7 +202,7 @@
end = pgd_end;
do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count);
+ count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
if (!count)
break;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -252,7 +212,7 @@
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
{
pgd_t *pgdir;
unsigned long end;
@@ -267,7 +227,7 @@
if (address >= end)
BUG();
do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+ count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
if (!count)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -276,607 +236,426 @@
return count;
}
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
/*
- * Returns non-zero if we scanned all `count' pages
+ * Returns remaining count of pages to be swapped out by followup call.
*/
-static int swap_out_mm(struct mm_struct * mm, int count)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
{
unsigned long address;
struct vm_area_struct* vma;
- if (!count)
- return 1;
- /*
- * Go through process' page directory.
- */
-
/*
* Find the proper vm-area after freezing the vma chain
* and ptes.
*/
spin_lock(&mm->page_table_lock);
address = mm->swap_address;
+ if (address == TASK_SIZE || swap_mm != mm) {
+ /* We raced: don't count this mm but try again */
+ ++*mmcounter;
+ goto out_unlock;
+ }
vma = find_vma(mm, address);
if (vma) {
if (address < vma->vm_start)
address = vma->vm_start;
for (;;) {
- count = swap_out_vma(mm, vma, address, count);
- if (!count)
- goto out_unlock;
+ count = swap_out_vma(mm, vma, address, count, classzone);
vma = vma->vm_next;
if (!vma)
break;
+ if (!count)
+ goto out_unlock;
address = vma->vm_start;
}
}
- /* Reset to 0 when we reach the end of address space */
- mm->swap_address = 0;
+ /* Indicate that we reached the end of address space */
+ mm->swap_address = TASK_SIZE;
out_unlock:
spin_unlock(&mm->page_table_lock);
- return !count;
-}
-
-#define SWAP_MM_SHIFT 4
-#define SWAP_SHIFT 5
-#define SWAP_MIN 8
-
-static inline int swap_amount(struct mm_struct *mm)
-{
- int nr = mm->rss >> SWAP_SHIFT;
- if (nr < SWAP_MIN) {
- nr = SWAP_MIN;
- if (nr > mm->rss)
- nr = mm->rss;
- }
- return nr;
+ return count;
}
-static void swap_out(unsigned int priority, int gfp_mask)
+static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
int counter;
- int retval = 0;
- struct mm_struct *mm = current->mm;
-
- /* Always start by trying to penalize the process that is allocating memory */
- if (mm)
- retval = swap_out_mm(mm, swap_amount(mm));
+ struct mm_struct *mm;
/* Then, look at the other mm's */
- counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
+ counter = mmlist_nr / priority;
do {
- struct list_head *p;
+ if (unlikely(current->need_resched)) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ }
spin_lock(&mmlist_lock);
- p = init_mm.mmlist.next;
- if (p == &init_mm.mmlist)
- goto empty;
-
- /* Move it to the back of the queue.. */
- list_del(p);
- list_add_tail(p, &init_mm.mmlist);
- mm = list_entry(p, struct mm_struct, mmlist);
+ mm = swap_mm;
+ while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
+ mm->swap_address = 0;
+ mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
+ if (mm == swap_mm)
+ goto empty;
+ swap_mm = mm;
+ }
/* Make sure the mm doesn't disappear when we drop the lock.. */
atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
- /* Walk about 6% of the address space each time */
- retval |= swap_out_mm(mm, swap_amount(mm));
+ nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
+
mmput(mm);
+
+ if (!nr_pages)
+ return 1;
} while (--counter >= 0);
- return;
+
+ return 0;
empty:
spin_unlock(&mmlist_lock);
+ return 0;
}
-
-/**
- * reclaim_page - reclaims one page from the inactive_clean list
- * @zone: reclaim a page from this zone
- *
- * The pages on the inactive_clean can be instantly reclaimed.
- * The tests look impressive, but most of the time we'll grab
- * the first page of the list and exit successfully.
- */
-struct page * reclaim_page(zone_t * zone)
+static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
{
- struct page * page = NULL;
- struct list_head * page_lru;
- int maxscan;
+ struct list_head * entry;
- /*
- * We only need the pagemap_lru_lock if we don't reclaim the page,
- * but we have to grab the pagecache_lock before the pagemap_lru_lock
- * to avoid deadlocks and most of the time we'll succeed anyway.
- */
- spin_lock(&pagecache_lock);
spin_lock(&pagemap_lru_lock);
- maxscan = zone->inactive_clean_pages;
- while ((page_lru = zone->inactive_clean_list.prev) !=
- &zone->inactive_clean_list && maxscan--) {
- page = list_entry(page_lru, struct page, lru);
-
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageInactiveClean(page)) {
- printk("VM: reclaim_page, wrong page on list.\n");
- list_del(page_lru);
- page->zone->inactive_clean_pages--;
- continue;
- }
+ while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
+ struct page * page;
+ swp_entry_t swap;
- /* Page is or was in use? Move it to the active list. */
- if (PageReferenced(page) || (!page->buffers && page_count(page) > 1)) {
- del_page_from_inactive_clean_list(page);
- add_page_to_active_list(page);
- page->age = PAGE_AGE_START;
- continue;
- }
-
- /* The page is dirty, or locked, move to inactive_dirty list. */
- if (page->buffers || PageDirty(page) || TryLockPage(page)) {
- del_page_from_inactive_clean_list(page);
- add_page_to_inactive_dirty_list(page);
+ if (unlikely(current->need_resched)) {
+ spin_unlock(&pagemap_lru_lock);
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ spin_lock(&pagemap_lru_lock);
continue;
}
- /* OK, remove the page from the caches. */
- if (PageSwapCache(page)) {
- __delete_from_swap_cache(page);
- goto found_page;
- }
-
- if (page->mapping) {
- __remove_inode_page(page);
- goto found_page;
- }
-
- /* We should never ever get here. */
- printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
- list_del(page_lru);
- zone->inactive_clean_pages--;
- UnlockPage(page);
- }
- /* Reset page pointer, maybe we encountered an unfreeable page. */
- page = NULL;
- goto out;
-
-found_page:
- memory_pressure++;
- del_page_from_inactive_clean_list(page);
- UnlockPage(page);
- page->age = PAGE_AGE_START;
- if (page_count(page) != 1)
- printk("VM: reclaim_page, found page with count %d!\n",
- page_count(page));
-out:
- spin_unlock(&pagemap_lru_lock);
- spin_unlock(&pagecache_lock);
- return page;
-}
+ page = list_entry(entry, struct page, lru);
-/**
- * page_launder - clean dirty inactive pages, move to inactive_clean list
- * @gfp_mask: what operations we are allowed to do
- * @sync: are we allowed to do synchronous IO in emergencies ?
- *
- * When this function is called, we are most likely low on free +
- * inactive_clean pages. Since we want to refill those pages as
- * soon as possible, we'll make two loops over the inactive list,
- * one to move the already cleaned pages to the inactive_clean lists
- * and one to (often asynchronously) clean the dirty inactive pages.
- *
- * In situations where kswapd cannot keep up, user processes will
- * end up calling this function. Since the user process needs to
- * have a page before it can continue with its allocation, we'll
- * do synchronous page flushing in that case.
- *
- * This code used to be heavily inspired by the FreeBSD source code.
- * Thanks go out to Matthew Dillon.
- */
-#define CAN_DO_FS (gfp_mask & __GFP_FS)
-int page_launder(int gfp_mask, int sync)
-{
- int maxscan, cleaned_pages;
- struct list_head * page_lru;
- struct page * page;
-
- cleaned_pages = 0;
-
- /* Will we wait on IO? */
- if (!sync)
- gfp_mask &= ~__GFP_WAIT;
-
- spin_lock(&pagemap_lru_lock);
- maxscan = nr_inactive_dirty_pages >> DEF_PRIORITY;
- while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
- maxscan-- > 0) {
- page = list_entry(page_lru, struct page, lru);
+ if (unlikely(!PageInactive(page) && !PageActive(page)))
+ BUG();
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageInactiveDirty(page)) {
- printk("VM: page_launder, wrong page on list.\n");
- list_del(page_lru);
- nr_inactive_dirty_pages--;
- page->zone->inactive_dirty_pages--;
+ list_del(entry);
+ list_add(entry, &inactive_list);
+ if (PageTestandClearReferenced(page))
continue;
- }
- /* Page is or was in use? Move it to the active list. */
- if (PageReferenced(page) || (!page->buffers && page_count(page) > 1) ||
- page_ramdisk(page)) {
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
- page->age = PAGE_AGE_START;
+ max_scan--;
+
+ if (unlikely(!memclass(page->zone, classzone)))
continue;
- }
- /*
- * If this zone has plenty of pages free,
- * don't spend time on cleaning it.
- */
- if (zone_free_plenty(page->zone)) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
+ /* Racy check to avoid trylocking when not worthwhile */
+ if (!page->buffers && page_count(page) != 1)
continue;
- }
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
- if (TryLockPage(page)) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
+ if (unlikely(TryLockPage(page)))
continue;
+
+ if (PageDirty(page) && is_page_cache_freeable(page)) {
+ /*
+ * It is not critical here to write it only if
+ * the page is unmapped beause any direct writer
+ * like O_DIRECT would set the PG_dirty bitflag
+ * on the phisical page after having successfully
+ * pinned it and after the I/O to the page is finished,
+ * so the direct writes to the page cannot get lost.
+ */
+ int (*writepage)(struct page *);
+
+ writepage = page->mapping->a_ops->writepage;
+ if ((gfp_mask & __GFP_FS) && writepage) {
+ ClearPageDirty(page);
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
+
+ writepage(page);
+ page_cache_release(page);
+
+ spin_lock(&pagemap_lru_lock);
+ continue;
+ }
}
/*
- * Dirty swap-cache page? Write it out if
- * last copy..
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we try to free
+ * the page as well.
*/
- if (PageDirty(page)) {
- int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
+ if (page->buffers) {
+ spin_unlock(&pagemap_lru_lock);
+
+ /* avoid to free a locked page */
+ page_cache_get(page);
- if (!writepage)
- goto page_active;
+ if (try_to_free_buffers(page, gfp_mask)) {
+ if (!page->mapping) {
+ /*
+ * Account we successfully freed a page
+ * of buffer cache.
+ */
+ atomic_dec(&buffermem_pages);
+
+ /*
+ * We must not allow an anon page
+ * with no buffers to be visible on
+ * the LRU, so we unlock the page after
+ * taking the lru lock
+ */
+ spin_lock(&pagemap_lru_lock);
+ UnlockPage(page);
+ __lru_cache_del(page);
- /* Can't do it? Move it to the back of the list */
- if (!CAN_DO_FS) {
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
+ /* effectively free the page here */
+ page_cache_release(page);
+
+ if (--nr_pages)
+ continue;
+ break;
+ } else {
+ /*
+ * The page is still in pagecache so undo the stuff
+ * before the try_to_free_buffers since we've not
+ * finished and we can now try the next step.
+ */
+ page_cache_release(page);
+
+ spin_lock(&pagemap_lru_lock);
+ }
+ } else {
+ /* failed to drop the buffers so stop here */
UnlockPage(page);
+ page_cache_release(page);
+
+ spin_lock(&pagemap_lru_lock);
continue;
}
+ }
- /* OK, do a physical asynchronous write to swap. */
- ClearPageDirty(page);
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
+ if (unlikely(!page->mapping))
+ BUG();
- writepage(page);
- page_cache_release(page);
+ if (unlikely(!spin_trylock(&pagecache_lock))) {
+ /* we hold the page lock so the page cannot go away from under us */
+ spin_unlock(&pagemap_lru_lock);
- /* And re-start the thing.. */
+ spin_lock(&pagecache_lock);
spin_lock(&pagemap_lru_lock);
- continue;
}
/*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we either free
- * the page (in case it was a buffercache only page) or we
- * move the page to the inactive_clean list.
- *
- * On the first round, we should free all previously cleaned
- * buffer pages
+ * this is the non-racy check, it is critical to check
+ * PageDirty _after_ we made sure the page is freeable
+ * so not in use by anybody.
*/
- if (page->buffers) {
- int clearedbuf;
- int freed_page = 0;
+ if (!is_page_cache_freeable(page) || PageDirty(page)) {
+ spin_unlock(&pagecache_lock);
+ UnlockPage(page);
+ continue;
+ }
- /*
- * Since we might be doing disk IO, we have to
- * drop the spinlock and take an extra reference
- * on the page so it doesn't go away from under us.
- */
- del_page_from_inactive_dirty_list(page);
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
+ /* point of no return */
+ if (likely(!PageSwapCache(page))) {
+ swap.val = 0;
+ __remove_inode_page(page);
+ } else {
+ swap.val = page->index;
+ __delete_from_swap_cache(page);
+ }
+ spin_unlock(&pagecache_lock);
- /* Try to free the page buffers. */
- clearedbuf = try_to_free_buffers(page, gfp_mask);
+ __lru_cache_del(page);
- /*
- * Re-take the spinlock. Note that we cannot
- * unlock the page yet since we're still
- * accessing the page_struct here...
- */
+ if (unlikely(swap.val != 0)) {
+ /* must drop lru lock if getting swap_list lock */
+ spin_unlock(&pagemap_lru_lock);
+ swap_free(swap);
spin_lock(&pagemap_lru_lock);
+ }
- /* The buffers were not freed. */
- if (!clearedbuf) {
- add_page_to_inactive_dirty_list(page);
-
- /* The page was only in the buffer cache. */
- } else if (!page->mapping) {
- atomic_dec(&buffermem_pages);
- freed_page = 1;
- cleaned_pages++;
-
- /* The page has more users besides the cache and us. */
- } else if (page_count(page) > 2) {
- add_page_to_active_list(page);
-
- /* OK, we "created" a freeable page. */
- } else /* page->mapping && page_count(page) == 2 */ {
- add_page_to_inactive_clean_list(page);
- cleaned_pages++;
- }
+ UnlockPage(page);
- /*
- * Unlock the page and drop the extra reference.
- * We can only do it here because we are accessing
- * the page struct above.
- */
- UnlockPage(page);
- page_cache_release(page);
+ /* effectively free the page here */
+ page_cache_release(page);
+ if (--nr_pages)
continue;
- } else if (page->mapping && !PageDirty(page)) {
- /*
- * If a page had an extra reference in
- * deactivate_page(), we will find it here.
- * Now the page is really freeable, so we
- * move it to the inactive_clean list.
- */
- del_page_from_inactive_dirty_list(page);
- add_page_to_inactive_clean_list(page);
- UnlockPage(page);
- cleaned_pages++;
- } else {
-page_active:
- /*
- * OK, we don't know what to do with the page.
- * It's no use keeping it here, so we move it to
- * the active list.
- */
- del_page_from_inactive_dirty_list(page);
- add_page_to_active_list(page);
- UnlockPage(page);
- }
+ break;
}
spin_unlock(&pagemap_lru_lock);
- /* Return the number of pages moved to the inactive_clean list. */
- return cleaned_pages;
+ return nr_pages;
}
-/**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
+/*
+ * This moves pages from the active list to
+ * the inactive list.
*
- * This function will scan a portion of the active list to find
- * unused pages, those pages will then be moved to the inactive list.
+ * We move them the other way when we see the
+ * reference bit on the page.
*/
-static int refill_inactive_scan(unsigned int priority)
+static void refill_inactive(int nr_pages)
{
- struct list_head * page_lru;
- struct page * page;
- int maxscan = nr_active_pages >> priority;
- int page_active = 0;
- int nr_deactivated = 0;
+ struct list_head * entry;
- /* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
- while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
- page = list_entry(page_lru, struct page, lru);
+ entry = active_list.prev;
+ while (nr_pages-- && entry != &active_list) {
+ struct page * page;
- /* Wrong page on list?! (list corruption, should not happen) */
- if (!PageActive(page)) {
- printk("VM: refill_inactive, wrong page on list.\n");
- list_del(page_lru);
- nr_active_pages--;
+ page = list_entry(entry, struct page, lru);
+ entry = entry->prev;
+ if (PageTestandClearReferenced(page)) {
+ list_del(&page->lru);
+ list_add(&page->lru, &active_list);
continue;
}
- /*
- * Do not deactivate pages from zones which
- * have plenty inactive pages.
- */
-
- if (zone_inactive_plenty(page->zone)) {
- page_active = 1;
- goto skip_page;
- }
-
- /* Do aging on the pages. */
- if (PageTestandClearReferenced(page)) {
- age_page_up(page);
- page_active = 1;
- } else {
- age_page_down(page);
- /*
- * Since we don't hold a reference on the page
- * ourselves, we have to do our test a bit more
- * strict then deactivate_page(). This is needed
- * since otherwise the system could hang shuffling
- * unfreeable pages from the active list to the
- * inactive_dirty list and back again...
- *
- * SUBTLE: we can have buffer pages with count 1.
- */
- if (page->age == 0 && page_count(page) <=
- (page->buffers ? 2 : 1)) {
- deactivate_page_nolock(page);
- page_active = 0;
- } else {
- page_active = 1;
- }
- }
- /*
- * If the page is still on the active list, move it
- * to the other end of the list. Otherwise we exit if
- * we have done enough work.
- */
- if (page_active || PageActive(page)) {
-skip_page:
- list_del(page_lru);
- list_add(page_lru, &active_list);
- } else {
- nr_deactivated++;
- }
+ del_page_from_active_list(page);
+ add_page_to_inactive_list(page);
}
spin_unlock(&pagemap_lru_lock);
-
- return nr_deactivated;
}
-/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
- */
-int free_shortage(void)
+static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
+static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
- pg_data_t *pgdat;
- unsigned int global_free = 0;
- unsigned int global_target = freepages.high;
-
- /* Are we low on free pages anywhere? */
- pgdat = pgdat_list;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones+ i;
- unsigned int free;
+ int max_scan = nr_inactive_pages / priority;
- if (!zone->size)
- continue;
+ nr_pages -= kmem_cache_reap(gfp_mask);
+ if (nr_pages <= 0)
+ return 0;
- free = zone->free_pages;
- free += zone->inactive_clean_pages;
+ /* Do we want to age the active list? */
+ if (nr_inactive_pages < nr_active_pages*2)
+ refill_inactive(nr_pages);
- /* Local shortage? */
- if (free < zone->pages_low)
- return 1;
+ nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
+ if (nr_pages <= 0)
+ return 0;
- global_free += free;
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
- /* Global shortage? */
- return global_free < global_target;
+ return nr_pages;
}
-/*
- * Are we low on inactive pages globally or in any zone?
- */
-int inactive_shortage(void)
+int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
{
- pg_data_t *pgdat;
- unsigned int global_target = freepages.high + inactive_target;
- unsigned int global_incative = 0;
+ int priority = DEF_PRIORITY;
+ int ret = 0;
- pgdat = pgdat_list;
do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones + i;
- unsigned int inactive;
+ int nr_pages = SWAP_CLUSTER_MAX;
+ nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
+ if (nr_pages <= 0)
+ return 1;
- if (!zone->size)
- continue;
+ ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2);
+ } while (--priority);
- inactive = zone->inactive_dirty_pages;
- inactive += zone->inactive_clean_pages;
- inactive += zone->free_pages;
+ return ret;
+}
- /* Local shortage? */
- if (inactive < zone->pages_high)
- return 1;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
- global_incative += inactive;
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+static int check_classzone_need_balance(zone_t * classzone)
+{
+ zone_t * first_classzone;
- /* Global shortage? */
- return global_incative < global_target;
+ first_classzone = classzone->zone_pgdat->node_zones;
+ while (classzone >= first_classzone) {
+ if (classzone->free_pages > classzone->pages_high)
+ return 0;
+ classzone--;
+ }
+ return 1;
}
-/*
- * Loop until we are no longer under an inactive or free
- * shortage. Return 1 on success, 0 if we failed to get
- * there even after "maxtry" loops.
- */
-#define INACTIVE_SHORTAGE 1
-#define FREE_SHORTAGE 2
-#define GENERAL_SHORTAGE 4
-static int do_try_to_free_pages(unsigned int gfp_mask, int user)
-{
- /* Always walk at least the active queue when called */
- int shortage = INACTIVE_SHORTAGE;
- int maxtry;
-
- maxtry = 1 << DEF_PRIORITY;
- do {
- /*
- * If needed, we move pages from the active list
- * to the inactive list.
- */
- if (shortage & INACTIVE_SHORTAGE) {
- /* Walk the VM space for a bit.. */
- swap_out(DEF_PRIORITY, gfp_mask);
+static int kswapd_balance_pgdat(pg_data_t * pgdat)
+{
+ int need_more_balance = 0, i;
+ zone_t * zone;
- /* ..and refill the inactive list */
- refill_inactive_scan(DEF_PRIORITY);
+ for (i = pgdat->nr_zones-1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (unlikely(current->need_resched))
+ schedule();
+ if (!zone->need_balance)
+ continue;
+ if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
+ zone->need_balance = 0;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ*5);
+ continue;
}
+ if (check_classzone_need_balance(zone))
+ need_more_balance = 1;
+ else
+ zone->need_balance = 0;
+ }
- /*
- * If we're low on free pages, move pages from the
- * inactive_dirty list to the inactive_clean list.
- *
- * Usually bdflush will have pre-cleaned the pages
- * before we get around to moving them to the other
- * list, so this is a relatively cheap operation.
- */
- if (shortage & FREE_SHORTAGE)
- page_launder(gfp_mask, user);
-
- /*
- * Reclaim unused slab cache if we were short on memory.
- */
- if (shortage & GENERAL_SHORTAGE) {
- shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
- shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+ return need_more_balance;
+}
- kmem_cache_reap(gfp_mask);
- }
+static void kswapd_balance(void)
+{
+ int need_more_balance;
+ pg_data_t * pgdat;
- if (current->need_resched) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
+ do {
+ need_more_balance = 0;
+ pgdat = pgdat_list;
+ do
+ need_more_balance |= kswapd_balance_pgdat(pgdat);
+ while ((pgdat = pgdat->node_next));
+ } while (need_more_balance);
+}
- shortage = 0;
- if (inactive_shortage())
- shortage |= INACTIVE_SHORTAGE | GENERAL_SHORTAGE;
- if (free_shortage())
- shortage |= FREE_SHORTAGE | GENERAL_SHORTAGE;
+static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
+{
+ zone_t * zone;
+ int i;
- if (--maxtry <= 0)
- break;
- } while (shortage);
+ for (i = pgdat->nr_zones-1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!zone->need_balance)
+ continue;
+ return 0;
+ }
- return !shortage;
+ return 1;
}
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+static int kswapd_can_sleep(void)
+{
+ pg_data_t * pgdat;
+
+ pgdat = pgdat_list;
+ do {
+ if (kswapd_can_sleep_pgdat(pgdat))
+ continue;
+ return 0;
+ } while ((pgdat = pgdat->node_next));
+
+ return 1;
+}
/*
* The background pageout daemon, started as a kernel thread
@@ -894,6 +673,7 @@
int kswapd(void *unused)
{
struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
daemonize();
strcpy(tsk->comm, "kswapd");
@@ -917,107 +697,31 @@
* Kswapd main loop.
*/
for (;;) {
- static long recalc = 0;
-
- /* Once a second ... */
- if (time_after(jiffies, recalc + HZ)) {
- recalc = jiffies;
-
- /* Recalculate VM statistics. */
- recalculate_vm_stats();
- }
-
- if (!do_try_to_free_pages(GFP_KSWAPD, 1)) {
- if (out_of_memory())
- oom_kill();
- continue;
- }
-
- run_task_queue(&tq_disk);
- interruptible_sleep_on_timeout(&kswapd_wait, HZ);
- }
-}
-
-void wakeup_kswapd(void)
-{
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
-}
-
-/*
- * Called by non-kswapd processes when they want more
- * memory but are unable to sleep on kswapd because
- * they might be holding some IO locks ...
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
- int ret = 1;
-
- if (gfp_mask & __GFP_WAIT) {
- current->flags |= PF_MEMALLOC;
- ret = do_try_to_free_pages(gfp_mask, 1);
- current->flags &= ~PF_MEMALLOC;
- }
-
- return ret;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
-/*
- * Kreclaimd will move pages from the inactive_clean list to the
- * free list, in order to keep atomic allocations possible under
- * all circumstances.
- */
-int kreclaimd(void *unused)
-{
- struct task_struct *tsk = current;
- pg_data_t *pgdat;
-
- daemonize();
- strcpy(tsk->comm, "kreclaimd");
- sigfillset(&tsk->blocked);
- current->flags |= PF_MEMALLOC;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kswapd_wait, &wait);
- while (1) {
+ mb();
+ if (kswapd_can_sleep())
+ schedule();
- /*
- * We sleep until someone wakes us up from
- * page_alloc.c::__alloc_pages().
- */
- interruptible_sleep_on(&kreclaimd_wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kswapd_wait, &wait);
/*
- * Move some pages from the inactive_clean lists to
- * the free lists, if it is needed.
+ * If we actually get into a low-memory situation,
+ * the processes needing more memory will wake us
+ * up on a more timely basis.
*/
- pgdat = pgdat_list;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones + i;
- if (!zone->size)
- continue;
-
- while (zone->free_pages < zone->pages_low) {
- struct page * page;
- page = reclaim_page(zone);
- if (!page)
- break;
- __free_page(page);
- }
- }
- pgdat = pgdat->node_next;
- } while (pgdat);
+ kswapd_balance();
+ run_task_queue(&tq_disk);
}
}
-
static int __init kswapd_init(void)
{
- printk("Starting kswapd v1.8\n");
+ printk("Starting kswapd\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
- kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)