/* Pager for file data, inode, indirect blocks, and cylinder groups
   Copyright (C) 1991, 1992 Free Software Foundation

This file is part of the GNU Hurd.

The GNU Hurd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

The GNU Hurd is distributed in the hope that it will be useful, 
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the GNU Hurd; see the file COPYING.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */

/* Written by Michael I. Bushnell.  */

#include <mach/vm_prot.h>
#include <mach/mig_errors.h>
#include <mach/message.h>
#include "ufs.h"
#include "memory_object.h"

/* TODO

   FILE_RO pagers -- waiting on new kernel support to do this right.

   Handle multiple inits from the race -- Richard Draves says not to
   bother, and the kernel will be fixed not to require this.  It never
   happens anyway.  ;-)

   Not alloc panics in incg_next_write are a security hole.  Kernel
   spoofers can issue writes on locked pages.
*/

/* Exported vars */

/* Diskblocks per cylinder group */
/* also is diskblocks per double indir block */
static int dbspfsb;


/* Local vars */

static int indbs_pcg;		/* Diskblocks of inodes per cylinder group */

/* This is a diskblock (sblock->fs_bsize) full of zeros */
static char *zeroblock;

/* These structures record and interlock the state of each pager */
struct controlinfo
{
  int porttype;			/* always PT_PAGER */
  enum pager_type
    {
      DINODE,			/* inodes */
      CG,			/* cylinder groups */
      DINDIR,			/* double indirect blocks */
      SINDIR,			/* single indirect blocks */
      FILE,			/* file contents */
      FILE_RO,			/* file contents, read only */
    } pager_type;
  enum 
    {
      NOTINIT,
      NORMAL,
      TERMINATING,
      TERMINATED,
    } pager_state;

  struct mutex interlock;
  struct condition wakeup;

  memory_object_t memobj;
  memory_object_control_t memobjcntl;
  memory_object_name_t memobjname;

  int seqno;			/* sequence number of last request */
  int waitingforseqno;		/* wakeup when done with request */

  int vsize;			/* virtual size -- not valid for FILE pagers */
  struct inode *ip;		/* used by sindir and file pagers */
  struct controlinfo *next;	/* active or freelist */
  struct controlinfo **pprev;	/* back pointer */

  int synccount;		/* set between lock_request and completed */
  int writecount;		/* number of syncing pages */

  struct pagemap *pagemap;	/* one entry per known page */
  int pagemapsize;		/* size of pagemap */

  struct controlinfo *opager;	/* FILE_RO->FILE, FILE->FILE_RO */
} *cginfo, *inodeinfo, *dininfo;

static char *pg_types[] = 
{
  "DINODE", "CG", "DINDIR", "SINDIR", "FILE", "FILE_RO",
};

/* This structure exists, one per page, to record info */
/* This is known to be one byte by pagemap_resize */
struct pagemap
{
  u_char
  pagingout:1,			/* being written */
  pageinwait:1,			/* provide data back when write done */
  syncing:1;			/* decrement writecount when write done */
  enum 
    {
      PAGE_NOERR,
      PAGE_ENOSPC,
      PAGE_EIO,
      PAGE_EDQUOT,
    } error:2;			/* Doesn't belong here XXX  */
  u_char spare:3;
};

/* This structure records I/O in progress */
struct paginfo
{
  /* Communication with I/O routines -- must be first */
  struct ioreq req;

  /* From the initial call to inode_data_request/inode_data_write */
  mach_port_t object, control;
  vm_offset_t offset;
  int length;
  vm_prot_t access;		/* don't use it yet, but who knows... */
  
  /* Our buffer */
  vm_address_t buf;

  /* Where we are in processing this request */
  int dbs_left;
  int curoffset;		/* in diskblocks */
  vm_address_t bp;

  /* Actual file size relative to buf */
  int realsize;

  /* Request requires a write lock (disk allocation) */
  int needlock;

  struct paginfo *next;
};


/* To avoid mallocs, we never free paginfo structures, and keep them
   on this freelist.  We allocate a number at start up in
   incg_pager_init.  Npgs records the current number allocated.  If it is
   high in operation, perhaps the number created at startup should be
   increased... */
static struct mutex pgfreelistlock;
static struct paginfo *pgfreelist = 0;
static int npgs;

static struct mutex cilistlock;
static struct controlinfo *cifreelist = 0;
static struct controlinfo *silist = 0;
static struct controlinfo *filelist = 0;
static int ncis;

static void pager_next_read (struct paginfo *);
static void pager_next_write (struct paginfo *);
static struct controlinfo *pager_cvt (mach_port_t);
static struct paginfo *pgalloc (void);
static void pgrelse (struct paginfo *);
static void cienqueue (struct controlinfo *);
static struct controlinfo *cialloc (enum pager_type);
static void pager_sync1 (struct controlinfo *, int);
static void pager_term1 (struct controlinfo *);
static daddr_t indir_alloc (struct inode *, int, int);
static void mark_object_error (struct controlinfo *, int, int, error_t);
static void pagemap_resize (struct controlinfo *, int);
static void wait_for_seqno (struct controlinfo *, int);


/* Initialize the inode pager */
void
pager_init (void)
{
  int err;

  /* firewalls: */
  if (DEV_BSIZE % sizeof (struct dinode))
    panic ("inode size wrong");
  if (vm_page_size % DEV_BSIZE)
    panic ("page size unusable");
  if (sblock->fs_bsize % DEV_BSIZE)
    panic ("nonintegral filesystem block size");
  if (sblock->fs_ipg % sblock->fs_inopb)
    panic ("nonintegral number of inodes per cylinder group");
  if (vm_page_size > sblock->fs_bsize)
    panic ("page size larger than block size");

  mutex_init (&pgfreelistlock);
  mutex_init (&cilistlock);
  pgfreelist = 0;
  cifreelist = 0;

  inodeinfo = cialloc (DINODE);
  cginfo = cialloc (CG);
  dininfo = cialloc (DINDIR);

  err = vm_allocate (mach_task_self (), (u_int *)&zeroblock,
		     sblock->fs_bsize, 1);
  if (err)
    panic_with_error ("incg zeroblock", err);
  bzero (zeroblock, sblock->fs_bsize);
  
  /* Find out how much memory we need */
  indbs_pcg = sblock->fs_ipg / sblock->fs_inopb;
  inodeinfo->vsize = round_page (sblock->fs_ipg * sblock->fs_ncg 
				 * sizeof (struct dinode));

  dbspfsb = sblock->fs_bsize / DEV_BSIZE;
  
  cginfo->vsize = round_page (sblock->fs_bsize * sblock->fs_ncg);

  dininfo->vsize = round_page (sblock->fs_ipg * sblock->fs_ncg
			      * sblock->fs_bsize);

  
  /* Do the mapping of the inodes and cylinder groups */

  /* Note that this will fail when vm_map is made synchronous with
     pagers.  At that time, this will have to occur after the request
     threads are started.  */
  err = vm_map (mach_task_self (), (u_int *)&dinodes, inodeinfo->vsize,
		0, 1, inodeinfo->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (!err)
    err = vm_map (mach_task_self (), (u_int *)&cgs, cginfo->vsize, 0, 1,
		  cginfo->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		  VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("inode_pager_init map", err);
  
  register_memory_fault_area (dinodes, inodeinfo->vsize);
  register_memory_fault_area (cgs, cginfo->vsize);

}



kern_return_t
seqnos_memory_object_init (mach_port_t object, 
			   mach_port_seqno_t seqno,
			   mach_port_t control,
			   mach_port_t name,
			   vm_size_t pagesize)
{
  struct controlinfo *ci;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  printf ("MOI type %s\n", pg_types[ci->pager_type]);

  mutex_lock (&ci->interlock);

  if (ci->pager_state != NOTINIT)
    {
      printf ("pager dup init");
      return 0;
    }
  if (pagesize != vm_page_size)
    {
      printf ("incg init: bad page size");
      return 0;
    }

  ci->memobjcntl = control;
  ci->memobjname = name;
  ci->pagemapsize = 0;
  ci->seqno = seqno;

  /* Tell the kernel we're ready */
  memory_object_set_attributes (control, 1, 1, MEMORY_OBJECT_COPY_NONE);

  ci->pager_state = NORMAL;
  mutex_unlock (&ci->interlock);
  return 0;
}


/* Called by the kernel when data is needed upon page fault */
kern_return_t
seqnos_memory_object_data_request (mach_port_t object, 
				   mach_port_seqno_t seqno,
				   mach_port_t control,
				   vm_offset_t offset,
				   vm_size_t length,
				   vm_prot_t access)
{
  struct paginfo *pg, *tmp;
  int counting;
  int vsize;
  vm_offset_t off;
  struct controlinfo *ci;

  printf ("MODR %d %d at %d\n", object, length, offset);
  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  if (control != ci->memobjcntl)
    {
      printf ("incg data request: wrong control port");
      return 0;
    }
  if (length % vm_page_size)
    {
      printf ("incg data request: bad length size");
      return 0;
    }
  if (offset % vm_page_size)
    {
      printf ("incg data request: misaligned request");
      return 0;
    }
  
  /* This checks to make sure that we aren't reading past the end of
     the file.  Files sizes are changed externally to the pager. */
  if (ci->pager_type == FILE)
    {
      int newlength = length;
      
      vsize = get_inode_vsize (ci->ip);
      if (offset > vsize)
	{
	  /* Access past the end is always EINVAL, so we don't mark error */
	  memory_object_data_error (control, offset, length, EINVAL);
	  return 0;
	}
      
      /* Prune the request if it goes over the end, and report an
	 error for the overflow.  This needs to be checked against the kernel
	 when multi page requests are put in it, to make sure it works. */
      while (offset + newlength - vsize > vm_page_size)
	newlength -= vm_page_size;
      
      if (newlength != length)
	/* Access past the end is always EINVAL, so we don't mark error */
	memory_object_data_error (control, offset + newlength,
				  length - newlength, EINVAL);
      length = newlength;
    }
  else
    vsize = ci->vsize;
  
  /* This horrid loop discovers which paging requests need to be
     started and which are being paged out and the pagein should be short
     circuited when the pageout is done.  The complexity is to bunch pages
     together that we have to start I/O on.  Since they are likely to be
     consecutive diskblocks, we don't want to put each separate page into
     its own paginfo.  */
  counting = 0;
  pg = 0;
  mutex_lock (&ci->interlock);
  wait_for_seqno (ci, seqno);
  pagemap_resize (ci, offset + length);
  for (off = offset; off < offset + length; off += vm_page_size)
    {
      assert (ci->pagemapsize >= off / vm_page_size);
      if (ci->pagemap[off / vm_page_size].pagingout)
	{
	  ci->pagemap[off / vm_page_size].pageinwait = 1;
	  if (counting)
	    {
	      if (ci->pager_type == FILE)
		{
		  if (vsize >= pg->offset && vsize < pg->offset + pg->length)
		    pg->realsize = vsize - pg->offset;
		  else
		    pg->realsize = pg->length;
		}
	      
	      vm_allocate (mach_task_self (), &pg->buf, pg->length, 1);
	      pg->dbs_left = pg->length / DEV_BSIZE;
	      pg->curoffset = pg->curoffset / DEV_BSIZE;;
	      pg->bp = pg->buf;
	    }
	  counting = 0;
	}
      else
	{
	  if (counting)
	    pg->length += vm_page_size;
	  else
	    {
	      tmp = pgalloc();
	      tmp->object = object;
	      tmp->control = control;
	      tmp->offset = off;
	      tmp->length = vm_page_size;
	      tmp->access = access;
	      tmp->next = pg;
	      tmp->needlock = 0;
	      pg = tmp;
	      counting = 1;
	    }
	}
    }
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);
  if (counting)
    {
      if (ci->pager_type == FILE)
	{
	  if (vsize >= pg->offset && vsize < pg->offset + pg->length)
	    pg->realsize = vsize - pg->offset;
	  else
	    pg->realsize = pg->length;
	}
      vm_allocate (mach_task_self (), &pg->buf, pg->length, 1);
      pg->dbs_left = pg->length / DEV_BSIZE;
      pg->curoffset = pg->offset / DEV_BSIZE;
      pg->bp = pg->buf;
    }

  /* Actually kick things off */
  pager_next_read (pg);
  return 0;
}

/* Called by inode_data_request and by I/O */
static void
pager_next_read (struct paginfo *pg)
{
  struct paginfo *tmp;
  struct controlinfo *ci;
  volatile int ndbs;
  int cg, cgoff, curoffset;
  struct inode *volatile ip;
  volatile int blkoff, vblkno;
  int fsblockno;
  int idx;

  printf ("PNR 1 read %d at %d\n", pg->dbs_left, pg->curoffset);
 tailrecurse:
  ci = (struct controlinfo *)(pg->object);
  if (pg->req.error)
    {
      printf ("PNR: returning error 0x%x.\n", pg->req.error);
      mark_object_error(ci, pg->offset, pg->length, EIO);
      memory_object_data_error (pg->control, pg->offset, pg->length, 
				EIO);
      vm_deallocate (mach_task_self (), pg->buf, pg->length);
      tmp = pg->next;
      pgrelse (pg);
      if (tmp)
	{
	  pg = tmp;
	  goto tailrecurse;
	}
      else
	return;
    }

  if (!pg->dbs_left)
    {
      printf ("PNR: returning data\n");
      /* Return the data to the kernel */
      if (ci->pager_type == FILE && pg->length != pg->realsize)
	bzero ((void *)(pg->req.buffer + pg->realsize),
	       pg->length - pg->realsize);
      mark_object_error(ci, pg->offset, pg->length, 0);
      memory_object_data_provided (pg->control, pg->offset, pg->req.buffer,
				   pg->length, (pg->needlock 
						? VM_PROT_WRITE:VM_PROT_NONE));
      vm_deallocate (mach_task_self (), pg->req.buffer, pg->length); /* XXX */
      tmp = pg->next;
      pgrelse (pg);
      if (tmp)
	{
	  pg = tmp;
	  goto tailrecurse;
	}
      else
	return;
    }
  
  ndbs = pg->dbs_left;
  curoffset = pg->curoffset;
    
  switch (ci->pager_type)
    {
    case DINODE:
      cg = curoffset / indbs_pcg;
      cgoff = curoffset % indbs_pcg;
      
      if (cgoff + ndbs > indbs_pcg)
	ndbs = indbs_pcg - cgoff;

      pg->req.diskaddr = fsbtodb (cgimin (cg)) + cgoff;

      printf ("cg: %d\tcgoff: %d\tndbs: %d\tdiskaddr: %d\n",
	      cg, cgoff, ndbs, pg->req.diskaddr);
      break;

    case CG:
      cg = curoffset / dbspfsb;
      cgoff = curoffset % dbspfsb;
      
      if (cgoff + ndbs > dbspfsb)
	ndbs = dbspfsb - cgoff;
      
      pg->req.diskaddr = fsbtodb (cgstart (cg)) + cgoff;
      break;

    case DINDIR:
      ip = ifind (curoffset / dbspfsb);

      blkoff = curoffset % dbspfsb;
      if (blkoff + ndbs > dbspfsb)
	ndbs = dbspfsb - blkoff;

      mutex_lock (&ip->i_dinlock);
      
      if (catch_exception ())
	{
	  mutex_unlock (&ip->i_dinlock);
	  goto error_read;
	}
      if (!ip->di->di_ib[INDIR_DOUBLE])
	{
	  mutex_unlock (&ip->i_dinlock);
	  end_catch_exception ();
	  goto zeroread;
	}
      fsblockno = ip->di->di_ib[INDIR_DOUBLE];
      end_catch_exception ();

      mutex_unlock (&ip->i_dinlock);
      
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      break;

    case SINDIR:
      blkoff = curoffset % dbspfsb;
      if (blkoff + ndbs > dbspfsb)
	ndbs = dbspfsb - blkoff;

      ip = ci->ip;

      mutex_lock (&ip->i_sinlock);

      if (catch_exception ())
	{
	  mutex_unlock (&ip->i_sinlock);
	  goto error_read;
	}     

      if (curoffset < dbspfsb)
	{
	  if (!ip->di->di_ib[INDIR_SINGLE])
	    {
	      mutex_unlock (&ip->i_sinlock);
	      end_catch_exception ();
	      goto zeroread;
	    }
	  fsblockno = ip->di->di_ib[INDIR_SINGLE];
	}
      else
	{
	  idx = curoffset / dbspfsb;

	  if (!ip->i_dinloc)
	    din_map (ip);

	  if (!ip->i_dinloc[idx - 1])
	    {
	      mutex_unlock (&ip->i_sinlock);
	      end_catch_exception ();
	      goto zeroread;
	    }

	  fsblockno = ip->i_dinloc[idx - 1];
	  end_catch_exception ();
	}

      mutex_unlock (&ip->i_sinlock);

      if (!fsblockno)
	panic ("SINDIR pagein");
      
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      break;

    case FILE:
      blkoff = curoffset % dbspfsb;
      if (blkoff + ndbs > dbspfsb)
	ndbs = dbspfsb - blkoff;
      vblkno = curoffset / dbspfsb;

      ip = ci->ip;

      mutex_lock (&ip->i_datalock);

      if (catch_exception ())
	{
	  mutex_unlock (&ip->i_datalock);
	  goto error_read;
	}
      
      if (vblkno < NDADDR)
	{
	  if (!ip->di->di_db[vblkno])
	    {
	      mutex_unlock (&ip->i_datalock);
	      end_catch_exception ();
	      goto zeroread;
	    }
	  else
	    fsblockno = ip->di->di_db[vblkno];
	}
      else 
	{
	  /* Data is mapped by an indirect block */
	  vblkno -= NDADDR;

	  if (!ip->i_sinloc)
	    sin_map (ip);
	  
	  if (!ip->i_sinloc[vblkno])
	    {
	      mutex_unlock (&ip->i_datalock);
	      end_catch_exception ();
	      goto zeroread;
	    }
	  else
	    fsblockno = ip->i_sinloc[vblkno];
	}
      end_catch_exception ();
      mutex_unlock (&ip->i_datalock);
      
      if (!fsblockno)
	panic ("FILE pagein");
      
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      break;
    default:
      panic ("incg next read");
    }

  pg->req.buffer = pg->bp;
  pg->req.amt = ndbs;
  pg->req.callback = pager_next_read;
  
  /* Update fields */
  pg->dbs_left -= ndbs;
  pg->curoffset += ndbs;
  pg->bp += ndbs * DEV_BSIZE;
  
  /* Start I/O */
  dev_read ((struct ioreq *)pg);
  return;
  
  /* We jump here when the read should be zeroed */
 zeroread:
  pg->needlock = 1;
  bzero ((char *)(pg->bp), ndbs * DEV_BSIZE);
  pg->dbs_left -= ndbs;
  pg->curoffset += ndbs;
  pg->bp += ndbs * DEV_BSIZE;
  goto tailrecurse;

  /* We go here when we had an error fault in getting the address of
     the block.  */
 error_read:
  pg->req.error = 1;
  goto tailrecurse;
}

kern_return_t
seqnos_memory_object_data_unlock (mach_port_t object, 
			   mach_port_seqno_t seqno,
			   mach_port_t control,
			   vm_offset_t offset,
			   vm_size_t length,
			   vm_prot_t access)
{
  struct controlinfo *ci;
  struct inode *volatile ip;
  volatile int err;
  volatile int curoffset, dbs_left, blkoff;
  daddr_t vbn, pbn;
  int ndbs;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  ip = ci->ip;

  mutex_lock (&ci->interlock);

  wait_for_seqno (ci, seqno);

  if (ci->pager_state == TERMINATING)
    {
      /* Just let 'em hang, and they'll get it when we're destroyed */
      mutex_unlock (&ci->interlock);
      return 0;
    }

  if (control != ci->memobjcntl)
    {
      printf ("incg data unlock: wrong control port");
      return 0;
    }
  /* The only thing we ever block is writes */
  if ((access & VM_PROT_WRITE) == 0)
    {
      printf ("incg data unlock: not unlock writes");
      return 0;
    }
  if (offset % vm_page_size)
    {
      printf ("incg data unlock: misaligned request");
      return 0;
    }
  if (length % vm_page_size)
    {
      printf ("incg data request: bad length size");
      return 0;
    }

  switch (ci->pager_type)
    {
    default:
      panic ("data unlock");
    case DINDIR:
      for (curoffset = offset / DEV_BSIZE, dbs_left = length / DEV_BSIZE;
	   dbs_left; curoffset += ndbs, dbs_left -= ndbs)
	{
	  ip = ifind (curoffset / dbspfsb);
	  blkoff = curoffset % dbspfsb;
	  
	  mutex_lock (&ip->i_dinlock);

	  /* We should dealloc the block here sometimes... */
	  if (err = catch_exception ())
	    {
	      mutex_unlock (&ip->i_dinlock);
	      err = PAGE_EIO;
	      goto error;
	    }

	  if (!ip->di->di_ib[INDIR_DOUBLE])
	    pbn = indir_alloc (ip, INDIR_DOUBLE, 0);
	  if (!pbn)
	    {
	      mutex_unlock (&ip->i_dinlock);
	      err = PAGE_ENOSPC;
	      end_catch_exception ();
	      goto error;
	    }
	  ip->di->di_ib[INDIR_DOUBLE] = pbn;
	  end_catch_exception ();
	  mutex_unlock (&ip->i_dinlock);
	  
	  ndbs = dbs_left;
	  if (blkoff + ndbs > dbspfsb)
	    ndbs = dbspfsb - blkoff;
	}
      break;
    case SINDIR:
      mutex_lock (&ip->i_sinlock);

      /* We should dealloc the block here sometimes... */
      if (err = catch_exception ())
	{
	  mutex_unlock (&ip->i_sinlock);
	  err = PAGE_EIO;
	  goto error;
	}
      
      /* Check for unallocated disk and try allocating it */
      for (curoffset = offset / DEV_BSIZE, dbs_left = length / DEV_BSIZE;
	   dbs_left; curoffset += ndbs, dbs_left -= ndbs)
	{
	  if (curoffset < dbspfsb)
	    {
	      if (!ip->di->di_ib[INDIR_SINGLE])
		{
		  pbn = indir_alloc (ip, INDIR_SINGLE, 0);
		  if (!pbn)
		    {
		      mutex_unlock (&ip->i_sinlock);
		      err = PAGE_ENOSPC;
		      end_catch_exception ();
		      goto error;
		    }
		  ip->di->di_ib[INDIR_SINGLE] = pbn;
		}
	    }
	  else
	    {
	      int idx = curoffset / dbspfsb;
	      if (!ip->i_dinloc)
		din_map (ip);
	      if (!ip->i_dinloc[idx - 1])
		{
		  pbn = indir_alloc (ip, INDIR_SINGLE, idx);
		  if (!pbn)
		    {
		      mutex_unlock (&ip->i_sinlock);
		      err = PAGE_ENOSPC;
		      end_catch_exception ();
		      goto error;
		    }
		  ip->i_dinloc[idx - 1] = pbn;
		}
	    }
	  ndbs = dbs_left;
	  blkoff = curoffset % dbspfsb;
	  if (blkoff + ndbs > dbspfsb)
	    ndbs = dbspfsb - blkoff;
	}
      end_catch_exception ();
      mutex_unlock (&ip->i_sinlock);
      break;
    case FILE_RO:
      panic ("FILE_RO unlock request");
      break;
    case FILE:
      /* We get here to allocate a hole.  If this fault is on the last
	 block of the file, something wrong has happened */

      /* Check to make sure the fault isn't inside the last block */
      mutex_lock (&ip->i_datalock);

      if (offset + length > 
	  blkroundup (get_inode_vsize (ip)) - sblock->fs_bsize)
	{
	  printf ("incg data unlock: FILE write fault on last block");
	  mutex_unlock (&ip->i_datalock);
	  return 0;
	}
      
      /* We should dealloc the block here sometimes */
      if (err = catch_exception ())
	{
	  mutex_unlock (&ip->i_datalock);
	  err = PAGE_EIO;
	  goto error;
	}
      
      /* Allocate the blocks of the hole */

      for (curoffset = offset / DEV_BSIZE, dbs_left = length / DEV_BSIZE;
	   dbs_left; curoffset += ndbs, dbs_left -= ndbs)
	{
	  ndbs = dbs_left;
	  blkoff = curoffset % dbspfsb;
	  if (blkoff + ndbs > dbspfsb)
	    ndbs = dbspfsb - blkoff;
	  vbn = curoffset / dbspfsb;
	  
	  if (vbn < NDADDR)
	    {
	      if (!ip->di->di_db[vbn])
		{
		  err = alloc (ip, vbn, 
			       blkpref (ip, vbn, vbn, ip->di->di_db),
			       sblock->fs_bsize, &pbn, 0);
		  if (err)
		    {
		      if (err != ENOSPC)
			panic ("data unlock FILE bad error 1");
		      err = PAGE_ENOSPC;
		      mutex_unlock (&ip->i_datalock);
		      end_catch_exception ();
		      goto error;
		    }
		  ip->di->di_db[vbn] = pbn;
		}
	    }
	  else
	    {
	      vbn -= NDADDR;
	      if (!ip->i_sinloc)
		sin_map (ip);

	      if (!ip->i_sinloc[vbn])
		{
		  err = alloc (ip, vbn,
			       blkpref (ip, vbn + NDADDR, vbn, ip->i_sinloc),
			       sblock->fs_bsize, &pbn, 0);
		  if (err)
		    {
		      if (err != ENOSPC)
			panic ("data unlock FILE bad error 2");
		      err = PAGE_ENOSPC;
		      mutex_unlock (&ip->i_datalock);
		      end_catch_exception ();
		      goto error;
		    }
		  ip->i_sinloc[vbn] = pbn;
		}
	    }
	}
      end_catch_exception ();
      mutex_unlock (&ip->i_datalock);
      break;
    }
  
  /* And now, assuming we haven't had an error */
  memory_object_lock_request (control, offset, length, 0, 0, VM_PROT_NONE,
			      PORT_NULL);
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }

  mutex_unlock (&ci->interlock);
  return 0;
  
 error:
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);
  /* And, if we have */
  mark_object_error (ci, offset, length, err);
  memory_object_lock_request (control, offset, length, 0, 1, VM_PROT_NONE,
			      PORT_NULL);
  memory_object_data_error (control, offset, length, err);
  return 0;
}


/* Called by the kernel to write data to the backing store */
kern_return_t
seqnos_memory_object_data_write (mach_port_t object, 
			  mach_port_seqno_t seqno,
			  mach_port_t control,
			  vm_offset_t offset,
			  pointer_t data,
			  vm_size_t length)
{
  struct paginfo *pg;
  int off;
  struct controlinfo *ci;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  if (readonly)
    {
      printf ("incg data write: read only file system");
      return 0;
    }
  if (control != ci->memobjcntl)
    {
      printf ("incg data write: wrong control port");
      return 0;
    }
  if (length % vm_page_size)
    {
      printf ("incg data write: wrong length size");
      return 0;
    }
  if (offset % vm_page_size)
    {
      printf ("incg data write: misaligned request");
      return 0;
    }

  if (ci->pager_type == FILE)
    {
      int vsize = get_inode_vsize (ci->ip);
      int newlength = length;
      
      if (offset > vsize)
	{
	  /* XXX report error, but how?  U*x doesn't bother usually... */
	  vm_deallocate (mach_task_self (), data, length);
	  return 0;
	}
      
      /* Prune the request */
      while (offset + newlength - vsize > vm_page_size)
	newlength -= vm_page_size;
      
      /* Zero the memory in the last page past the size of the file 
	 to assure that extra data is always zero.  */
      if (vsize >= offset && vsize < offset + newlength)
	bzero ((char *)(data + (vsize - offset)),
	       newlength - (vsize - offset));

      pg = pgalloc ();
      pg->dbs_left = newlength / DEV_BSIZE;
    }
  else
    {
      pg = pgalloc ();
      pg->dbs_left = length / DEV_BSIZE;
    }

  mutex_lock (&ci->interlock);

  wait_for_seqno (ci, seqno);
  
  for (off = offset; off < offset + length; off += vm_page_size)
    {
      assert (ci->pagemapsize >= off / vm_page_size);
      if (ci->synccount && !ci->pagemap[off / vm_page_size].syncing)
	{
	  ci->pagemap[off/vm_page_size].syncing = 1;
	  ci->writecount++;
	}
      assert (!ci->pagemap[off/vm_page_size].pageinwait);
      assert (!ci->pagemap[off/vm_page_size].pagingout);
      ci->pagemap[off/vm_page_size].pagingout = 1;
    }
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);

  pg->length = length;
  pg->object = object;
  pg->control = control;
  pg->offset = offset;
  
  pg->buf = data;
  
  pg->curoffset = offset / DEV_BSIZE;
  pg->bp = pg->buf;

  pager_next_write (pg);
  return 0;
}


/* Called by incg_data_write and by I/O routines */
static void
pager_next_write (struct paginfo *pg)
{
  struct controlinfo *ci;
  volatile int ndbs;
  int cg, cgoff, off;
  int curoffset;
  struct inode *volatile ip;
  volatile int blkoff, vblkno;
  int fsblockno;
  error_t error;
  
 tailrecurse:
  ci = (struct controlinfo *)pg->object;

  if (!pg->dbs_left || pg->req.error)
    {
      mutex_lock (&ci->interlock);

      /* At some point this loop needs to be rewritten to combine
	 successive data_provided's and deallocates. */
      pagemap_resize (ci, pg->offset + pg->length);
      for (off = pg->offset; off < pg->offset + pg->length; 
	   off += vm_page_size)
	{
	  assert (ci->pagemapsize >= off / vm_page_size);
	  if (ci->pagemap[off / vm_page_size].syncing)
	    {
	      assert (ci->writecount);
	      ci->writecount--;
	    }
	  ci->pagemap[off / vm_page_size].syncing = 0;
	  ci->pagemap[off / vm_page_size].pagingout = 0;
	  if (ci->pagemap[off / vm_page_size].pageinwait)
	    {
	      mark_object_error(ci, off, vm_page_size, 0);
	      memory_object_data_provided (ci->memobjcntl, off, 
					   pg->buf + off - pg->offset,
					   vm_page_size, VM_PROT_NONE);
	    }
	  else
	    vm_deallocate (mach_task_self (), pg->buf + off - pg->offset,
			   vm_page_size);
	  ci->pagemap[off / vm_page_size].pageinwait = 0;
	  if (ci->writecount == 0 || ci == inodeinfo)
	    condition_broadcast (&ci->wakeup);
	}
      mutex_unlock (&ci->interlock);
      pgrelse (pg);
      return;
    }

  curoffset = pg->curoffset;
  ndbs = pg->dbs_left;
  
  switch (ci->pager_type)
    {
    case DINODE:
      cg = curoffset / indbs_pcg;
      cgoff = curoffset % indbs_pcg;
      
      if (cgoff + ndbs > indbs_pcg)
	ndbs = indbs_pcg - cgoff;
      
      pg->req.diskaddr = fsbtodb (cgimin (cg)) + cgoff;
      break;

    case CG:
      cg = curoffset / dbspfsb;
      cgoff = curoffset % dbspfsb;
      
      if (cgoff + ndbs > dbspfsb)
	ndbs = dbspfsb - cgoff;
      
      pg->req.diskaddr = fsbtodb (cgstart (cg)) + cgoff;
      break;

    case DINDIR:
      ip = ifind (curoffset / dbspfsb);
      blkoff = curoffset % dbspfsb;

      mutex_lock (&ip->i_dinlock);
      if (error = catch_exception ())
	{
	  mutex_unlock (&ip->i_dinlock);
	  goto error_write;
	}
      if (!ip->di->di_ib[INDIR_DOUBLE])
	panic ("DINDIR not alloc");
      else
	fsblockno = ip->di->di_ib[INDIR_DOUBLE];
      end_catch_exception ();
      mutex_unlock (&ip->i_dinlock);
      
      if (blkoff + ndbs > dbspfsb)
	ndbs = dbspfsb - blkoff;
      
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      break;

    case SINDIR:
      ip = ci->ip;
      mutex_lock (&ip->i_sinlock);

      if (error = catch_exception ())
	{
	  mutex_unlock (&ip->i_sinlock);
	  goto error_write;
	}
      
      if (curoffset < dbspfsb)
	{
	  if (!ip->di->di_ib[INDIR_SINGLE])
	    panic ("SINDIR not alloc main");
	  fsblockno = ip->di->di_ib[INDIR_SINGLE];
	}
      else
	{
	  int idx = curoffset / dbspfsb;
	  if (!ip->i_dinloc)
	    din_map (ip);
	  if (!ip->i_dinloc[idx - 1])
	    panic ("SINDIR not alloc from DINDIR");
	  fsblockno = ip->i_dinloc[idx - 1];
	}
      end_catch_exception ();
      mutex_unlock (&ip->i_sinlock);
      blkoff = curoffset % dbspfsb;

      if (!fsblockno)
	panic ("SINDIR pageout"); 

      if (blkoff + ndbs > dbspfsb)
	ndbs = dbspfsb - blkoff;
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      break;

    case FILE:
      blkoff = curoffset % sblock->fs_bsize;
      if (blkoff + ndbs > sblock->fs_bsize)
	ndbs = sblock->fs_bsize - blkoff;
      vblkno = curoffset / sblock->fs_bsize;
      
      ip = ci->ip;
      
      mutex_lock (&ip->i_datalock);
      if (error = catch_exception ())
	{
	  mutex_unlock (&ip->i_datalock);
	  goto error_write;
	}
      
      if (vblkno < NDADDR)
	{
	  if (!ip->di->di_db[vblkno])
	    panic ("file direct block not alloc");

	  fsblockno = ip->di->di_db[vblkno];
	}
      else
	{
	  vblkno -= NDADDR;

	  if (!ip->i_sinloc)
	    sin_map (ip);

	  if (!ip->i_sinloc[vblkno])
	    panic ("file indirected data block not alloc");

	  fsblockno = ip->i_sinloc[vblkno];
	}
      end_catch_exception ();
      mutex_unlock (&ip->i_datalock);
      
      pg->req.diskaddr = fsbtodb (fsblockno) + blkoff;
      
    default:
      panic ("incg next write");
    }

  pg->req.buffer = pg->bp;
  pg->req.amt = ndbs;
  pg->req.callback = pager_next_write;
  
  /* Update counters */
  pg->dbs_left -= ndbs;
  pg->curoffset += ndbs;
  pg->bp += ndbs * DEV_BSIZE;
  
  /* Start I/O */
  dev_write ((struct ioreq *)pg);
  
 error_write:
  pg->req.error = 1;
  goto tailrecurse;
}


/* Called by the kernel when a lock request has finished. */
kern_return_t
seqnos_memory_object_lock_completed (mach_port_t object,
			      mach_port_seqno_t seqno,
			      mach_port_t control,
			      vm_offset_t offset,
			      vm_size_t length)
{
  struct controlinfo *ci;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  if (control != ci->memobjcntl)
    {
      printf ("incg lock completed: wrong control port");
      return 0;
    }
  if (offset % vm_page_size)
    {
      printf ("incg lock completed: misaligned request");
      return 0;
    }
  if (length % vm_page_size)
    {
      printf ("incg lock completed: bad length size");
      return 0;
    }

  mutex_lock(&ci->interlock);
  wait_for_seqno (ci, seqno);
  assert (ci->synccount);
  ci->synccount--;
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  else if (ci->synccount == 0)
    condition_broadcast (&ci->wakeup);
  mutex_unlock (&ci->interlock);
  return 0;
}


/* Called by the kernel when a shutdown has finished. */
kern_return_t
seqnos_memory_object_terminate (mach_port_t object, 
			 mach_port_seqno_t seqno,
			 mach_port_t control,
			 mach_port_t name)
{
  struct controlinfo *ci;
  
  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  
  if (control != ci->memobjcntl)
    {
      printf ("incg terminate: wrong control port");
      return 0;
    }
  if (name != ci->memobjname)
    {
      printf ("incg terminate: wrong name port");
      return 0;
    }

  mutex_lock (&ci->interlock);

  wait_for_seqno (ci, seqno);

  /* Deallocate data structures, etc. */
  
  if ((ci->pager_type == CG
       || ci->pager_type == DINODE
       || ci->pager_type == DINDIR)
      && ci->pager_state != TERMINATED)
    panic ("unexpected pager terminated");

  mach_port_deallocate (mach_task_self (), control);
  mach_port_deallocate (mach_task_self (), name);

  /* Free the pagemap */
  if (ci->pagemapsize)
    vm_deallocate (mach_task_self (), (u_int)ci->pagemap, ci->pagemapsize);
      
  if (ci->pager_state == TERMINATING)
    {
      /* Someone is waiting for this, they'll do the clean up */
      ci->pager_state = TERMINATED;
      condition_broadcast (&ci->wakeup);
    }
  if (ci->waitingforseqno)
    {
      printf ("Waiting messages after terminate!\n");
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);
  return 0;
}

kern_return_t
seqnos_memory_object_copy (memory_object_t old,
		    mach_port_seqno_t seqno,
		    memory_object_control_t oldctl,
		    vm_offset_t off,
		    vm_size_t len,
		    memory_object_t new)
{
  printf ("memory_object_copy called\n");
  return 0;
}

error_t
seqnos_memory_object_supply_completed (mach_port_t obj,
				       mach_port_seqno_t seqno,
				       mach_port_t ctl,
				       vm_offset_t offset,
				       vm_size_t len,
				       error_t res,
				       vm_offset_t err_off)
{
  printf ("memory_object_supply_completed called\n");
  return 0;
}

error_t
seqnos_memory_object_data_return (mach_port_t obj,
				  mach_port_seqno_t seq,
				  mach_port_t ctl,
				  vm_offset_t off,
				  pointer_t data,
				  u_int datalen,
				  boolean_t dirty,
				  boolean_t kcopy)
{
  printf ("memory_object_data_return called\n");
  return 0;
}

error_t
seqnos_memory_object_change_completed (mach_port_t obj,
				       mach_port_seqno_t seq,
				       boolean_t may_cache,
				       memory_object_copy_strategy_t strat)
{
  printf ("memory_object_change_completed called\n");
  return 0;
}






/* Exported routines */

void
pager_sync (int wait)
{
  struct controlinfo *ci;
  mutex_lock (&cilistlock);
  for (ci = filelist; ci; ci = ci->next)
    pager_sync1(ci, wait);
  for (ci = silist; ci; ci = ci->next)
    pager_sync1(ci, wait);
  mutex_unlock (&cilistlock);
  pager_sync1 (dininfo, wait);
  pager_sync1 (inodeinfo, wait);
  pager_sync1 (cginfo, wait);
}

void
pager_shutdown (void)
{
  struct controlinfo *ci;
  mutex_lock (&cilistlock);
  for (ci = filelist; ci; ci = ci->next)
    pager_term1(ci);
  for (ci = silist; ci; ci = ci->next)
    pager_term1(ci);
  mutex_unlock (&cilistlock);
  pager_term1 (dininfo);
  pager_term1 (inodeinfo);
  pager_term1 (cginfo);
}
     
/* Write a single inode to disk  -- must be locked */
void
inode_update (struct inode *ip,
	      int wait)
{
  vm_offset_t offset, offsetpg;
  
  offset = ip->i_number * sizeof (struct dinode);
  offsetpg = offset / vm_page_size;
  offset = offsetpg * vm_page_size;

  mutex_lock (&inodeinfo->interlock);
  memory_object_lock_request (inodeinfo->memobjcntl, offset,
			      vm_page_size, 1, 0, VM_PROT_NONE,
			      inodeinfo->memobj);
  if (wait)
    while (inodeinfo->synccount || inodeinfo->pagemap[offsetpg].syncing)
      condition_wait (&inodeinfo->wakeup, &inodeinfo->interlock);

  mutex_unlock (&inodeinfo->interlock);
}

/* Sync the contents of a file */
void
file_update (struct inode *ip,
	     int wait)
{
  mutex_lock (&ip->i_datalock);
  if (ip->i_fileinfo)
    pager_sync1 (ip->i_fileinfo, wait);
  mutex_unlock (&ip->i_datalock);
  
  mutex_lock (&ip->i_sinlock);
  if (ip->i_sininfo)
    pager_sync1(ip->i_sininfo, wait);
  mutex_unlock (&ip->i_sinlock);
  
  /* This is overkill, but I don't want to deal with the better way now. */
  pager_sync1(dininfo, wait);
  
  inode_update (ip, wait);
}

  
/* Make sure the pagemap can deal with address off */
void
pagemap_resize (struct controlinfo *ci,
		int off)
{
  void *newaddr;
  int newsize;
  
  off /= vm_page_size;
  if (ci->pagemapsize >= off)
    return;
  
  newsize = round_page (off);
  vm_allocate (mach_task_self (), (u_int *)&newaddr, newsize, 1);
  bcopy (ci->pagemap, newaddr, ci->pagemapsize);
  bzero (newaddr + ci->pagemapsize, newsize - ci->pagemapsize);
  vm_deallocate (mach_task_self (), (u_int)ci->pagemap, ci->pagemapsize);
  ci->pagemap = newaddr;
  ci->pagemapsize = newsize;
}


/* This drops the double indirect block */
void
dindir_drop (struct inode *ip)
{
  mutex_lock (&ip->i_dinlock);
  
  memory_object_lock_request (dininfo->memobjcntl, 
			      ip->i_number * sblock->fs_bsize,
			      sblock->fs_bsize, 0, 1, VM_PROT_NONE,
			      MACH_PORT_NULL);

  if (ip->di->di_ib[INDIR_DOUBLE])
    {
      blkfree (ip->di->di_ib[INDIR_DOUBLE], sblock->fs_bsize);
      ip->di->di_ib[INDIR_DOUBLE] = 0;
    }

  mutex_unlock (&ip->i_dinlock);
}
  

/* This hideous routine is used by inode_truncate to clean the info in
   sin_loc. */
void
sindir_drop (struct inode *ip,
	     int lastiblock,
	     int olastiblock)
{
  int idx;
  
  mutex_lock (&ip->i_sinlock);
  
  memory_object_lock_request (ip->i_sininfo->memobjcntl,
			      (lastiblock + 1) * sblock->fs_bsize,
			      (olastiblock - lastiblock) * sblock->fs_bsize,
			      0, 1, VM_PROT_NONE, MACH_PORT_NULL);

  /* Drop indirect blocks found in the double indirect block */
  if (olastiblock > 1)
    {
      if (!ip->i_dinloc)
	din_map (ip);
      for (idx = lastiblock + 1; idx = olastiblock; idx++)
	{
	  if (ip->i_dinloc[idx])
	    {
	      blkfree (ip->i_dinloc[idx], sblock->fs_bsize);
	      ip->i_dinloc[idx] = 0;
	    }
	}
      
      /* If we no longer need the double indirect block, drop it. */
      if (lastiblock <= 1)
	{
	  dindir_drop (ip);
	  unregister_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
	  vm_deallocate (mach_task_self (), (u_int)ip->i_dinloc,
			 sblock->fs_bsize);
	  ip->i_dinloc = 0;
	}
    }
  
  /* Drop the block from the inode if we don't need it any more */
  if (lastiblock < 0 && ip->di->di_ib[INDIR_SINGLE])
    {
      blkfree (ip->di->di_ib[INDIR_SINGLE], sblock->fs_bsize);
      ip->di->di_ib[INDIR_SINGLE] = 0;
    }
  mutex_unlock (&ip->i_sinlock);
}



void
inode_truncate (struct inode *ip,
		off_t length)
{
  daddr_t lastblock, olastblock, bn;
  off_t osize;
  int bsize, idx;

  osize = ip->di->di_size;
  if (length < osize)
    return;

  /* Calculate block number of last block */
  lastblock = lblkno (length + sblock->fs_bsize - 1) - 1;
  olastblock = lblkno (osize + sblock->fs_bsize - 1) - 1;

  /* If the prune is not to a block boundary, zero the bit upto the
     next block boundary. */
  if (blkoff (length))
    fs_rdwr (ip, zeroblock, length, 
	     blksize (ip, lastblock) - blkoff (length), 1, 0);

  mutex_lock (&ip->i_datalock);

  /* Update the size now.  If we crash, fsck can finish freeing the
     blocks. */
  ip->di->di_size = length;

  /* Flush the old data.  This operation is bad if people do
     copy-on-write sharing of files.  As a result, the FILE pager has its
     copy strategy set to MEMORY_OBJECT_COPY_NONE.  At some point this will
     have to be changed to MEMORY_OBJECT_COPY_CALL, but I'm not up for it
     yet.  */
  if (ip->i_fileinfo)
    memory_object_lock_request (ip->i_fileinfo->memobjcntl, 
				((lastblock == -1 ? 0 : lastblock)
				 * sblock->fs_bsize),
				(olastblock - lastblock) * sblock->fs_bsize,
				0, 1, VM_PROT_NONE, MACH_PORT_NULL);

  /* Drop data blocks mapped by indirect blocks */
  if (olastblock > NDADDR)
    {
      if (!ip->i_sinloc)
	sin_map (ip);
      for (idx = lastblock + 1; idx <= olastblock; idx ++)
	{
	  if (ip->i_sinloc[idx])
	    {
	      blkfree (ip->i_sinloc[idx], sblock->fs_bsize);
	      ip->i_sinloc[idx] = 0;
	    }
	}

      /* Prune the block pointer handled by the sindir pager.  This will
	 free all the indirect blocks and such as necessary.  */
      sindir_drop (ip, lblkno((lastblock - NDADDR) * sizeof (daddr_t)),
		   lblkno ((olastblock - NDADDR) * sizeof (daddr_t)));

      /* Unmap the old sindir mapping */
      unregister_memory_fault_area (ip->i_sinloc, ip->i_sininfo->vsize);
      vm_deallocate (mach_task_self (), (u_int)ip->i_sinloc,
		     ip->i_sininfo->vsize);
      sin_map (ip);
    }

  /* Prune the blocks mapped directly from the inode */
  for (idx = lastblock + 1; idx < NDADDR; idx++)
    {
      bn = ip->di->di_db[idx];
      if (idx)
	{
	  ip->di->di_ib[idx] = 0;
	  if (bn > olastblock)
	    panic ("inode_truncate 1");
	  if (bn == olastblock)
	    {
	      bsize = blksize (ip, idx);
	      blkfree (bn, bsize);
	    }
	  else
	    blkfree (bn, sblock->fs_bsize);
	}
    }
  
  if (lastblock >= 0 && lastblock < NDADDR)
    {
      /* Look for a change in the size of the last direct block */
      bn = ip->di->di_db[lastblock];
      if (bn)
	{
	  off_t oldspace, newspace;
	  
	  oldspace = blksize (ip, lastblock);
	  newspace = fragroundup (blkoff (length));;
	  if (newspace == 0)
	    panic ("inode_truncate: newspace");
	  if (oldspace - newspace)
	    {
	      bn += numfrags (newspace);
	      blkfree (bn, oldspace - newspace);
	    }
	}
    }

  if (lastblock < NDADDR)
    ip->i_allocsize = blkroundup (length);
  else
    ip->i_allocsize = fragstoblks (fragroundup (length));

  mutex_unlock (&ip->i_datalock);
  file_update (ip, 1);
}  

/* This maps the double indirect block of a file */
void
din_map (struct inode *ip)
{
  vm_offset_t indiroff;
  int err;

  if (!mutex_try_lock (&ip->i_sinlock))
    panic ("din_map not locked");

  if (ip->i_dinloc)
    panic ("din_map");
  
  indiroff = ip->i_number * sblock->fs_bsize;
  
  err = vm_map (mach_task_self (), (u_int *)&ip->i_dinloc, sblock->fs_bsize, 
		0, 1, dininfo->memobj, indiroff, 0, VM_PROT_READ|VM_PROT_WRITE,
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("din_map", err);

  register_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
}


/* Called to map the single indirect blocks of a file */
void
sin_map (struct inode *ip)
{
  int err;
  struct controlinfo *ci;
  int size;
  
  if (!mutex_try_lock (&ip->i_datalock))
    panic ("sin_map not locked");
  
  if (!ip->i_sininfo)
    {
      ci = cialloc (SINDIR);
      
      ip->i_refcnt++;
      ci->ip = ip;

      size = get_inode_vsize (ip);
      
      ci->vsize = (size + sblock->fs_bsize - 1) / sblock->fs_bsize;	/* size in fsblks */
      ci->vsize -= NDADDR;		/* subtract off direct blocks */
      
      ci->vsize = (ci->vsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
  
      ci->vsize *= sblock->fs_bsize;

      ip->i_sininfo = ci;
    }
  else
    ci = ip->i_sininfo;

  err = vm_map (mach_task_self (), (u_int *)&ip->i_sinloc, ci->vsize, 0, 1,
		ci->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE, 
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("sin_map mapping", err);

  register_memory_fault_area (ip->i_sinloc, ci->vsize);
}


/* Called to remap the single indirect blocks when a file grows */
void
sin_remap (struct inode *ip,
	   int newsize)
{
  struct controlinfo *ci;
  int err;
  
  if (!mutex_try_lock (&ip->i_datalock))
    panic ("sin_remap not locked");
  
  ci = ip->i_sininfo;
  
  newsize = (newsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
  newsize -= NDADDR;
  newsize = (newsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
  newsize *= sblock->fs_bsize;
  
  if (newsize < ci->vsize)
    panic ("sin_remap size shrink");
  if (newsize == ci->vsize)
    {
      mutex_unlock (&ip->i_sinlock);
      return;
    }
  
  unregister_memory_fault_area (ip->i_sinloc, ci->vsize);
  vm_deallocate (mach_task_self (), (u_int)ip->i_sinloc, ci->vsize);
  
  ci->vsize = newsize;
  
  err = vm_map (mach_task_self (), (u_int *)&ip->i_sinloc, ci->vsize, 0, 1, 
		ci->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE, 
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("sin_remap mapping", err);

  register_memory_fault_area (ip->i_sinloc, ci->vsize);
}
    

/* Called to create a FILE memory object for an inode */
mach_port_t
get_filemap (struct inode *ip)
{
  struct controlinfo *ci;
  mach_port_t ret;

  mutex_lock (&ip->i_datalock);
  
  if (ip->i_fileinfo)
    {
      ci = ip->i_fileinfo;
      mutex_lock (&ci->interlock);
      mach_port_insert_right (mach_task_self (), ci->memobj, ci->memobj,
			      MACH_MSG_TYPE_MAKE_SEND);
      mutex_unlock (&ci->interlock);
      ret = ip->i_fileinfo->memobj;
      mutex_unlock (&ip->i_datalock);
      return ret;
    }

  ci = cialloc (FILE);
  ip->i_refcnt++;
  ci->ip = ip;

  ret = ci->memobj;
  mutex_unlock (&ip->i_datalock);
  return ret;
}

void
pager_nosenders(struct controlinfo *ci)
{
  struct inode *ip = ci->ip;
  mach_port_status_t mps;
  mach_port_t foo;

  if (ci->pager_type != FILE 
      && ci->pager_type != SINDIR)
    {
      printf ("pager_nosenders bad type");
      return;
    }

  /* Lock the inode fields */
  mutex_lock (&ip->i_toplock);
  if (ci->pager_type == FILE)
    mutex_lock (&ip->i_datalock);
  else /* ci->pager_type == SINDIR */
    mutex_lock (&ip->i_sinlock);

  /* Are there still no senders? */
  mach_port_get_receive_status (mach_task_self (), ci->memobj, &mps);
  if (mps.mps_srights)
    {
      /* Schedule another notification and quit */
      mach_port_request_notification (mach_task_self (), ci->memobj, 
				      MACH_NOTIFY_NO_SENDERS, 0, 
				      ci->memobj, MACH_MSG_TYPE_MAKE_SEND_ONCE,
				      &foo);
      if (ci->pager_type == FILE)
	mutex_unlock (&ip->i_datalock);
      else /* ci->pager_type == SINDIR */
	mutex_unlock (&ip->i_sinlock);
      mutex_unlock (&ip->i_toplock);
      return;
    }

  /* release all the memory */
  mach_port_destroy (mach_task_self (), ci->memobj);

  mutex_lock (&ci->interlock);
  ci->pager_state = TERMINATED;
  mutex_unlock (&ci->interlock);
  
  if (ci->pager_type == FILE)
    {
      ip->i_fileinfo = 0;
      if (ip->i_sinloc)
	{
	  unregister_memory_fault_area (ip->i_sinloc, ip->i_sininfo->vsize);
	  vm_deallocate (mach_task_self (), 
			 (u_int) ip->i_sinloc, ip->i_sininfo->vsize);
	}
      ip->i_sinloc = 0;
      mach_port_deallocate (mach_task_self (), ip->i_sininfo->memobj);
      mutex_unlock (&ip->i_datalock);
    }
  else if (ci->pager_type == SINDIR)
    {
      ip->i_sininfo = 0;
      if (ip->i_dinloc)
	{
	  unregister_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
	  vm_deallocate (mach_task_self (), (u_int) ip->i_dinloc,
			 sblock->fs_bsize);
	}
      ip->i_dinloc = 0;
      mutex_unlock (&ip->i_sinlock);
    }
  else
    panic ("pager_nosenders");
  
  ci->ip = 0;
  irele (ip);

  mutex_lock (&cilistlock);

  /* Off current list */
  ci->next->pprev = ci->pprev;
  *ci->pprev = ci->next;
  
  /* Onto free list */
  ci->next = cifreelist;
  ci->next->pprev = &ci->next;
  ci->pprev = &cifreelist;
  cifreelist = ci;

  mutex_unlock (&cilistlock);

  /* And that's all folks */
}
  
  
  


/* Utility */

void
wait_for_seqno (struct controlinfo *ci,
		int seqno)
{
  while (seqno != ci->seqno + 1)
    {
      ci->waitingforseqno = 1;
      condition_wait (&ci->wakeup, &ci->interlock);
    }
  ci->seqno = seqno;
}

struct controlinfo *
pager_cvt (mach_port_t port)
{
  if (*(int *)port == PT_PAGER)
    return (struct controlinfo *) port;
  else
    return 0;
}

static struct paginfo *
pgalloc (void)
{
  struct paginfo *pg;
  error_t err;

  mutex_lock (&pgfreelistlock);
  if (pgfreelist)
    {
      pg = pgfreelist;
      pgfreelist = pgfreelist->next;
      mutex_unlock (&pgfreelistlock);
    }
  else
    {
      npgs++;
      mutex_unlock (&pgfreelistlock);
      pg = malloc (sizeof (struct paginfo));
      err = mach_port_allocate_name (mach_task_self (), 
				     MACH_PORT_RIGHT_RECEIVE,
				     (mach_port_t) pg);
      if (err == KERN_NAME_EXISTS)
	{
	  /* Isn't this ugly? */
	  struct paginfo *newpg;

	  /* Unlock and get another structure */
	  mutex_unlock (&pgfreelistlock);
	  newpg = malloc (sizeof (struct paginfo));

	  free (pg); /* XXX */
	  mutex_lock (&pgfreelistlock);
	  npgs--;
	  mutex_unlock (&pgfreelistlock);
	  return newpg;
	}
      pg->req.porttype = PT_DEVIO;
      mach_port_move_member (mach_task_self (), (mach_port_t) pg,
			     ufs_portset);
    }
  pg->req.error = 0;
  return pg;
}

static void
pgrelse (struct paginfo *pg)
{
  mutex_lock (&pgfreelistlock);
  pg->next = pgfreelist;
  pgfreelist = pg;
  mutex_unlock (&pgfreelistlock);
}

static void
cienqueue (struct controlinfo *ci)
{
 switch (ci->pager_type)
   {
   case SINDIR:
     ci->next = silist;
     ci->next->pprev = &ci->next;
     ci->pprev = &silist;
     silist = ci;
     break;
   case FILE:
     ci->next = filelist;
     ci->next->pprev = &ci->next;
     ci->pprev = &filelist;
     filelist = ci;
     break;
   case DINODE:
   case CG:
   case DINDIR: 
     break;
   default:
     panic ("cienqueue");
   }
}

static struct controlinfo *
cialloc (enum pager_type type)
{
  struct controlinfo *ci;
  kern_return_t err;
  mach_port_t foo;

  mutex_lock (&cilistlock);
  if (cifreelist)
    {
      ci = cifreelist;
      cifreelist = cifreelist->next;
      cifreelist->pprev = &cifreelist;
    }
  else
    {
      ncis++;
      ci = malloc (sizeof (struct controlinfo));
    }
  ci->pager_type = type;
  ci->porttype = PT_PAGER;

  err = mach_port_allocate_name (mach_task_self (), MACH_PORT_RIGHT_RECEIVE,
				 (mach_port_t) ci);
  if (err == KERN_NAME_EXISTS)
    {
      /* Isn't this ugly? */
      struct controlinfo *newci;

      /* Unlock and get another structure */
      mutex_unlock (&cilistlock);
      newci = cialloc (type);

      /* Ditch this one */
      free (ci);
      mutex_lock (&cilistlock);
      ncis--;
      mutex_unlock (&cilistlock);
      
      return newci;
    }

  cienqueue (ci);
  mutex_unlock (&cilistlock);
  ci->pager_state = NOTINIT;
  ci->memobj = (memory_object_t) ci;
  mutex_init (&ci->interlock);
  condition_init (&ci->wakeup);
  ci->pagemapsize = ci->synccount = ci->writecount = 0;
  ci->pagemap = 0;
  mach_port_insert_right (mach_task_self (), (mach_port_t)ci, 
			  (mach_port_t) ci, MACH_MSG_TYPE_MAKE_SEND);
  mach_port_request_notification (mach_task_self (), ci->memobj, 
				  MACH_NOTIFY_NO_SENDERS, 1, ci->memobj,
				  MACH_MSG_TYPE_MAKE_SEND_ONCE, &foo);
  mach_port_move_member (mach_task_self (), ci->memobj, ufs_portset);
  return ci;
}


/* Write modified pages to disk. */
static void
pager_sync1 (struct controlinfo *ci,
	     int wait)
{
  mutex_lock (&ci->interlock);
  ci->synccount++;

  if (ci->pager_type == FILE)
    ci->vsize = get_inode_vsize (ci->ip);

  memory_object_lock_request (ci->memobjcntl, 0, ci->vsize, 1, 0,
			      VM_PROT_NONE, ci->memobj);
  if (wait)
    while (ci->synccount || ci->writecount)
      condition_wait (&ci->wakeup, &ci->interlock);
  mutex_unlock (&ci->interlock);
}

static void
pager_term1 (struct controlinfo *ci)
{
  mutex_lock (&ci->interlock);
  mutex_lock (&cilistlock);
  
  ci->next->pprev = ci->pprev;	/* remove from queue */
  *ci->pprev = ci->next;
  mutex_unlock (&cilistlock);

  ci->synccount++;
  ci->pager_state = TERMINATING;
  
  if (ci->pager_type == FILE)
    ci->vsize = get_inode_vsize (ci->ip);
  
  memory_object_lock_request (ci->memobjcntl, 0, ci->vsize, 1, 0,
			      VM_PROT_WRITE, PORT_NULL);

  /* At this point all the write messages are on the queue, so we can
     do a destroy op right away  */
  memory_object_destroy (ci->memobjcntl, 0);

  while (ci->pager_state != TERMINATED)
    condition_wait (&ci->wakeup, &ci->interlock);
}
   
/* Allocate an indirect block */
static daddr_t
indir_alloc (struct inode *ip,
	     int type,		/* INDIR_DOUBLE or INDIR_SINGLE */
	     int ind)		/* which in the series? */
{
  daddr_t bn;
  daddr_t lbn;
  int error;

  switch (type)
    {
    case INDIR_DOUBLE:
      lbn = NDADDR + sblock->fs_bsize / sizeof (daddr_t);
      break;
    case INDIR_SINGLE:
      if (ind == 0)
	lbn = NDADDR;
      else
	lbn = NDADDR + ind * sblock->fs_bsize / sizeof (daddr_t);
      break;
    default:
      panic ("indir_alloc type");
    }
  
  if (error = alloc (ip, NDADDR,
		     blkpref (ip, lbn, 0, (daddr_t *)0),
		     sblock->fs_bsize, &bn, 0))
    return 0;

  /* We do this write synchronously so that the inode never
     points at an indirect block full of garbage */
  if (dev_write_sync (fsbtodb (bn), zeroblock, sblock->fs_bsize))
    {
      blkfree (bn, sblock->fs_bsize);
      return 0;
    }
  else
    return bn;
}



/* These routines are cruft because the kernel is BUGGY and doesn't
   pass errors from memory_object_data_error through to the exception
   handler.  */
void
mark_object_error(struct controlinfo *ci,
		  int offset,
		  int length,
		  error_t error)
{
  int page_error;
  struct pagemap *p;

  offset /= vm_page_size;
  length /= vm_page_size;
  
  switch (error)
    {
    case 0:
      page_error = PAGE_NOERR;
      break;
    case ENOSPC:
      page_error = PAGE_ENOSPC;
      break;
    case EIO:
      page_error = PAGE_EIO;
      break;
    case EDQUOT:
      page_error = PAGE_EDQUOT;
      break;
    default:
      panic ("mark_object_error");
      break;
    }
  
  for (p = ci->pagemap; p < ci->pagemap + length; p++)
    p->error = page_error;
}
