patch-2.4.22 linux-2.4.22/mm/filemap.c
Next file: linux-2.4.22/mm/mremap.c
Previous file: linux-2.4.22/lib/vsprintf.c
Back to the patch index
Back to the overall index
- Lines: 402
- Date:
2003-08-25 04:44:44.000000000 -0700
- Orig file:
linux-2.4.21/mm/filemap.c
- Orig date:
2003-06-13 07:51:39.000000000 -0700
diff -urN linux-2.4.21/mm/filemap.c linux-2.4.22/mm/filemap.c
@@ -123,8 +123,6 @@
*/
void __remove_inode_page(struct page *page)
{
- if (PageDirty(page) && !PageSwapCache(page))
- BUG();
remove_page_from_inode_queue(page);
remove_page_from_hash_queue(page);
}
@@ -812,6 +810,20 @@
return &wait[hash];
}
+/*
+ * This must be called after every submit_bh with end_io
+ * callbacks that would result into the blkdev layer waking
+ * up the page after a queue unplug.
+ */
+void wakeup_page_waiters(struct page * page)
+{
+ wait_queue_head_t * head;
+
+ head = page_waitqueue(page);
+ if (waitqueue_active(head))
+ wake_up(head);
+}
+
/*
* Wait for a page to get unlocked.
*
@@ -1545,6 +1557,27 @@
UPDATE_ATIME(inode);
}
+static inline int have_mapping_directIO(struct address_space * mapping)
+{
+ return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
+}
+
+/* Switch between old and new directIO formats */
+static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
+{
+ struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
+
+ if (mapping->a_ops->direct_fileIO)
+ return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
+ return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
+}
+
+/*
+ * i_sem and i_alloc_sem should be held already. i_sem may be dropped
+ * later once we've mapped the new IO. i_alloc_sem is kept until the IO
+ * completes.
+ */
+
static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
{
ssize_t retval;
@@ -1575,7 +1608,7 @@
retval = -EINVAL;
if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
goto out_free;
- if (!mapping->a_ops->direct_IO)
+ if (!have_mapping_directIO(mapping))
goto out_free;
if ((rw == READ) && (offset + count > size))
@@ -1603,7 +1636,7 @@
if (retval)
break;
- retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+ retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
if (rw == READ && retval > 0)
mark_dirty_kiobuf(iobuf, retval);
@@ -1699,12 +1732,16 @@
retval = 0;
if (!count)
goto out; /* skip atime */
+ down_read(&inode->i_alloc_sem);
+ down(&inode->i_sem);
size = inode->i_size;
if (pos < size) {
retval = generic_file_direct_IO(READ, filp, buf, count, pos);
if (retval > 0)
*ppos = pos + retval;
}
+ up(&inode->i_sem);
+ up_read(&inode->i_alloc_sem);
UPDATE_ATIME(filp->f_dentry->d_inode);
goto out;
}
@@ -2483,14 +2520,17 @@
{
long error = -EBADF;
struct file * file;
+ struct inode * inode;
unsigned long size, rlim_rss;
/* Doesn't work if there's no mapped file. */
if (!vma->vm_file)
return error;
file = vma->vm_file;
- size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ inode = file->f_dentry->d_inode;
+ if (!inode->i_mapping->a_ops->readpage)
+ return error;
+ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
@@ -2944,44 +2984,18 @@
}
/*
- * Write to a file through the page cache.
- *
- * We currently put everything into the page cache prior to writing it.
- * This is not a problem when writing full pages. With partial pages,
- * however, we first have to read the data into the cache, then
- * dirty the page, and finally schedule it for writing. Alternatively, we
- * could write-through just the portion of data that would go into that
- * page, but that would kill performance for applications that write data
- * line by line, and it's prone to race conditions.
- *
- * Note that this routine doesn't try to keep track of dirty pages. Each
- * file system has to do this all by itself, unfortunately.
- * okir@monad.swb.de
+ * precheck_file_write():
+ * Check the conditions on a file descriptor prior to beginning a write
+ * on it. Contains the common precheck code for both buffered and direct
+ * IO.
*/
-ssize_t
-generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+int precheck_file_write(struct file *file, struct inode *inode,
+ size_t *count, loff_t *ppos)
{
- struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
- struct inode *inode = mapping->host;
- unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
- loff_t pos;
- struct page *page, *cached_page;
- ssize_t written;
- long status = 0;
ssize_t err;
- unsigned bytes;
-
- if ((ssize_t) count < 0)
- return -EINVAL;
-
- if (!access_ok(VERIFY_READ, buf, count))
- return -EFAULT;
-
- cached_page = NULL;
-
- down(&inode->i_sem);
-
- pos = *ppos;
+ unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ loff_t pos = *ppos;
+
err = -EINVAL;
if (pos < 0)
goto out;
@@ -2992,11 +3006,9 @@
goto out;
}
- written = 0;
-
/* FIXME: this is for backwards compatibility with 2.4 */
if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
- pos = inode->i_size;
+ *ppos = pos = inode->i_size;
/*
* Check whether we've reached the file size limit.
@@ -3008,23 +3020,23 @@
send_sig(SIGXFSZ, current, 0);
goto out;
}
- if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
+ if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
- count = limit - (u32)pos;
+ *count = limit - (u32)pos;
}
}
/*
* LFS rule
*/
- if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
+ if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
if (pos >= MAX_NON_LFS) {
send_sig(SIGXFSZ, current, 0);
goto out;
}
- if (count > MAX_NON_LFS - (u32)pos) {
+ if (*count > MAX_NON_LFS - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
- count = MAX_NON_LFS - (u32)pos;
+ *count = MAX_NON_LFS - (u32)pos;
}
}
@@ -3041,7 +3053,7 @@
if (!S_ISBLK(inode->i_mode)) {
if (pos >= inode->i_sb->s_maxbytes)
{
- if (count || pos > inode->i_sb->s_maxbytes) {
+ if (*count || pos > inode->i_sb->s_maxbytes) {
send_sig(SIGXFSZ, current, 0);
err = -EFBIG;
goto out;
@@ -3049,35 +3061,68 @@
/* zero-length writes at ->s_maxbytes are OK */
}
- if (pos + count > inode->i_sb->s_maxbytes)
- count = inode->i_sb->s_maxbytes - pos;
+ if (pos + *count > inode->i_sb->s_maxbytes)
+ *count = inode->i_sb->s_maxbytes - pos;
} else {
if (is_read_only(inode->i_rdev)) {
err = -EPERM;
goto out;
}
if (pos >= inode->i_size) {
- if (count || pos > inode->i_size) {
+ if (*count || pos > inode->i_size) {
err = -ENOSPC;
goto out;
}
}
- if (pos + count > inode->i_size)
- count = inode->i_size - pos;
+ if (pos + *count > inode->i_size)
+ *count = inode->i_size - pos;
}
err = 0;
- if (count == 0)
+out:
+ return err;
+}
+
+/*
+ * Write to a file through the page cache.
+ *
+ * We currently put everything into the page cache prior to writing it.
+ * This is not a problem when writing full pages. With partial pages,
+ * however, we first have to read the data into the cache, then
+ * dirty the page, and finally schedule it for writing. Alternatively, we
+ * could write-through just the portion of data that would go into that
+ * page, but that would kill performance for applications that write data
+ * line by line, and it's prone to race conditions.
+ *
+ * Note that this routine doesn't try to keep track of dirty pages. Each
+ * file system has to do this all by itself, unfortunately.
+ * okir@monad.swb.de
+ */
+ssize_t
+do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos;
+ struct page *page, *cached_page;
+ ssize_t written;
+ long status = 0;
+ ssize_t err;
+ unsigned bytes;
+
+ cached_page = NULL;
+ pos = *ppos;
+ written = 0;
+
+ err = precheck_file_write(file, inode, &count, &pos);
+ if (err != 0 || count == 0)
goto out;
remove_suid(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
- if (file->f_flags & O_DIRECT)
- goto o_direct;
-
do {
unsigned long index, offset;
long page_fault;
@@ -3155,11 +3200,9 @@
status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
}
-out_status:
err = written ? written : status;
out:
- up(&inode->i_sem);
return err;
fail_write:
status = -EFAULT;
@@ -3176,8 +3219,32 @@
if (pos + bytes > inode->i_size)
vmtruncate(inode, inode->i_size);
goto done;
+}
+
+ssize_t
+do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos;
+ ssize_t written;
+ long status = 0;
+ ssize_t err;
+
+ pos = *ppos;
+ written = 0;
+
+ err = precheck_file_write(file, inode, &count, &pos);
+ if (err != 0 || count == 0)
+ goto out;
+
+ if (!file->f_flags & O_DIRECT)
+ BUG();
+
+ remove_suid(inode);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
-o_direct:
written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
if (written > 0) {
loff_t end = pos + written;
@@ -3194,7 +3261,58 @@
*/
if (written >= 0 && file->f_flags & O_SYNC)
status = generic_osync_inode(inode, OSYNC_METADATA);
- goto out_status;
+
+ err = written ? written : status;
+out:
+ return err;
+}
+
+static int do_odirect_fallback(struct file *file, struct inode *inode,
+ const char *buf, size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+ int err;
+
+ down(&inode->i_sem);
+ ret = do_generic_file_write(file, buf, count, ppos);
+ if (ret > 0) {
+ err = do_fdatasync(file);
+ if (err)
+ ret = err;
+ }
+ up(&inode->i_sem);
+ return ret;
+}
+
+ssize_t
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+ struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+ ssize_t err;
+
+ if ((ssize_t) count < 0)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ if (file->f_flags & O_DIRECT) {
+ /* do_generic_direct_write may drop i_sem during the
+ actual IO */
+ down_read(&inode->i_alloc_sem);
+ down(&inode->i_sem);
+ err = do_generic_direct_write(file, buf, count, ppos);
+ up(&inode->i_sem);
+ up_read(&inode->i_alloc_sem);
+ if (unlikely(err == -ENOTBLK))
+ err = do_odirect_fallback(file, inode, buf, count, ppos);
+ } else {
+ down(&inode->i_sem);
+ err = do_generic_file_write(file, buf, count, ppos);
+ up(&inode->i_sem);
+ }
+
+ return err;
}
void __init page_cache_init(unsigned long mempages)
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)