/*
 * ocfsiosup.c
 *
 * Read and write to disk
 *
 * Copyright (C) 2002 Oracle Corporation.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have recieved a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 *
 * Authors: Neeraj Goyal, Suchit Kaura, Kurt Hackel, Sunil Mushran,
 *          Manish Singh, Wim Coekaerts
 */

#include "ocfs.h"
#include <linux/locks.h>
#include <linux/pagemap.h>

/* Tracing */
#define OCFS_DEBUG_CONTEXT      OCFS_DEBUG_CONTEXT_IOSUP

/*
 * LinuxWriteForceDisk()
 *
 */
int LinuxWriteForceDisk (ocfs_super * osb,
		     void *Buffer, __u32 Length, __u64 Offset, bool Cached)
{
	int status = 0;
	int nr, i;
	struct super_block *sb;
	__u64 blocknum;
	kdev_t dev;
	struct buffer_head *bh;
	struct buffer_head **bhs = NULL;
	char *kaddr;

	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u.%u)\n", osb, Buffer, Length,
			HI (Offset), LO (Offset));

#ifdef NO_CACHE
	Cached = false;
#endif

	if (Length % 512) {
                LOG_ERROR_STATUS (status = -EFAIL);
                goto bail;
        }

	if (osb == NULL || osb->sb == NULL) {
		LOG_ERROR_STATUS (status = -EFAIL);
		goto bail;
	}

	sb = osb->sb;
	dev = sb->s_dev;

	/* !!!! I don't really know where to put this... !!!! */
	if (Cached) {
		ocfs_io_runs *runs;
		__u32 num_runs;

		ocfs_create_log_extent_map(osb, &runs, &num_runs, Offset,
				(__u64)Length);
		// on NT they do the io straight off the io_runs
		// but we do the buffer_head stuff below... just free?
		ocfs_safefree(runs);
		LOG_TRACE_ARGS("woo!  %d runs were just added to the trans_mcb!\n", num_runs);
	}


	nr = (Length + 511) >> 9;
	if (nr > 256)
		LOG_TRACE_ARGS("getting write for %d blocks\n", nr);
	bhs = kmalloc (nr * sizeof(bh), GFP_KERNEL);
	if (bhs == NULL) {
		LOG_ERROR_STATUS (status = -ENOMEM);
		goto bail;
	}

	blocknum = Offset >> sb->s_blocksize_bits;

	if (blocknum == 0) {
		ocfs_vol_disk_hdr *hdr;
		LOG_TRACE_STR ("Blocknum is zero!!!");
		hdr = (ocfs_vol_disk_hdr *)Buffer;
		if (memcmp(hdr->signature, OCFS_VOLUME_SIGNATURE, strlen(OCFS_VOLUME_SIGNATURE)) != 0) {
			status = -EIO;
			LOG_ERROR_STR("WARNING! attempting to write non volume header to block 0");
			goto bail;
		}
	}

	/* build an array of bh's and prepare them for submit */
	for (i = 0 ; i < nr ; i++) {
		bh = getblk (dev, blocknum++, sb->s_blocksize);
		if (bh == NULL) {
			LOG_ERROR_STATUS (status = -EFAIL);
			goto bail;
		}
		lock_buffer(bh);

		/* if the new buffer is the same as the old buffer then don't bother doing anything
		 * this is only the case for locks that we rewrite to disk
		 * and are awlays 1 sector sized writes coming in
		 */
		kaddr = kmap(bh->b_page);
#ifdef NO_CACHE
		if (nr == 1 && Offset >= osb->vol_layout.data_start_off &&
		    (memcmp(kaddr + ((unsigned long)(bh)->b_data & ~PAGE_MASK), Buffer, 512) == 0))
#else
		if (Cached && nr == 1 && Offset >= osb->vol_layout.data_start_off &&
		    (memcmp(kaddr + ((unsigned long)(bh)->b_data & ~PAGE_MASK), Buffer, 512) == 0))
#endif
		{
			kunmap(bh->b_page);
			unlock_buffer(bh);
			brelse(bh);
			goto bail;
		}
		memcpy(kaddr + ((unsigned long)(bh)->b_data & ~PAGE_MASK), Buffer, 512);
		kunmap(bh->b_page);
		mark_buffer_uptodate(bh, true);
		mark_buffer_dirty(bh);
		unlock_buffer(bh);
		bhs[i] = bh;
		Buffer = (__u8 *) Buffer + sb->s_blocksize;
	}

	/* we do not do ll_rw_block if we are in Cached, 1 block, and above datastartoff */
	/* otherwise we always call ll_rw_block */

	if (!Cached || nr != 1 || Offset < osb->vol_layout.data_start_off)
		ll_rw_block (WRITE, nr, bhs);

	/* reap blocks */
	for (i = 0 ; i < nr ; i++) {
		bh = bhs[i];
		if (!Cached || ( nr != 1 || Offset < osb->vol_layout.data_start_off)) 
			wait_on_buffer(bh);
		brelse(bh);
	}


      bail:
      	if (bhs)
		kfree(bhs);

	LOG_EXIT_STATUS (status);
	return status;
}				/* LinuxWriteForceDisk */

/*
 * LinuxReadForceDisk()
 *
 */
int LinuxReadForceDisk (ocfs_super * osb,
		    void *Buffer, __u32 Length, __u64 Offset, bool Cached)
{
	int status = 0;
	struct super_block *sb;
	int nr, i;
	__u64 blocknum;
	kdev_t dev;
	struct buffer_head *bh;
	struct buffer_head **bhs = NULL;
	char *kaddr;

	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u.%u)\n", osb, Buffer, Length,
			HI (Offset), LO (Offset));

#ifdef NO_CACHE
	Cached = false;
#endif

	if (Length % 512) {
		LOG_ERROR_STATUS (status = -EFAIL);
		goto bail;
	}

	if (osb == NULL || osb->sb == NULL) {
		LOG_ERROR_STATUS (status = -EFAIL);
		goto bail;
	}

	sb = osb->sb;
	dev = sb->s_dev;

	nr = (Length + 511) >> 9;
	bhs = kmalloc (nr * sizeof(bh), GFP_KERNEL);
	if (bhs == NULL) {
	        LOG_ERROR_STATUS (status = -ENOMEM);
	        goto bail;
	}

	blocknum = Offset >> sb->s_blocksize_bits;

	if (nr == 0) {
		LOG_TRACE_STR ("No buffers will be read!!!");
		LOG_TRACE_ARGS
		    ("Len=%u Off=%u.%u numbuffers=%u blocknum=%u.%u\n", Length,
		     HI (Offset), LO (Offset), nr, HI (blocknum),
		     LO (blocknum));
	}

	for (i = 0 ; i < nr ; i++) {
                bh = getblk (dev, blocknum++, sb->s_blocksize);
                if (bh == NULL) {
                        LOG_ERROR_STATUS (status = -EFAIL);
                        goto bail;
                }
                bhs[i] = bh;
		if (!Cached || Offset < osb->vol_layout.data_start_off) {
			lock_buffer(bh);
			if (!buffer_dirty(bh)) 
				mark_buffer_uptodate(bh, false);
			unlock_buffer(bh);
		}
	}	

	ll_rw_block(READ, nr, bhs);

	for (i = 0; i < nr ; i++) {
		bh = bhs[i];
		wait_on_buffer(bh);
                lock_buffer(bh);
		kaddr = kmap(bh->b_page);
                memcpy(Buffer, kaddr + ((unsigned long)(bh)->b_data & ~PAGE_MASK), 512);
		kunmap(bh->b_page);
                unlock_buffer(bh);
                Buffer = (__u8 *) Buffer + sb->s_blocksize;
		brelse(bh);
        }

      bail:
	if (bhs)
		kfree(bhs);

	LOG_EXIT_STATUS (status);
	return status;

}				/* LinuxReadForceDisk */

/*
 * ocfs_write_metadata()
 *
 */
int ocfs_write_metadata (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	int status = 0;
	__s64 tempVbo = 0;
	__s64 tempLbo = 0;
	bool bRet = false;

	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u.%u)\n", osb, Buffer, Length,
			HI (Offset), LO (Offset));

	tempLbo = tempVbo = Offset;

	{
		int i = 0;

		while (((osb->needs_flush)) && (i < 3000)
		       && (!osb->trans_in_progress)) {
			ocfs_sleep (100);	/* 100ms */
			i++;
		}
	}

	ocfs_down_sem (&(osb->map_lock), true);
	bRet =
	    ocfs_add_extent_map_entry (osb, &osb->metadata_map, tempVbo, tempLbo,
				   (__u32) Length);
	if (!bRet) {
		ocfs_remove_extent_map_entry (osb, &osb->metadata_map, tempVbo,
					  Length);
		bRet =
		    ocfs_add_extent_map_entry (osb, &osb->metadata_map, tempVbo,
					   tempLbo, (__u32) Length);
	}
	ocfs_up_sem (&(osb->map_lock));

	status = LinuxWriteForceDisk (osb, Buffer, Length, Offset, true);
	if (status < 0)
		LOG_ERROR_STATUS (status);

	LOG_EXIT_STATUS (status);
	return status;
}				/* ocfs_write_metadata */


/*
 * ocfs_read_metadata()
 *
 */
int ocfs_read_metadata (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	int status = 0;
	__u32 RunsInExtentMap = 0, ExtentMapIndex;
	__s64 diskOffsetToFind = 0, foundFileOffset = 0;
	__s64 foundDiskOffset = 0;
	__u32 tempLen = 0, numMetaDataRuns = 0, numDataRuns = 0;
	__u32 remainingLength, length, i = 0;
        bool free_data=false, free_md=false;
	ocfs_io_runs *IoDataRuns = NULL, *IoMetaDataRuns = NULL;

	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u.%u)\n", osb, Buffer, Length,
			HI (Offset), LO (Offset));

        /* try to use prealloc ioruns if available */
        ocfs_down_sem (&osb->osb_res, true);
        if (! OSB_PREALLOC_LOCK_TEST(osb, OSB_DATA_LOCK))
        {
            OSB_PREALLOC_LOCK_SET(osb, OSB_DATA_LOCK);
            IoDataRuns = osb->data_prealloc;
        }
        if (! OSB_PREALLOC_LOCK_TEST(osb, OSB_MD_LOCK))
        {
            OSB_PREALLOC_LOCK_SET(osb, OSB_MD_LOCK);
            IoMetaDataRuns = osb->md_prealloc;
        }
        ocfs_up_sem(&osb->osb_res);

        if (IoDataRuns==NULL)
        {
            free_data=true;
	    IoDataRuns = ocfs_malloc (IORUN_ALLOC_SIZE);
	    if (IoDataRuns == NULL) {
		    LOG_ERROR_STATUS (status = -ENOMEM);
		    goto finally;
	    }
        }
        if (IoMetaDataRuns==NULL)
        {
            free_md=true;
	    IoMetaDataRuns = ocfs_malloc (IORUN_ALLOC_SIZE);
	    if (IoMetaDataRuns == NULL) {
		    LOG_ERROR_STATUS (status = -ENOMEM);
		    goto finally;
	    }
        }

	remainingLength = Length;
	length = 0;
	diskOffsetToFind = Offset;

	{
		int i = 0;

		while (((osb->needs_flush)) && (i < 3000)
		       && (!osb->trans_in_progress)) {
			ocfs_sleep (100);	/* 100ms */
			i++;
		}
	}

	ocfs_down_sem (&(osb->map_lock), true);

	RunsInExtentMap = ocfs_extent_map_get_count (&osb->metadata_map);
	#warning smp race wrt map count most likely
	for (ExtentMapIndex = 0; ExtentMapIndex < RunsInExtentMap;
	     ExtentMapIndex++) {
		if (!ocfs_get_next_extent_map_entry
		    (osb, &osb->metadata_map, ExtentMapIndex, &foundFileOffset,
		     &foundDiskOffset, &tempLen)) {
			/* It means this is a hole */
			continue;
		}

		length = tempLen;

		/*      |***TO*FIND***|                           */
		/*                      |---FOUND---|             */
		if (foundDiskOffset >= (diskOffsetToFind + remainingLength)) {
			break;
		}

		/*                    |***TO*FIND***|             */
		/*      |---FOUND---|                             */
		if (diskOffsetToFind >= (foundDiskOffset + length)) {
			continue;
		} else {
			/*                |***TO*FIND***|             */
			/*      |---FOUND-------------------|         */
			if ((diskOffsetToFind >= foundDiskOffset) &&
			    ((diskOffsetToFind + remainingLength) <=
			     (foundDiskOffset + length))) {

				IoMetaDataRuns[numMetaDataRuns].offset =
				    diskOffsetToFind;
				IoMetaDataRuns[numMetaDataRuns].disk_off =
				    diskOffsetToFind;
				IoMetaDataRuns[numMetaDataRuns].byte_cnt =
				    remainingLength;
				remainingLength -=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				diskOffsetToFind +=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				numMetaDataRuns++;
				break;

			}
			/*      |***TO*FIND***|***or****|             */
			/*             |---FOUND---|                  */
			else if ((diskOffsetToFind < foundDiskOffset)
				 && ((diskOffsetToFind + remainingLength) >
				     foundDiskOffset)) {
				IoDataRuns[numDataRuns].offset =
				    diskOffsetToFind;
				IoDataRuns[numDataRuns].disk_off =
				    diskOffsetToFind;
				IoDataRuns[numDataRuns].byte_cnt =
				    foundDiskOffset - diskOffsetToFind;
				remainingLength -=
				    IoDataRuns[numDataRuns].byte_cnt;
				diskOffsetToFind +=
				    IoDataRuns[numDataRuns].byte_cnt;
				numDataRuns++;

				IoMetaDataRuns[numMetaDataRuns].offset =
				    foundDiskOffset;
				IoMetaDataRuns[numMetaDataRuns].disk_off =
				    foundDiskOffset;
				IoMetaDataRuns[numMetaDataRuns].byte_cnt =
				    (remainingLength >
				     length) ? length : remainingLength;

				remainingLength -=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				diskOffsetToFind +=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				numMetaDataRuns++;

				if (remainingLength > 0) {
					continue;
				} else {
					break;
				}

			}
			/*             |***TO*FIND***|                */
			/*      |---FOUND---|                         */
			else if ((diskOffsetToFind >= foundDiskOffset) &&
				 ((diskOffsetToFind + remainingLength) >
				  (foundDiskOffset + length))) {
				IoMetaDataRuns[numMetaDataRuns].offset =
				    diskOffsetToFind;
				IoMetaDataRuns[numMetaDataRuns].disk_off =
				    diskOffsetToFind;
				IoMetaDataRuns[numMetaDataRuns].byte_cnt =
				    length - (diskOffsetToFind -
					      foundDiskOffset);
				remainingLength -=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				diskOffsetToFind +=
				    IoMetaDataRuns[numMetaDataRuns].byte_cnt;
				numMetaDataRuns++;
				continue;

			}
		}
	}

	ocfs_up_sem (&(osb->map_lock));

	if (remainingLength > 0) {
		IoDataRuns[numDataRuns].offset = diskOffsetToFind;
		IoDataRuns[numDataRuns].disk_off = diskOffsetToFind;
		IoDataRuns[numDataRuns].byte_cnt = remainingLength;
		numDataRuns++;
	}

	/* look for the specified offset in the map .if it exists then */
	/* do the read from cache, else go to disk. */
	for (i = 0; i < numDataRuns; i++) {
		__u64 newOffset = 0;
		__u32 newLength = IoDataRuns[i].byte_cnt;
		__u32 diff;

		newOffset = IoDataRuns[i].disk_off;
		diff = (__u32) (newOffset - Offset);

		status =
		    ocfs_read_force_disk (osb, (void *) ((__u8 *) Buffer + diff),
				       newLength, newOffset);
		if (status < 0) {
			LOG_ERROR_STATUS (status);
			goto finally;
		}
	}

	for (i = 0; i < numMetaDataRuns; i++) {
		__u64 newOffset = 0;
		__u32 diff;
		__u32 newLength = IoMetaDataRuns[i].byte_cnt;

		newOffset = IoMetaDataRuns[i].disk_off;

		diff = (__u32) (newOffset - Offset);

		status = LinuxReadForceDisk (osb, (void *) ((__u8 *) Buffer + diff),
				    newLength, newOffset, true);
		if (status < 0) {
			LOG_ERROR_STATUS (status);
			goto finally;
		}
	}

      finally:
	if (IoDataRuns && free_data) {
		ocfs_free (IoDataRuns);
		IoDataRuns = NULL;
	}

	if (IoMetaDataRuns && free_md) {
		ocfs_free (IoMetaDataRuns);
		IoMetaDataRuns = NULL;
	}

        ocfs_down_sem (&osb->osb_res, true);
        if (!free_data && OSB_PREALLOC_LOCK_TEST(osb, OSB_DATA_LOCK))
        {
            OSB_PREALLOC_LOCK_CLEAR(osb, OSB_DATA_LOCK);
        }
        if (!free_md && OSB_PREALLOC_LOCK_TEST(osb, OSB_MD_LOCK))
        {
            OSB_PREALLOC_LOCK_CLEAR(osb, OSB_MD_LOCK);
        }
        ocfs_up_sem(&osb->osb_res);


	LOG_EXIT_STATUS (status);
	return (status);
}				/* ocfs_read_metadata */

/*
 * ocfs_write_force_disk()
 *
 */
int ocfs_write_force_disk (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	return LinuxWriteForceDisk (osb, Buffer, Length, Offset, false);
}				/* ocfs_write_force_disk */

/*
 * ocfs_write_disk()
 *
 */
int ocfs_write_disk (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	return LinuxWriteForceDisk (osb, Buffer, Length, Offset, false);
}				/* ocfs_write_disk */

/*
 * ocfs_read_force_disk()
 *
 */
int ocfs_read_force_disk (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	return LinuxReadForceDisk (osb, Buffer, Length, Offset, false);
}				/* ocfs_read_force_disk */

/*
 * ocfs_read_force_disk_ex()
 *
 */
int ocfs_read_force_disk_ex (ocfs_super * osb, void **Buffer, __u32 AllocLen,
			     __u32 ReadLen, __u64 Offset)
{
	int status = 0;

	LOG_ENTRY ();

	if (*Buffer == NULL) {
		*Buffer = ocfs_malloc (AllocLen);
		if (*Buffer == NULL) {
			LOG_ERROR_STATUS (status = -ENOMEM);
			goto bail;
		}
	}

	status = ocfs_read_force_disk (osb, *Buffer, ReadLen, Offset);
	if (status < 0) {
		LOG_ERROR_STATUS (status);
		goto bail;
	}

      bail:
	LOG_EXIT_STATUS (status);
	return status;
}				/* ocfs_read_force_disk_ex */

/*
 * ocfs_read_disk()
 *
 */
int ocfs_read_disk (ocfs_super * osb, void *Buffer, __u32 Length, __u64 Offset)
{
	if (Offset < osb->vol_layout.bitmap_off)
		return LinuxReadForceDisk (osb, Buffer, Length, Offset, false);
	return ocfs_read_metadata (osb, Buffer, Length, Offset);
}				/* ocfs_read_disk */

/*
 * ocfs_read_disk_ex()
 *
 */
int ocfs_read_disk_ex (ocfs_super * osb, void **Buffer, __u32 AllocLen,
		       __u32 ReadLen, __u64 Offset)
{
	int status = 0;

	LOG_ENTRY ();

	if (*Buffer == NULL) {
		*Buffer = ocfs_malloc (AllocLen);
		if (*Buffer == NULL) {
			LOG_ERROR_STATUS (status = -ENOMEM);
			goto bail;
		}
	}

	status = ocfs_read_disk (osb, *Buffer, ReadLen, Offset);
	if (status < 0) {
		LOG_ERROR_STATUS (status);
		goto bail;
	}

      bail:
	LOG_EXIT_STATUS (status);
	return status;
}				/* ocfs_read_disk_ex */
