patch-2.0.35 linux/drivers/block/md.c
Next file: linux/drivers/block/paride/Config.in
Previous file: linux/drivers/block/ll_rw_blk.c
Back to the patch index
Back to the overall index
- Lines: 592
- Date:
Mon Jul 13 13:47:28 1998
- Orig file:
v2.0.34/linux/drivers/block/md.c
- Orig date:
Mon Jul 13 13:46:26 1998
diff -u --recursive --new-file v2.0.34/linux/drivers/block/md.c linux/drivers/block/md.c
@@ -9,6 +9,9 @@
kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ RAID-1/RAID-5 extensions by:
+ Ingo Molnar, Miguel de Icaza, Gadi Oxman
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
@@ -31,18 +34,27 @@
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
+#include <linux/smp_lock.h>
#ifdef CONFIG_KERNELD
#include <linux/kerneld.h>
#endif
#include <linux/errno.h>
+/*
+ * For kernel_thread()
+ */
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
#define MAJOR_NR MD_MAJOR
#define MD_DRIVER
#include <linux/blk.h>
+#include <asm/bitops.h>
+#include <asm/atomic.h>
static struct hd_struct md_hd_struct[MAX_MD_DEV];
static int md_blocksizes[MAX_MD_DEV];
+static struct md_thread md_threads[MAX_MD_THREADS];
int md_size[MAX_MD_DEV]={0, };
@@ -91,8 +103,7 @@
if (!hd)
{
- printk ("No gendisk entry for dev %s\n", kdevname(dev));
- sprintf (name, "dev %s", kdevname(dev));
+ sprintf (name, "[dev %s]", kdevname(dev));
return (name);
}
@@ -117,23 +128,196 @@
read_ahead[MD_MAJOR]=minra;
}
+static int legacy_raid_sb (int minor, int pnum)
+{
+ int i, factor;
+
+ factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+
+ /*****
+ * do size and offset calculations.
+ */
+ for (i=0; i<md_dev[minor].nb_dev; i++) {
+ md_dev[minor].devices[i].size &= ~(factor - 1);
+ md_size[minor] += md_dev[minor].devices[i].size;
+ md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
+ md_dev[minor].devices[i-1].size) : 0;
+ }
+ return 0;
+}
+
+static void free_sb (struct md_dev *mddev)
+{
+ int i;
+ struct real_dev *realdev;
+
+ if (mddev->sb) {
+ free_page((unsigned long) mddev->sb);
+ mddev->sb = NULL;
+ }
+ for (i = 0; i <mddev->nb_dev; i++) {
+ realdev = mddev->devices + i;
+ if (realdev->sb) {
+ free_page((unsigned long) realdev->sb);
+ realdev->sb = NULL;
+ }
+ }
+}
+
+static int analyze_sb (int minor, int pnum)
+{
+ int i;
+ struct md_dev *mddev = md_dev + minor;
+ struct buffer_head *bh;
+ kdev_t dev;
+ struct real_dev *realdev;
+ u32 sb_offset, device_size;
+ md_superblock_t *sb = NULL;
+
+ /*
+ * raid-0 and linear don't use a raid superblock
+ */
+ if (pnum == RAID0 >> PERSONALITY_SHIFT || pnum == LINEAR >> PERSONALITY_SHIFT)
+ return legacy_raid_sb(minor, pnum);
+
+ /*
+ * Verify the raid superblock on each real device
+ */
+ for (i = 0; i < mddev->nb_dev; i++) {
+ realdev = mddev->devices + i;
+ dev = realdev->dev;
+ device_size = blk_size[MAJOR(dev)][MINOR(dev)];
+ realdev->sb_offset = sb_offset = MD_NEW_SIZE_BLOCKS(device_size);
+ set_blocksize(dev, MD_SB_BYTES);
+ bh = bread(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+ if (bh) {
+ sb = (md_superblock_t *) bh->b_data;
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk("md: %s: invalid raid superblock magic (%x) on block %u\n", kdevname(dev), sb->md_magic, sb_offset);
+ goto abort;
+ }
+ if (!mddev->sb) {
+ mddev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
+ if (!mddev->sb)
+ goto abort;
+ memcpy(mddev->sb, sb, MD_SB_BYTES);
+ }
+ realdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
+ if (!realdev->sb)
+ goto abort;
+ memcpy(realdev->sb, bh->b_data, MD_SB_BYTES);
+
+ if (memcmp(mddev->sb, sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
+ printk(KERN_ERR "md: superblock inconsistenty -- run ckraid\n");
+ goto abort;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ if (sb->utime != mddev->sb->utime) {
+ printk(KERN_ERR "md: superblock update time inconsistenty -- using the most recent one\n");
+ if (sb->utime > mddev->sb->utime)
+ memcpy(mddev->sb, sb, MD_SB_BYTES);
+ }
+ realdev->size = sb->size;
+ } else
+ printk(KERN_ERR "md: disabled device %s\n", kdevname(dev));
+ }
+ if (!mddev->sb) {
+ printk(KERN_ERR "md: couldn't access raid array %s\n", kdevname(MKDEV(MD_MAJOR, minor)));
+ goto abort;
+ }
+ sb = mddev->sb;
+
+ /*
+ * Check if we can support this raid array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION || sb->minor_version > MD_MINOR_VERSION) {
+ printk("md: %s: unsupported raid array version %d.%d.%d\n", kdevname(MKDEV(MD_MAJOR, minor)),
+ sb->major_version, sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+ if (sb->state != (1 << MD_SB_CLEAN)) {
+ printk(KERN_ERR "md: %s: raid array is not clean -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor)));
+ goto abort;
+ }
+ switch (sb->level) {
+ case 1:
+ md_size[minor] = sb->size;
+ break;
+ case 4:
+ case 5:
+ md_size[minor] = sb->size * (sb->raid_disks - 1);
+ break;
+ default:
+ printk(KERN_ERR "md: %s: unsupported raid level %d\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level);
+ goto abort;
+ }
+ return 0;
+abort:
+ free_sb(mddev);
+ return 1;
+}
+
+int md_update_sb(int minor)
+{
+ struct md_dev *mddev = md_dev + minor;
+ struct buffer_head *bh;
+ md_superblock_t *sb = mddev->sb;
+ struct real_dev *realdev;
+ kdev_t dev;
+ int i;
+ u32 sb_offset;
+
+ sb->utime = CURRENT_TIME;
+ for (i = 0; i < mddev->nb_dev; i++) {
+ realdev = mddev->devices + i;
+ if (!realdev->sb)
+ continue;
+ dev = realdev->dev;
+ sb_offset = realdev->sb_offset;
+ set_blocksize(dev, MD_SB_BYTES);
+ printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
+ bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
+ if (bh) {
+ sb = (md_superblock_t *) bh->b_data;
+ memcpy(sb, mddev->sb, MD_SB_BYTES);
+ memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
+ mark_buffer_uptodate(bh, 1);
+ mark_buffer_dirty(bh, 1);
+ ll_rw_block(WRITE, 1, &bh);
+ wait_on_buffer(bh);
+ bforget(bh);
+ fsync_dev(dev);
+ invalidate_buffers(dev);
+ } else
+ printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
+ }
+ return 0;
+}
static int do_md_run (int minor, int repart)
{
- int pnum, i, min, current_ra, err;
-
+ int pnum, i, min, factor, current_ra, err;
+
if (!md_dev[minor].nb_dev)
return -EINVAL;
if (md_dev[minor].pers)
return -EBUSY;
-
+
md_dev[minor].repartition=repart;
- if ((pnum=PERSONALITY(md_dev+minor) >> (PERSONALITY_SHIFT))
+ if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
>= MAX_PERSONALITY)
return -EINVAL;
-
+
+ /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
+ if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
+ for (i = 0; i < md_dev [minor].nb_dev; i++)
+ if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
+ return -EINVAL;
+ }
if (!pers[pnum])
{
#ifdef CONFIG_KERNELD
@@ -145,7 +329,7 @@
return -EINVAL;
}
- min=1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
+ factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
for (i=0; i<md_dev[minor].nb_dev; i++)
if (md_dev[minor].devices[i].size<min)
@@ -154,26 +338,37 @@
partition_name (md_dev[minor].devices[i].dev), min);
return -EINVAL;
}
+
+ for (i=0; i<md_dev[minor].nb_dev; i++) {
+ fsync_dev(md_dev[minor].devices[i].dev);
+ invalidate_buffers(md_dev[minor].devices[i].dev);
+ }
/* Resize devices according to the factor. It is used to align
partitions size on a given chunk size. */
md_size[minor]=0;
-
- for (i=0; i<md_dev[minor].nb_dev; i++)
- {
- md_dev[minor].devices[i].size &= ~(min - 1);
- md_size[minor] += md_dev[minor].devices[i].size;
- md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset + md_dev[minor].devices[i-1].size) : 0;
- }
+
+ /*
+ * Analyze the raid superblock
+ */
+ if (analyze_sb(minor, pnum))
+ return -EINVAL;
md_dev[minor].pers=pers[pnum];
if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
{
md_dev[minor].pers=NULL;
+ free_sb(md_dev + minor);
return (err);
}
-
+
+ if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
+ {
+ md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
+ md_update_sb(minor);
+ }
+
/* FIXME : We assume here we have blocks
that are twice as large as sectors.
THIS MAY NOT BE TRUE !!! */
@@ -191,7 +386,6 @@
read_ahead[MD_MAJOR]=current_ra;
- printk ("START_DEV md%x %s\n", minor, md_dev[minor].pers->name);
return (0);
}
@@ -211,38 +405,40 @@
/* The device won't exist anymore -> flush it now */
fsync_dev (inode->i_rdev);
invalidate_buffers (inode->i_rdev);
+ if (md_dev[minor].sb)
+ {
+ md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
+ md_update_sb(minor);
+ }
md_dev[minor].pers->stop (minor, md_dev+minor);
}
/* Remove locks. */
+ if (md_dev[minor].sb)
+ free_sb(md_dev + minor);
for (i=0; i<md_dev[minor].nb_dev; i++)
clear_inode (md_dev[minor].devices[i].inode);
-
+
md_dev[minor].nb_dev=md_size[minor]=0;
md_hd_struct[minor].nr_sects=0;
md_dev[minor].pers=NULL;
set_ra (); /* calculate new read_ahead */
- printk ("STOP_DEV md%x\n", minor);
return (0);
}
static int do_md_add (int minor, kdev_t dev)
{
- struct gendisk *gen_real;
int i;
-
- if (MAJOR(dev)==MD_MAJOR || md_dev[minor].nb_dev==MAX_REAL)
+
+ if (md_dev[minor].nb_dev==MAX_REAL)
return -EINVAL;
if (!fs_may_mount (dev) || md_dev[minor].pers)
return -EBUSY;
-
- if (!(gen_real=find_gendisk (dev)))
- return -ENOENT;
-
+
i=md_dev[minor].nb_dev++;
md_dev[minor].devices[i].dev=dev;
@@ -258,7 +454,13 @@
/* Sizes are now rounded at run time */
- md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)];
+/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
+
+ if (blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
+ printk("md_add(): zero device size, huh, bailing out.\n");
+ }
+
+ md_dev[minor].devices[i].size=blk_size[MAJOR(dev)][MINOR(dev)];
printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
return (0);
@@ -420,6 +622,27 @@
return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
}
+int md_make_request (int minor, int rw, struct buffer_head * bh)
+{
+ if (md_dev [minor].pers->make_request) {
+ if (buffer_locked(bh))
+ return 0;
+ if (rw == WRITE || rw == WRITEA) {
+ if (!buffer_dirty(bh))
+ return 0;
+ set_bit(BH_Lock, &bh->b_state);
+ }
+ if (rw == READ || rw == READA) {
+ if (buffer_uptodate(bh))
+ return 0;
+ set_bit (BH_Lock, &bh->b_state);
+ }
+ return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
+ } else {
+ make_request (MAJOR(bh->b_rdev), rw, bh);
+ return 0;
+ }
+}
static void do_md_request (void)
{
@@ -427,6 +650,40 @@
return;
}
+/*
+ * We run MAX_MD_THREADS from md_init() and arbitrate them in run time.
+ * This is not so elegant, but how can we use kernel_thread() from within
+ * loadable modules?
+ */
+struct md_thread *md_register_thread (void (*run) (void *), void *data)
+{
+ int i;
+ for (i = 0; i < MAX_MD_THREADS; i++) {
+ if (md_threads[i].run == NULL) {
+ md_threads[i].run = run;
+ md_threads[i].data = data;
+ return md_threads + i;
+ }
+ }
+ return NULL;
+}
+
+
+void md_unregister_thread (struct md_thread *thread)
+{
+ thread->run = NULL;
+ thread->data = NULL;
+ thread->flags = 0;
+}
+
+void md_wakeup_thread(struct md_thread *thread)
+{
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+struct buffer_head *efind_buffer(kdev_t dev, int block, int size);
+
static struct symbol_table md_symbol_table=
{
#include <linux/symtab_begin.h>
@@ -435,11 +692,18 @@
X(register_md_personality),
X(unregister_md_personality),
X(partition_name),
+ X(md_dev),
+ X(md_error),
+ X(md_register_thread),
+ X(md_unregister_thread),
+ X(md_update_sb),
+ X(md_map),
+ X(md_wakeup_thread),
+ X(efind_buffer),
#include <linux/symtab_end.h>
};
-
static void md_geninit (struct gendisk *gdisk)
{
int i;
@@ -463,6 +727,17 @@
});
}
+int md_error (kdev_t mddev, kdev_t rdev)
+{
+ unsigned int minor = MINOR (mddev);
+ if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
+ panic ("md_error gets unknown device\n");
+ if (!md_dev [minor].pers)
+ panic ("md_error gets an error for an unknown device\n");
+ if (md_dev [minor].pers->error_handler)
+ return (md_dev [minor].pers->error_handler (md_dev+minor, rdev));
+ return 0;
+}
int get_md_status (char *page)
{
@@ -495,9 +770,13 @@
partition_name(md_dev[i].devices[j].dev));
size+=md_dev[i].devices[j].size;
}
-
- if (md_dev[i].nb_dev)
- sz+=sprintf (page+sz, " %d blocks", size);
+
+ if (md_dev[i].nb_dev) {
+ if (md_dev[i].pers)
+ sz+=sprintf (page+sz, " %d blocks", md_size[i]);
+ else
+ sz+=sprintf (page+sz, " %d blocks", size);
+ }
if (!md_dev[i].pers)
{
@@ -508,11 +787,8 @@
if (md_dev[i].pers->max_invalid_dev)
sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
- sz+=sprintf (page+sz, " %dk %s\n", 1<<FACTOR_SHIFT(FACTOR(md_dev+i)),
- md_dev[i].pers == pers[LINEAR>>PERSONALITY_SHIFT] ?
- "rounding" : "chunks");
-
sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
+ sz+=sprintf (page+sz, "\n");
}
return (sz);
@@ -545,6 +821,32 @@
return 0;
}
+int md_thread(void * arg)
+{
+ struct md_thread *thread = arg;
+
+ current->session = 1;
+ current->pgrp = 1;
+ sprintf(current->comm, "md_thread");
+
+#ifdef __SMP__
+ lock_kernel();
+ syscall_count++;
+#endif
+ for (;;) {
+ sti();
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+ if (thread->run) {
+ thread->run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ current->signal = 0;
+ cli();
+ if (!test_bit(THREAD_WAKEUP, &thread->flags))
+ interruptible_sleep_on(&thread->wqueue);
+ }
+}
+
void linear_init (void);
void raid0_init (void);
void raid1_init (void);
@@ -552,7 +854,11 @@
int md_init (void)
{
- printk ("md driver %s MAX_MD_DEV=%d, MAX_REAL=%d\n", MD_VERSION, MAX_MD_DEV, MAX_REAL);
+ int i;
+
+ printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
+ MAX_MD_DEV, MAX_REAL);
if (register_blkdev (MD_MAJOR, "md", &md_fops))
{
@@ -560,9 +866,17 @@
return (-1);
}
+ for (i = 0; i < MAX_MD_THREADS; i++) {
+ md_threads[i].run = NULL;
+ init_waitqueue(&md_threads[i].wqueue);
+ md_threads[i].flags = 0;
+ kernel_thread (md_thread, md_threads + i, 0);
+ }
+
blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
blk_dev[MD_MAJOR].current_request=NULL;
read_ahead[MD_MAJOR]=INT_MAX;
+ memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
md_gendisk.next=gendisk_head;
gendisk_head=&md_gendisk;
@@ -572,6 +886,12 @@
#endif
#ifdef CONFIG_MD_STRIPED
raid0_init ();
+#endif
+#ifdef CONFIG_MD_MIRRORING
+ raid1_init ();
+#endif
+#ifdef CONFIG_MD_RAID5
+ raid5_init ();
#endif
return (0);
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov