patch-2.4.20 linux-2.4.20/arch/x86_64/kernel/process.c

Next file: linux-2.4.20/arch/x86_64/kernel/ptrace.c
Previous file: linux-2.4.20/arch/x86_64/kernel/pci-x86_64.h
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/arch/x86_64/kernel/process.c linux-2.4.20/arch/x86_64/kernel/process.c
@@ -0,0 +1,729 @@
+/*
+ *  linux/arch/x86-64/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  X86-64 port
+ *	Andi Kleen.
+ * 
+ *  $Id: process.c,v 1.57 2002/09/12 12:56:36 ak Exp $
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#define __KERNEL_SYSCALLS__
+#include <stdarg.h>
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <asm/kdebug.h>
+#include <asm/proto.h>
+
+#include <linux/irq.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+int hlt_counter;
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+
+void disable_hlt(void)
+{
+	hlt_counter++;
+}
+
+void enable_hlt(void)
+{
+	hlt_counter--;
+}
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+static void default_idle(void)
+{
+	if (!hlt_counter) {
+		__cli();
+		if (!current->need_resched)
+			safe_halt();
+		else
+			__sti();
+	}
+}
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+	int oldval;
+
+	__sti();
+
+	/*
+	 * Deal with another CPU just having chosen a thread to
+	 * run here:
+	 */
+	oldval = xchg(&current->need_resched, -1);
+
+	if (!oldval)
+		asm volatile(
+			"2:"
+			"cmpl $-1, %0;"
+			"rep; nop;"
+			"je 2b;"
+				: :"m" (current->need_resched));
+}
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+	/* endless idle loop with no priority at all */
+	init_idle();
+	current->nice = 20;
+	current->counter = -100;
+
+	while (1) {
+		void (*idle)(void) = pm_idle;
+		if (!idle)
+			idle = default_idle;
+		while (!current->need_resched)
+			idle();
+		schedule();
+		check_pgt_cache();
+	}
+}
+
+static int __init idle_setup (char *str)
+{
+	if (!strncmp(str, "poll", 4)) {
+		printk("using polling idle threads.\n");
+		pm_idle = poll_idle;
+	}
+
+	return 1;
+}
+
+__setup("idle=", idle_setup);
+
+static long no_idt[3];
+static int reboot_mode;
+
+#ifdef CONFIG_SMP
+int reboot_smp = 0;
+static int reboot_cpu = -1;
+#endif
+static int __init reboot_setup(char *str)
+{
+	while(1) {
+		switch (*str) {
+		case 'w': /* "warm" reboot (no memory testing etc) */
+			reboot_mode = 0x1234;
+			break;
+		case 'c': /* "cold" reboot (with memory testing etc) */
+			reboot_mode = 0x0;
+			break;
+#ifdef CONFIG_SMP
+		case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
+			reboot_smp = 1;
+			if (isdigit(str[1]))
+				sscanf(str+1, "%d", &reboot_cpu);		
+			else if (!strncmp(str,"smp",3))
+				sscanf(str+3, "%d", &reboot_cpu); 
+			/* we will leave sorting out the final value 
+			   when we are ready to reboot, since we might not
+ 			   have set up boot_cpu_id or smp_num_cpu */
+			break;
+#endif
+		}
+		if((str = strchr(str,',')) != NULL)
+			str++;
+		else
+			break;
+	}
+	return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+static inline void kb_wait(void)
+{
+	int i;
+
+	for (i=0; i<0x10000; i++)
+		if ((inb_p(0x64) & 0x02) == 0)
+			break;
+}
+
+void machine_restart(char * __unused)
+{
+#if CONFIG_SMP
+	int cpuid;
+	
+	cpuid = GET_APIC_ID(apic_read(APIC_ID));
+
+	if (reboot_smp) {
+
+		/* check to see if reboot_cpu is valid 
+		   if its not, default to the BSP */
+		if ((reboot_cpu == -1) ||  
+		      (reboot_cpu > (NR_CPUS -1))  || 
+		      !(phys_cpu_present_map & (1<<cpuid))) 
+			reboot_cpu = boot_cpu_id;
+
+		reboot_smp = 0;  /* use this as a flag to only go through this once*/
+		/* re-run this function on the other CPUs
+		   it will fall though this section since we have 
+		   cleared reboot_smp, and do the reboot if it is the
+		   correct CPU, otherwise it halts. */
+		if (reboot_cpu != cpuid)
+			smp_call_function((void *)machine_restart , NULL, 1, 0);
+	}
+
+	/* if reboot_cpu is still -1, then we want a tradional reboot, 
+	   and if we are not running on the reboot_cpu,, halt */
+	if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
+		for (;;)
+		__asm__ __volatile__ ("hlt");
+	}
+	/*
+	 * Stop all CPUs and turn off local APICs and the IO-APIC, so
+	 * other OSs see a clean IRQ state.
+	 */
+	if (notify_die(DIE_STOP,"cpustop",0,0) != NOTIFY_BAD)
+		smp_send_stop();
+	disable_IO_APIC();
+#endif
+	/* Could do reset through the northbridge of the Hammer here. */
+
+	/* rebooting needs to touch the page at absolute addr 0 */
+	*((unsigned short *)__va(0x472)) = reboot_mode;
+	for (;;) {
+		int i;
+		/* First fondle with the keyboard controller. */ 
+		for (i=0; i<100; i++) {
+			kb_wait();
+			udelay(50);
+			outb(0xfe,0x64);         /* pulse reset low */
+			udelay(50);
+		}
+		/* That didn't work - force a triple fault.. */
+		__asm__ __volatile__("lidt %0": :"m" (no_idt));
+		__asm__ __volatile__("int3");
+	}
+}
+
+void machine_halt(void)
+{
+}
+
+void machine_power_off(void)
+{
+	if (pm_power_off)
+		pm_power_off();
+}
+
+/* Prints also some state that isn't saved in the pt_regs */ 
+void show_regs(struct pt_regs * regs)
+{
+	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+	unsigned int fsindex,gsindex;
+	unsigned int ds,cs,es; 
+
+	printk("\n");
+	printk("Pid: %d, comm: %.20s %s\n", current->pid, current->comm, print_tainted());
+	printk("RIP: %04lx:[<%016lx>]\n", regs->cs & 0xffff, regs->rip);
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+	       regs->rax, regs->rbx, regs->rcx);
+	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+	       regs->rdx, regs->rsi, regs->rdi); 
+	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+	       regs->rbp, regs->r8, regs->r9); 
+	printk("R10: %016lx R11: %016lx R12: %016lx\n",
+	       regs->r10, regs->r11, regs->r12); 
+	printk("R13: %016lx R14: %016lx R15: %016lx\n",
+	       regs->r13, regs->r14, regs->r15); 
+
+	asm("movl %%ds,%0" : "=r" (ds)); 
+	asm("movl %%cs,%0" : "=r" (cs)); 
+	asm("movl %%es,%0" : "=r" (es)); 
+	asm("movl %%fs,%0" : "=r" (fsindex));
+	asm("movl %%gs,%0" : "=r" (gsindex));
+
+	rdmsrl(MSR_FS_BASE, fs);
+	rdmsrl(MSR_GS_BASE, gs); 
+	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+
+	asm("movq %%cr0, %0": "=r" (cr0));
+	asm("movq %%cr2, %0": "=r" (cr2));
+	asm("movq %%cr3, %0": "=r" (cr3));
+	asm("movq %%cr4, %0": "=r" (cr4));
+
+	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
+	       fs,fsindex,gs,gsindex,shadowgs); 
+	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
+	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ */
+void release_segments(struct mm_struct *mm)
+{
+	void * ldt = mm->context.segments;
+
+	/*
+	 * free the LDT
+	 */
+	if (ldt) {
+		mm->context.segments = NULL;
+		clear_LDT();
+		vfree(ldt);
+	}
+}
+
+/* 
+ * Reloading %gs is a bit complicated because the kernel relies on it 
+ * This includes the exception handlers, so we cannot take any exceptions
+ * while doing this. Check the new gs value manually for validity and only
+ * then load it. This needs locking again parallel CPUs that share the same
+ * LDT. This has to be done in the context switch iff %gs changes.
+ */
+void load_gs_index(unsigned gs)
+{
+	struct mm_struct *mm = current->mm;
+	int access; 
+	/* paranoia: */
+	if ((gs & 3) != 2) gs = 0;
+	if (mm) 
+		read_lock(&mm->context.ldtlock); 
+	asm volatile("pushf\n\t" 
+		     "cli\n\t"
+		     "swapgs\n\t"
+		     /* cannot take any exception until the next swapgs */
+		     "lar %1,%0\n\t"
+		     "jnz 1f\n\t"
+		     "movl %1,%%eax\n\t"
+		     "movl %%eax,%%gs\n\t"
+		     "jmp 2f\n\t"
+		     "1: movl %2,%%gs\n\t"
+		     "2: swapgs\n\t"
+		     "popf" : "=g" (access) : "g" (gs), "r" (0) : "rax"); 
+	if (mm)
+		read_unlock(&mm->context.ldtlock);
+}
+	
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	struct task_struct *me = current;
+	if (me->thread.io_bitmap_ptr) { 
+		(init_tss + smp_processor_id())->io_map_base = 
+			INVALID_IO_BITMAP_OFFSET;  
+		kfree(me->thread.io_bitmap_ptr); 
+		me->thread.io_bitmap_ptr = NULL; 		
+	} 
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+	/*
+	 * Forget coprocessor state..
+	 */
+	clear_fpu(tsk);
+	tsk->used_math = 0;
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+	if (dead_task->mm) {
+		void * ldt = dead_task->mm->context.segments;
+
+		// temporary debugging check
+		if (ldt) {
+			printk("WARNING: dead process %8s still has LDT? <%p>\n",
+					dead_task->comm, ldt);
+			BUG();
+		}
+	}
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
+{
+	struct mm_struct * old_mm;
+	void *old_ldt, *ldt;
+ 
+	ldt = NULL;
+	old_mm = current->mm;
+	if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
+		/*
+		 * Completely new LDT, we initialize it from the parent:
+		 */
+		ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
+		if (!ldt)
+			printk(KERN_WARNING "ldt allocation failed\n");
+		else
+			memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
+	}
+	new_mm->context.segments = ldt;
+	new_mm->context.cpuvalid = 0UL;
+	return;
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+		unsigned long unused,
+	struct task_struct * p, struct pt_regs * regs)
+{
+	struct pt_regs * childregs;
+	struct task_struct *me = current;
+
+	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
+
+	*childregs = *regs;
+
+	childregs->rax = 0;
+	childregs->rsp = rsp;
+	if (rsp == ~0) {
+		childregs->rsp = (unsigned long)childregs;
+	}
+
+	p->thread.rsp = (unsigned long) childregs;
+	p->thread.rsp0 = (unsigned long) (childregs+1);
+	p->thread.userrsp = current->thread.userrsp; 
+
+	p->thread.rip = (unsigned long) ret_from_fork;
+
+	p->thread.fs = me->thread.fs;
+	p->thread.gs = me->thread.gs;
+
+	asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
+	asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
+	asm("movl %%es,%0" : "=m" (p->thread.es));
+	asm("movl %%ds,%0" : "=m" (p->thread.ds));
+
+	unlazy_fpu(current);	
+	p->thread.i387 = current->thread.i387;
+
+	if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
+		p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL);
+		if (!p->thread.io_bitmap_ptr) 
+			return -ENOMEM;
+		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 
+		       (IO_BITMAP_SIZE+1)*4);
+	} 
+
+	return 0;
+}
+
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,register) \
+		set_debug(thread->debugreg[register], register)
+
+/*
+ *	switch_to(x,y) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. 
+ * 
+ * This could still be optimized: 
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ */
+struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev = &prev_p->thread,
+				 *next = &next_p->thread;
+	struct tss_struct *tss = init_tss + smp_processor_id();
+
+	unlazy_fpu(prev_p);
+
+	/*
+	 * Reload rsp0, LDT and the page table pointer:
+	 */
+	tss->rsp0 = next->rsp0;
+
+	/* 
+	 * Switch DS and ES.	 
+	 */
+	asm volatile("movl %%es,%0" : "=m" (prev->es)); 
+	if (unlikely(next->es | prev->es))
+		loadsegment(es, next->es); 
+	
+	asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); 
+	if (unlikely(next->ds | prev->ds))
+		loadsegment(ds, next->ds);
+
+	/* 
+	 * Switch FS and GS.
+	 * XXX Check if this is safe on SMP (!= -> |)
+	 */
+	{ 
+		unsigned int fsindex;
+
+		asm volatile("movl %%fs,%0" : "=g" (fsindex)); 
+		if (unlikely(fsindex != next->fsindex)) /* or likely? */
+			loadsegment(fs, next->fsindex);
+		if (unlikely(fsindex != prev->fsindex))
+			prev->fs = 0; 
+		if ((fsindex != prev->fsindex) || (prev->fs != next->fs))
+			wrmsrl(MSR_FS_BASE, next->fs); 
+		prev->fsindex = fsindex;
+	}
+	{
+		unsigned int gsindex;
+
+		asm volatile("movl %%gs,%0" : "=g" (gsindex)); 
+		if (unlikely(gsindex != next->gsindex))
+			load_gs_index(next->gs); 
+		if (unlikely(gsindex != prev->gsindex)) 
+			prev->gs = 0;				
+		if (gsindex != prev->gsindex || prev->gs != next->gs)
+			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
+		prev->gsindex = gsindex;
+	}
+
+	/* 
+	 * Switch the PDA context.
+	 */
+	prev->userrsp = read_pda(oldrsp); 
+	write_pda(oldrsp, next->userrsp); 
+	write_pda(pcurrent, next_p); 
+	write_pda(kernelstack, (unsigned long)next_p + THREAD_SIZE - PDA_STACKOFFSET);
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (unlikely(next->debugreg[7])) {
+		loaddebug(next, 0);
+		loaddebug(next, 1);
+		loaddebug(next, 2);
+		loaddebug(next, 3);
+		/* no 4 and 5 */
+		loaddebug(next, 6);
+		loaddebug(next, 7);
+	}
+
+
+	/* 
+	 * Handle the IO bitmap 
+	 */ 
+	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+		if (next->io_bitmap_ptr) {
+			/*
+			 * 4 cachelines copy ... not good, but not that
+			 * bad either. Anyone got something better?
+			 * This only affects processes which use ioperm().
+			 * [Putting the TSSs into 4k-tlb mapped regions
+			 * and playing VM tricks to switch the IO bitmap
+			 * is not really acceptable.]
+			 */
+			memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+				 IO_BITMAP_SIZE*sizeof(u32));
+			tss->io_map_base = IO_BITMAP_OFFSET;
+		} else {
+			/*
+			 * a bitmap offset pointing outside of the TSS limit
+			 * causes a nicely controllable SIGSEGV if a process
+			 * tries to use a port IO instruction. The first
+			 * sys_ioperm() call sets up the bitmap properly.
+			 */
+			tss->io_map_base = INVALID_IO_BITMAP_OFFSET;
+		}
+	}
+
+
+	return prev_p;
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage 
+long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+{
+	long error;
+	char * filename;
+
+	filename = getname(name);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename)) 
+		return error;
+	error = do_execve(filename, argv, envp, &regs); 
+	if (error == 0)
+		current->ptrace &= ~PT_DTRACE;
+	putname(filename);
+	return error;
+}
+
+void set_personality_64bit(void)
+{
+	/* inherit personality from parent */
+
+	/* Make sure to be in 64bit mode */
+	current->thread.flags = 0;
+}
+
+asmlinkage long sys_fork(struct pt_regs regs)
+{
+	return do_fork(SIGCHLD, regs.rsp, &regs, 0);
+}
+
+asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, struct pt_regs regs)
+{
+	if (!newsp)
+		newsp = regs.rsp;
+	return do_fork(clone_flags, newsp, &regs, 0);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, &regs, 0);
+}
+
+/*
+ * These bracket the sleeping functions..
+ */
+extern void scheduling_functions_start_here(void);
+extern void scheduling_functions_end_here(void);
+#define first_sched	((unsigned long) scheduling_functions_start_here)
+#define last_sched	((unsigned long) scheduling_functions_end_here)
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	u64 fp,rip;
+	int count = 0;
+
+	if (!p || p == current || p->state==TASK_RUNNING)
+		return 0; 
+	if (p->thread.rsp < (u64)p || p->thread.rsp > (u64)p + THREAD_SIZE)
+		return 0;
+	fp = *(u64 *)(p->thread.rsp);
+	do { 
+		if (fp < (unsigned long)p || fp > (unsigned long)p+THREAD_SIZE)
+			return 0; 
+		rip = *(u64 *)(fp+8); 
+		if (rip < first_sched || rip >= last_sched)
+			return rip; 
+		fp = *(u64 *)fp; 
+	} while (count++ < 16); 
+	return 0;
+}
+#undef last_sched
+#undef first_sched
+
+asmlinkage long sys_arch_prctl(int code, unsigned long addr)
+{ 
+	int ret = 0; 
+	unsigned long tmp; 
+
+	switch (code) { 
+	case ARCH_SET_GS:
+		if (addr >= TASK_SIZE) 
+			return -EPERM; 
+		asm volatile("movw %%gs,%0" : "=g" (current->thread.gsindex)); 
+		current->thread.gs = addr;
+		ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+		break;
+	case ARCH_SET_FS:
+		/* Not strictly needed for fs, but do it for symmetry
+		   with gs */
+		if (addr >= TASK_SIZE)
+			return -EPERM; 
+		asm volatile("movw %%fs,%0" : "=g" (current->thread.fsindex)); 
+		current->thread.fs = addr;
+		ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+		break;
+
+		/* Returned value may not be correct when the user changed fs/gs */ 
+	case ARCH_GET_FS:
+		rdmsrl(MSR_FS_BASE, tmp);
+		ret = put_user(tmp, (unsigned long *)addr); 
+		break; 
+
+	case ARCH_GET_GS: 
+		rdmsrl(MSR_KERNEL_GS_BASE, tmp); 
+		ret = put_user(tmp, (unsigned long *)addr); 
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	} 
+	return ret;	
+} 
+

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)