--- linux/fs/proc/array.c.orig +++ linux/fs/proc/array.c @@ -123,9 +123,9 @@ static const char *task_state_array[] = "R (running)", /* 0 */ "S (sleeping)", /* 1 */ "D (disk sleep)", /* 2 */ - "Z (zombie)", /* 4 */ - "T (stopped)", /* 8 */ - "W (paging)" /* 16 */ + "T (stopped)", /* 4 */ + "Z (zombie)", /* 8 */ + "X (dead)" /* 16 */ }; static inline const char * get_task_state(struct task_struct *tsk) @@ -158,7 +158,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), p->tgid, - p->pid, p->pid ? p->p_opptr->pid : 0, 0, + p->pid, p->pid ? p->real_parent->pid : 0, 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); read_unlock(&tasklist_lock); @@ -226,18 +226,19 @@ static void collect_sigign_sigcatch(stru sigemptyset(ign); sigemptyset(catch); - spin_lock_irq(&p->sigmask_lock); - - if (p->sig) { - k = p->sig->action; + read_lock(&tasklist_lock); + if (p->sighand) { + spin_lock_irq(&p->sighand->siglock); + k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) sigaddset(ign, i); else if (k->sa.sa_handler != SIG_DFL) sigaddset(catch, i); } + spin_unlock_irq(&p->sighand->siglock); } - spin_unlock_irq(&p->sigmask_lock); + read_unlock(&tasklist_lock); } static inline char * task_sig(struct task_struct *p, char *buffer) @@ -338,16 +339,16 @@ int proc_pid_stat(struct task_struct *ta /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ - priority = task->counter; - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; - nice = task->nice; + priority = task_prio(task); + nice = task_nice(task); read_lock(&tasklist_lock); - ppid = task->pid ? task->p_opptr->pid : 0; + ppid = task->pid ? task->real_parent->pid : 0; read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %ld %ld %ld %ld\n" + , task->pid, task->comm, state, @@ -390,7 +391,12 @@ int proc_pid_stat(struct task_struct *ta task->nswap, task->cnswap, task->exit_signal, - task->processor); + task->cpu, + task->group_times.tms_utime, + task->group_times.tms_stime, + task->group_times.tms_cutime, + task->group_times.tms_cstime + ); if(mm) mmput(mm); return res; --- linux/fs/proc/base.c.orig +++ linux/fs/proc/base.c @@ -329,7 +329,7 @@ static struct file_operations proc_info_ }; #define MAY_PTRACE(p) \ -(p==current||(p->p_pptr==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED)) +(p==current||(p->parent==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED)) static int mem_open(struct inode* inode, struct file* file) @@ -998,6 +998,13 @@ struct dentry *proc_pid_lookup(struct in d_add(dentry, inode); return NULL; } + + /* + * Deal with dot-aliases for threads. + */ + if (name[0] == '.') + name++, len--; + while (len-- > 0) { c = *name - '0'; name++; @@ -1021,7 +1028,7 @@ struct dentry *proc_pid_lookup(struct in inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_INO); - free_task_struct(task); + put_task_struct(task); if (!inode) goto out; @@ -1043,7 +1050,7 @@ void proc_pid_delete_inode(struct inode if (inode->u.proc_i.file) fput(inode->u.proc_i.file); if (inode->u.proc_i.task) - free_task_struct(inode->u.proc_i.task); + put_task_struct(inode->u.proc_i.task); } #define PROC_NUMBUF 10 @@ -1054,31 +1061,42 @@ void proc_pid_delete_inode(struct inode * tasklist lock while doing this, and we must release it before * we actually do the filldir itself, so we use a temp buffer.. */ -static int get_pid_list(int index, unsigned int *pids) +static int get_pid_list(int index, int *pids, struct file *filp) { - struct task_struct *p; + struct task_struct *p = NULL; int nr_pids = 0; + int pid = 0, pid_cursor = (int)filp->private_data; - index--; read_lock(&tasklist_lock); - for_each_task(p) { - int pid = p->pid; - if (!pid) - continue; + if (pid_cursor) + p = find_task_by_pid(pid_cursor); + if (!p) { + p = &init_task; + index--; + } else + index = 0; + __for_each_process(p) { if (--index >= 0) continue; - pids[nr_pids] = pid; + pid = p->pid; + if (!pid) + BUG(); + if (p->tgid != p->pid) + pids[nr_pids] = -pid; + else + pids[nr_pids] = pid; nr_pids++; if (nr_pids >= PROC_MAXPIDS) break; } + filp->private_data = (void *)pid; read_unlock(&tasklist_lock); return nr_pids; } int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int pid_array[PROC_MAXPIDS]; + int pid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; unsigned int nr_pids, i; @@ -1091,14 +1109,16 @@ int proc_pid_readdir(struct file * filp, nr++; } - nr_pids = get_pid_list(nr, pid_array); + nr_pids = get_pid_list(nr, pid_array, filp); for (i = 0; i < nr_pids; i++) { - int pid = pid_array[i]; + int pid = abs(pid_array[i]); ino_t ino = fake_ino(pid,PROC_PID_INO); unsigned long j = PROC_NUMBUF; do buf[--j] = '0' + (pid % 10); while (pid/=10); + if (pid_array[i] < 0) + buf[--j] = '.'; if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; --- linux/fs/proc/inode.c.orig +++ linux/fs/proc/inode.c @@ -194,7 +194,9 @@ struct super_block *proc_read_super(stru * Fixup the root inode's nlink value */ read_lock(&tasklist_lock); - for_each_task(p) if (p->pid) root_inode->i_nlink++; + for_each_process(p) + if (p->pid) + root_inode->i_nlink++; read_unlock(&tasklist_lock); s->s_root = d_alloc_root(root_inode); if (!s->s_root) --- linux/fs/proc/proc_misc.c.orig +++ linux/fs/proc/proc_misc.c @@ -108,11 +108,11 @@ static int loadavg_read_proc(char *page, a = avenrun[0] + (FIXED_1/200); b = avenrun[1] + (FIXED_1/200); c = avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_threads, last_pid); + nr_running(), nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -124,7 +124,7 @@ static int uptime_read_proc(char *page, int len; uptime = jiffies; - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; + idle = init_task.times.tms_utime + init_task.times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -383,10 +383,10 @@ static int kstat_read_proc(char *page, c } proc_sprintf(page, &off, &len, - "\nctxt %u\n" + "\nctxt %lu\n" "btime %lu\n" "processes %lu\n", - kstat.context_swtch, + nr_context_switches(), xtime.tv_sec - jif / HZ, total_forks); --- linux/fs/smbfs/sock.c.orig +++ linux/fs/smbfs/sock.c @@ -688,12 +688,12 @@ smb_request(struct smb_sb_info *server) len = smb_len(buffer) + 4; DEBUG1("len = %d cmd = 0x%X\n", len, buffer[8]); - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); sigpipe = sigismember(¤t->pending.signal, SIGPIPE); old_set = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); fs = get_fs(); set_fs(get_ds()); @@ -705,12 +705,12 @@ smb_request(struct smb_sb_info *server) } /* read/write errors are handled by errno */ - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); if (result == -EPIPE && !sigpipe) sigdelset(¤t->pending.signal, SIGPIPE); current->blocked = old_set; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); set_fs(fs); @@ -879,12 +879,12 @@ smb_trans2_request(struct smb_sb_info *s if ((result = smb_dont_catch_keepalive(server)) != 0) goto bad_conn; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); sigpipe = sigismember(¤t->pending.signal, SIGPIPE); old_set = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); fs = get_fs(); set_fs(get_ds()); @@ -898,12 +898,12 @@ smb_trans2_request(struct smb_sb_info *s } /* read/write errors are handled by errno */ - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); if (result == -EPIPE && !sigpipe) sigdelset(¤t->pending.signal, SIGPIPE); current->blocked = old_set; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); set_fs(fs); --- linux/fs/ncpfs/sock.c.orig 2001-08-21 14:26:04.000000000 +0200 +++ linux/fs/ncpfs/sock.c @@ -454,7 +454,7 @@ static int ncp_do_request(struct ncp_ser sigset_t old_set; unsigned long mask, flags; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); old_set = current->blocked; if (current->flags & PF_EXITING) mask = 0; @@ -466,14 +466,14 @@ static int ncp_do_request(struct ncp_ser What if we've blocked it ourselves? What about alarms? Why, in fact, are we mucking with the sigmask at all? -- r~ */ - if (current->sig->action[SIGINT - 1].sa.sa_handler == SIG_DFL) + if (current->sighand->action[SIGINT - 1].sa.sa_handler == SIG_DFL) mask |= sigmask(SIGINT); - if (current->sig->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) + if (current->sighand->action[SIGQUIT - 1].sa.sa_handler == SIG_DFL) mask |= sigmask(SIGQUIT); } siginitsetinv(¤t->blocked, mask); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); fs = get_fs(); set_fs(get_ds()); @@ -485,10 +485,10 @@ static int ncp_do_request(struct ncp_ser set_fs(fs); - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = old_set; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } DDPRINTK("do_ncp_rpc_call returned %d\n", result); --- linux/fs/autofs/waitq.c.orig 2001-08-21 14:26:07.000000000 +0200 +++ linux/fs/autofs/waitq.c @@ -70,10 +70,10 @@ static int autofs_write(struct file *fil /* Keep the currently executing process from receiving a SIGPIPE unless it was already supposed to get one */ if (wr == -EPIPE && !sigpipe) { - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); sigdelset(¤t->pending.signal, SIGPIPE); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } return (bytes > 0); @@ -161,18 +161,18 @@ int autofs_wait(struct autofs_sb_info *s sigset_t oldset; unsigned long irqflags; - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); oldset = current->blocked; siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); interruptible_sleep_on(&wq->queue); - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { DPRINTK(("autofs_wait: skipped sleeping\n")); } --- linux/fs/lockd/clntlock.c.orig +++ linux/fs/lockd/clntlock.c @@ -188,7 +188,7 @@ nlmclnt_recovery(struct nlm_host *host, nlmclnt_prepare_reclaim(host, newstate); nlm_get_host(host); MOD_INC_USE_COUNT; - kernel_thread(reclaimer, host, CLONE_SIGNAL); + kernel_thread(reclaimer, host, CLONE_KERNEL); } } --- linux/fs/lockd/clntproc.c.orig +++ linux/fs/lockd/clntproc.c @@ -139,7 +139,7 @@ nlmclnt_proc(struct inode *inode, int cm } /* Keep the old signal mask */ - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); oldset = current->blocked; /* If we're cleaning up locks because the process is exiting, @@ -148,8 +148,8 @@ nlmclnt_proc(struct inode *inode, int cm && fl->fl_type == F_UNLCK && (current->flags & PF_EXITING)) { sigfillset(¤t->blocked); /* Mask all signals */ - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); call = nlmclnt_alloc_call(); if (!call) { @@ -158,7 +158,7 @@ nlmclnt_proc(struct inode *inode, int cm } call->a_flags = RPC_TASK_ASYNC; } else { - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); memset(call, 0, sizeof(*call)); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); @@ -183,10 +183,10 @@ nlmclnt_proc(struct inode *inode, int cm kfree(call); out_restore: - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); done: dprintk("lockd: clnt proc returns %d\n", status); @@ -592,11 +592,11 @@ nlmclnt_cancel(struct nlm_host *host, st int status; /* Block all signals while setting up call */ - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); oldset = current->blocked; sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); req = nlmclnt_alloc_call(); if (!req) @@ -611,10 +611,10 @@ nlmclnt_cancel(struct nlm_host *host, st if (status < 0) kfree(req); - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); return status; } --- linux/fs/lockd/svc.c.orig +++ linux/fs/lockd/svc.c @@ -97,10 +97,10 @@ lockd(struct svc_rqst *rqstp) sprintf(current->comm, "lockd"); /* Process request with signals blocked. */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); /* kick rpciod */ rpciod_up(); @@ -122,9 +122,9 @@ lockd(struct svc_rqst *rqstp) { long timeout = MAX_SCHEDULE_TIMEOUT; if (signalled()) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); if (nlmsvc_ops) { nlmsvc_ops->detach(); grace_period_expire = set_grace_period(); @@ -307,9 +307,9 @@ lockd_down(void) "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sigmask_lock); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); out: up(&nlmsvc_sema); } --- linux/fs/nfsd/export.c.orig +++ linux/fs/nfsd/export.c @@ -496,9 +496,9 @@ exp_writelock(void) want_lock--; /* restore the task's signals */ - spin_lock_irq(¤t->sigmask_lock); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (!hash_count && !hash_lock) goto lock_it; --- linux/fs/nfsd/nfssvc.c.orig +++ linux/fs/nfsd/nfssvc.c @@ -179,10 +179,10 @@ nfsd(struct svc_rqst *rqstp) */ for (;;) { /* Block all but the shutdown signals */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); /* * Find a socket with data available and call its @@ -204,10 +204,10 @@ nfsd(struct svc_rqst *rqstp) */ rqstp->rq_client = exp_getclient(&rqstp->rq_addr); /* Process request with signals blocked. */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, ALLOWED_SIGS); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); svc_process(serv, rqstp); --- linux/fs/jffs/intrep.c.orig +++ linux/fs/jffs/intrep.c @@ -3347,13 +3347,12 @@ jffs_garbage_collect_thread(void *ptr) lock_kernel(); exit_mm(c->gc_task); - current->session = 1; - current->pgrp = 1; + set_special_pids(1, 1); init_completion(&c->gc_thread_comp); /* barrier */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); strcpy(current->comm, "jffs_gcd"); D1(printk (KERN_NOTICE "jffs_garbage_collect_thread(): Starting infinite loop.\n")); @@ -3381,9 +3380,9 @@ jffs_garbage_collect_thread(void *ptr) siginfo_t info; unsigned long signr; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); signr = dequeue_signal(¤t->blocked, &info); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); switch(signr) { case SIGSTOP: --- linux/fs/autofs4/waitq.c.orig 2001-08-21 14:26:07.000000000 +0200 +++ linux/fs/autofs4/waitq.c @@ -74,10 +74,10 @@ static int autofs4_write(struct file *fi /* Keep the currently executing process from receiving a SIGPIPE unless it was already supposed to get one */ if (wr == -EPIPE && !sigpipe) { - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); sigdelset(¤t->pending.signal, SIGPIPE); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } return (bytes > 0); @@ -198,18 +198,18 @@ int autofs4_wait(struct autofs_sb_info * sigset_t oldset; unsigned long irqflags; - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); oldset = current->blocked; siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); interruptible_sleep_on(&wq->queue); - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { DPRINTK(("autofs_wait: skipped sleeping\n")); } --- linux/fs/reiserfs/buffer2.c.orig +++ linux/fs/reiserfs/buffer2.c @@ -51,11 +51,11 @@ void wait_buffer_until_released (const s struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) { struct buffer_head *result; - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); ); result = bread (super -> s_dev, n_block, n_size); PROC_INFO_INC( super, breads ); - PROC_EXP( if( kstat.context_swtch != ctx_switches ) + PROC_EXP( if( nr_context_switches() != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); return result; } --- linux/fs/reiserfs/journal.c.orig +++ linux/fs/reiserfs/journal.c @@ -1868,10 +1868,10 @@ static int reiserfs_journal_commit_threa daemonize() ; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); sprintf(current->comm, "kreiserfsd") ; lock_kernel() ; --- linux/fs/jffs2/background.c.orig +++ linux/fs/jffs2/background.c @@ -106,14 +106,11 @@ static int jffs2_garbage_collect_thread( sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); - /* FIXME in the 2.2 backport */ - current->nice = 10; - for (;;) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (!thread_should_wake(c)) { set_current_state (TASK_INTERRUPTIBLE); @@ -134,9 +131,9 @@ static int jffs2_garbage_collect_thread( siginfo_t info; unsigned long signr; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); signr = dequeue_signal(¤t->blocked, &info); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); switch(signr) { case SIGSTOP: @@ -161,10 +158,10 @@ static int jffs2_garbage_collect_thread( } } /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv (¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): pass\n")); jffs2_garbage_collect_pass(c); --- linux/fs/jfs/jfs_logmgr.c.orig +++ linux/fs/jfs/jfs_logmgr.c @@ -2150,10 +2150,10 @@ int jfsIOWait(void *arg) unlock_kernel(); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); complete(&jfsIOwait); --- linux/fs/jfs/jfs_txnmgr.c.orig +++ linux/fs/jfs/jfs_txnmgr.c @@ -2801,10 +2801,10 @@ int jfs_lazycommit(void *arg) jfsCommitTask = current; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); LAZY_LOCK_INIT(); TxAnchor.unlock_queue = TxAnchor.unlock_tail = 0; @@ -3007,10 +3007,10 @@ int jfs_sync(void *arg) unlock_kernel(); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); complete(&jfsIOwait); --- linux/fs/jbd/journal.c.orig +++ linux/fs/jbd/journal.c @@ -206,10 +206,10 @@ int kjournald(void *arg) lock_kernel(); daemonize(); reparent_to_init(); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); sprintf(current->comm, "kjournald"); --- linux/fs/afs/cmservice.c.orig +++ linux/fs/afs/cmservice.c @@ -127,14 +127,10 @@ static int kafscmd(void *arg) complete(&kafscmd_alive); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around looking for things to attend to */ do { --- linux/fs/afs/kafsasyncd.c.orig +++ linux/fs/afs/kafsasyncd.c @@ -101,14 +101,10 @@ static int kafsasyncd(void *arg) complete(&kafsasyncd_alive); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around looking for things to attend to */ do { @@ -132,9 +128,9 @@ static int kafsasyncd(void *arg) while (signal_pending(current)) { siginfo_t sinfo; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); dequeue_signal(¤t->blocked,&sinfo); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } die = kafsasyncd_die; --- linux/fs/afs/kafstimod.c.orig +++ linux/fs/afs/kafstimod.c @@ -78,14 +78,10 @@ static int kafstimod(void *arg) complete(&kafstimod_alive); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around looking for things to attend to */ loop: @@ -107,9 +103,9 @@ static int kafstimod(void *arg) while (signal_pending(current)) { siginfo_t sinfo; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); dequeue_signal(¤t->blocked,&sinfo); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } /* work out the time to elapse before the next event */ --- linux/fs/binfmt_elf.c.orig +++ linux/fs/binfmt_elf.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -149,12 +150,12 @@ create_elf_tables(char *p, int argc, int * removed for 2.5 */ - u_platform = u_platform - (((current->pid+jiffies) % 64) << 7); + sp = (void *) (u_platform - (((current->pid+jiffies) % 64) << 7)); /* * Force 16 byte _final_ alignment here for generality. */ - sp = (elf_addr_t *)(~15UL & (unsigned long)(u_platform)); + sp = (elf_addr_t *)(~15UL & (unsigned long)(sp)); csp = sp; csp -= (1+DLINFO_ITEMS)*2 + (k_platform ? 2 : 0); #ifdef DLINFO_ARCH_ITEMS @@ -601,12 +602,8 @@ static int load_elf_binary(struct linux_ /* Do this so that we can load the interpreter, if need be. We will change some of these later */ current->mm->rss = 0; - retval = setup_arg_pages(bprm); - if (retval < 0) { - send_sig(SIGKILL, current, 0); - return retval; - } - + current->mm->free_area_cache = TASK_UNMAPPED_BASE; + setup_arg_pages(bprm); /* XXX: check error */ current->mm->start_stack = bprm->p; /* Now we do a little grungy work by mmaping the ELF image into @@ -957,7 +954,7 @@ static int notesize(struct memelfnote *e /* #define DEBUG */ #ifdef DEBUG -static void dump_regs(const char *str, elf_greg_t *r) +static void dump_regs(const char *str, elf_gregset_t *r) { int i; static const char *regs[] = { "ebx", "ecx", "edx", "esi", "edi", "ebp", @@ -983,7 +980,7 @@ static int writenote(struct memelfnote * { struct elf_note en; - en.n_namesz = strlen(men->name); + en.n_namesz = strlen(men->name)+1; en.n_descsz = men->datasz; en.n_type = men->type; @@ -1005,6 +1002,155 @@ static int writenote(struct memelfnote * #define DUMP_SEEK(off) \ if (!dump_seek(file, (off))) \ goto end_coredump; + +static inline void fill_elf_header(struct elfhdr *elf, int segs) +{ + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = 0; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = segs; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + return; +} + +static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset) +{ + phdr->p_type = PT_NOTE; + phdr->p_offset = offset; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = sz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + return; +} + +static inline void fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) +{ + note->name = name; + note->type = type; + note->datasz = sz; + note->data = data; + return; +} + +/* + * fill up all the fields in prstatus from the given task struct, except registers + * which need to be filled up seperately. + */ +static inline void fill_prstatus(struct elf_prstatus *prstatus, struct task_struct *p, long signr) +{ + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; + prstatus->pr_pid = p->pid; + prstatus->pr_ppid = p->parent->pid; + prstatus->pr_pgrp = p->pgrp; + prstatus->pr_sid = p->session; + prstatus->pr_utime.tv_sec = CT_TO_SECS(p->times.tms_utime); + prstatus->pr_utime.tv_usec = CT_TO_USECS(p->times.tms_utime); + prstatus->pr_stime.tv_sec = CT_TO_SECS(p->times.tms_stime); + prstatus->pr_stime.tv_usec = CT_TO_USECS(p->times.tms_stime); + prstatus->pr_cutime.tv_sec = CT_TO_SECS(p->times.tms_cutime); + prstatus->pr_cutime.tv_usec = CT_TO_USECS(p->times.tms_cutime); + prstatus->pr_cstime.tv_sec = CT_TO_SECS(p->times.tms_cstime); + prstatus->pr_cstime.tv_usec = CT_TO_USECS(p->times.tms_cstime); +} + +static inline void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p) +{ + int i; + + psinfo->pr_pid = p->pid; + psinfo->pr_ppid = p->parent->pid; + psinfo->pr_pgrp = p->pgrp; + psinfo->pr_sid = p->session; + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; + psinfo->pr_sname = (i < 0 || i > 5) ? '.' : "RSDTZD"[i]; + psinfo->pr_zomb = psinfo->pr_sname == 'Z'; + psinfo->pr_nice = task_nice(p); + psinfo->pr_flag = p->flags; + psinfo->pr_uid = NEW_TO_OLD_UID(p->uid); + psinfo->pr_gid = NEW_TO_OLD_GID(p->gid); + strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + return; +} + +/* Here is the structure in which status of each thread is captured. */ +struct elf_thread_status +{ + struct list_head list; + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + elf_fpregset_t fpu; /* NT_PRFPREG */ +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; /* NT_PRXFPREG */ +#endif + struct memelfnote notes[3]; + int num_notes; +}; + +/* + * In order to add the specific thread information for the elf file format, + * we need to keep a linked list of every threads pr_status and then + * create a single section for them in the final core file. + */ +static int elf_dump_thread_status(long signr, struct task_struct * p, struct list_head * thread_list) +{ + + struct elf_thread_status *t; + int sz = 0; + + t = kmalloc(sizeof(*t), GFP_ATOMIC); + if (!t) + return 0; + + INIT_LIST_HEAD(&t->list); + t->num_notes = 0; + + fill_prstatus(&t->prstatus, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); + t->num_notes++; + sz += notesize(&t->notes[0]); + +#ifndef __x86_64__ + if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) { + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu)); + t->num_notes++; + sz += notesize(&t->notes[1]); + } +#endif + +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(p, &t->xfpu)) { + fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), &(t->xfpu)); + t->num_notes++; + sz += notesize(&t->notes[2]); + } +#endif + list_add(&t->list, thread_list); + return sz; +} + /* * Actual dumper * @@ -1023,12 +1169,19 @@ static int elf_core_dump(long signr, str struct elfhdr elf; off_t offset = 0, dataoff; unsigned long limit = current->rlim[RLIMIT_CORE].rlim_cur; - int numnote = 4; - struct memelfnote notes[4]; + int numnote = 5; + struct memelfnote notes[5]; struct elf_prstatus prstatus; /* NT_PRSTATUS */ - elf_fpregset_t fpu; /* NT_PRFPREG */ struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ - + struct task_struct *g, *p; + LIST_HEAD(thread_list); + struct list_head *t; + elf_fpregset_t fpu; +#ifdef ELF_CORE_COPY_XFPREGS + elf_fpxregset_t xfpu; +#endif + int thread_status_size = 0; + /* first copy the parameters from user space */ memset(&psinfo, 0, sizeof(psinfo)); { @@ -1063,108 +1216,99 @@ static int elf_core_dump(long signr, str *(struct pt_regs *)&prstatus.pr_reg = *regs; #endif - /* now stop all vm operations */ - down_write(¤t->mm->mmap_sem); - segs = current->mm->map_count; + /* capture the status of all other threads */ + if (signr) { + read_lock(&tasklist_lock); + do_each_thread(g, p) + if (current->mm == p->mm && current != p) { + int sz = elf_dump_thread_status(signr, p, &thread_list); + if (!sz) { + read_unlock(&tasklist_lock); + goto cleanup; + } else + thread_status_size += sz; + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + + memset(&prstatus, 0, sizeof(prstatus)); + fill_prstatus(&prstatus, current, signr); + elf_core_copy_regs(&prstatus.pr_reg, regs); + + /* We no longer stop all vm operations */ + + /* This because those proceses that could possibly + * change map_count or the mmap / vma pages are now suspended. + * + * Only ptrace can touch these memory address, but it cannot change + * the map_count or the pages. So no possibility of crashing exists while dumping + * the mm->vm_next areas to the core file. + * + * Grabbing mmap_sem in this function is risky WRT the use of suspend_threads. + * Although no locks ups have been induced, if one of the suspended threads was + * in line for the current->mmap_sem and if gets it while on the Phantom runque, + * then we would dead lock in this function if we continue to attempt to down_write + * in this function. + */ + segs = current->mm->map_count; #ifdef DEBUG printk("elf_core_dump: %d segs %lu limit\n", segs, limit); #endif /* Set up header */ - memcpy(elf.e_ident, ELFMAG, SELFMAG); - elf.e_ident[EI_CLASS] = ELF_CLASS; - elf.e_ident[EI_DATA] = ELF_DATA; - elf.e_ident[EI_VERSION] = EV_CURRENT; - memset(elf.e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); - - elf.e_type = ET_CORE; - elf.e_machine = ELF_ARCH; - elf.e_version = EV_CURRENT; - elf.e_entry = 0; - elf.e_phoff = sizeof(elf); - elf.e_shoff = 0; - elf.e_flags = 0; - elf.e_ehsize = sizeof(elf); - elf.e_phentsize = sizeof(struct elf_phdr); - elf.e_phnum = segs+1; /* Include notes */ - elf.e_shentsize = 0; - elf.e_shnum = 0; - elf.e_shstrndx = 0; - - fs = get_fs(); - set_fs(KERNEL_DS); + fill_elf_header(&elf, segs+1); /* including notes section*/ has_dumped = 1; current->flags |= PF_DUMPCORE; - DUMP_WRITE(&elf, sizeof(elf)); - offset += sizeof(elf); /* Elf header */ - offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ - /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. */ - notes[0].name = "CORE"; - notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(prstatus); - notes[0].data = &prstatus; - prstatus.pr_info.si_signo = prstatus.pr_cursig = signr; - prstatus.pr_sigpend = current->pending.signal.sig[0]; - prstatus.pr_sighold = current->blocked.sig[0]; - psinfo.pr_pid = prstatus.pr_pid = current->pid; - psinfo.pr_ppid = prstatus.pr_ppid = current->p_pptr->pid; - psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp; - psinfo.pr_sid = prstatus.pr_sid = current->session; - prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime); - prstatus.pr_utime.tv_usec = CT_TO_USECS(current->times.tms_utime); - prstatus.pr_stime.tv_sec = CT_TO_SECS(current->times.tms_stime); - prstatus.pr_stime.tv_usec = CT_TO_USECS(current->times.tms_stime); - prstatus.pr_cutime.tv_sec = CT_TO_SECS(current->times.tms_cutime); - prstatus.pr_cutime.tv_usec = CT_TO_USECS(current->times.tms_cutime); - prstatus.pr_cstime.tv_sec = CT_TO_SECS(current->times.tms_cstime); - prstatus.pr_cstime.tv_usec = CT_TO_USECS(current->times.tms_cstime); - #ifdef DEBUG dump_regs("Passed in regs", (elf_greg_t *)regs); dump_regs("prstatus regs", (elf_greg_t *)&prstatus.pr_reg); #endif - notes[1].name = "CORE"; - notes[1].type = NT_PRPSINFO; - notes[1].datasz = sizeof(psinfo); - notes[1].data = &psinfo; - i = current->state ? ffz(~current->state) + 1 : 0; - psinfo.pr_state = i; - psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; - psinfo.pr_zomb = psinfo.pr_sname == 'Z'; - psinfo.pr_nice = current->nice; - psinfo.pr_flag = current->flags; - psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); - psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); - strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname)); - - notes[2].name = "CORE"; - notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = sizeof(*current); - notes[2].data = current; - - /* Try to dump the FPU. */ - prstatus.pr_fpvalid = dump_fpu (regs, &fpu); - if (!prstatus.pr_fpvalid) - { - numnote--; - } - else - { - notes[3].name = "CORE"; - notes[3].type = NT_PRFPREG; - notes[3].datasz = sizeof(fpu); - notes[3].data = &fpu; - } + fill_note(¬es[0], "CORE", NT_PRSTATUS, sizeof(prstatus), &prstatus); + /* + * NT_PRPSINFO describes the process as a whole, + * ie. the group leader: + */ + fill_psinfo(&psinfo, current->group_leader); + fill_note(¬es[1], "CORE", NT_PRPSINFO, sizeof(psinfo), &psinfo); + + fill_note(¬es[2], "CORE", NT_TASKSTRUCT, sizeof(*current), current); + +#ifndef __x86_64__ + /* Try to dump the FPU. */ + if ((prstatus.pr_fpvalid = elf_core_copy_task_fpregs(current, &fpu))) { + fill_note(¬es[3], "CORE", NT_PRFPREG, sizeof(fpu), &fpu); + } else { + --numnote; + } +#endif +#ifdef ELF_CORE_COPY_XFPREGS + if (elf_core_copy_task_xfpregs(current, &xfpu)) { + fill_note(¬es[4], "LINUX", NT_PRXFPREG, sizeof(xfpu), &xfpu); + } else { + --numnote; + } +#else + numnote --; +#endif + + fs = get_fs(); + set_fs(KERNEL_DS); + + DUMP_WRITE(&elf, sizeof(elf)); + offset += sizeof(elf); /* Elf header */ + offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + /* Write notes phdr entry */ { struct elf_phdr phdr; @@ -1172,17 +1316,11 @@ static int elf_core_dump(long signr, str for(i = 0; i < numnote; i++) sz += notesize(¬es[i]); + + sz += thread_status_size; - phdr.p_type = PT_NOTE; - phdr.p_offset = offset; - phdr.p_vaddr = 0; - phdr.p_paddr = 0; - phdr.p_filesz = sz; - phdr.p_memsz = 0; - phdr.p_flags = 0; - phdr.p_align = 0; - - offset += phdr.p_filesz; + fill_elf_note_phdr(&phdr, sz, offset); + offset += sz; DUMP_WRITE(&phdr, sizeof(phdr)); } @@ -1211,10 +1349,19 @@ static int elf_core_dump(long signr, str DUMP_WRITE(&phdr, sizeof(phdr)); } + /* write out the notes section */ for(i = 0; i < numnote; i++) if (!writenote(¬es[i], file)) goto end_coredump; + /* write out the thread status notes section */ + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp = list_entry(t, struct elf_thread_status, list); + for (i = 0; i < tmp->num_notes; i++) + if (!writenote(&tmp->notes[i], file)) + goto end_coredump; + } + DUMP_SEEK(dataoff); for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { @@ -1258,11 +1405,19 @@ static int elf_core_dump(long signr, str (off_t) file->f_pos, offset); } - end_coredump: +end_coredump: set_fs(fs); - up_write(¤t->mm->mmap_sem); + +cleanup: + while(!list_empty(&thread_list)) { + struct list_head *tmp = thread_list.next; + list_del(tmp); + kfree(list_entry(tmp, struct elf_thread_status, list)); + } + return has_dumped; } + #endif /* USE_ELF_CORE_DUMP */ static int __init init_elf_binfmt(void) @@ -1278,4 +1433,3 @@ static void __exit exit_elf_binfmt(void) module_init(init_elf_binfmt) module_exit(exit_elf_binfmt) -MODULE_LICENSE("GPL"); --- linux/fs/buffer.c.orig +++ linux/fs/buffer.c @@ -2926,16 +2926,15 @@ int bdflush(void *startup) * display semi-sane things. Not real crucial though... */ - tsk->session = 1; - tsk->pgrp = 1; + set_special_pids(1, 1); strcpy(tsk->comm, "bdflush"); /* avoid getting signals */ - spin_lock_irq(&tsk->sigmask_lock); + spin_lock_irq(&tsk->sighand->siglock); flush_signals(tsk); sigfillset(&tsk->blocked); - recalc_sigpending(tsk); - spin_unlock_irq(&tsk->sigmask_lock); + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); complete((struct completion *)startup); @@ -3002,16 +3001,15 @@ int kupdate(void *startup) struct task_struct * tsk = current; int interval; - tsk->session = 1; - tsk->pgrp = 1; + set_special_pids(1, 1); strcpy(tsk->comm, "kupdated"); /* sigstop and sigcont will stop and wakeup kupdate */ - spin_lock_irq(&tsk->sigmask_lock); + spin_lock_irq(&tsk->sighand->siglock); sigfillset(&tsk->blocked); siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP)); - recalc_sigpending(tsk); - spin_unlock_irq(&tsk->sigmask_lock); + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); complete((struct completion *)startup); @@ -3029,13 +3027,13 @@ int kupdate(void *startup) /* check for sigstop */ if (signal_pending(tsk)) { int stopped = 0; - spin_lock_irq(&tsk->sigmask_lock); + spin_lock_irq(&tsk->sighand->siglock); if (sigismember(&tsk->pending.signal, SIGSTOP)) { sigdelset(&tsk->pending.signal, SIGSTOP); stopped = 1; } - recalc_sigpending(tsk); - spin_unlock_irq(&tsk->sigmask_lock); + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); if (stopped) goto stop_kupdate; } @@ -3052,9 +3050,9 @@ static int __init bdflush_init(void) { static struct completion startup __initdata = COMPLETION_INITIALIZER(startup); - kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + kernel_thread(bdflush, &startup, CLONE_KERNEL); wait_for_completion(&startup); - kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + kernel_thread(kupdate, &startup, CLONE_KERNEL); wait_for_completion(&startup); return 0; } --- linux/fs/exec.c.orig +++ linux/fs/exec.c @@ -35,10 +35,13 @@ #include #include #include +#include #include #include #define __NO_VERSION__ #include +#include +#include #include #include @@ -474,43 +477,186 @@ static int exec_mmap(void) * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal - * table via the CLONE_SIGNAL option to clone().) + * table via the CLONE_SIGHAND option to clone().) */ - -static inline int make_private_signals(void) +static inline int de_thread(struct task_struct *tsk) { - struct signal_struct * newsig; + struct signal_struct *newsig, *oldsig = tsk->signal; + struct sighand_struct *newsighand, *oldsighand = tsk->sighand; + spinlock_t *lock = &oldsighand->siglock; + int count; - if (atomic_read(¤t->sig->count) <= 1) + /* + * If we don't share sighandlers, then we aren't sharing anything + * and we can just re-use it all. + */ + if (atomic_read(&oldsighand->count) <= 1) return 0; - newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); - if (newsig == NULL) + + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + if (!newsighand) return -ENOMEM; - spin_lock_init(&newsig->siglock); - atomic_set(&newsig->count, 1); - memcpy(newsig->action, current->sig->action, sizeof(newsig->action)); - spin_lock_irq(¤t->sigmask_lock); - current->sig = newsig; - spin_unlock_irq(¤t->sigmask_lock); + + spin_lock_init(&newsighand->siglock); + atomic_set(&newsighand->count, 1); + memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); + + /* + * See if we need to allocate a new signal structure + */ + newsig = NULL; + if (atomic_read(&oldsig->count) > 1) { + newsig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + if (!newsig) { + kmem_cache_free(sighand_cachep, newsighand); + return -ENOMEM; + } + atomic_set(&newsig->count, 1); + newsig->group_exit = 0; + newsig->group_exit_code = 0; + newsig->group_exit_task = NULL; + newsig->group_stop_count = 0; + newsig->curr_target = NULL; + init_sigpending(&newsig->shared_pending); + } + + if (thread_group_empty(current)) + goto no_thread_group; + /* + * Kill all other threads in the thread group: + */ + read_lock(&tasklist_lock); + spin_lock_irq(lock); + if (oldsig->group_exit) { + /* + * Another group action in progress, just + * return so that the signal is processed. + */ + spin_unlock_irq(lock); + read_unlock(&tasklist_lock); + kmem_cache_free(sighand_cachep, newsighand); + if (newsig) + kmem_cache_free(signal_cachep, newsig); + return -EAGAIN; + } + oldsig->group_exit = 1; + zap_other_threads(current); + + /* + * Account for the thread group leader hanging around: + */ + count = 2; + if (current->pid == current->tgid) + count = 1; + while (atomic_read(&oldsig->count) > count) { + oldsig->group_exit_task = current; + current->state = TASK_UNINTERRUPTIBLE; + spin_unlock_irq(lock); + read_unlock(&tasklist_lock); + schedule(); + read_lock(&tasklist_lock); + spin_lock_irq(lock); + if (oldsig->group_exit_task) + BUG(); + } + spin_unlock_irq(lock); + read_unlock(&tasklist_lock); + + /* + * At this point all other threads have exited, all we have to + * do is to wait for the thread group leader to become inactive, + * and to assume its PID: + */ + if (current->pid != current->tgid) { + struct task_struct *leader = current->group_leader, *parent; + unsigned long state, ptrace; + + /* + * Wait for the thread group leader to be a zombie. + * It should already be zombie at this point, most + * of the time. + */ + while (leader->state != TASK_ZOMBIE) + yield(); + + write_lock_irq(&tasklist_lock); + + if (leader->tgid != current->tgid) + BUG(); + if (current->pid == current->tgid) + BUG(); + /* + * An exec() starts a new thread group with the + * TGID of the previous thread group. Rehash the + * two threads with a switched PID, and release + * the former thread group leader: + */ + ptrace = leader->ptrace; + parent = leader->parent; + + ptrace_unlink(current); + ptrace_unlink(leader); + remove_parent(current); + remove_parent(leader); + + switch_exec_pids(leader, current); + + current->parent = current->real_parent = leader->real_parent; + leader->parent = leader->real_parent = child_reaper; + current->group_leader = current; + leader->group_leader = leader; + + add_parent(current, current->parent); + add_parent(leader, leader->parent); + if (ptrace) { + current->ptrace = ptrace; + __ptrace_link(current, parent); + } + + list_del(¤t->tasks); + list_add_tail(¤t->tasks, &init_task.tasks); + current->exit_signal = SIGCHLD; + state = leader->state; + + write_unlock_irq(&tasklist_lock); + + if (state != TASK_ZOMBIE) + BUG(); + release_task(leader); + } + +no_thread_group: + + write_lock_irq(&tasklist_lock); + spin_lock(&oldsighand->siglock); + spin_lock(&newsighand->siglock); + + if (current == oldsig->curr_target) + oldsig->curr_target = next_thread(current); + if (newsig) + current->signal = newsig; + current->sighand = newsighand; + init_sigpending(¤t->pending); + recalc_sigpending(); + + spin_unlock(&newsighand->siglock); + spin_unlock(&oldsighand->siglock); + write_unlock_irq(&tasklist_lock); + + if (newsig && atomic_dec_and_test(&oldsig->count)) + kmem_cache_free(signal_cachep, oldsig); + + if (atomic_dec_and_test(&oldsighand->count)) + kmem_cache_free(sighand_cachep, oldsighand); + + if (!thread_group_empty(current)) + BUG(); + if (current->tgid != current->pid) + BUG(); return 0; } /* - * If make_private_signals() made a copy of the signal table, decrement the - * refcount of the original table, and free it if necessary. - * We don't do that in make_private_signals() so that we can back off - * in flush_old_exec() if an error occurs after calling make_private_signals(). - */ - -static inline void release_old_signals(struct signal_struct * oldsig) -{ - if (current->sig == oldsig) - return; - if (atomic_dec_and_test(&oldsig->count)) - kmem_cache_free(sigact_cachep, oldsig); -} - -/* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ @@ -543,48 +689,26 @@ static inline void flush_old_files(struc write_unlock(&files->file_lock); } -/* - * An execve() will automatically "de-thread" the process. - * Note: we don't have to hold the tasklist_lock to test - * whether we migth need to do this. If we're not part of - * a thread group, there is no way we can become one - * dynamically. And if we are, we only need to protect the - * unlink - even if we race with the last other thread exit, - * at worst the list_del_init() might end up being a no-op. - */ -static inline void de_thread(struct task_struct *tsk) -{ - if (!list_empty(&tsk->thread_group)) { - write_lock_irq(&tasklist_lock); - list_del_init(&tsk->thread_group); - write_unlock_irq(&tasklist_lock); - } - - /* Minor oddity: this might stay the same. */ - tsk->tgid = tsk->pid; -} - int flush_old_exec(struct linux_binprm * bprm) { char * name; int i, ch, retval; - struct signal_struct * oldsig; - - /* - * Make sure we have a private signal table - */ - oldsig = current->sig; - retval = make_private_signals(); - if (retval) goto flush_failed; /* * Release all of the old mmap stuff */ retval = exec_mmap(); - if (retval) goto mmap_failed; + if (retval) + goto out; + /* + * Make sure we have a private signal table and that + * we are unassociated from the previous thread group. + */ + retval = de_thread(current); + if (retval) + goto out; /* This is the point of no return */ - release_old_signals(oldsig); current->sas_ss_sp = current->sas_ss_size = 0; @@ -602,8 +726,6 @@ int flush_old_exec(struct linux_binprm * flush_thread(); - de_thread(current); - if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || permission(bprm->file->f_dentry->d_inode,MAY_READ)) current->mm->dumpable = 0; @@ -618,14 +740,7 @@ int flush_old_exec(struct linux_binprm * return 0; -mmap_failed: -flush_failed: - spin_lock_irq(¤t->sigmask_lock); - if (current->sig != oldsig) { - kmem_cache_free(sigact_cachep, current->sig); - current->sig = oldsig; - } - spin_unlock_irq(¤t->sigmask_lock); +out: return retval; } @@ -734,7 +849,7 @@ void compute_creds(struct linux_binprm * if (must_not_trace_exec(current) || atomic_read(¤t->fs->count) > 1 || atomic_read(¤t->files->count) > 1 - || atomic_read(¤t->sig->count) > 1) { + || atomic_read(¤t->sighand->count) > 1) { if(!capable(CAP_SETUID)) { bprm->e_uid = current->uid; bprm->e_gid = current->gid; @@ -1085,28 +1200,73 @@ void format_corename(char *corename, con *out_ptr = 0; } -int do_coredump(long signr, struct pt_regs * regs) +static void zap_threads (struct mm_struct *mm) +{ + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g,p) + if (mm == p->mm && p != current) { + force_sig_specific(SIGKILL, p); + mm->core_waiters++; + } + while_each_thread(g,p); + + read_unlock(&tasklist_lock); +} + +static void coredump_wait(struct mm_struct *mm) +{ + DECLARE_COMPLETION(startup_done); + + if (mm->core_waiters) + BUG(); + mm->core_waiters++; /* let other threads block */ + mm->core_startup_done = &startup_done; + + /* give other threads a chance to run: */ + yield(); + + zap_threads(mm); + if (--mm->core_waiters) { + up_write(&mm->mmap_sem); + wait_for_completion(&startup_done); + } else + up_write(&mm->mmap_sem); + BUG_ON(mm->core_waiters); +} + +int do_coredump(long signr, int exit_code, struct pt_regs * regs) { - struct linux_binfmt * binfmt; char corename[CORENAME_MAX_SIZE + 1]; - struct file * file; + struct mm_struct *mm = current->mm; + struct linux_binfmt * binfmt; struct inode * inode; + struct file * file; int retval = 0; lock_kernel(); binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; - if (!current->mm->dumpable) + down_write(&mm->mmap_sem); + if (!mm->dumpable) { + up_write(&mm->mmap_sem); goto fail; - current->mm->dumpable = 0; + } + mm->dumpable = 0; + init_completion(&mm->core_done); + current->signal->group_exit = 1; + current->signal->group_exit_code = exit_code; + coredump_wait(mm); + if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) - goto fail; + goto fail_unlock; format_corename(corename, core_pattern, signr); file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600); if (IS_ERR(file)) - goto fail; + goto fail_unlock; inode = file->f_dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ @@ -1124,8 +1284,11 @@ int do_coredump(long signr, struct pt_re retval = binfmt->core_dump(signr, regs, file); + current->signal->group_exit_code |= 0x80; close_fail: filp_close(file, NULL); +fail_unlock: + complete_all(&mm->core_done); fail: unlock_kernel(); return retval; --- linux/fs/fcntl.c.orig +++ linux/fs/fcntl.c @@ -415,24 +415,29 @@ static void send_sigio_to_task(struct ta void send_sigio(struct fown_struct *fown, int fd, int band) { - struct task_struct * p; - int pid = fown->pid; + struct task_struct *p; + int pid; + + pid = fown->pid; + if (!pid) + goto out_unlock_fown; read_lock(&tasklist_lock); - if ( (pid > 0) && (p = find_task_by_pid(pid)) ) { - send_sigio_to_task(p, fown, fd, band); - goto out; - } - for_each_task(p) { - int match = p->pid; - if (pid < 0) - match = -p->pgrp; - if (pid != match) - continue; - send_sigio_to_task(p, fown, fd, band); + if (pid > 0) { + p = find_task_by_pid(pid); + if (p) { + send_sigio_to_task(p, fown, fd, band); + } + } else { + struct list_head *l; + struct pid *pidptr; + for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + send_sigio_to_task(p, fown, fd, band); + } } -out: read_unlock(&tasklist_lock); + out_unlock_fown: + ; } static rwlock_t fasync_lock = RW_LOCK_UNLOCKED; @@ -495,7 +500,7 @@ void __kill_fasync(struct fasync_struct /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ - if (fown->pid && !(sig == SIGURG && fown->signum == 0)) + if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); fa = fa->fa_next; } --- linux/fs/namespace.c.orig +++ linux/fs/namespace.c @@ -863,11 +863,11 @@ out1: static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) { - struct task_struct *p; + struct task_struct *g, *p; struct fs_struct *fs; read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -880,7 +880,7 @@ static void chroot_fs_refs(struct nameid put_fs_struct(fs); } else task_unlock(p); - } + } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -985,7 +985,7 @@ static void __init init_mount_tree(void) { struct vfsmount *mnt; struct namespace *namespace; - struct task_struct *p; + struct task_struct *g, *p; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) @@ -1001,10 +1001,10 @@ static void __init init_mount_tree(void) init_task.namespace = namespace; read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g, p) { get_namespace(namespace); p->namespace = namespace; - } + } while_each_thread(g, p); read_unlock(&tasklist_lock); set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); --- linux/fs/stat.c.orig +++ linux/fs/stat.c @@ -25,6 +25,13 @@ do_revalidate(struct dentry *dentry) return 0; } +static inline nlink_t user_nlink(struct inode *inode) +{ + if (inode->i_nlink >= 32*1024) + return 32*1024-1; + return inode->i_nlink; +} + static int do_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { int res = 0; @@ -38,7 +45,7 @@ static int do_getattr(struct vfsmount *m stat->dev = kdev_t_to_nr(inode->i_dev); stat->ino = inode->i_ino; stat->mode = inode->i_mode; - stat->nlink = inode->i_nlink; + stat->nlink = user_nlink(inode); stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = kdev_t_to_nr(inode->i_rdev); --- linux/fs/binfmt_aout.c.orig +++ linux/fs/binfmt_aout.c @@ -302,6 +302,7 @@ static int load_aout_binary(struct linux (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); + current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->rss = 0; current->mm->mmap = NULL; --- linux/init/main.c.orig +++ linux/init/main.c @@ -98,6 +98,9 @@ extern void pte_chain_init(void); extern void free_initmem(void); +extern void pidhash_init(void); + + #ifdef CONFIG_TC extern void tc_init(void); #endif @@ -303,8 +306,6 @@ static void __init parse_options(char *l extern void setup_arch(char **); extern void cpu_idle(void); -unsigned long wait_init_idle; - #ifndef CONFIG_SMP #ifdef CONFIG_X86_LOCAL_APIC @@ -313,34 +314,24 @@ static void __init smp_init(void) APIC_init_uniprocessor(); } #else -#define smp_init() do { } while (0) +#define smp_init() do { } while (0) #endif #else - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { /* Get other processors into their bootup holding patterns. */ smp_boot_cpus(); - wait_init_idle = cpu_online_map; - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ smp_threads_ready=1; smp_commence(); - - /* Wait for the other cpus to set up their idle processes */ - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); - while (wait_init_idle) { - cpu_relax(); - barrier(); - } - printk("All processors have done init_idle\n"); } #endif + /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -350,11 +341,11 @@ static void __init smp_init(void) static void rest_init(void) { - kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + init_idle(current, smp_processor_id()); + kernel_thread(init, NULL, CLONE_KERNEL); unlock_kernel(); - current->need_resched = 1; - cpu_idle(); -} + cpu_idle(); +} /* * Activate the first processor. @@ -402,6 +393,7 @@ asmlinkage void __init start_kernel(void #endif mem_init(); kmem_cache_sizes_init(); + pidhash_init(); pgtable_cache_init(); pte_chain_init(); @@ -431,14 +423,10 @@ asmlinkage void __init start_kernel(void ipc_init(); #endif check_bugs(); + printk("POSIX conformance testing by UNIFIX\n"); - /* - * We count on the initial thread going ok - * Like idlers init is an unlocked kernel thread, which will - * make syscalls (and thus be locked). - */ - smp_init(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -467,7 +455,6 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { - /* * Tell the world that we're going to be the grim * reaper of innocent orphaned children. @@ -553,6 +540,11 @@ extern void prepare_namespace(void); static int init(void * unused) { lock_kernel(); + + smp_init(); +#if CONFIG_SMP + migration_init(); +#endif do_basic_setup(); prepare_namespace(); --- linux/kernel/Makefile.orig +++ linux/kernel/Makefile @@ -14,7 +14,8 @@ export-objs = signal.o sys.o kmod.o cont obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o lowlat.o profile.o \ module.o exit.o itimer.o info.o time.o softirq.o resource.o \ sysctl.o acct.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o context.o kksymoops.o + signal.o sys.o kmod.o context.o kksymoops.o \ + futex.o pid.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o --- linux/kernel/capability.c.orig +++ linux/kernel/capability.c @@ -8,6 +8,8 @@ #include #include +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ + kernel_cap_t cap_bset = CAP_INIT_EFF_SET; /* Note: never hold tasklist_lock while spinning for this one */ @@ -81,17 +83,17 @@ static void cap_set_pg(int pgrp, kernel_cap_t *inheritable, kernel_cap_t *permitted) { - struct task_struct *target; + struct task_struct *target, *g; /* FIXME: do we need to have a write lock here..? */ read_lock(&tasklist_lock); - for_each_task(target) { + do_each_thread(g, target) { if (target->pgrp != pgrp) continue; target->cap_effective = *effective; target->cap_inheritable = *inheritable; target->cap_permitted = *permitted; - } + } while_each_thread(g, target); read_unlock(&tasklist_lock); } @@ -101,18 +103,18 @@ static void cap_set_all(kernel_cap_t *ef kernel_cap_t *inheritable, kernel_cap_t *permitted) { - struct task_struct *target; + struct task_struct *target, *g; /* FIXME: do we need to have a write lock here..? */ read_lock(&tasklist_lock); /* ALL means everyone other than self or 'init' */ - for_each_task(target) { + do_each_thread(g, target) { if (target == current || target->pid == 1) continue; target->cap_effective = *effective; target->cap_inheritable = *inheritable; target->cap_permitted = *permitted; - } + } while_each_thread(g, target); read_unlock(&tasklist_lock); } --- linux/kernel/context.c.orig +++ linux/kernel/context.c @@ -75,10 +75,10 @@ static int context_thread(void *startup) keventd_running = 1; keventd_task = curtask; - spin_lock_irq(&curtask->sigmask_lock); + spin_lock_irq(&curtask->sighand->siglock); siginitsetinv(&curtask->blocked, sigmask(SIGCHLD)); - recalc_sigpending(curtask); - spin_unlock_irq(&curtask->sigmask_lock); + recalc_sigpending_tsk(curtask); + spin_unlock_irq(&curtask->sighand->siglock); complete((struct completion *)startup); @@ -104,10 +104,10 @@ static int context_thread(void *startup) if (signal_pending(curtask)) { while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0) ; - spin_lock_irq(&curtask->sigmask_lock); + spin_lock_irq(&curtask->sighand->siglock); flush_signals(curtask); - recalc_sigpending(curtask); - spin_unlock_irq(&curtask->sigmask_lock); + recalc_sigpending_tsk(curtask); + spin_unlock_irq(&curtask->sighand->siglock); } } } --- linux/kernel/exit.c.orig +++ linux/kernel/exit.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -13,13 +14,15 @@ #include #include #include -#ifdef CONFIG_BSD_PROCESS_ACCT #include -#endif +#include +#include +#include +#include +#include #include #include -#include #include extern void sem_exit (void); @@ -27,51 +30,70 @@ extern struct task_struct *child_reaper; int getrusage(struct task_struct *, int, struct rusage *); -static void release_task(struct task_struct * p) +static void __unhash_process(struct task_struct *p) { - if (p != current) { -#ifdef CONFIG_SMP - /* - * Wait to make sure the process isn't on the - * runqueue (active on some other CPU still) - */ - for (;;) { - task_lock(p); - if (!task_has_cpu(p)) - break; - task_unlock(p); - do { - cpu_relax(); - barrier(); - } while (task_has_cpu(p)); - } - task_unlock(p); -#endif - atomic_dec(&p->user->processes); - free_uid(p->user); - unhash_process(p); - - release_thread(p); - current->cmin_flt += p->min_flt + p->cmin_flt; - current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; - /* - * Potentially available timeslices are retrieved - * here - this way the parent does not get penalized - * for creating too many processes. - * - * (this cannot be used to artificially 'generate' - * timeslices, because any timeslice recovered here - * was given away by the parent in the first place.) - */ - current->counter += p->counter; - if (current->counter >= MAX_COUNTER) - current->counter = MAX_COUNTER; - p->pid = 0; - free_task_struct(p); - } else { - printk("task releasing itself\n"); + nr_threads--; + detach_pid(p, PIDTYPE_PID); + detach_pid(p, PIDTYPE_TGID); + if (thread_group_leader(p)) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); } + + REMOVE_LINKS(p); +} + +void release_task(struct task_struct * p) +{ + task_t *leader; + + BUG_ON(p->state < TASK_ZOMBIE); + + if (p != current) + wait_task_inactive(p); + + atomic_dec(&p->user->processes); + free_uid(p->user); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) + __ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + __exit_signal(p); + __exit_sighand(p); + __unhash_process(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && + leader->state == TASK_ZOMBIE && leader->exit_signal != -1) + do_notify_parent(leader, leader->exit_signal); + + p->parent->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; + p->parent->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; + p->parent->group_leader->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; + p->parent->group_leader->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; + + p->parent->cmin_flt += p->min_flt + p->cmin_flt; + p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; + p->parent->cnswap += p->nswap + p->cnswap; + sched_exit(p); + write_unlock_irq(&tasklist_lock); + + release_thread(p); + put_task_struct(p); +} + +/* we are using it only for SMP init */ + +void unhash_process(struct task_struct *p) +{ + write_lock_irq(&tasklist_lock); + __unhash_process(p); + write_unlock_irq(&tasklist_lock); } /* @@ -82,22 +104,23 @@ static void release_task(struct task_str int session_of_pgrp(int pgrp) { struct task_struct *p; - int fallback; + struct list_head *l; + struct pid *pid; + int sid = -1; - fallback = -1; read_lock(&tasklist_lock); - for_each_task(p) { - if (p->session <= 0) - continue; - if (p->pgrp == pgrp) { - fallback = p->session; - break; + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) + if (p->session > 0) { + sid = p->session; + goto out; } - if (p->pid == pgrp) - fallback = p->session; - } + p = find_task_by_pid(pgrp); + if (p) + sid = p->session; +out: read_unlock(&tasklist_lock); - return fallback; + + return sid; } /* @@ -108,74 +131,156 @@ int session_of_pgrp(int pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task) +static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) { struct task_struct *p; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p == ignored_task) || (p->pgrp != pgrp) || - (p->state == TASK_ZOMBIE) || - (p->p_pptr->pid == 1)) + struct list_head *l; + struct pid *pid; + int ret = 1; + + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + if (p == ignored_task + || p->state >= TASK_ZOMBIE + || p->real_parent->pid == 1) continue; - if ((p->p_pptr->pgrp != pgrp) && - (p->p_pptr->session == p->session)) { - read_unlock(&tasklist_lock); - return 0; + if (p->real_parent->pgrp != pgrp + && p->real_parent->session == p->session) { + ret = 0; + break; } } - read_unlock(&tasklist_lock); - return 1; /* (sighing) "Often!" */ + return ret; /* (sighing) "Often!" */ } int is_orphaned_pgrp(int pgrp) { - return will_become_orphaned_pgrp(pgrp, 0); + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(pgrp, NULL); + read_unlock(&tasklist_lock); + + return retval; } static inline int has_stopped_jobs(int pgrp) { int retval = 0; - struct task_struct * p; + struct task_struct *p; + struct list_head *l; + struct pid *pid; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pgrp != pgrp) - continue; + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { if (p->state != TASK_STOPPED) continue; + + /* If p is stopped by a debugger on a signal that won't + stop it, then don't count p as stopped. This isn't + perfect but it's a good approximation. */ + if (unlikely (p->ptrace) + && p->exit_code != SIGSTOP + && p->exit_code != SIGTSTP + && p->exit_code != SIGTTOU + && p->exit_code != SIGTTIN) + continue; + retval = 1; break; } - read_unlock(&tasklist_lock); return retval; } +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + REMOVE_LINKS(current); + current->parent = child_reaper; + current->real_parent = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + current->user = INIT_USER; + + write_unlock_irq(&tasklist_lock); +} + /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. */ -static inline void forget_original_parent(struct task_struct * father) + +void daemonize(void) { - struct task_struct * p; + struct fs_struct *fs; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->p_opptr == father) { - /* We dont want people slaying init */ - p->exit_signal = SIGCHLD; - p->self_exec_id++; + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); - /* Make sure we're not reparenting to ourselves */ - p->p_opptr = child_reaper; + set_special_pids(1, 1); + current->tty = NULL; - if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0); - } + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_init(); +} + +void __set_special_pids(pid_t session, pid_t pgrp) +{ + struct task_struct *curr = current; + + if (curr->session != session) { + detach_pid(curr, PIDTYPE_SID); + curr->session = session; + attach_pid(curr, PIDTYPE_SID, session); + } + if (curr->pgrp != pgrp) { + detach_pid(curr, PIDTYPE_PGID); + curr->pgrp = pgrp; + attach_pid(curr, PIDTYPE_PGID, pgrp); } - read_unlock(&tasklist_lock); +} + +void set_special_pids(pid_t session, pid_t pgrp) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(session, pgrp); + write_unlock_irq(&tasklist_lock); } static inline void close_files(struct files_struct * files) @@ -197,7 +302,6 @@ static inline void close_files(struct fi } i++; set >>= 1; - conditional_schedule(); /* sys_exit, many files open */ } } } @@ -306,19 +410,30 @@ void end_lazy_tlb(struct mm_struct *mm) */ static inline void __exit_mm(struct task_struct * tsk) { - struct mm_struct * mm = tsk->mm; + struct mm_struct *mm = tsk->mm; mm_release(); - if (mm) { - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); - /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; - task_unlock(tsk); - enter_lazy_tlb(mm, current, smp_processor_id()); - mmput(mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump: + */ + if (mm->core_waiters) { + down_write(&mm->mmap_sem); + if (!--mm->core_waiters) + complete(mm->core_startup_done); + up_write(&mm->mmap_sem); + + wait_for_completion(&mm->core_done); } + atomic_inc(&mm->mm_count); + if (mm != tsk->active_mm) BUG(); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + enter_lazy_tlb(mm, current, smp_processor_id()); + task_unlock(tsk); + mmput(mm); } void exit_mm(struct task_struct *tsk) @@ -326,15 +441,155 @@ void exit_mm(struct task_struct *tsk) __exit_mm(tsk); } +static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) +{ + /* + * Make sure we're not reparenting to ourselves and that + * the parent is not a zombie. + */ + if (p == reaper || reaper->state >= TASK_ZOMBIE) + p->real_parent = child_reaper; + else + p->real_parent = reaper; + if (p->parent == p->real_parent) + BUG(); +} + +static inline void reparent_thread(task_t *p, task_t *father, int traced) +{ + /* We dont want people slaying init. */ + if (p->exit_signal != -1) + p->exit_signal = SIGCHLD; + p->self_exec_id++; + + if (p->pdeath_signal) + send_sig(p->pdeath_signal, p, 0); + + /* Move the child from its dying parent to the new one. */ + if (unlikely(traced)) { + /* Preserve ptrace links if someone else is tracing this child. */ + list_del_init(&p->ptrace_list); + if (p->parent != p->real_parent) + list_add(&p->ptrace_list, &p->real_parent->ptrace_children); + } else { + /* If this child is being traced, then we're the one tracing it + * anyway, so let go of it. + */ + p->ptrace = 0; + list_del_init(&p->sibling); + p->parent = p->real_parent; + list_add_tail(&p->sibling, &p->parent->children); + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (p->state == TASK_ZOMBIE && p->exit_signal != -1) + do_notify_parent(p, p->exit_signal); + } + + /* + * process group orphan check + * Case ii: Our child is in a different pgrp + * than we are, and it was the only connection + * outside, so the child pgrp is now orphaned. + */ + if ((p->pgrp != father->pgrp) && + (p->session == father->session)) { + int pgrp = p->pgrp; + + if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + __kill_pg_info(SIGHUP, (void *)1, pgrp); + __kill_pg_info(SIGCONT, (void *)1, pgrp); + } + } +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the global child reaper process (ie "init") + */ +static inline void forget_original_parent(struct task_struct * father) +{ + struct task_struct *p, *reaper = father; + struct list_head *_p, *_n; + + reaper = father->group_leader; + if (reaper == father) + reaper = child_reaper; + + /* + * There are only two places where our children can be: + * + * - in our child list + * - in our ptraced child list + * + * Search them and reparent children. + */ + list_for_each_safe(_p, _n, &father->children) { + p = list_entry(_p,struct task_struct,sibling); + if (father == p->real_parent) { + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 0); + } else { + ptrace_unlink (p); + if (p->state == TASK_ZOMBIE && p->exit_signal != -1) + do_notify_parent(p, p->exit_signal); + } + } + list_for_each_safe(_p, _n, &father->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 1); + } +} + /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(void) +static void exit_notify(struct task_struct *tsk) { - struct task_struct * p, *t; + struct task_struct *t; + + if (signal_pending(tsk) && !tsk->signal->group_exit + && !thread_group_empty(tsk)) { + /* + * This occurs when there was a race between our exit + * syscall and a group signal choosing us as the one to + * wake up. It could be that we are the only thread + * alerted to check for pending signals, but another thread + * should be woken now to take the signal since we will not. + * Now we'll wake all the threads in the group just to make + * sure someone gets all the pending signals. + */ + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + for (t = next_thread(tsk); t != tsk; t = next_thread(t)) + if (!signal_pending(t) && !(t->flags & PF_EXITING)) { + recalc_sigpending_tsk(t); + if (signal_pending(t)) + signal_wake_up(t, 0); + } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + } + + write_lock_irq(&tasklist_lock); + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + + forget_original_parent(tsk); + BUG_ON(!list_empty(&tsk->children)); - forget_original_parent(current); /* * Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped @@ -345,14 +600,14 @@ static void exit_notify(void) * is about to become orphaned. */ - t = current->p_pptr; + t = tsk->real_parent; - if ((t->pgrp != current->pgrp) && - (t->session == current->session) && - will_become_orphaned_pgrp(current->pgrp, current) && - has_stopped_jobs(current->pgrp)) { - kill_pg(current->pgrp,SIGHUP,1); - kill_pg(current->pgrp,SIGCONT,1); + if ((t->pgrp != tsk->pgrp) && + (t->session == tsk->session) && + will_become_orphaned_pgrp(tsk->pgrp, tsk) && + has_stopped_jobs(tsk->pgrp)) { + __kill_pg_info(SIGHUP, (void *)1, tsk->pgrp); + __kill_pg_info(SIGCONT, (void *)1, tsk->pgrp); } /* Let father know we died @@ -371,96 +626,67 @@ static void exit_notify(void) * */ - if(current->exit_signal != SIGCHLD && - ( current->parent_exec_id != t->self_exec_id || - current->self_exec_id != current->parent_exec_id) + if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && + ( tsk->parent_exec_id != t->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && !capable(CAP_KILL)) - current->exit_signal = SIGCHLD; + tsk->exit_signal = SIGCHLD; - /* - * This loop does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. */ - - write_lock_irq(&tasklist_lock); - current->state = TASK_ZOMBIE; - do_notify_parent(current, current->exit_signal); - while (current->p_cptr != NULL) { - p = current->p_cptr; - current->p_cptr = p->p_osptr; - p->p_ysptr = NULL; - p->ptrace = 0; - - p->p_pptr = p->p_opptr; - p->p_osptr = p->p_pptr->p_cptr; - if (p->p_osptr) - p->p_osptr->p_ysptr = p; - p->p_pptr->p_cptr = p; - if (p->state == TASK_ZOMBIE) - do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((p->pgrp != current->pgrp) && - (p->session == current->session)) { - int pgrp = p->pgrp; - - write_unlock_irq(&tasklist_lock); - if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) { - kill_pg(pgrp,SIGHUP,1); - kill_pg(pgrp,SIGCONT,1); - } - write_lock_irq(&tasklist_lock); - } + if (tsk->exit_signal != -1) { + if (tsk->parent == tsk->real_parent) + do_notify_parent(tsk, tsk->exit_signal); + else + do_notify_parent(tsk, SIGCHLD); } - write_unlock_irq(&tasklist_lock); + + tsk->state = TASK_ZOMBIE; + /* + * No need to unlock IRQs, we'll schedule() immediately + * anyway. + */ + write_unlock(&tasklist_lock); } NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - if (in_interrupt()) + if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); - if (!tsk->pid) + if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (tsk->pid == 1) + if (unlikely(tsk->pid == 1)) panic("Attempted to kill init!"); tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); profile_exit_task(tsk); -fake_volatile: -#ifdef CONFIG_BSD_PROCESS_ACCT + if (unlikely(current->ptrace & PT_TRACE_EXIT)) + ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); + acct_process(code); -#endif if (current->tux_info) { #ifdef CONFIG_TUX_DEBUG - printk("Possibly unexpected TUX-thread exit(%ld) at %p?\n", - code, __builtin_return_address(0)); + printk("Possibly unexpected TUX-thread exit(%ld) at %p?\n", code, __builtin_return_address(0)); #endif current->tux_exit(); } + __exit_mm(tsk); - lock_kernel(); sem_exit(); __exit_files(tsk); __exit_fs(tsk); exit_namespace(tsk); - exit_sighand(tsk); exit_thread(); - if (current->leader) + if (tsk->leader) disassociate_ctty(1); put_exec_domain(tsk->exec_domain); @@ -468,23 +694,15 @@ fake_volatile: __MOD_DEC_USE_COUNT(tsk->binfmt->module); tsk->exit_code = code; - exit_notify(); + exit_notify(tsk); + + if (tsk->exit_signal == -1) + release_task(tsk); + schedule(); BUG(); -/* - * In order to get rid of the "volatile function does return" message - * I did this little loop that confuses gcc to think do_exit really - * is volatile. In fact it's schedule() that is volatile in some - * circumstances: when current->state = ZOMBIE, schedule() never - * returns. - * - * In fact the natural way to do all this is to have the label and the - * goto right after each other, but I put the fake_volatile label at - * the start of the function just in case something /really/ bad - * happens, and the schedule returns. This way we can try again. I'm - * not paranoid: it's just that everybody is out to get me. - */ - goto fake_volatile; + /* Avoid "noreturn function does return". */ + for (;;) ; } NORET_TYPE void complete_and_exit(struct completion *comp, long code) @@ -495,20 +713,276 @@ NORET_TYPE void complete_and_exit(struct do_exit(code); } -asmlinkage long sys_exit(int error_code) +asmlinkage void sys_exit(int error_code) { do_exit((error_code&0xff)<<8); } +void check_tasklist_locked(void) +{ +#if CONFIG_SMP + if (!rwlock_is_locked(&tasklist_lock)) + BUG(); +#endif +} + +task_t *next_thread(task_t *p) +{ + struct pid_link *link = p->pids + PIDTYPE_TGID; + struct list_head *tmp, *head = &link->pidptr->task_list; + +#if CONFIG_SMP + if (!p->sighand) + BUG(); + check_tasklist_locked(); +#endif + tmp = link->pid_chain.next; + if (tmp == head) + tmp = head->next; + + return pid_task(tmp, PIDTYPE_TGID); +} + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (current->signal->group_exit) + exit_code = current->signal->group_exit_code; + else if (!thread_group_empty(current)) { + struct signal_struct *const sig = current->signal; + struct sighand_struct *const sighand = current->sighand; + read_lock(&tasklist_lock); + spin_lock_irq(&sighand->siglock); + if (sig->group_exit) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->group_exit = 1; + sig->group_exit_code = exit_code; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +asmlinkage void sys_exit_group(int error_code) +{ + do_group_exit((error_code & 0xff) << 8); +} + +static int eligible_child(pid_t pid, int options, task_t *p) +{ + if (pid > 0) { + if (p->pid != pid) + return 0; + } else if (!pid) { + if (p->pgrp != current->pgrp) + return 0; + } else if (pid != -1) { + if (p->pgrp != -pid) + return 0; + } + + /* + * Do not consider detached threads that are + * not ptraced: + */ + if (p->exit_signal == -1 && !p->ptrace) + return 0; + + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + && !(options & __WALL)) + return 0; + /* + * Do not consider thread group leaders that are + * in a non-empty thread group: + */ + if (current->tgid != p->tgid && delay_group_leader(p)) + return 2; + + return 1; +} + +/* + * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(task_t *p, unsigned int *stat_addr, struct rusage *ru) +{ + unsigned long state; + int retval; + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->state, TASK_DEAD); + if (state != TASK_ZOMBIE) { + BUG_ON(state != TASK_DEAD); + return 0; + } + if (unlikely(p->exit_signal == -1)) + /* + * This can only happen in a race with a ptraced thread + * dying on another processor. + */ + return 0; + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to TASK_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) { + if (p->signal->group_exit) + retval = put_user(p->signal->group_exit_code, stat_addr); + else + retval = put_user(p->exit_code, stat_addr); + } + if (retval) { + p->state = TASK_ZOMBIE; + return retval; + } + retval = p->pid; + if (p->real_parent != p->parent) { + write_lock_irq(&tasklist_lock); + /* Double-check with lock held. */ + if (p->real_parent != p->parent) { + __ptrace_unlink(p); + do_notify_parent(p, p->exit_signal); + p->state = TASK_ZOMBIE; + p = NULL; + } + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + BUG_ON(!retval); + return retval; +} + +/* + * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_stopped(task_t *p, int delayed_group_leader, + unsigned int *stat_addr, struct rusage *ru) +{ + int retval, exit_code; + + if (!p->exit_code) + return 0; + if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && + p->signal && p->signal->group_stop_count > 0) + /* + * A group stop is in progress and this is the group leader. + * We won't report until all threads have stopped. + */ + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + write_lock_irq(&tasklist_lock); + + /* + * This uses xchg to be atomic with the thread resuming and setting + * it. It must also be done with the write lock held to prevent a + * race with the TASK_ZOMBIE case. + */ + exit_code = xchg(&p->exit_code, 0); + if (unlikely(p->state > TASK_STOPPED)) { + /* + * The task resumed and then died. Let the next iteration + * catch it in TASK_ZOMBIE. Note that exit_code might + * already be zero here if it resumed and did _exit(0). + * The task itself is dead and won't touch exit_code again; + * other processors in this function are locked out. + */ + p->exit_code = exit_code; + exit_code = 0; + } + if (unlikely(exit_code == 0)) { + /* + * Another thread in this function got to it first, or it + * resumed, or it resumed and then died. + */ + write_unlock_irq(&tasklist_lock); + put_task_struct(p); + read_lock(&tasklist_lock); + return 0; + } + + /* move to end of parent's list to avoid starvation */ + remove_parent(p); + add_parent(p, p->parent); + + write_unlock_irq(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) + retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval) + retval = p->pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru) { - int flag, retval; DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; + int flag, retval, workaround = 0; if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + if (current->sighand->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) { + static unsigned long last_timestamp; + + // rate-limit it to 1 per minute: + if (jiffies - last_timestamp > 60*HZ) { + last_timestamp = jiffies; + printk(KERN_INFO "application bug: %s(%d) has SIGCHLD set to SIG_IGN but calls wait().\n", current->comm, current->pid); + printk(KERN_INFO "(see the NOTES section of 'man 2 wait'). Workaround activated.\n"); + } + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; + workaround = 1; + } add_wait_queue(¤t->wait_chldexit,&wait); repeat: flag = 0; @@ -517,68 +991,53 @@ repeat: tsk = current; do { struct task_struct *p; - for (p = tsk->p_cptr ; p ; p = p->p_osptr) { - if (pid>0) { - if (p->pid != pid) - continue; - } else if (!pid) { - if (p->pgrp != current->pgrp) - continue; - } else if (pid != -1) { - if (p->pgrp != -pid) - continue; - } - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) - && !(options & __WALL)) + struct list_head *_p; + int ret; + + list_for_each(_p,&tsk->children) { + p = list_entry(_p,struct task_struct,sibling); + + ret = eligible_child(pid, options, p); + if (!ret) continue; flag = 1; + switch (p->state) { case TASK_STOPPED: - if (!p->exit_code) - continue; - if (!(options & WUNTRACED) && !(p->ptrace & PT_PTRACED)) + if (!(options & WUNTRACED) && + !(p->ptrace & PT_PTRACED)) continue; - read_unlock(&tasklist_lock); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - if (!retval && stat_addr) - retval = put_user((p->exit_code << 8) | 0x7f, stat_addr); - if (!retval) { - p->exit_code = 0; - retval = p->pid; - } - goto end_wait4; + retval = wait_task_stopped(p, ret == 2, + stat_addr, ru); + if (retval != 0) /* He released the lock. */ + goto end_wait4; + break; case TASK_ZOMBIE: - current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; - current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; - read_unlock(&tasklist_lock); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - if (!retval && stat_addr) - retval = put_user(p->exit_code, stat_addr); - if (retval) - goto end_wait4; - retval = p->pid; - if (p->p_opptr != p->p_pptr) { - write_lock_irq(&tasklist_lock); - REMOVE_LINKS(p); - p->p_pptr = p->p_opptr; - SET_LINKS(p); - do_notify_parent(p, SIGCHLD); - write_unlock_irq(&tasklist_lock); - } else - release_task(p); - goto end_wait4; - default: - continue; + /* + * Eligible but we cannot release it yet: + */ + if (ret == 2) + continue; + retval = wait_task_zombie(p, stat_addr, ru); + if (retval != 0) /* He released the lock. */ + goto end_wait4; + break; + } + } + if (!flag) { + list_for_each (_p,&tsk->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); + if (!eligible_child(pid, options, p)) + continue; + flag = 1; + break; } } if (options & __WNOTHREAD) break; tsk = next_thread(tsk); + if (tsk->signal != current->signal) + BUG(); } while (tsk != current); read_unlock(&tasklist_lock); if (flag) { @@ -595,10 +1054,12 @@ repeat: end_wait4: current->state = TASK_RUNNING; remove_wait_queue(¤t->wait_chldexit,&wait); + if (workaround) + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_IGN; return retval; } -#if !defined(__alpha__) && !defined(__ia64__) +#if !defined(__alpha__) && !defined(__ia64__) && !defined(__arm__) /* * sys_waitpid() remains for compatibility. waitpid() should be --- linux/kernel/fork.c.orig +++ linux/kernel/fork.c @@ -22,30 +22,61 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include #include +static kmem_cache_t *task_struct_cachep; + +extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern void exit_semundo(struct task_struct *tsk); + /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ -int last_pid; -struct task_struct *pidhash[PIDHASH_SZ]; +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + +/* + * A per-CPU task cache - this relies on the fact that + * the very last portion of sys_exit() is executed with + * preemption turned off. + */ +static task_t *task_cache[NR_CPUS] __cacheline_aligned; + +void __put_task_struct(struct task_struct *tsk) +{ + int cpu = smp_processor_id(); + + if (tsk != current) { + __free_task_struct(tsk); + return; + } + + tsk = task_cache[cpu]; + if (tsk) + __free_task_struct(tsk); + + task_cache[cpu] = current; +} void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; wait->flags &= ~WQ_FLAG_EXCLUSIVE; - wq_write_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, wait); - wq_write_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&q->lock, flags); } void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) @@ -53,90 +84,100 @@ void add_wait_queue_exclusive(wait_queue unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; - wq_write_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&q->lock, flags); __add_wait_queue_tail(q, wait); - wq_write_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&q->lock, flags); } void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; - wq_write_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&q->lock, flags); __remove_wait_queue(q, wait); - wq_write_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&q->lock, flags); +} + +void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } } void __init fork_init(unsigned long mempages) { + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", + sizeof(struct task_struct),0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!task_struct_cachep) + panic("fork_init(): cannot create task_struct SLAB cache"); + /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half * of memory. */ max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + /* + * we need to allow at least 20 threads to boot a system + */ + if(max_threads < 20) + max_threads = 20; init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; } -/* Protects next_safe and last_pid. */ -spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED; - -static int get_pid(unsigned long flags) +static struct task_struct *dup_task_struct(struct task_struct *orig) { - static int next_safe = PID_MAX; - struct task_struct *p; - int pid, beginpid; + struct task_struct *tsk; + int cpu = smp_processor_id(); - if (flags & CLONE_PID) - return current->pid; + tsk = task_cache[cpu]; + task_cache[cpu] = NULL; - spin_lock(&lastpid_lock); - beginpid = last_pid; - if((++last_pid) & 0xffff8000) { - last_pid = 300; /* Skip daemons etc. */ - goto inside; - } - if(last_pid >= next_safe) { -inside: - next_safe = PID_MAX; - read_lock(&tasklist_lock); - repeat: - for_each_task(p) { - if(p->pid == last_pid || - p->pgrp == last_pid || - p->tgid == last_pid || - p->session == last_pid) { - if(++last_pid >= next_safe) { - if(last_pid & 0xffff8000) - last_pid = 300; - next_safe = PID_MAX; - } - if(unlikely(last_pid == beginpid)) - goto nomorepids; - goto repeat; - } - if(p->pid > last_pid && next_safe > p->pid) - next_safe = p->pid; - if(p->pgrp > last_pid && next_safe > p->pgrp) - next_safe = p->pgrp; - if(p->tgid > last_pid && next_safe > p->tgid) - next_safe = p->tgid; - if(p->session > last_pid && next_safe > p->session) - next_safe = p->session; - } - read_unlock(&tasklist_lock); + if (!tsk) { + tsk = __alloc_task_struct(); + if (!tsk) + return NULL; } - pid = last_pid; - spin_unlock(&lastpid_lock); - - return pid; -nomorepids: - read_unlock(&tasklist_lock); - spin_unlock(&lastpid_lock); - return 0; + memcpy(tsk, orig, sizeof(*tsk)); + atomic_set(&tsk->usage,1); + return tsk; } static inline int dup_mmap(struct mm_struct * mm) @@ -149,6 +190,7 @@ static inline int dup_mmap(struct mm_str mm->mmap = NULL; mm->mmap_cache = NULL; mm->map_count = 0; + mm->free_area_cache = TASK_UNMAPPED_BASE; mm->rss = 0; mm->cpu_vm_mask = 0; pprev = &mm->mmap; @@ -219,7 +261,7 @@ fail_nomem: return retval; } -spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; +spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; int mmlist_nr; #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) @@ -231,7 +273,9 @@ static struct mm_struct * mm_init(struct atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); memset(&mm->mm_stat, 0, sizeof(mm->mm_stat)); + mm->core_waiters = 0; mm->page_table_lock = SPIN_LOCK_UNLOCKED; + mm->free_area_cache = TASK_UNMAPPED_BASE; mm->pgd = pgd_alloc(mm); mm->def_flags = 0; if (mm->pgd) @@ -263,7 +307,7 @@ struct mm_struct * mm_alloc(void) */ inline void __mmdrop(struct mm_struct *mm) { - BUG_ON(mm == &init_mm); + if (mm == &init_mm) BUG(); pgd_free(mm->pgd); destroy_context(mm); free_mm(mm); @@ -306,6 +350,16 @@ void mm_release(void) tsk->vfork_done = NULL; complete(vfork_done); } + if (tsk->clear_child_tid) { + int * tidptr = tsk->clear_child_tid; + tsk->clear_child_tid = NULL; + /* + * We dont check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tidptr); + sys_futex((unsigned long)tidptr, FUTEX_WAKE, 1, NULL); + } } static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) @@ -332,6 +386,14 @@ static int copy_mm(unsigned long clone_f if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; + + /* + * There are cases where the PTL is held to ensure no + * new threads start up in user mode using an mm, which + * allows optimizing out ipis; the tlb_gather_mmu code + * is an example. + */ + spin_unlock_wait(&oldmm->page_table_lock); goto good_mm; } @@ -355,11 +417,6 @@ static int copy_mm(unsigned long clone_f if (retval) goto free_pt; - /* - * child gets a private LDT (if there was an LDT in the parent) - */ - copy_segments(tsk, mm); - good_mm: tsk->mm = mm; tsk->active_mm = mm; @@ -535,19 +592,42 @@ out_release: static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { - struct signal_struct *sig; + struct sighand_struct *sig; - if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sig->count); + if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + atomic_inc(¤t->sighand->count); return 0; } - sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); - tsk->sig = sig; + sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + tsk->sighand = sig; if (!sig) return -1; spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); - memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); + memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + return 0; +} + +static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) +{ + struct signal_struct *sig; + + if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + return 0; + } + sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + tsk->signal = sig; + if (!sig) + return -1; + atomic_set(&sig->count, 1); + sig->group_exit = 0; + sig->group_exit_code = 0; + sig->group_exit_task = NULL; + sig->group_stop_count = 0; + sig->curr_target = NULL; + init_sigpending(&sig->shared_pending); + return 0; } @@ -562,54 +642,56 @@ static inline void copy_flags(unsigned l p->flags = new_flags; } +asmlinkage int sys_set_tid_address(int *tidptr) +{ + current->clear_child_tid = tidptr; + + return current->pid; +} + /* - * Ok, this is the main fork-routine. It copies the system process - * information (task[nr]) and sets up the necessary registers. It also - * copies the data segment in its entirety. The "stack_start" and - * "stack_top" arguments are simply passed along to the platform - * specific copy_thread() routine. Most platforms ignore stack_top. - * For an example that's using stack_top, see - * arch/ia64/kernel/process.c. + * This creates a new process as a copy of the old one, + * but does not actually start it yet. + * + * It copies the registers, and all the appropriate + * parts of the process environment (as per the clone + * flags). The actual kick-off is left to the caller. */ -int do_fork(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size) +static struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int *parent_tidptr, + int *child_tidptr) { int retval; - struct task_struct *p; - struct completion vfork; + struct task_struct *p = NULL; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) - return -EINVAL; - - retval = -EPERM; + return ERR_PTR(-EINVAL); - /* - * CLONE_PID is only allowed for the initial SMP swapper - * calls + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. */ - if (clone_flags & CLONE_PID) { - if (current->pid) - goto fork_out; - } + if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) + return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_DETACHED) && !(clone_flags & CLONE_THREAD)) + return ERR_PTR(-EINVAL); + if (!(clone_flags & CLONE_DETACHED) && (clone_flags & CLONE_THREAD)) + return ERR_PTR(-EINVAL); retval = -ENOMEM; - p = alloc_task_struct(); + p = dup_task_struct(current); if (!p) goto fork_out; - *p = *current; p->tux_info = NULL; - retval = -EAGAIN; - /* - * Check if we are over our maximum process limit, but be sure to - * exclude root. This is needed to make it possible for login and - * friends to set the per-user process limit to something lower - * than the amount of processes root is running. -- Rik - */ - if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur - && !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE)) - goto bad_fork_free; + if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE)) + goto bad_fork_free; + } atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); @@ -632,21 +714,27 @@ int do_fork(unsigned long clone_flags, u p->state = TASK_UNINTERRUPTIBLE; copy_flags(clone_flags, p); - p->pid = get_pid(clone_flags); - if (p->pid == 0 && current->pid != 0) - goto bad_fork_cleanup; + if (clone_flags & CLONE_IDLETASK) + p->pid = 0; + else { + p->pid = alloc_pidmap(); + if (p->pid == -1) + goto bad_fork_cleanup; + } + + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) + if (put_user(p->pid, parent_tidptr)) + goto bad_fork_cleanup; - p->run_list.next = NULL; - p->run_list.prev = NULL; + INIT_LIST_HEAD(&p->run_list); - p->p_cptr = NULL; + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - } spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->switch_lock); p->sigpending = 0; init_sigpending(&p->pending); @@ -660,17 +748,18 @@ int do_fork(unsigned long clone_flags, u p->tty_old_pgrp = 0; p->times.tms_utime = p->times.tms_stime = 0; p->times.tms_cutime = p->times.tms_cstime = 0; + p->group_times.tms_utime = p->group_times.tms_stime = 0; + p->group_times.tms_cutime = p->group_times.tms_cstime = 0; #ifdef CONFIG_SMP { int i; - p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* ?? should we just memset this ?? */ - for(i = 0; i < smp_num_cpus; i++) + for(i = 0; i < NR_CPUS; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; - spin_lock_init(&p->sigmask_lock); } #endif + p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; @@ -682,15 +771,25 @@ int do_fork(unsigned long clone_flags, u goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p)) goto bad_fork_cleanup_fs; - if (copy_mm(clone_flags, p)) + if (copy_signal(clone_flags, p)) goto bad_fork_cleanup_sighand; + if (copy_mm(clone_flags, p)) + goto bad_fork_cleanup_signal; if (copy_namespace(clone_flags, p)) goto bad_fork_cleanup_mm; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespace; p->semundo = NULL; - + + if (clone_flags & CLONE_CHILD_SETTID) + p->set_child_tid = child_tidptr; + /* + * Clear TID on mm_release()? + */ + if (clone_flags & CLONE_CHILD_CLEARTID) + p->clear_child_tid = child_tidptr; + /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -698,19 +797,41 @@ int do_fork(unsigned long clone_flags, u /* ok, now we should be set up.. */ p->swappable = 1; - p->exit_signal = clone_flags & CSIGNAL; + if (clone_flags & CLONE_DETACHED) + p->exit_signal = -1; + else + p->exit_signal = clone_flags & CSIGNAL; p->pdeath_signal = 0; /* - * "share" dynamic priority between parent and child, thus the - * total amount of dynamic priorities in the system doesn't change, - * more scheduling fairness. This is only important in the first - * timeslice, on the long run the scheduling behaviour is unchanged. - */ - p->counter = (current->counter + 1) >> 1; - current->counter >>= 1; - if (!current->counter) - current->need_resched = 1; + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesnt change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + p->last_run = jiffies; + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + scheduler_tick(0, 0); + } + local_irq_enable(); + + if ((int)current->time_slice <= 0) + BUG(); + if ((int)p->time_slice <= 0) + BUG(); /* * Ok, add it to the run-queues and make it @@ -718,47 +839,85 @@ int do_fork(unsigned long clone_flags, u * * Let it rip! */ - retval = p->pid; - p->tgid = retval; - INIT_LIST_HEAD(&p->thread_group); + p->tgid = p->pid; + p->group_leader = p; + INIT_LIST_HEAD(&p->ptrace_children); + INIT_LIST_HEAD(&p->ptrace_list); /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* + * Check for pending SIGKILL! The new thread should not be allowed + * to slip out of an OOM kill. (or normal SIGKILL.) + */ + if (sigismember(¤t->pending.signal, SIGKILL)) { + write_unlock_irq(&tasklist_lock); + retval = -EINTR; + goto bad_fork_cleanup_namespace; + } /* CLONE_PARENT re-uses the old parent */ - p->p_opptr = current->p_opptr; - p->p_pptr = current->p_pptr; - if (!(clone_flags & CLONE_PARENT)) { - p->p_opptr = current; - if (!(p->ptrace & PT_PTRACED)) - p->p_pptr = current; - } + if (clone_flags & CLONE_PARENT) + p->real_parent = current->real_parent; + else + p->real_parent = current; + p->parent = p->real_parent; if (clone_flags & CLONE_THREAD) { + spin_lock(¤t->sighand->siglock); + /* + * Important: if an exit-all has been started then + * do not create this new thread - the whole thread + * group is supposed to exit anyway. + */ + if (current->signal->group_exit) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + goto bad_fork_cleanup_namespace; + } p->tgid = current->tgid; - list_add(&p->thread_group, ¤t->thread_group); + p->group_leader = current->group_leader; + + if (current->signal->group_stop_count > 0) { + /* + * There is an all-stop in progress for the group. + * We ourselves will stop as soon as we check signals. + * Make the new thread part of that group stop too. + */ + current->signal->group_stop_count++; + p->sigpending = 1; + } + + spin_unlock(¤t->sighand->siglock); } SET_LINKS(p); - hash_pid(p); - nr_threads++; - write_unlock_irq(&tasklist_lock); - if (p->ptrace & PT_PTRACED) - send_sig(SIGSTOP, p, 1); + __ptrace_link(p, current->parent); + + attach_pid(p, PIDTYPE_PID, p->pid); + if (thread_group_leader(p)) { + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PGID, p->pgrp); + attach_pid(p, PIDTYPE_SID, p->session); + } else + link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); - wake_up_process(p); /* do this last */ - ++total_forks; - if (clone_flags & CLONE_VFORK) - wait_for_completion(&vfork); + nr_threads++; + write_unlock_irq(&tasklist_lock); + retval = 0; fork_out: - return retval; + if (retval) + return ERR_PTR(retval); + return p; bad_fork_cleanup_namespace: exit_namespace(p); bad_fork_cleanup_mm: exit_mm(p); +bad_fork_cleanup_signal: + exit_signal(p); bad_fork_cleanup_sighand: exit_sighand(p); bad_fork_cleanup_fs: @@ -766,6 +925,8 @@ bad_fork_cleanup_fs: bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup: + if (p->pid > 0) + free_pidmap(p->pid); put_exec_domain(p->exec_domain); if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); @@ -773,12 +934,91 @@ bad_fork_cleanup_count: atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: - free_task_struct(p); + p->state = TASK_ZOMBIE; /* debug */ + put_task_struct(p); goto fork_out; } -/* SLAB cache for signal_struct structures (tsk->sig) */ -kmem_cache_t *sigact_cachep; +static inline int fork_traceflag (unsigned clone_flags) +{ + if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK)) + return 0; + else if (clone_flags & CLONE_VFORK) { + if (current->ptrace & PT_TRACE_VFORK) + return PTRACE_EVENT_VFORK; + } else if ((clone_flags & CSIGNAL) != SIGCHLD) { + if (current->ptrace & PT_TRACE_CLONE) + return PTRACE_EVENT_CLONE; + } else if (current->ptrace & PT_TRACE_FORK) + return PTRACE_EVENT_FORK; + + return 0; +} + +/* + * Ok, this is the main fork-routine. + * + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +struct task_struct *do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int *parent_tidptr, + int *child_tidptr) +{ + struct task_struct *p; + int trace = 0; + + if (unlikely(current->ptrace)) { + trace = fork_traceflag (clone_flags); + if (trace) + clone_flags |= CLONE_PTRACE; + } + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr); + if (!IS_ERR(p)) { + struct completion vfork; + + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + } + + if (p->ptrace & PT_PTRACED) { + /* + * We'll start up with an immediate SIGSTOP. + */ + sigaddset(&p->pending.signal, SIGSTOP); + p->sigpending = 1; + } + + wake_up_forked_process(p); /* do this last */ + ++total_forks; + + if (unlikely (trace)) { + current->ptrace_message = (unsigned long) p->pid; + ptrace_notify ((trace << 8) | SIGTRAP); + } + + if (clone_flags & CLONE_VFORK) + wait_for_completion(&vfork); + else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + set_need_resched(); + } + return p; +} + +/* SLAB cache for signal_struct structures (tsk->signal) */ +kmem_cache_t *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +kmem_cache_t *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ kmem_cache_t *files_cachep; @@ -794,11 +1034,17 @@ kmem_cache_t *mm_cachep; void __init proc_caches_init(void) { - sigact_cachep = kmem_cache_create("signal_act", + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!sighand_cachep) + panic("Cannot create sighand SLAB cache"); + + signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!sigact_cachep) - panic("Cannot create signal action SLAB cache"); + if (!signal_cachep) + panic("Cannot create signal SLAB cache"); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, --- linux/kernel/futex.c.orig +++ linux/kernel/futex.c @@ -0,0 +1,365 @@ +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * Generalized futexes for every mapping type, Ingo Molnar, 2002 + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FUTEX_HASHBITS 8 + +/* + * We use this hashed waitqueue instead of a normal wait_queue_t, so + * we can wake only the relevent ones (hashed queues may be shared): + */ +struct futex_q { + struct list_head list; + wait_queue_head_t waiters; + + /* Page struct and offset within it. */ + struct page *page; + int offset; + + /* the virtual => physical COW-safe cache */ + vcache_t vcache; +}; + +/* The key for the hash is the address + index + offset within page */ +static struct list_head futex_queues[1<mm->page_table_lock); + spin_lock(&vcache_lock); + spin_lock(&futex_lock); +} + +static inline void unlock_futex_mm(void) +{ + spin_unlock(&futex_lock); + spin_unlock(&vcache_lock); + spin_unlock(¤t->mm->page_table_lock); +} + +/* + * The physical page is shared, so we can hash on its address: + */ +static inline struct list_head *hash_futex(struct page *page, int offset) +{ + return &futex_queues[hash_long((unsigned long)page + offset, + FUTEX_HASHBITS)]; +} + +/* Waiter either waiting in FUTEX_WAIT or poll(), or expecting signal */ +static inline void tell_waiter(struct futex_q *q) +{ + wake_up_all(&q->waiters); +} + +/* + * Get kernel address of the user page and pin it. + * + * Must be called with (and returns with) all futex-MM locks held. + */ +static struct page *__pin_page(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct page *page, *tmp; + int err; + + /* + * Do a quick atomic lookup first - this is the fastpath. + */ + page = follow_page(mm, addr, 0); + if (likely(page != NULL)) { + if (!PageReserved(page)) + get_page(page); + return page; + } + + /* + * No luck - need to fault in the page: + */ +repeat_lookup: + + unlock_futex_mm(); + + down_read(&mm->mmap_sem); + err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL); + up_read(&mm->mmap_sem); + + lock_futex_mm(); + + if (err < 0) + return NULL; + /* + * Since the faulting happened with locks released, we have to + * check for races: + */ + tmp = follow_page(mm, addr, 0); + if (tmp != page) { + put_page(page); + goto repeat_lookup; + } + + return page; +} + +static inline void unpin_page(struct page *page) +{ + put_page(page); +} + +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake(unsigned long uaddr, int offset, int num) +{ + struct list_head *i, *next, *head; + struct page *page; + int ret = 0; + + lock_futex_mm(); + + page = __pin_page(uaddr - offset); + if (!page) { + unlock_futex_mm(); + return -EFAULT; + } + + head = hash_futex(page, offset); + + list_for_each_safe(i, next, head) { + struct futex_q *this = list_entry(i, struct futex_q, list); + + if (this->page == page && this->offset == offset) { + list_del_init(i); + __detach_vcache(&this->vcache); + tell_waiter(this); + ret++; + if (ret >= num) + break; + } + } + + unlock_futex_mm(); + unpin_page(page); + + return ret; +} + +/* + * This gets called by the COW code, we have to rehash any + * futexes that were pending on the old physical page, and + * rehash it to the new physical page. The pagetable_lock + * and vcache_lock is already held: + */ +static void futex_vcache_callback(vcache_t *vcache, struct page *new_page) +{ + struct futex_q *q = container_of(vcache, struct futex_q, vcache); + struct list_head *head = hash_futex(new_page, q->offset); + + spin_lock(&futex_lock); + + if (!list_empty(&q->list)) { + q->page = new_page; + list_del(&q->list); + list_add_tail(&q->list, head); + } + + spin_unlock(&futex_lock); +} + +static inline void __queue_me(struct futex_q *q, struct page *page, + unsigned long uaddr, int offset) +{ + struct list_head *head = hash_futex(page, offset); + + q->offset = offset; + q->page = page; + + list_add_tail(&q->list, head); + /* + * We register a futex callback to this virtual address, + * to make sure a COW properly rehashes the futex-queue. + */ + __attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback); +} + +/* Return 1 if we were still queued (ie. 0 means we were woken) */ +static inline int unqueue_me(struct futex_q *q) +{ + int ret = 0; + + spin_lock(&vcache_lock); + spin_lock(&futex_lock); + if (!list_empty(&q->list)) { + list_del(&q->list); + __detach_vcache(&q->vcache); + ret = 1; + } + spin_unlock(&futex_lock); + spin_unlock(&vcache_lock); + return ret; +} + +static int futex_wait(unsigned long uaddr, + int offset, + int val, + unsigned long time) +{ + DECLARE_WAITQUEUE(wait, current); + int ret = 0, curval; + struct page *page; + struct futex_q q; + + init_waitqueue_head(&q.waiters); + + lock_futex_mm(); + + page = __pin_page(uaddr - offset); + if (!page) { + unlock_futex_mm(); + return -EFAULT; + } + __queue_me(&q, page, uaddr, offset); + + unlock_futex_mm(); + + /* Page is pinned, but may no longer be in this address space. */ + if (get_user(curval, (int *)uaddr) != 0) { + ret = -EFAULT; + goto out; + } + if (curval != val) { + ret = -EWOULDBLOCK; + goto out; + } + /* + * The get_user() above might fault and schedule so we + * cannot just set TASK_INTERRUPTIBLE state when queueing + * ourselves into the futex hash. This code thus has to + * rely on the FUTEX_WAKE code doing a wakeup after removing + * the waiter from the list. + */ + add_wait_queue(&q.waiters, &wait); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&q.list)) + time = schedule_timeout(time); + set_current_state(TASK_RUNNING); + /* + * NOTE: we dont remove ourselves from the waitqueue because + * we are the only user of it. + */ + if (time == 0) { + ret = -ETIMEDOUT; + goto out; + } + if (signal_pending(current)) + ret = -EINTR; +out: + /* Were we woken up anyway? */ + if (!unqueue_me(&q)) + ret = 0; + unpin_page(page); + + return ret; +} + +static inline int futex_wait_utime(unsigned long uaddr, + int offset, + int val, + struct timespec* utime) +{ + unsigned long time = MAX_SCHEDULE_TIMEOUT; + + if (utime) { + struct timespec t; + if (copy_from_user(&t, utime, sizeof(t)) != 0) + return -EFAULT; + time = timespec_to_jiffies(&t) + 1; + } + + return futex_wait(uaddr, offset, val, time); +} + +asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime) +{ + unsigned long pos_in_page; + int ret; + + pos_in_page = uaddr % PAGE_SIZE; + + /* Must be "naturally" aligned */ + if (pos_in_page % sizeof(int)) + return -EINVAL; + + switch (op) { + case FUTEX_WAIT: + ret = futex_wait_utime(uaddr, pos_in_page, val, utime); + break; + case FUTEX_WAKE: + ret = futex_wake(uaddr, pos_in_page, val); + break; + /* + * We disable FUTEX_FD support due to risks: it is the least tested + * aspect of futexes (they were broken for many kernel versions and + * no-one noticed) and always were one of the biggest source of bugs. + */ +#if 0 + case FUTEX_FD: + /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ + ret = futex_fd(uaddr, pos_in_page, val); + break; +#endif + default: + ret = -EINVAL; + } + return ret; +} + +static int __init init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) + INIT_LIST_HEAD(&futex_queues[i]); + return 0; +} +__initcall(init); --- linux/kernel/kmod.c.orig +++ linux/kernel/kmod.c @@ -97,8 +97,7 @@ int exec_usermodehelper(char *program_pa int i; struct task_struct *curtask = current; - curtask->session = 1; - curtask->pgrp = 1; + set_special_pids(1, 1); use_init_fs_context(); @@ -108,12 +107,13 @@ int exec_usermodehelper(char *program_pa as the super user right after the execve fails if you time the signal just right. */ - spin_lock_irq(&curtask->sigmask_lock); + spin_lock_irq(&curtask->sighand->siglock); sigemptyset(&curtask->blocked); flush_signals(curtask); flush_signal_handlers(curtask); - recalc_sigpending(curtask); - spin_unlock_irq(&curtask->sigmask_lock); + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; + recalc_sigpending_tsk(curtask); + spin_unlock_irq(&curtask->sighand->siglock); for (i = 0; i < curtask->files->max_fds; i++ ) { if (curtask->files->fd[i]) close(i); @@ -217,20 +217,20 @@ int request_module(const char * module_n } /* Block everything but SIGKILL/SIGSTOP */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); tmpsig = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); waitpid_result = waitpid(pid, NULL, __WCLONE); atomic_dec(&kmod_concurrent); /* Allow signals again.. */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = tmpsig; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (waitpid_result != pid) { printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n", --- linux/kernel/ksyms.c.orig +++ linux/kernel/ksyms.c @@ -478,7 +478,6 @@ EXPORT_SYMBOL(iomem_resource); /* process management */ EXPORT_SYMBOL(complete_and_exit); EXPORT_SYMBOL(__wake_up); -EXPORT_SYMBOL(__wake_up_sync); EXPORT_SYMBOL(wake_up_process); EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); @@ -488,6 +487,11 @@ EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(yield); EXPORT_SYMBOL(__cond_resched); +EXPORT_SYMBOL(set_user_nice); +#ifdef CONFIG_SMP +EXPORT_SYMBOL_GPL(set_cpus_allowed); +#endif +EXPORT_SYMBOL(nr_context_switches); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); @@ -505,7 +509,6 @@ EXPORT_SYMBOL(loops_per_jiffy); #endif EXPORT_SYMBOL(kstat); -EXPORT_SYMBOL(nr_running); /* misc */ EXPORT_SYMBOL(panic); @@ -617,7 +620,11 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); EXPORT_SYMBOL(init_task_union); EXPORT_SYMBOL(tasklist_lock); -EXPORT_SYMBOL(pidhash); - +EXPORT_SYMBOL_GPL(next_thread); +EXPORT_SYMBOL_GPL(find_task_by_pid); +EXPORT_SYMBOL(sys_wait4); +EXPORT_SYMBOL_GPL(set_special_pids); /* debug */ +extern void check_tasklist_locked(void); +EXPORT_SYMBOL_GPL(check_tasklist_locked); EXPORT_SYMBOL(dump_stack); --- linux/kernel/pid.c.orig +++ linux/kernel/pid.c @@ -0,0 +1,282 @@ +/* + * Generic pidhash and scalable, time-bounded PID allocator + * + * (C) 2002 William Irwin, IBM + * (C) 2002 Ingo Molnar, Red Hat + * + * pid-structures are backing objects for tasks sharing a given ID to chain + * against. There is very little to them aside from hashing them and + * parking tasks using given ID's on a list. + * + * The hash is always changed with the tasklist_lock write-acquired, + * and the hash is only accessed with the tasklist_lock at least + * read-acquired, so there's no additional SMP locking needed here. + * + * We have a list of bitmap pages, which bitmaps represent the PID space. + * Allocating and freeing PIDs is completely lockless. The worst-case + * allocation scenario when all but one out of 1 million PIDs possible are + * allocated already: the scanning of 32 list entries and at most PAGE_SIZE + * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). + */ + +#include +#include +#include +#include + +#define PIDHASH_SIZE 4096 +#define pid_hashfn(nr) ((nr >> 8) ^ nr) & (PIDHASH_SIZE - 1) +static struct list_head pid_hash[PIDTYPE_MAX][PIDHASH_SIZE]; + +int pid_max = PID_MAX_DEFAULT; +int last_pid; + +#define RESERVED_PIDS 300 + +#define PIDMAP_ENTRIES (PID_MAX_LIMIT/PAGE_SIZE/8) +#define BITS_PER_PAGE (PAGE_SIZE*8) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +/* + * PID-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way a low pid_max + * value does not cause lots of bitmaps to be allocated, but + * the scheme scales to up to 4 million PIDs, runtime. + */ +typedef struct pidmap { + atomic_t nr_free; + void *page; +} pidmap_t; + +static pidmap_t pidmap_array[PIDMAP_ENTRIES] = + { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; + +static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES; + +static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +inline void free_pidmap(int pid) +{ + pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + int offset = pid & BITS_PER_PAGE_MASK; + + clear_bit(offset, map->page); + atomic_inc(&map->nr_free); +} + +/* + * Here we search for the next map that has free bits left. + * Normally the next map has free PIDs. + */ +static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps) +{ + while (--*max_steps) { + if (++map == map_limit) + map = pidmap_array; + if (unlikely(!map->page)) { + unsigned long page = get_zeroed_page(GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock(&pidmap_lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&pidmap_lock); + + if (!map->page) + break; + } + if (atomic_read(&map->nr_free)) + return map; + } + return NULL; +} + +int alloc_pidmap(void) +{ + int pid, offset, max_steps = PIDMAP_ENTRIES + 1; + pidmap_t *map; + + pid = last_pid + 1; + if (pid >= pid_max) + pid = RESERVED_PIDS; + + offset = pid & BITS_PER_PAGE_MASK; + map = pidmap_array + pid / BITS_PER_PAGE; + + if (likely(map->page && !test_and_set_bit(offset, map->page))) { + /* + * There is a small window for last_pid updates to race, + * but in that case the next allocation will go into the + * slowpath and that fixes things up. + */ +return_pid: + atomic_dec(&map->nr_free); + last_pid = pid; + return pid; + } + + if (!offset || !atomic_read(&map->nr_free)) { +next_map: + map = next_free_map(map, &max_steps); + if (!map) + goto failure; + offset = 0; + } + /* + * Find the next zero bit: + */ +scan_more: + offset = find_next_zero_bit(map->page, BITS_PER_PAGE, offset); + if (offset >= BITS_PER_PAGE) + goto next_map; + if (test_and_set_bit(offset, map->page)) + goto scan_more; + + /* we got the PID: */ + pid = (map - pidmap_array) * BITS_PER_PAGE + offset; + goto return_pid; + +failure: + return -1; +} + +inline struct pid *find_pid(enum pid_type type, int nr) +{ + struct list_head *elem, *bucket = &pid_hash[type][pid_hashfn(nr)]; + struct pid *pid; + +#if CONFIG_SMP + BUG_ON(!rwlock_is_locked(&tasklist_lock)); +#endif + __list_for_each(elem, bucket) { + pid = list_entry(elem, struct pid, hash_chain); + if (pid->nr == nr) + return pid; + } + return NULL; +} + +void link_pid(task_t *task, struct pid_link *link, struct pid *pid) +{ + atomic_inc(&pid->count); + list_add_tail(&link->pid_chain, &pid->task_list); + link->pidptr = pid; +} + +int attach_pid(task_t *task, enum pid_type type, int nr) +{ + struct pid *pid = find_pid(type, nr); + + if (pid) + atomic_inc(&pid->count); + else { + pid = &task->pids[type].pid; + pid->nr = nr; + atomic_set(&pid->count, 1); + INIT_LIST_HEAD(&pid->task_list); + pid->task = task; + get_task_struct(task); + list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + } + list_add_tail(&task->pids[type].pid_chain, &pid->task_list); + task->pids[type].pidptr = pid; + + return 0; +} + +static inline int __detach_pid(task_t *task, enum pid_type type) +{ + struct pid_link *link = task->pids + type; + struct pid *pid = link->pidptr; + int nr; + + list_del(&link->pid_chain); + if (!atomic_dec_and_test(&pid->count)) + return 0; + + nr = pid->nr; + list_del(&pid->hash_chain); + put_task_struct(pid->task); + + return nr; +} + +static void _detach_pid(task_t *task, enum pid_type type) +{ + __detach_pid(task, type); +} + +void detach_pid(task_t *task, enum pid_type type) +{ + int nr = __detach_pid(task, type); + + if (!nr) + return; + + for (type = 0; type < PIDTYPE_MAX; ++type) + if (find_pid(type, nr)) + return; + free_pidmap(nr); +} + +task_t *find_task_by_pid(int nr) +{ + struct pid *pid = find_pid(PIDTYPE_PID, nr); + + if (!pid) + return NULL; + return pid_task(pid->task_list.next, PIDTYPE_PID); +} + +/* + * This function switches the PIDs if a non-leader thread calls + * sys_execve() - this must be done without releasing the PID. + * (which a detach_pid() would eventually do.) + */ +void switch_exec_pids(task_t *leader, task_t *thread) +{ + _detach_pid(leader, PIDTYPE_PID); + _detach_pid(leader, PIDTYPE_TGID); + _detach_pid(leader, PIDTYPE_PGID); + _detach_pid(leader, PIDTYPE_SID); + + _detach_pid(thread, PIDTYPE_PID); + _detach_pid(thread, PIDTYPE_TGID); + + leader->pid = leader->tgid = thread->pid; + thread->pid = thread->tgid; + + attach_pid(thread, PIDTYPE_PID, thread->pid); + attach_pid(thread, PIDTYPE_TGID, thread->tgid); + attach_pid(thread, PIDTYPE_PGID, thread->pgrp); + attach_pid(thread, PIDTYPE_SID, thread->session); + + attach_pid(leader, PIDTYPE_PID, leader->pid); + attach_pid(leader, PIDTYPE_TGID, leader->tgid); + attach_pid(leader, PIDTYPE_PGID, leader->pgrp); + attach_pid(leader, PIDTYPE_SID, leader->session); +} + +void __init pidhash_init(void) +{ + int i, j; + + /* + * Allocate PID 0, and hash it via all PID types: + */ + pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + set_bit(0, pidmap_array->page); + atomic_dec(&pidmap_array->nr_free); + + write_lock_irq(&tasklist_lock); + for (i = 0; i < PIDTYPE_MAX; i++) { + for (j = 0; j < PIDHASH_SIZE; j++) + INIT_LIST_HEAD(&pid_hash[i][j]); + attach_pid(current, i, 0); + } + write_unlock_irq(&tasklist_lock); +} --- linux/kernel/printk.c.orig +++ linux/kernel/printk.c @@ -26,6 +26,7 @@ #include #include /* For in_interrupt() */ #include +#include #include --- linux/kernel/profile.c.orig +++ linux/kernel/profile.c @@ -27,7 +27,7 @@ int __init profile_setup(char * str) void __init profile_init(void) { unsigned int size; - + if (!prof_shift) return; --- linux/kernel/ptrace.c.orig +++ linux/kernel/ptrace.c @@ -11,12 +11,51 @@ #include #include #include +#include #include +#include #include #include /* + * ptrace a task: make the debugger its new parent and + * move it to the ptrace list. + * + * Must be called with the tasklist lock write-held. + */ +void __ptrace_link(task_t *child, task_t *new_parent) +{ + if (!list_empty(&child->ptrace_list)) + BUG(); + if (child->parent == new_parent) + return; + list_add(&child->ptrace_list, &child->parent->ptrace_children); + REMOVE_LINKS(child); + child->parent = new_parent; + SET_LINKS(child); +} + +/* + * unptrace a task: move it back to its original parent and + * remove it from the ptrace list. + * + * Must be called with the tasklist lock write-held. + */ +void __ptrace_unlink(task_t *child) +{ + if (!child->ptrace) + BUG(); + child->ptrace = 0; + if (list_empty(&child->ptrace_list)) + return; + list_del_init(&child->ptrace_list); + REMOVE_LINKS(child); + child->parent = child->real_parent; + SET_LINKS(child); +} + +/* * Check that we have indeed attached to the thing.. */ int ptrace_check_attach(struct task_struct *child, int kill) @@ -24,28 +63,13 @@ int ptrace_check_attach(struct task_stru if (!(child->ptrace & PT_PTRACED)) return -ESRCH; - if (child->p_pptr != current) + if (child->parent != current) return -ESRCH; if (!kill) { if (child->state != TASK_STOPPED) return -ESRCH; -#ifdef CONFIG_SMP - /* Make sure the child gets off its CPU.. */ - for (;;) { - task_lock(child); - if (!task_has_cpu(child)) - break; - task_unlock(child); - do { - if (child->state != TASK_STOPPED) - return -ESRCH; - barrier(); - cpu_relax(); - } while (task_has_cpu(child)); - } - task_unlock(child); -#endif + wait_task_inactive(child); } /* All systems go.. */ @@ -54,7 +78,9 @@ int ptrace_check_attach(struct task_stru int ptrace_attach(struct task_struct *task) { + int retval; task_lock(task); + retval = -EPERM; if (task->pid <= 1) goto bad; if (task == current) @@ -66,7 +92,6 @@ int ptrace_attach(struct task_struct *ta (current->uid != task->uid) || (current->gid != task->egid) || (current->gid != task->sgid) || - (!cap_issubset(task->cap_permitted, current->cap_permitted)) || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) goto bad; rmb(); @@ -75,6 +100,7 @@ int ptrace_attach(struct task_struct *ta /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; + retval = 0; /* Go */ task->ptrace |= PT_PTRACED; @@ -83,19 +109,15 @@ int ptrace_attach(struct task_struct *ta task_unlock(task); write_lock_irq(&tasklist_lock); - if (task->p_pptr != current) { - REMOVE_LINKS(task); - task->p_pptr = current; - SET_LINKS(task); - } + __ptrace_link(task, current); write_unlock_irq(&tasklist_lock); - send_sig(SIGSTOP, task, 1); + force_sig_specific(SIGSTOP, task); return 0; bad: task_unlock(task); - return -EPERM; + return retval; } int ptrace_detach(struct task_struct *child, unsigned int data) @@ -107,16 +129,15 @@ int ptrace_detach(struct task_struct *ch ptrace_disable(child); /* .. re-parent .. */ - child->ptrace = 0; child->exit_code = data; + write_lock_irq(&tasklist_lock); - REMOVE_LINKS(child); - child->p_pptr = child->p_opptr; - SET_LINKS(child); + __ptrace_unlink(child); + /* .. and wake it up. */ + if (child->state != TASK_ZOMBIE) + wake_up_process(child); write_unlock_irq(&tasklist_lock); - /* .. and wake it up. */ - wake_up_process(child); return 0; } @@ -133,12 +154,7 @@ int access_process_vm(struct task_struct struct page *page; void *old_buf = buf; - /* Worry about races with exit() */ - task_lock(tsk); - mm = tsk->mm; - if (mm) - atomic_inc(&mm->mm_users); - task_unlock(tsk); + mm = get_task_mm(tsk); if (!mm) return 0; @@ -164,13 +180,13 @@ int access_process_vm(struct task_struct if (write) { memcpy(maddr + offset, buf, bytes); flush_page_to_ram(page); - flush_icache_user_range(vma, page, addr, len); + flush_icache_user_range(vma, page, addr, bytes); } else { memcpy(buf, maddr + offset, bytes); flush_page_to_ram(page); } kunmap(page); - put_page(page); + page_cache_release(page); len -= bytes; buf += bytes; addr += bytes; @@ -230,3 +246,108 @@ int ptrace_writedata(struct task_struct } return copied; } + +static int ptrace_setoptions(struct task_struct *child, long data) +{ + if (data & PTRACE_O_TRACESYSGOOD) + child->ptrace |= PT_TRACESYSGOOD; + else + child->ptrace &= ~PT_TRACESYSGOOD; + + if (data & PTRACE_O_TRACEFORK) + child->ptrace |= PT_TRACE_FORK; + else + child->ptrace &= ~PT_TRACE_FORK; + + if (data & PTRACE_O_TRACEVFORK) + child->ptrace |= PT_TRACE_VFORK; + else + child->ptrace &= ~PT_TRACE_VFORK; + + if (data & PTRACE_O_TRACECLONE) + child->ptrace |= PT_TRACE_CLONE; + else + child->ptrace &= ~PT_TRACE_CLONE; + + if (data & PTRACE_O_TRACEEXEC) + child->ptrace |= PT_TRACE_EXEC; + else + child->ptrace &= ~PT_TRACE_EXEC; + + if (data & PTRACE_O_TRACEVFORKDONE) + child->ptrace |= PT_TRACE_VFORK_DONE; + else + child->ptrace &= ~PT_TRACE_VFORK_DONE; + + if (data & PTRACE_O_TRACEEXIT) + child->ptrace |= PT_TRACE_EXIT; + else + child->ptrace &= ~PT_TRACE_EXIT; + + if ((data & (PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK + | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE + | PTRACE_O_TRACEEXEC | PTRACE_O_TRACEEXIT + | PTRACE_O_TRACEVFORKDONE)) + != data) + return -EINVAL; + + return 0; +} + +static int ptrace_getsiginfo(struct task_struct *child, long data) +{ + if (child->last_siginfo == NULL) + return -EINVAL; + return copy_siginfo_to_user ((siginfo_t *) data, child->last_siginfo); +} + +static int ptrace_setsiginfo(struct task_struct *child, long data) +{ + if (child->last_siginfo == NULL) + return -EINVAL; + if (copy_from_user (child->last_siginfo, (siginfo_t *) data, + sizeof (siginfo_t)) != 0) + return -EFAULT; + return 0; +} + +int ptrace_request(struct task_struct *child, long request, + long addr, long data) +{ + int ret = -EIO; + + switch (request) { +#ifdef PTRACE_OLDSETOPTIONS + case PTRACE_OLDSETOPTIONS: +#endif + case PTRACE_SETOPTIONS: + ret = ptrace_setoptions(child, data); + break; + case PTRACE_GETEVENTMSG: + ret = put_user(child->ptrace_message, (unsigned long *) data); + break; + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, data); + break; + case PTRACE_SETSIGINFO: + ret = ptrace_setsiginfo(child, data); + break; + default: + break; + } + + return ret; +} + +void ptrace_notify(int exit_code) +{ + BUG_ON (!(current->ptrace & PT_PTRACED)); + + /* Let the debugger run. */ + current->exit_code = exit_code; + set_current_state(TASK_STOPPED); + notify_parent(current, SIGCHLD); + schedule(); + /* Signals sent while we're stopped might not set sigpending. */ + recalc_sigpending(); +} --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -1,760 +1,1158 @@ /* - * linux/kernel/sched.c + * kernel/sched.c * * Kernel scheduler and related syscalls * - * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe + * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli - * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. */ -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid()), which just extract a field from - * current-task - */ - -#include #include +#include #include +#include +#include #include -#include +#include #include -#include #include -#include -#include +#include +#include +#include +#include +#include -#include -#include +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, + * maximum timeslice is 200 msecs. Timeslices get refilled after + * they expire. + */ +#define MIN_TIMESLICE ( 10 * HZ / 1000) +#define MAX_TIMESLICE (200 * HZ / 1000) +#define CHILD_PENALTY 50 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (10*HZ) +#define STARVATION_LIMIT (30*HZ) +#define SYNC_WAKEUPS 1 +#define SMART_WAKE_CHILD 1 /* - * scheduler variables + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. */ -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) -extern void mem_use(void); +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) /* - * Scheduling quanta. + * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. * - * NOTE! The unix "nice" value influences how long a process - * gets. The nice value ranges from -20 to +19, where a -20 - * is a "high-priority" task, and a "+10" is a low-priority - * task. + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. * - * We want the time-slice to be around 50ms or so, so this - * calculation depends on the value of HZ. + * task_timeslice() is the interface that is used by the scheduler. */ -#if HZ < 200 -#define TICK_SCALE(x) ((x) >> 2) -#elif HZ < 400 -#define TICK_SCALE(x) ((x) >> 1) -#elif HZ < 800 -#define TICK_SCALE(x) (x) -#elif HZ < 1600 -#define TICK_SCALE(x) ((x) << 1) -#else -#define TICK_SCALE(x) ((x) << 2) -#endif -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) +static inline unsigned int task_timeslice(task_t *p) +{ + return BASE_TIMESLICE(p); +} /* - * Init task must be ok at boot for the ix86 as we will check its signals - * via the SMP irq return path. + * These are the runqueue data structures: */ - -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; /* - * The tasklist_lock protects the linked list of processes. - * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. + * This is the main, per-CPU runqueue data structure. * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. - * - * task->alloc_lock nests inside tasklist_lock. + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +struct runqueue { + spinlock_t lock; + unsigned long nr_running, nr_switches, expired_timestamp, + nr_uninterruptible; + task_t *curr, *idle; + struct mm_struct *prev_mm; + prio_array_t *active, *expired, arrays[2]; + int prev_nr_running[NR_CPUS]; + + task_t *migration_thread; + struct list_head migration_queue; + + atomic_t nr_iowait; +} ____cacheline_aligned; + +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; + +#define cpu_rq(cpu) (runqueues + (cpu)) +#define this_rq() cpu_rq(smp_processor_id()) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) -static LIST_HEAD(runqueue_head); +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while(0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif /* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +{ + struct runqueue *rq; -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} -struct kernel_stat kstat; -extern struct task_struct *child_reaper; +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} -#ifdef CONFIG_SMP +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu)) + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); -#else + return rq; +} -#define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock_irq(&rq->lock); +} -#endif +/* + * Adding/removing a task to/from a priority array: + */ +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} -void scheduling_functions_start_here(void) { } +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. */ +static inline int effective_prio(task_t *p) +{ + int bonus, prio; + + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} + +/* + * activate_task - move a task to the runqueue. + + * Also update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) { - int weight; + enqueue_task(p, rq->active); + rq->nr_running++; +} - /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. - */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long sleep_time = jiffies - p->last_run; - /* - * Non-RT process - normal case first. - */ - if (p->policy == SCHED_OTHER) { + if (!rt_task(p) && sleep_time) { /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. + * This code gives a bonus to interactive tasks. We update + * an 'average sleep time' value here, based on + * ->last_run. The more time a task spends sleeping, + * the higher the average gets - and the higher the priority + * boost gets as well. */ - weight = p->counter; - if (!weight) - goto out; - -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif - - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += 1; - weight += 20 - p->nice; - goto out; + p->sleep_avg += sleep_time; + if (p->sleep_avg > MAX_SLEEP_AVG) + p->sleep_avg = MAX_SLEEP_AVG; + p->prio = effective_prio(p); } - - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; + __activate_task(p, rq); } /* - * the 'goodness value' of replacing a process on a given CPU. - * positive value means 'replace', zero or negative means 'dont'. + * deactivate_task - remove a task from the runqueue. */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p, p->array); + p->array = NULL; } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); - -static void reschedule_idle(struct task_struct * p) +static inline void resched_task(task_t *p) { #ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + int need_resched; - /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; - } - } + need_resched = p->need_resched; + wmb(); + set_tsk_need_resched(p); + if (!need_resched && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); +#else + set_tsk_need_resched(p); +#endif +} +#ifdef CONFIG_SMP - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; - - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. + */ +void wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + +repeat: + rq = task_rq(p); + if (unlikely(task_running(rq, p))) { + cpu_relax(); /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * enable/disable preemption just to make this + * a preemption point - we are busy-waiting + * anyway. */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. - */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } - } - } + goto repeat; } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; - } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); - } - return; - - -#else /* UP */ - int this_cpu = smp_processor_id(); - struct task_struct *tsk; - - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) - tsk->need_resched = 1; -#endif + rq = task_rq_lock(p, &flags); + if (unlikely(task_running(rq, p))) { + task_rq_unlock(rq, &flags); + goto repeat; + } + task_rq_unlock(rq, &flags); } /* - * Careful! + * kick_if_running - kick the remote CPU if the task is running currently. * - * This has to add the process to the _end_ of the - * run-queue, not the beginning. The goodness value will - * determine whether this process will run next. This is - * important to get SCHED_FIFO and SCHED_RR right, where - * a process that is either pre-empted or its time slice - * has expired, should be moved to the tail of the run - * queue for its priority - Bhavesh Davda + * This code is used by the signal code to signal tasks + * which are in user-mode, as quickly as possible. + * + * (Note that we do this lockless - if the task does anything + * while the message is in flight then it will notice the + * sigpending condition anyway.) */ -static inline void add_to_runqueue(struct task_struct * p) +void kick_if_running(task_t * p) { - list_add_tail(&p->run_list, &runqueue_head); - nr_running++; + if ((task_running(task_rq(p), p)) && (task_cpu(p) != smp_processor_id())) + resched_task(p); } -static inline void move_last_runqueue(struct task_struct * p) -{ - list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); -} +#endif -/* - * Wake up a process. Put it on the run-queue if it's not - * already there. The "current" process is always on the - * run-queue (except when the actual re-schedule is in - * progress), and as such you're allowed to do the simpler - * "current->state = TASK_RUNNING" to mark yourself runnable - * without the overhead of this. +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. */ -static inline int try_to_wake_up(struct task_struct * p, int synchronous) +static int try_to_wake_up(task_t * p, unsigned int state, int sync) { unsigned long flags; int success = 0; + long old_state; + runqueue_t *rq; + + sync &= SYNC_WAKEUPS; +repeat_lock_task: + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (old_state & state) { + if (!p->array) { + /* + * Fast-migrate the task if it's not running or runnable + * currently. Do not violate hard affinity. + */ + if (unlikely(sync && !task_running(rq, p) && + (task_cpu(p) != smp_processor_id()) && + (p->cpus_allowed & (1UL << smp_processor_id())))) { + + set_task_cpu(p, smp_processor_id()); + task_rq_unlock(rq, &flags); + goto repeat_lock_task; + } + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + if (sync) + __activate_task(p, rq); + else { + activate_task(p, rq); + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } + success = 1; + } + if (p->state >= TASK_ZOMBIE) + BUG(); + p->state = TASK_RUNNING; + } + task_rq_unlock(rq, &flags); - /* - * We want the common case fall through straight, thus the goto. - */ - spin_lock_irqsave(&runqueue_lock, flags); - p->state = TASK_RUNNING; - if (task_on_runqueue(p)) - goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id()))) - reschedule_idle(p); - success = 1; -out: - spin_unlock_irqrestore(&runqueue_lock, flags); return success; } -inline int wake_up_process(struct task_struct * p) +int wake_up_process(task_t * p) { - return try_to_wake_up(p, 0); + return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } -static void process_timeout(unsigned long __data) +int wake_up_state(task_t *p, unsigned int state) { - struct task_struct * p = (struct task_struct *) __data; - - wake_up_process(p); + return try_to_wake_up(p, state, 0); } -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. +/* + * wake_up_forked_process - wake up a freshly forked process. * - * In all cases the return value is guaranteed to be non-negative. + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created process. */ -signed long schedule_timeout(signed long timeout) +void wake_up_forked_process(task_t * p) { - struct timer_list timer; - unsigned long expire; + runqueue_t *rq = this_rq_lock(); - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: + p->state = TASK_RUNNING; + if (!rt_task(p)) { /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + p->prio = effective_prio(p); + } + set_task_cpu(p, smp_processor_id()); + + if (SMART_WAKE_CHILD) { + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; } - } + } else + activate_task(p, rq); + rq_unlock(rq); +} - expire = timeout + jiffies; +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void sched_exit(task_t * p) +{ + unsigned long flags; - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; + local_irq_save(flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) + p->parent->time_slice = MAX_TIMESLICE; + } + local_irq_restore(flags); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT + + p->sleep_avg) / (EXIT_WEIGHT + 1); +} - add_timer(&timer); - schedule(); - del_timer_sync(&timer); +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) +{ + finish_arch_switch(this_rq(), prev); + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; - timeout = expire - jiffies; + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, smp_processor_id()); + } else + switch_mm(oldmm, mm, next, smp_processor_id()); - out: - return timeout < 0 ? 0 : timeout; + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; } /* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. */ -static inline void __schedule_tail(struct task_struct *prev) +unsigned long nr_running(void) { -#ifdef CONFIG_SMP - int policy; + unsigned long i, sum = 0; - /* - * prev->policy can be written from here only before `prev' - * can be scheduled (before setting prev->cpus_runnable to ~0UL). - * Of course it must also be read before allowing prev - * to be rescheduled, but since the write depends on the read - * to complete, wmb() is enough. (the spin_lock() acquired - * before setting cpus_runnable is not enough because the spin_lock() - * common code semantics allows code outside the critical section - * to enter inside the critical section) - */ - policy = prev->policy; - prev->policy = policy & ~SCHED_YIELD; - wmb(); + for (i = 0; i < NR_CPUS; i++) + sum += cpu_rq(i)->nr_running; - /* - * fast path falls through. We have to clear cpus_runnable before - * checking prev->state to avoid a wakeup race. Protect against - * the task exiting early. - */ - task_lock(prev); - task_release_cpu(prev); - mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; + return sum; +} -out_unlock: - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; - /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. - */ -needs_resched: - { - unsigned long flags; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + sum += cpu_rq(i)->nr_uninterruptible; + } + return sum; +} - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; +unsigned long nr_context_switches(void) +{ + unsigned long i, sum = 0; - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + sum += cpu_rq(i)->nr_switches; } -#else - prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ + return sum; } -asmlinkage void schedule_tail(struct task_struct *prev) +unsigned long nr_iowait(void) { - __schedule_tail(prev); + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; ++i) { + if (!cpu_online(i)) + continue; + sum += atomic_read(&cpu_rq(i)->nr_iowait); + } + return sum; } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * double_rq_lock - safely lock two runqueues * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. */ -asmlinkage void schedule(void) +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; - + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} - spin_lock_prefetch(&runqueue_lock); +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} - BUG_ON(!current->active_mm); -need_resched_back: - prev = current; - this_cpu = prev->processor; +#if CONFIG_SMP - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); - BUG(); +/* + * double_lock_balance - lock the busiest runqueue + * + * this_rq is locked already. Recalculate nr_running if we have to + * drop the runqueue lock. + */ +static inline unsigned int double_lock_balance(runqueue_t *this_rq, + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + /* Need to recalculate nr_running */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; + } else + spin_lock(&busiest->lock); } + return nr_running; +} - release_kernel_lock(prev, this_cpu); +/* + * find_busiest_queue - find the busiest runqueue. + */ +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance) +{ + int nr_running, load, max_load, i; + runqueue_t *busiest, *rq_src; /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. */ - sched_data = & aligned_data[this_cpu].schedule_data; + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; - spin_lock_irq(&runqueue_lock); + busiest = NULL; + max_load = 1; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; - /* move an exhausted RR process to be last.. */ - if (unlikely(prev->policy == SCHED_RR)) - if (!prev->counter) { - prev->counter = NICE_TO_TICKS(prev->nice); - move_last_runqueue(prev); + rq_src = cpu_rq(i); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + load = rq_src->nr_running; + else + load = this_rq->prev_nr_running[i]; + this_rq->prev_nr_running[i] = rq_src->nr_running; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + max_load = load; } + } - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; - break; - } - default: - del_from_runqueue(prev); - case TASK_RUNNING:; + if (likely(!busiest)) + goto out; + + *imbalance = (max_load - nr_running) / 2; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && (*imbalance < (max_load + 3)/4)) { + busiest = NULL; + goto out; } - prev->need_resched = 0; + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* - * this is the scheduler proper: + * Make sure nothing changed since we checked the + * runqueue length. */ + if (busiest->nr_running <= nr_running + 1) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return busiest; +} -repeat_schedule: +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_rq->active); /* - * Default process to select.. + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. */ - next = idle_task(this_cpu); - c = -1000; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; - } + if (p->prio < this_rq->curr->prio) + set_need_resched(); + else { + if (p->prio == this_rq->curr->prio && + p->time_slice > this_rq->curr->time_slice) + set_need_resched(); } +} - /* Do we need to re-calculate counters? */ - if (unlikely(!c)) { - struct task_struct *p; - - spin_unlock_irq(&runqueue_lock); - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); - read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); - goto repeat_schedule; - } +/* + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). + * + * We call this with the current runqueue locked, + * irqs disabled. + */ +static void load_balance(runqueue_t *this_rq, int idle) +{ + int imbalance, idx, this_cpu = smp_processor_id(); + runqueue_t *busiest; + prio_array_t *array; + struct list_head *head, *curr; + task_t *tmp; + + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); + if (!busiest) + goto out; /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; - if (unlikely(prev == next)) { - /* We won't go through the normal tail, so do this by hand */ - prev->policy &= ~SCHED_YIELD; - goto same_process; +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out_unlock; } -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->last_run > cache_decay_ticks) && \ + !task_running(rq, p) && \ + ((p)->cpus_allowed & (1UL << (this_cpu)))) + + curr = curr->prev; + + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, this_cpu); + if (!idle && --imbalance) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out_unlock: + spin_unlock(&busiest->lock); +out: + ; +} +/* + * One of the idle_cpu_tick() and busy_cpu_tick() functions will + * get called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + +static inline void idle_tick(runqueue_t *rq) +{ + if (jiffies % IDLE_REBALANCE_TICK) + return; + spin_lock(&rq->lock); + load_balance(rq, 1); + spin_unlock(&rq->lock); +} + +#endif + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks: + */ +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(int user_ticks, int sys_ticks) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + + if (p == rq->idle) { + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += sys_ticks; +#if CONFIG_SMP + idle_tick(rq); +#endif + return; + } + if (TASK_NICE(p) > 0) + kstat.per_cpu_nice[cpu] += user_ticks; + else + kstat.per_cpu_user[cpu] += user_ticks; + kstat.per_cpu_system[cpu] += sys_ticks; + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + if (unlikely(rt_task(p))) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out; + } /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ + * The task was running during this tick - update the + * time slice counter and the sleep average. Note: we + * do not update a thread's priority until it either + * goes to sleep or uses up its timeslice. This makes + * it possible for interactive tasks to use up their + * timeslices at their highest priority levels. + */ + if (p->sleep_avg) + p->sleep_avg--; + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + enqueue_task(p, rq->expired); + } else + enqueue_task(p, rq->active); + } +out: +#if CONFIG_SMP + if (!(jiffies % BUSY_REBALANCE_TICK)) + load_balance(rq, 0); +#endif + spin_unlock(&rq->lock); +} + +void scheduling_functions_start_here(void) { } + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; + int idx; -#endif /* CONFIG_SMP */ + BUG_ON(in_interrupt()); - kstat.context_swtch++; +need_resched: + prev = current; + rq = this_rq(); + + release_kernel_lock(prev, smp_processor_id()); /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. + * Ok, we are leaving the CPU now, lets update the 'last run' + * timestamp: */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - BUG_ON(next->active_mm); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - BUG_ON(next->active_mm != mm); - switch_mm(oldmm, mm, next, this_cpu); - } + prev->last_run = jiffies; + spin_lock_irq(&rq->lock); - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; } + default: + deactivate_task(prev, rq); + case TASK_RUNNING: + ; + } +#if CONFIG_SMP +pick_next_task: +#endif + if (unlikely(!rq->nr_running)) { +#if CONFIG_SMP + load_balance(rq, 1); + if (rq->nr_running) + goto pick_next_task; +#endif + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; } - /* - * This just switches the register state and the - * stack. - */ - switch_to(prev, next, prev); - __schedule_tail(prev); + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + + if (likely(prev != next)) { + struct mm_struct *prev_mm; + rq->nr_switches++; + rq->curr = next; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + rq = this_rq(); + prev_mm = rq->prev_mm; + rq->prev_mm = NULL; + finish_arch_switch(rq, prev); + if (prev_mm) + mmdrop(prev_mm); + } else + spin_unlock_irq(&rq->lock); -same_process: reacquire_kernel_lock(current); - if (current->need_resched) - goto need_resched_back; - return; + if (need_resched()) + goto need_resched; } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, const int sync) +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) { struct list_head *tmp; - struct task_struct *p; + unsigned int state; + wait_queue_t *curr; + task_t *p; + + list_for_each(tmp, &q->task_list) { + curr = list_entry(tmp, wait_queue_t, task_list); + p = curr->task; + state = p->state; + if ((state & mode) && try_to_wake_up(p, mode, sync) && + ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; - CHECK_MAGIC_WQHEAD(q); - WQ_CHECK_LIST_HEAD(&q->task_list); - - list_for_each(tmp,&q->task_list) { - unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + if (unlikely(!q)) + return; - CHECK_MAGIC(curr->__magic); - p = curr->task; - state = p->state; - if (state & mode) { - WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } - } + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); } -void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr) +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 0); - wq_read_unlock_irqrestore(&q->lock, flags); - } + __wake_up_common(q, mode, 1, 0); } -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr) +#if CONFIG_SMP + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 1); - wq_read_unlock_irqrestore(&q->lock, flags); - } + unsigned long flags; + + if (unlikely(!q)) + return; + + spin_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); } +#endif + void complete(struct completion *x) { unsigned long flags; @@ -765,6 +1163,16 @@ void complete(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + void wait_for_completion(struct completion *x) { spin_lock_irq(&x->wait.lock); @@ -790,15 +1198,15 @@ void wait_for_completion(struct completi wait_queue_t wait; \ init_waitqueue_entry(&wait, current); -#define SLEEP_ON_HEAD \ - wq_write_lock_irqsave(&q->lock,flags); \ +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ __add_wait_queue(q, &wait); \ - wq_write_unlock(&q->lock); + spin_unlock(&q->lock); #define SLEEP_ON_TAIL \ - wq_write_lock_irq(&q->lock); \ + spin_lock_irq(&q->lock); \ __remove_wait_queue(q, &wait); \ - wq_write_unlock_irqrestore(&q->lock,flags); + spin_unlock_irqrestore(&q->lock, flags); void interruptible_sleep_on(wait_queue_head_t *q) { @@ -850,17 +1258,54 @@ long sleep_on_timeout(wait_queue_head_t void scheduling_functions_end_here(void) { } +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + p->static_prio = NICE_TO_PRIO(nice); + p->prio = NICE_TO_PRIO(nice); + if (array) { + enqueue_task(p, array); + /* + * If the task is running and lowered its priority, + * or increased its priority then reschedule its CPU: + */ + if ((NICE_TO_PRIO(nice) < p->static_prio) || + task_running(rq, p)) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + #ifndef __alpha__ /* - * This has been replaced by sys_setpriority. Maybe it should be - * moved into the arch dependent tree for those ports that require - * it for backward compatibility? + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. */ - asmlinkage long sys_nice(int increment) { - long newprio; + long nice; /* * Setpriority might change our priority at the same moment. @@ -876,34 +1321,78 @@ asmlinkage long sys_nice(int increment) if (increment > 40) increment = 40; - newprio = current->nice + increment; - if (newprio < -20) - newprio = -20; - if (newprio > 19) - newprio = 19; - current->nice = newprio; + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + set_user_nice(current, nice); return 0; } #endif -static inline struct task_struct *find_process_by_pid(pid_t pid) +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) +{ + return p->prio - MAX_USER_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(task_t *p) +{ + return TASK_NICE(p); +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +int task_curr(task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) { - struct task_struct *tsk = current; + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} - if (pid) - tsk = find_task_by_pid(pid); - return tsk; +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; } -static int setscheduler(pid_t pid, int policy, - struct sched_param *param) +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; - struct task_struct *p; - int retval; + int retval = -EINVAL; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + task_t *p; - retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; @@ -915,72 +1404,99 @@ static int setscheduler(pid_t pid, int p * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; - + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + if (policy < 0) policy = p->policy; else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_NORMAL) goto out_unlock; } - + /* - * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid - * priority for SCHED_OTHER is 0. + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ retval = -EINVAL; - if (lp.sched_priority < 0 || lp.sched_priority > 99) + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; - if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && + if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - - current->need_resched = 1; + if (policy != SCHED_NORMAL) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; + if (array) + __activate_task(p, task_rq(p)); out_unlock: - spin_unlock(&runqueue_lock); + task_rq_unlock(rq, &flags); +out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: return retval; } -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { return setscheduler(pid, policy, param); } +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param) { return setscheduler(pid, -1, param); } +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ asmlinkage long sys_sched_getscheduler(pid_t pid) { - struct task_struct *p; - int retval; + int retval = -EINVAL; + task_t *p; - retval = -EINVAL; if (pid < 0) goto out_nounlock; @@ -988,20 +1504,24 @@ asmlinkage long sys_sched_getscheduler(p read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - retval = p->policy & ~SCHED_YIELD; + retval = p->policy; read_unlock(&tasklist_lock); out_nounlock: return retval; } +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) { - struct task_struct *p; struct sched_param lp; - int retval; + int retval = -EINVAL; + task_t *p; - retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; @@ -1026,47 +1546,134 @@ out_unlock: return retval; } -asmlinkage long sys_sched_yield(void) +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) { + unsigned long new_mask; + int retval; + task_t *p; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + new_mask &= cpu_online_map; + if (!new_mask) + return -EINVAL; + + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; - int nr_pending = nr_running; + retval = 0; + set_cpus_allowed(p, new_mask); -#if CONFIG_SMP - int i; +out_unlock: + put_task_struct(p); + return retval; +} - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; - } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { - /* - * This process can only be rescheduled by us, - * so this is safe without any locking. - */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; - - spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); - spin_unlock_irq(&runqueue_lock); +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) +{ + unsigned int real_len; + unsigned long mask; + int retval; + task_t *p; + + real_len = sizeof(mask); + if (len < real_len) + return -EINVAL; + + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + mask = p->cpus_allowed & cpu_online_map; + +out_unlock: + read_unlock(&tasklist_lock); + if (retval) + return retval; + if (copy_to_user(user_mask_ptr, &mask, real_len)) + return -EFAULT; + return real_len; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->array; + + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (likely(!rt_task(current))) { + dequeue_task(current, array); + enqueue_task(current, rq->expired); + } else { + list_del(¤t->run_list); + list_add_tail(¤t->run_list, array->queue + current->prio); } + spin_unlock(&rq->lock); + + schedule(); + return 0; } +void __cond_resched(void) +{ + set_current_state(TASK_RUNNING); + schedule(); +} + /** * yield - yield the current processor to other threads. * @@ -1077,15 +1684,42 @@ void yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); - schedule(); } -void __cond_resched(void) +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void io_schedule(void) { - set_current_state(TASK_RUNNING); + struct runqueue *rq = this_rq(); + + atomic_inc(&rq->nr_iowait); schedule(); + atomic_dec(&rq->nr_iowait); +} + +long io_schedule_timeout(long timeout) +{ + struct runqueue *rq = this_rq(); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; } +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ asmlinkage long sys_sched_get_priority_max(int policy) { int ret = -EINVAL; @@ -1093,15 +1727,22 @@ asmlinkage long sys_sched_get_priority_m switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = 99; + ret = MAX_USER_RT_PRIO-1; break; - case SCHED_OTHER: + case SCHED_NORMAL: ret = 0; break; } return ret; } +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ asmlinkage long sys_sched_get_priority_min(int policy) { int ret = -EINVAL; @@ -1111,17 +1752,25 @@ asmlinkage long sys_sched_get_priority_m case SCHED_RR: ret = 1; break; - case SCHED_OTHER: + case SCHED_NORMAL: ret = 0; } return ret; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { - struct timespec t; - struct task_struct *p; int retval = -EINVAL; + struct timespec t; + task_t *p; if (pid < 0) goto out_nounlock; @@ -1130,8 +1779,8 @@ asmlinkage long sys_sched_rr_get_interva read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), - &t); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); if (p) retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -1139,14 +1788,33 @@ out_nounlock: return retval; } -static void show_task(struct task_struct * p) +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +void show_task(task_t * p) { unsigned long free = 0; + task_t *relative; int state; - static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; + static const char * stat_nam[] = { "R", "S", "D", "T", "Z", "X" }; printk("%-13.13s ", p->comm); - state = p->state ? ffz(~p->state) + 1 : 0; + state = p->state ? __ffs(p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); else @@ -1168,26 +1836,26 @@ static void show_task(struct task_struct n++; free = (unsigned long) n - (unsigned long)(p+1); } - printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid); - if (p->p_cptr) - printk("%5d ", p->p_cptr->pid); + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); else printk(" "); - if (p->p_ysptr) - printk("%7d", p->p_ysptr->pid); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); else printk(" "); - if (p->p_osptr) - printk(" %5d", p->p_osptr->pid); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); else printk(" "); if (!p->mm) printk(" (L-TLB)\n"); else printk(" (NOTLB)\n"); - +// print_signals(p); { - extern void show_trace_task(struct task_struct *tsk); + extern void show_trace_task(task_t *tsk); show_trace_task(p); } } @@ -1209,7 +1877,7 @@ char * render_sigset_t(sigset_t *set, ch void show_state(void) { - struct task_struct *p; + task_t *g, *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -1221,132 +1889,258 @@ void show_state(void) printk(" task PC stack pid father child younger older\n"); #endif read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: */ touch_nmi_watchdog(); show_task(p); - } + } while_each_thread(g, p); + read_unlock(&tasklist_lock); } -/** - * reparent_to_init() - Reparent the calling kernel thread to the init task. - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited fro a user process, so we reset them to sane values here. +void __init init_idle(task_t *idle, int cpu) +{ + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + double_rq_unlock(idle_rq, rq); + set_tsk_need_resched(idle); + local_irq_restore(flags); +} + +#if CONFIG_SMP +/* + * This is how migration works: * - * NOTE that reparent_to_init() gives the caller full capabilities. + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. */ -void reparent_to_init(void) -{ - struct task_struct *this_task = current; - write_lock_irq(&tasklist_lock); +typedef struct { + struct list_head list; + task_t *task; + struct completion done; +} migration_req_t; - /* Reparent to init */ - REMOVE_LINKS(this_task); - this_task->p_pptr = child_reaper; - this_task->p_opptr = child_reaper; - SET_LINKS(this_task); +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) +{ + unsigned long flags; + migration_req_t req; + runqueue_t *rq; - /* Set the exit signal to SIGCHLD so we signal init on exit */ - this_task->exit_signal = SIGCHLD; +#if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */ + new_mask &= cpu_online_map; + if (!new_mask) + BUG(); +#endif - /* We also take the runqueue_lock while altering task fields - * which affect scheduling decisions */ - spin_lock(&runqueue_lock); + rq = task_rq_lock(p, &flags); + p->cpus_allowed = new_mask; + /* + * Can the task run on the task's current CPU? If not then + * migrate the thread off to a proper CPU. + */ + if (new_mask & (1UL << task_cpu(p))) { + task_rq_unlock(rq, &flags); + return; + } + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, __ffs(p->cpus_allowed)); + task_rq_unlock(rq, &flags); + return; + } + init_completion(&req.done); + req.task = p; + list_add(&req.list, &rq->migration_queue); + task_rq_unlock(rq, &flags); - this_task->ptrace = 0; - this_task->nice = DEF_NICE; - this_task->policy = SCHED_OTHER; - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - this_task->cap_effective = CAP_INIT_EFF_SET; - this_task->cap_inheritable = CAP_INIT_INH_SET; - this_task->cap_permitted = CAP_FULL_SET; - this_task->keep_capabilities = 0; - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); - this_task->user = INIT_USER; + wake_up_process(rq->migration_thread); - spin_unlock(&runqueue_lock); - write_unlock_irq(&tasklist_lock); + wait_for_completion(&req.done); } /* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. + * migration_thread - this is a highprio system thread that performs + * thread migration by 'pulling' threads into the target runqueue. */ - -void daemonize(void) +static int migration_thread(void * data) { - struct fs_struct *fs; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + int cpu = (long) data; + runqueue_t *rq; + int ret; + daemonize(); + sigfillset(¤t->blocked); + set_fs(KERNEL_DS); /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. + * Either we are running on the right CPU, or there's a + * a migration thread on the target CPU, guaranteed. */ - exit_mm(current); + set_cpus_allowed(current, 1UL << cpu); + + ret = setscheduler(0, SCHED_FIFO, ¶m); + + rq = this_rq(); + rq->migration_thread = current; - current->session = 1; - current->pgrp = 1; - current->tty = NULL; + sprintf(current->comm, "migration/%d", smp_processor_id()); - /* Become as one with the init task */ + for (;;) { + runqueue_t *rq_src, *rq_dest; + struct list_head *head; + int cpu_src, cpu_dest; + migration_req_t *req; + unsigned long flags; + task_t *p; + + spin_lock_irqsave(&rq->lock, flags); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irqrestore(&rq->lock, flags); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock_irqrestore(&rq->lock, flags); + + p = req->task; + cpu_dest = __ffs(p->cpus_allowed); + rq_dest = cpu_rq(cpu_dest); +repeat: + cpu_src = task_cpu(p); + rq_src = cpu_rq(cpu_src); + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + if (task_cpu(p) != cpu_src) { + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + goto repeat; + } + if (rq_src == rq) { + set_task_cpu(p, cpu_dest); + if (p->array) { + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); + if (p->prio < rq_dest->curr->prio) + resched_task(rq_dest->curr); + } + } + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + complete(&req->done); + } } -extern unsigned long wait_init_idle; +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +void migration_call(void *hcpu) +{ + printk("Starting migration thread for cpu %li\n", (long)hcpu); + kernel_thread(migration_thread, hcpu, CLONE_KERNEL); + while (!cpu_rq((long)hcpu)->migration_thread) + yield(); +} -void __init init_idle(void) +__init int migration_init(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + int cpu; - if (current != &init_task && task_on_runqueue(current)) { - printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); - } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); - clear_bit(current->processor, &wait_init_idle); + /* Start one for boot CPU. */ + migration_call((void *)(long)smp_processor_id()); + + printk("smp_num_cpus: %d.\n", smp_num_cpus); + for (cpu = 0; cpu < smp_num_cpus; cpu++) + if (cpu != smp_processor_id()) + migration_call((void *)(long)cpu); + return 0; } -extern void init_timervecs (void); +#endif + + +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); void __init sched_init(void) { + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } /* * We have to do a little magic to get the first - * process right in SMP mode. + * thread right in SMP mode. */ - int cpu = smp_processor_id(); - int nr; - - init_task.processor = cpu; - - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; + rq = this_rq(); + rq->curr = current; + rq->idle = current; + set_task_cpu(current, smp_processor_id()); + wake_up_forked_process(current); init_timervecs(); - init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); @@ -1355,5 +2149,6 @@ void __init sched_init(void) * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, cpu); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); } + --- linux/kernel/signal.c.orig +++ linux/kernel/signal.c @@ -6,6 +6,8 @@ * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson */ +#define __KERNEL_SYSCALLS__ + #include #include #include @@ -13,47 +15,207 @@ #include #include #include - +#include +#include +#include +#include #include +#include /* * SLAB caches for signal bits. */ -#define DEBUG_SIG 0 +static kmem_cache_t *sigqueue_cachep; + +atomic_t nr_queued_signals; +int max_queued_signals = 1024; + +/********************************************************* + + POSIX thread group signal behavior: + +---------------------------------------------------------- +| | userspace | kernel | +---------------------------------------------------------- +| SIGHUP | load-balance | kill-all | +| SIGINT | load-balance | kill-all | +| SIGQUIT | load-balance | kill-all+core | +| SIGILL | specific | kill-all+core | +| SIGTRAP | specific | kill-all+core | +| SIGABRT/SIGIOT | specific | kill-all+core | +| SIGBUS | specific | kill-all+core | +| SIGFPE | specific | kill-all+core | +| SIGKILL | n/a | kill-all | +| SIGUSR1 | load-balance | kill-all | +| SIGSEGV | specific | kill-all+core | +| SIGUSR2 | load-balance | kill-all | +| SIGPIPE | specific | kill-all | +| SIGALRM | load-balance | kill-all | +| SIGTERM | load-balance | kill-all | +| SIGCHLD | load-balance | ignore | +| SIGCONT | load-balance | ignore | +| SIGSTOP | n/a | stop-all | +| SIGTSTP | load-balance | stop-all | +| SIGTTIN | load-balance | stop-all | +| SIGTTOU | load-balance | stop-all | +| SIGURG | load-balance | ignore | +| SIGXCPU | specific | kill-all+core | +| SIGXFSZ | specific | kill-all+core | +| SIGVTALRM | load-balance | kill-all | +| SIGPROF | specific | kill-all | +| SIGPOLL/SIGIO | load-balance | kill-all | +| SIGSYS/SIGUNUSED | specific | kill-all+core | +| SIGSTKFLT | specific | kill-all | +| SIGWINCH | load-balance | ignore | +| SIGPWR | load-balance | kill-all | +| SIGRTMIN-SIGRTMAX | load-balance | kill-all | +---------------------------------------------------------- + + non-POSIX signal thread group behavior: + +---------------------------------------------------------- +| | userspace | kernel | +---------------------------------------------------------- +| SIGEMT | specific | kill-all+core | +---------------------------------------------------------- +*/ -#if DEBUG_SIG -#define SIG_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) +/* Some systems do not have a SIGSTKFLT and the kernel never + * generates such signals anyways. + */ +#ifdef SIGSTKFLT +#define M_SIGSTKFLT M(SIGSTKFLT) #else -#define SIG_SLAB_DEBUG 0 +#define M_SIGSTKFLT 0 #endif -static kmem_cache_t *sigqueue_cachep; +#ifdef SIGEMT +#define M_SIGEMT M(SIGEMT) +#else +#define M_SIGEMT 0 +#endif -atomic_t nr_queued_signals; -int max_queued_signals = 1024; +#if SIGRTMIN > BITS_PER_LONG +#define M(sig) (1ULL << ((sig)-1)) +#else +#define M(sig) (1UL << ((sig)-1)) +#endif +#define T(sig, mask) (M(sig) & (mask)) -void __init signals_init(void) +#define SIG_KERNEL_BROADCAST_MASK (\ + M(SIGHUP) | M(SIGINT) | M(SIGQUIT) | M(SIGILL) | \ + M(SIGTRAP) | M(SIGABRT) | M(SIGBUS) | M(SIGFPE) | \ + M(SIGKILL) | M(SIGUSR1) | M(SIGSEGV) | M(SIGUSR2) | \ + M(SIGPIPE) | M(SIGALRM) | M(SIGTERM) | M(SIGXCPU) | \ + M(SIGXFSZ) | M(SIGVTALRM) | M(SIGPROF) | M(SIGPOLL) | \ + M(SIGSYS) | M_SIGSTKFLT | M(SIGPWR) | M(SIGCONT) | \ + M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) | \ + M_SIGEMT ) + +#define SIG_KERNEL_ONLY_MASK (\ + M(SIGKILL) | M(SIGSTOP) ) + +#define SIG_KERNEL_STOP_MASK (\ + M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) ) + +#define SIG_KERNEL_COREDUMP_MASK (\ + M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \ + M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \ + M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT ) + +#define SIG_KERNEL_IGNORE_MASK (\ + M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) ) + +#define sig_kernel_only(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK)) +#define sig_kernel_coredump(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK)) +#define sig_kernel_ignore(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK)) +#define sig_kernel_stop(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) + +#define sig_user_defined(t, signr) \ + (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ + ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) + +#define sig_ignored(t, signr) \ + (!((t)->ptrace & PT_PTRACED) && \ + (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_IGN) + +#define sig_fatal(t, signr) \ + (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ + (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) { - sigqueue_cachep = - kmem_cache_create("sigqueue", - sizeof(struct sigqueue), - __alignof__(struct sigqueue), - SIG_SLAB_DEBUG, NULL, NULL); - if (!sigqueue_cachep) - panic("signals_init(): cannot create sigqueue SLAB cache"); + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; } +#define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) + +inline void recalc_sigpending_tsk(struct task_struct *t) +{ + if (t->signal->group_stop_count > 0 || + PENDING(&t->pending, &t->blocked) || + PENDING(&t->signal->shared_pending, &t->blocked)) + t->sigpending = 1; + else + t->sigpending = 0; +} + +void recalc_sigpending(void) +{ + recalc_sigpending_tsk(current); +} +void print_signals(struct task_struct *t) +{ + printk("pend: %d [p: %d [%p-%p], sp: %d [%p-%p]], bl: %08lx.\n", + signal_pending(t), PENDING(&t->pending, &t->blocked), + t->pending.head, + t->pending.tail, + PENDING(&t->signal->shared_pending, &t->blocked), + t->signal->shared_pending.head, + t->signal->shared_pending.tail, + t->blocked.sig[0] + ); +} /* Given the mask, find the first available signal that should be serviced. */ static int -next_signal(struct task_struct *tsk, sigset_t *mask) +next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; - s = tsk->pending.signal.sig; + s = pending->signal.sig; m = mask->sig; switch (_NSIG_WORDS) { default: @@ -107,21 +269,72 @@ flush_signals(struct task_struct *t) { t->sigpending = 0; flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +void __exit_sighand(struct task_struct *tsk) +{ + struct sighand_struct * sighand = tsk->sighand; + + /* Ok, we're done with the signal handlers */ + tsk->sighand = NULL; + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); } void exit_sighand(struct task_struct *tsk) { - struct signal_struct * sig = tsk->sig; + write_lock_irq(&tasklist_lock); + __exit_sighand(tsk); + write_unlock_irq(&tasklist_lock); +} - spin_lock_irq(&tsk->sigmask_lock); - if (sig) { - tsk->sig = NULL; - if (atomic_dec_and_test(&sig->count)) - kmem_cache_free(sigact_cachep, sig); +/* + * This function expects the tasklist_lock write-locked. + */ +void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct * sig = tsk->signal; + struct sighand_struct * sighand = tsk->sighand; + + if (!sig) + BUG(); + if (!atomic_read(&sig->count)) + BUG(); + spin_lock(&sighand->siglock); + if (atomic_dec_and_test(&sig->count)) { + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + tsk->signal = NULL; + spin_unlock(&sighand->siglock); + flush_sigqueue(&sig->shared_pending); + kmem_cache_free(signal_cachep, sig); + } else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) <= 2) { + wake_up_process(sig->group_exit_task); + sig->group_exit_task = NULL; + } + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + tsk->signal = NULL; + spin_unlock(&sighand->siglock); } tsk->sigpending = 0; flush_sigqueue(&tsk->pending); - spin_unlock_irq(&tsk->sigmask_lock); +} + +void exit_signal(struct task_struct *tsk) +{ + write_lock_irq(&tasklist_lock); + __exit_signal(tsk); + write_unlock_irq(&tasklist_lock); } /* @@ -132,7 +345,7 @@ void flush_signal_handlers(struct task_struct *t) { int i; - struct k_sigaction *ka = &t->sig->action[0]; + struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; @@ -142,34 +355,6 @@ flush_signal_handlers(struct task_struct } } -/* - * sig_exit - cause the current task to exit due to a signal. - */ - -void -sig_exit(int sig, int exit_code, struct siginfo *info) -{ - struct task_struct *t; - - sigaddset(¤t->pending.signal, sig); - recalc_sigpending(current); - current->flags |= PF_SIGNALED; - - /* Propagate the signal to all the tasks in - * our thread group - */ - if (info && (unsigned long)info != 1 - && info->si_code != SI_TKILL) { - read_lock(&tasklist_lock); - for_each_thread(t) { - force_sig_info(sig, info, t); - } - read_unlock(&tasklist_lock); - } - - do_exit(exit_code); - /* NOTREACHED */ -} /* Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be @@ -184,11 +369,11 @@ block_all_signals(int (*notifier)(void * { unsigned long flags; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->notifier_mask = mask; current->notifier_data = priv; current->notifier = notifier; - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } /* Notify the system that blocking has ended. */ @@ -198,14 +383,14 @@ unblock_all_signals(void) { unsigned long flags; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->notifier = NULL; current->notifier_data = NULL; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } -static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) { if (sigismember(&list->signal, sig)) { /* Collect the siginfo appropriate to this signal. */ @@ -217,9 +402,10 @@ static int collect_signal(int sig, struc pp = &q->next; } - /* Ok, it wasn't in the queue. We must have - been out of queue space. So zero out the - info. */ + /* Ok, it wasn't in the queue. This must be + a fast-pathed signal or we must have been + out of queue space. So zero out the info. + */ sigdelset(&list->signal, sig); info->si_signo = sig; info->si_errno = 0; @@ -253,24 +439,12 @@ found_another: return 0; } -/* - * Dequeue a signal and return the element to the caller, which is - * expected to free it. - * - * All callers must be holding current->sigmask_lock. - */ - -int -dequeue_signal(sigset_t *mask, siginfo_t *info) +static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, + siginfo_t *info) { int sig = 0; -#if DEBUG_SIG -printk("SIG dequeue (%s:%d): %d ", current->comm, current->pid, - signal_pending(current)); -#endif - - sig = next_signal(current, mask); + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -281,34 +455,97 @@ printk("SIG dequeue (%s:%d): %d ", curre } } - if (!collect_signal(sig, ¤t->pending, info)) + if (!collect_signal(sig, pending, info)) sig = 0; /* XXX: Once POSIX.1b timers are in, if si_code == SI_TIMER, we need to xchg out the timer overrun values. */ } - recalc_sigpending(current); - -#if DEBUG_SIG -printk(" %d -> %d\n", signal_pending(current), sig); -#endif + recalc_sigpending(); return sig; } -static int rm_from_queue(int sig, struct sigpending *s) +/* + * Dequeue a signal and return the element to the caller, which is + * expected to free it. + * + * All callers have to hold the siglock. + */ +int dequeue_signal(sigset_t *mask, siginfo_t *info) +{ + int signr = __dequeue_signal(¤t->pending, mask, info); + if (!signr) + signr = __dequeue_signal(¤t->signal->shared_pending, + mask, info); + return signr; +} + +/* + * Tell a process that it has a new active signal.. + * + * NOTE! we rely on the previous spin_lock to + * lock interrupts for us! We can only be called with + * "siglock" held, and the local interrupt must + * have been disabled when that got acquired! + * + * No need to set need_resched since signal event passing + * goes through ->blocked + */ +inline void signal_wake_up(struct task_struct *t, int resume) +{ + unsigned int mask; + + t->sigpending = 1; + + /* + * If the task is running on a different CPU + * force a reschedule on the other CPU to make + * it notice the new signal quickly. + * + * The code below is a tad loose and might occasionally + * kick the wrong CPU if we catch the process in the + * process of changing - but no harm is done by that + * other than doing an extra (lightweight) IPI interrupt. + */ + if (t->state == TASK_RUNNING) + kick_if_running(t); + /* + * If resume is set, we want to wake it up in the TASK_STOPPED case. + * We don't check for TASK_STOPPED because there is a race with it + * executing another processor and just now entering stopped state. + * By calling wake_up_process any time resume is set, we ensure + * the process will wake up and handle its stop or death signal. + */ + mask = TASK_INTERRUPTIBLE; + if (resume) + mask |= TASK_STOPPED; + if (t->state & mask) { + wake_up_process(t); + return; + } +} + +/* + * Remove signals in mask from the pending set and queue. + * Returns 1 if any signals were found. + * + * All callers must be holding the siglock. + */ +static int rm_from_queue(unsigned long mask, struct sigpending *s) { struct sigqueue *q, **pp; - if (!sigismember(&s->signal, sig)) + if (!sigtestsetmask(&s->signal, mask)) return 0; - sigdelset(&s->signal, sig); + sigdelsetmask(&s->signal, mask); pp = &s->head; while ((q = *pp) != NULL) { - if (q->info.si_signo == sig) { + if (q->info.si_signo < SIGRTMIN && + (mask & sigmask (q->info.si_signo))) { if ((*pp = q->next) == NULL) s->tail = pp; kmem_cache_free(sigqueue_cachep,q); @@ -321,110 +558,107 @@ static int rm_from_queue(int sig, struct } /* - * Remove signal sig from t->pending. - * Returns 1 if sig was found. - * - * All callers must be holding t->sigmask_lock. - */ -static int rm_sig_from_queue(int sig, struct task_struct *t) -{ - return rm_from_queue(sig, &t->pending); -} - -/* * Bad permissions for sending the signal */ -int bad_signal(int sig, struct siginfo *info, struct task_struct *t) +static inline int check_kill_permission(int sig, struct siginfo *info, + struct task_struct *t) { - return (!info || ((unsigned long)info != 1 && SI_FROMUSER(info))) + int error = -EINVAL; + if (sig < 0 || sig > _NSIG) + return error; + error = -EPERM; + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) && ((sig != SIGCONT) || (current->session != t->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_KILL); + && !capable(CAP_KILL)) + return error; + return 0; } -/* - * Signal type: - * < 0 : global action (kill - spread to all non-blocked threads) - * = 0 : ignored - * > 0 : wake up. - */ -static int signal_type(int sig, struct signal_struct *signals) -{ - unsigned long handler; - - if (!signals) - return 0; - - handler = (unsigned long) signals->action[sig-1].sa.sa_handler; - if (handler > 1) - return 1; - - /* "Ignore" handler.. Illogical, but that has an implicit handler for SIGCHLD */ - if (handler == 1) - return sig == SIGCHLD; - - /* Default handler. Normally lethal, but.. */ - switch (sig) { - - /* Ignored */ - case SIGCONT: case SIGWINCH: - case SIGCHLD: case SIGURG: - return 0; - - /* Implicit behaviour */ - case SIGTSTP: case SIGTTIN: case SIGTTOU: - return 1; - - /* Implicit actions (kill or do special stuff) */ - default: - return -1; - } -} - +/* forward decl */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + struct task_struct *parent); /* - * Determine whether a signal should be posted or not. - * - * Signals with SIG_IGN can be ignored, except for the - * special case of a SIGCHLD. - * - * Some signals with SIG_DFL default to a non-action. + * Handle magic process-wide effects of stop/continue signals, and SIGKILL. + * Unlike the signal actions, these happen immediately at signal-generation + * time regardless of blocking, ignoring, or handling. This does the + * actual continuing for SIGCONT, but not the actual stopping for stop + * signals. The process stop is done as a signal action for SIG_DFL. */ -static int ignored_signal(int sig, struct task_struct *t) +static void handle_stop_signal(int sig, struct task_struct *p) { - /* Don't ignore traced or blocked signals */ - if ((t->ptrace & PT_PTRACED) || sigismember(&t->blocked, sig)) - return 0; - - return signal_type(sig, t->sig) == 0; -} + struct task_struct *t; -/* - * Handle TASK_STOPPED cases etc implicit behaviour - * of certain magical signals. - * - * SIGKILL gets spread out to every thread. - */ -static void handle_stop_signal(int sig, struct task_struct *t) -{ - switch (sig) { - case SIGKILL: case SIGCONT: - /* Wake up the process if stopped. */ - if (t->state == TASK_STOPPED) - wake_up_process(t); - t->exit_code = 0; - rm_sig_from_queue(SIGSTOP, t); - rm_sig_from_queue(SIGTSTP, t); - rm_sig_from_queue(SIGTTOU, t); - rm_sig_from_queue(SIGTTIN, t); - break; + if (sig_kernel_stop(sig)) { + /* + * This is a stop signal. Remove SIGCONT from all queues. + */ + rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); + t = p; + do { + rm_from_queue(sigmask(SIGCONT), &t->pending); + t = next_thread(t); + } while (t != p); + } else if (sig == SIGCONT) { + /* + * Remove all stop signals from all queues, + * and wake all threads. + */ + if (unlikely(p->signal->group_stop_count > 0)) { + /* + * There was a group stop in progress. We'll + * pretend it finished before we got here. We are + * obliged to report it to the parent: if the + * SIGSTOP happened "after" this SIGCONT, then it + * would have cleared this pending SIGCONT. If it + * happened "before" this SIGCONT, then the parent + * got the SIGCHLD about the stop finishing before + * the continue happened. We do the notification + * now, and it's as if the stop had finished and + * the SIGCHLD was pending on entry to this kill. + */ + p->signal->group_stop_count = 0; + if (p->ptrace & PT_PTRACED) + do_notify_parent_cldstop(p, p->parent); + else + do_notify_parent_cldstop( + p->group_leader, + p->group_leader->real_parent); + } + rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); + t = p; + do { + unsigned int mask; + + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + + /* + * If there is a handler for SIGCONT, we must make + * sure that no thread returns to user mode before + * we post the signal, in case it was the only + * thread eligible to run the signal handler--then + * it must not do anything between resuming and + * running the handler. With the TIF_SIGPENDING + * flag set, the thread will pause and acquire the + * siglock that we hold now and until we've queued + * the pending signal. + * + * Wake up the stopped thread _after_ setting + * TIF_SIGPENDING + */ + mask = TASK_STOPPED; + if (sig_user_defined(t, SIGCONT) && + !sigismember(&t->blocked, SIGCONT)) { + t->sigpending = 1; + mask |= TASK_INTERRUPTIBLE; + } + wake_up_state(t, mask); - case SIGSTOP: case SIGTSTP: - case SIGTTIN: case SIGTTOU: - /* If we're stopping again, cancel SIGCONT */ - rm_sig_from_queue(SIGCONT, t); - break; + t = next_thread(t); + } while (t != p); } } @@ -432,6 +666,13 @@ static int send_signal(int sig, struct s { struct sigqueue * q = NULL; + /* + * fast-pathed signals for kernel-internal things like SIGSTOP + * or SIGKILL. + */ + if ((unsigned long)info == 2) + goto out_set; + /* Real-time signals must be queued if sent by sigqueue, or some other real-time mechanism. It is implementation defined whether kill() does so. We attempt to do so, on @@ -440,9 +681,8 @@ static int send_signal(int sig, struct s make sure at least one signal gets delivered and don't pass on the info struct. */ - if (atomic_read(&nr_queued_signals) < max_queued_signals) { + if (atomic_read(&nr_queued_signals) < max_queued_signals) q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); - } if (q) { atomic_inc(&nr_queued_signals); @@ -450,185 +690,334 @@ static int send_signal(int sig, struct s *signals->tail = q; signals->tail = &q->next; switch ((unsigned long) info) { - case 0: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_USER; - q->info.si_pid = current->pid; - q->info.si_uid = current->uid; - break; - case 1: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_KERNEL; - q->info.si_pid = 0; - q->info.si_uid = 0; - break; - default: - copy_siginfo(&q->info, info); - break; + case 0: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_USER; + q->info.si_pid = current->pid; + q->info.si_uid = current->uid; + break; + case 1: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_KERNEL; + q->info.si_pid = 0; + q->info.si_uid = 0; + break; + default: + copy_siginfo(&q->info, info); + break; } } else if (sig >= SIGRTMIN && info && (unsigned long)info != 1 - && info->si_code != SI_USER) { + && info->si_code != SI_USER) /* * Queue overflow, abort. We may abort if the signal was rt * and sent by user using something other than kill(). */ return -EAGAIN; - } +out_set: sigaddset(&signals->signal, sig); return 0; } +#define LEGACY_QUEUE(sigptr, sig) \ + (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) + + +static int +specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) +{ + int ret; + + if (!irqs_disabled()) + BUG(); +#if CONFIG_SMP + if (!spin_is_locked(&t->sighand->siglock)) + BUG(); +#endif + + /* Short-circuit ignored signals. */ + if (sig_ignored(t, sig)) + return 0; + + /* Support queueing exactly one non-rt signal, so that we + can get more detailed information about the cause of + the signal. */ + if (LEGACY_QUEUE(&t->pending, sig)) + return 0; + + if (sig_kernel_ignore(sig) && + t->sighand->action[sig-1].sa.sa_handler == SIG_DFL && + !sigismember(&t->blocked, sig)) + return 0; + + ret = send_signal(sig, info, &t->pending); + if (!ret && !sigismember(&t->blocked, sig)) + signal_wake_up(t, sig == SIGKILL); + + return ret; +} + /* - * Tell a process that it has a new active signal.. - * - * NOTE! we rely on the previous spin_lock to - * lock interrupts for us! We can only be called with - * "sigmask_lock" held, and the local interrupt must - * have been disabled when that got acquired! - * - * No need to set need_resched since signal event passing - * goes through ->blocked + * Force a signal that the process can't ignore: if necessary + * we unblock the signal and change any SIG_IGN to SIG_DFL. */ -static inline void signal_wake_up(struct task_struct *t) + +int +force_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - t->sigpending = 1; + unsigned long int flags; + int ret; + + spin_lock_irqsave(&t->sighand->siglock, flags); + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) + t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + ret = specific_send_sig_info(sig, info, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); + + return ret; +} + +void +force_sig_specific(int sig, struct task_struct *t) +{ + unsigned long int flags; + + spin_lock_irqsave(&t->sighand->siglock, flags); + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) + t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + specific_send_sig_info(sig, (void *)2, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); +} + +/* + * Test if P wants to take SIG. After we've checked all threads with this, + * it's equivalent to finding no threads not blocking SIG. Any threads not + * blocking SIG were ruled out because they are not running and already + * have pending signals. Such threads will dequeue from the shared queue + * as soon as they're available, so putting the signal on the shared queue + * will be equivalent to sending it to one such thread. + */ +#define wants_signal(sig, p, mask) \ + (!sigismember(&(p)->blocked, sig) \ + && !((p)->state & mask) \ + && !((p)->flags & PF_EXITING) \ + && (task_curr(p) || !signal_pending(p))) + +static inline int +__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + struct task_struct *t; + unsigned int mask; + int ret; + +#if CONFIG_SMP + if (!spin_is_locked(&p->sighand->siglock)) + BUG(); +#endif + handle_stop_signal(sig, p); + + /* Short-circuit ignored signals. */ + if (sig_ignored(p, sig)) + return 0; + + if (LEGACY_QUEUE(&p->signal->shared_pending, sig)) + /* This is a non-RT signal and we already have one queued. */ + return 0; -#ifdef CONFIG_SMP /* - * If the task is running on a different CPU - * force a reschedule on the other CPU to make - * it notice the new signal quickly. + * Don't bother zombies and stopped tasks (but + * SIGKILL will punch through stopped state) + */ + mask = TASK_DEAD | TASK_ZOMBIE; + if (sig != SIGKILL) + mask |= TASK_STOPPED; + + /* + * Put this signal on the shared-pending queue, or fail with EAGAIN. + * We always use the shared queue for process-wide signals, + * to avoid several races. + */ + ret = send_signal(sig, info, &p->signal->shared_pending); + if (unlikely(ret)) + return ret; + + /* + * Now find a thread we can wake up to take the signal off the queue. * - * The code below is a tad loose and might occasionally - * kick the wrong CPU if we catch the process in the - * process of changing - but no harm is done by that - * other than doing an extra (lightweight) IPI interrupt. + * If the main thread wants the signal, it gets first crack. + * Probably the least surprising to the average bear. + */ + if (wants_signal(sig, p, mask)) + t = p; + else if (thread_group_empty(p)) + /* + * There is just one thread and it does not need to be woken. + * It will dequeue unblocked signals before it runs again. + */ + return 0; + else { + /* + * Otherwise try to find a suitable thread. + */ + t = p->signal->curr_target; + if (t == NULL) + /* restart balancing at this thread */ + t = p->signal->curr_target = p; + BUG_ON(t->tgid != p->tgid); + + while (!wants_signal(sig, t, mask)) { + t = next_thread(t); + if (t == p->signal->curr_target) + /* + * No thread needs to be woken. + * Any eligible threads will see + * the signal in the queue soon. + */ + return 0; + } + p->signal->curr_target = t; + } + + if (sig_kernel_ignore(sig) && + p->sighand->action[sig-1].sa.sa_handler == SIG_DFL) { + rm_from_queue(sigmask(sig), &p->signal->shared_pending); + return 0; + } + + /* + * Found a killable thread. If the signal will be fatal, + * then start taking the whole group down immediately. */ - spin_lock(&runqueue_lock); - if (task_has_cpu(t) && t->processor != smp_processor_id()) - smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); -#endif /* CONFIG_SMP */ + if (sig_fatal(p, sig) && !p->signal->group_exit && + !sigismember(&t->real_blocked, sig) && + (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { + /* + * This signal will be fatal to the whole group. + */ + if (!sig_kernel_coredump(sig)) { + /* + * Start a group exit and wake everybody up. + * This way we don't have other threads + * running and doing things after a slower + * thread has the fatal signal pending. + */ + p->signal->group_exit = 1; + p->signal->group_exit_code = sig; + p->signal->group_stop_count = 0; + t = p; + do { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + t = next_thread(t); + } while (t != p); + return 0; + } - if (t->state & TASK_INTERRUPTIBLE) { - wake_up_process(t); - return; + /* + * There will be a core dump. We make all threads other + * than the chosen one go into a group stop so that nothing + * happens until it gets scheduled, takes the signal off + * the shared queue, and does the core dump. This is a + * little more complicated than strictly necessary, but it + * keeps the signal state that winds up in the core dump + * unchanged from the death state, e.g. which thread had + * the core-dump signal unblocked. + */ + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); + p->signal->group_stop_count = 0; + p->signal->group_exit_task = t; + t = p; + do { + p->signal->group_stop_count++; + signal_wake_up(t, 0); + t = next_thread(t); + } while (t != p); + wake_up_process(p->signal->group_exit_task); + return 0; } + + /* + * The signal is already in the shared-pending queue. + * Tell the chosen thread to wake up and dequeue it. + */ + signal_wake_up(t, sig == SIGKILL); + return 0; } -static int deliver_signal(int sig, struct siginfo *info, struct task_struct *t) +/* + * Nuke all other threads in the group. + */ +void zap_other_threads(struct task_struct *p) { - int retval = send_signal(sig, info, &t->pending); + struct task_struct *t; - if (!retval && !sigismember(&t->blocked, sig)) - signal_wake_up(t); + p->signal->group_stop_count = 0; - return retval; + if (thread_group_empty(p)) + return; + + for (t = next_thread(p); t != p; t = next_thread(t)) { + sigaddset(&t->pending.signal, SIGKILL); + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + signal_wake_up(t, 1); + } } int -send_sig_info(int sig, struct siginfo *info, struct task_struct *t) +group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; int ret; - -#if DEBUG_SIG -printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig); -#endif - - ret = -EINVAL; - if (sig < 0 || sig > _NSIG) - goto out_nolock; - /* The somewhat baroque permissions check... */ - ret = -EPERM; - if (bad_signal(sig, info, t)) - goto out_nolock; - - /* The null signal is a permissions and process existence probe. - No signal is actually delivered. Same goes for zombies. */ - ret = 0; - if (!sig || !t->sig) - goto out_nolock; - - spin_lock_irqsave(&t->sigmask_lock, flags); - handle_stop_signal(sig, t); - - /* Optimize away the signal, if it's a signal that can be - handled immediately (ie non-blocked and untraced) and - that is ignored (either explicitly or by default). */ - - if (ignored_signal(sig, t)) - goto out; - - /* Support queueing exactly one non-rt signal, so that we - can get more detailed information about the cause of - the signal. */ - if (sig < SIGRTMIN && sigismember(&t->pending.signal, sig)) - goto out; - - ret = deliver_signal(sig, info, t); -out: - spin_unlock_irqrestore(&t->sigmask_lock, flags); -out_nolock: -#if DEBUG_SIG -printk(" %d -> %d\n", signal_pending(t), ret); -#endif + ret = check_kill_permission(sig, info, p); + if (!ret && sig && p->sighand) { + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = __group_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } return ret; } /* - * Force a signal that the process can't ignore: if necessary - * we unblock the signal and change any SIG_IGN to SIG_DFL. + * kill_pg_info() sends a signal to a process group: this is what the tty + * control characters do (^C, ^Z etc) */ -int -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { - unsigned long int flags; - - spin_lock_irqsave(&t->sigmask_lock, flags); - if (t->sig == NULL) { - spin_unlock_irqrestore(&t->sigmask_lock, flags); - return -ESRCH; - } + struct task_struct *p; + struct list_head *l; + struct pid *pid; + int err, retval = -ESRCH; - if (t->sig->action[sig-1].sa.sa_handler == SIG_IGN) - t->sig->action[sig-1].sa.sa_handler = SIG_DFL; - sigdelset(&t->blocked, sig); - recalc_sigpending(t); - spin_unlock_irqrestore(&t->sigmask_lock, flags); + if (pgrp <= 0) + return -EINVAL; - return send_sig_info(sig, info, t); + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + err = group_send_sig_info(sig, info, p); + if (retval) + retval = err; + } + return retval; } -/* - * kill_pg_info() sends a signal to a process group: this is what the tty - * control characters do (^C, ^Z etc) - */ - int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { - int retval = -EINVAL; - if (pgrp > 0) { - struct task_struct *p; + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pg_info(sig, info, pgrp); + read_unlock(&tasklist_lock); - retval = -ESRCH; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pgrp == pgrp && thread_group_leader(p)) { - int err = send_sig_info(sig, info, p); - if (retval) - retval = err; - } - } - read_unlock(&tasklist_lock); - } return retval; } @@ -638,28 +1027,33 @@ kill_pg_info(int sig, struct siginfo *in * the connection is lost. */ + int -kill_sl_info(int sig, struct siginfo *info, pid_t sess) +kill_sl_info(int sig, struct siginfo *info, pid_t sid) { - int retval = -EINVAL; - if (sess > 0) { - struct task_struct *p; + int err, retval = -EINVAL; + struct pid *pid; + struct list_head *l; + struct task_struct *p; - retval = -ESRCH; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->leader && p->session == sess) { - int err = send_sig_info(sig, info, p); - if (retval) - retval = err; - } - } - read_unlock(&tasklist_lock); + if (sid <= 0) + goto out; + + retval = -ESRCH; + read_lock(&tasklist_lock); + for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { + if (!p->leader) + continue; + err = group_send_sig_info(sig, info, p); + if (retval) + retval = err; } + read_unlock(&tasklist_lock); +out: return retval; } -inline int +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; @@ -668,15 +1062,8 @@ kill_proc_info(int sig, struct siginfo * read_lock(&tasklist_lock); p = find_task_by_pid(pid); error = -ESRCH; - if (p) { - if (!thread_group_leader(p)) { - struct task_struct *tg; - tg = find_task_by_pid(p->tgid); - if (tg) - p = tg; - } - error = send_sig_info(sig, info, p); - } + if (p) + error = group_send_sig_info(sig, info, p); read_unlock(&tasklist_lock); return error; } @@ -698,9 +1085,9 @@ static int kill_something_info(int sig, struct task_struct * p; read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pid > 1 && p != current && thread_group_leader(p)) { - int err = send_sig_info(sig, info, p); + for_each_process(p) { + if (p->pid > 1 && p->tgid != current->tgid) { + int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) retval = err; @@ -720,6 +1107,24 @@ static int kill_something_info(int sig, */ int +send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + int ret; + + /* XXX should nix these interfaces and update the kernel */ + if (T(sig, SIG_KERNEL_BROADCAST_MASK)) { + read_lock(&tasklist_lock); + ret = group_send_sig_info(sig, info, p); + read_unlock(&tasklist_lock); + } else { + spin_lock_irq(&p->sighand->siglock); + ret = specific_send_sig_info(sig, info, p); + spin_unlock_irq(&p->sighand->siglock); + } + return ret; +} + +int send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, (void*)(long)(priv != 0), p); @@ -753,13 +1158,24 @@ kill_proc(pid_t pid, int sig, int priv) * Joy. Or not. Pthread wants us to wake up every thread * in our parent group. */ -static void wake_up_parent(struct task_struct *parent) +static inline void __wake_up_parent(struct task_struct *p, + struct task_struct *parent) { struct task_struct *tsk = parent; + /* + * Fortunately this is not necessary for thread groups: + */ + if (p->tgid == tsk->tgid) { + wake_up_interruptible(&tsk->wait_chldexit); + return; + } + do { wake_up_interruptible(&tsk->wait_chldexit); tsk = next_thread(tsk); + if (tsk->signal != parent->signal) + BUG(); } while (tsk != parent); } @@ -770,7 +1186,12 @@ static void wake_up_parent(struct task_s void do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; + unsigned long flags; int why, status; + struct sighand_struct *psig; + + if (sig == -1) + BUG(); info.si_signo = sig; info.si_errno = 0; @@ -806,8 +1227,34 @@ void do_notify_parent(struct task_struct info.si_code = why; info.si_status = status; - send_sig_info(sig, &info, tsk->p_pptr); - wake_up_parent(tsk->p_pptr); + psig = tsk->parent->sighand; + spin_lock_irqsave(&psig->siglock, flags); + if (sig == SIGCHLD && tsk->state != TASK_STOPPED && + (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { + /* + * We are exiting and our parent doesn't care. POSIX.1 + * defines special semantics for setting SIGCHLD to SIG_IGN + * or setting the SA_NOCLDWAIT flag: we should be reaped + * automatically and not left for our parent's wait4 call. + * Rather than having the parent do it as a magic kind of + * signal handler, we just set this to tell do_exit that we + * can be cleaned up without becoming a zombie. Note that + * we still call __wake_up_parent in this case, because a + * blocked sys_wait4 might now return -ECHILD. + * + * Whether we send SIGCHLD or not for SA_NOCLDWAIT + * is implementation-defined: we do (if you don't want + * it, just use SIG_IGN instead). + */ + tsk->exit_signal = -1; + if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) + sig = 0; + } + if (sig > 0 && sig <= _NSIG) + __group_send_sig_info(sig, &info, tsk->parent); + __wake_up_parent(tsk, tsk->parent); + spin_unlock_irqrestore(&psig->siglock, flags); } @@ -821,12 +1268,308 @@ void do_notify_parent(struct task_struct void notify_parent(struct task_struct *tsk, int sig) { - read_lock(&tasklist_lock); - do_notify_parent(tsk, sig); - read_unlock(&tasklist_lock); + if (sig != -1) { + read_lock(&tasklist_lock); + do_notify_parent(tsk, sig); + read_unlock(&tasklist_lock); + } +} + +static void +do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent) +{ + struct siginfo info; + unsigned long flags; + struct sighand_struct *sighand; + + info.si_signo = SIGCHLD; + info.si_errno = 0; + info.si_pid = tsk->pid; + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ + info.si_utime = tsk->times.tms_utime; + info.si_stime = tsk->times.tms_stime; + + info.si_status = tsk->exit_code & 0x7f; + info.si_code = CLD_STOPPED; + + sighand = parent->sighand; + spin_lock_irqsave(&sighand->siglock, flags); + if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && + !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) + __group_send_sig_info(SIGCHLD, &info, parent); + /* + * Even if SIGCHLD is not generated, we must wake up wait4 calls. + */ + __wake_up_parent(tsk, parent); + spin_unlock_irqrestore(&sighand->siglock, flags); +} + +static void +finish_stop(int stop_count) +{ + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, + * report to the parent. When ptraced, every thread reports itself. + */ + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, current->parent); + read_unlock(&tasklist_lock); + } + else if (stop_count == 0) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, + current->group_leader->real_parent); + read_unlock(&tasklist_lock); + } + + schedule(); + /* + * Now we don't run again until continued. + */ + current->exit_code = 0; +} + +/* + * This performs the stopping for SIGSTOP and other stop signals. + * We have to stop all threads in the thread group. + */ +static void +do_signal_stop(int signr) +{ + struct signal_struct *sig = current->signal; + struct sighand_struct *sighand = current->sighand; + int stop_count = -1; + + if (sig->group_stop_count > 0) { + /* + * There is a group stop in progress. We don't need to + * start another one. + */ + spin_lock_irq(&sighand->siglock); + if (unlikely(sig->group_stop_count == 0)) { + spin_unlock_irq(&sighand->siglock); + return; + } + signr = sig->group_exit_code; + stop_count = --sig->group_stop_count; + current->exit_code = signr; + set_current_state(TASK_STOPPED); + spin_unlock_irq(&sighand->siglock); + } + else if (thread_group_empty(current)) { + /* + * No locks needed in this case. + */ + current->exit_code = signr; + set_current_state(TASK_STOPPED); + } + else { + /* + * There is no group stop already in progress. + * We must initiate one now. + */ + struct task_struct *t; + read_lock(&tasklist_lock); + spin_lock_irq(&sighand->siglock); + + if (unlikely(sig->group_exit)) { + /* + * There is a group exit in progress now. + * We'll just ignore the stop and process the + * associated fatal signal. + */ + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + return; + } + + if (sig->group_stop_count == 0) { + sig->group_exit_code = signr; + stop_count = 0; + for (t = next_thread(current); t != current; + t = next_thread(t)) + /* + * Setting state to TASK_STOPPED for a group + * stop is always done with the siglock held, + * so this check has no races. + */ + if (t->state < TASK_STOPPED) { + stop_count++; + signal_wake_up(t, 0); + } + sig->group_stop_count = stop_count; + } + else { + /* A race with another thread while unlocked. */ + signr = sig->group_exit_code; + stop_count = --sig->group_stop_count; + } + + current->exit_code = signr; + set_current_state(TASK_STOPPED); + + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + } + + finish_stop(stop_count); +} + + +#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER + +int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs) +{ + sigset_t *mask = ¤t->blocked; + + for (;;) { + unsigned long signr = 0; + struct k_sigaction *ka; + + spin_lock_irq(¤t->sighand->siglock); + if (unlikely(current->signal->group_stop_count > 0)) { + int stop_count; + if (current->signal->group_exit_task == current) { + /* + * Group stop is so we can do a core dump. + */ + current->signal->group_exit_task = NULL; + goto dequeue; + } + /* + * There is a group stop in progress. We stop + * without any associated signal being in our queue. + */ + stop_count = --current->signal->group_stop_count; + signr = current->signal->group_exit_code; + current->exit_code = signr; + set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + finish_stop(stop_count); + continue; + } + dequeue: + signr = dequeue_signal(mask, info); + spin_unlock_irq(¤t->sighand->siglock); + + if (!signr) + break; + + if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { + /* + * If there is a group stop in progress, + * we must participate in the bookkeeping. + */ + if (current->signal->group_stop_count > 0) { + spin_lock_irq(¤t->sighand->siglock); + --current->signal->group_stop_count; + spin_unlock_irq(¤t->sighand->siglock); + } + + /* Let the debugger run. */ + current->exit_code = signr; + current->last_siginfo = info; + set_current_state(TASK_STOPPED); + notify_parent(current, SIGCHLD); + schedule(); + + current->last_siginfo = NULL; + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + continue; + current->exit_code = 0; + + /* Update the siginfo structure if the signal has + changed. If the debugger wanted something + specific in the siginfo structure then it should + have updated *info via PTRACE_SETSIGINFO. */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = current->parent->pid; + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + spin_lock_irq(¤t->sighand->siglock); + specific_send_sig_info(signr, info, current); + spin_unlock_irq(¤t->sighand->siglock); + continue; + } + } + + ka = ¤t->sighand->action[signr-1]; + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ + continue; + if (ka->sa.sa_handler != SIG_DFL) /* Run the handler. */ + return signr; + + /* + * Now we are doing the default action for this signal. + */ + if (sig_kernel_ignore(signr)) /* Default is nothing. */ + continue; + + /* Init gets no signals it doesn't want. */ + if (current->pid == 1) + continue; + + if (sig_kernel_stop(signr)) { + /* + * The default action is to stop all threads in + * the thread group. The job control signals + * do nothing in an orphaned pgrp, but SIGSTOP + * always works. + */ + if (signr == SIGSTOP || + !is_orphaned_pgrp(current->pgrp)) + do_signal_stop(signr); + continue; + } + + /* + * Anything else is fatal, maybe with a core dump. + */ + current->flags |= PF_SIGNALED; + if (sig_kernel_coredump(signr) && + do_coredump(signr, signr, regs)) { + /* + * That killed all other threads in the group and + * synchronized with their demise, so there can't + * be any more left to kill now. The group_exit + * flags are set by do_coredump. Note that + * thread_group_empty won't always be true yet, + * because those threads were blocked in __exit_mm + * and we just let them go to finish dying. + */ + const int code = signr | 0x80; + BUG_ON(!current->signal->group_exit); + BUG_ON(current->signal->group_exit_code != code); + do_exit(code); + /* NOTREACHED */ + } + + /* + * Death signals, no core dump. + */ + do_group_exit(signr); + /* NOTREACHED */ + } + return 0; } -EXPORT_SYMBOL(dequeue_signal); +#endif + +EXPORT_SYMBOL(recalc_sigpending); +EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(force_sig_info); @@ -837,7 +1580,6 @@ EXPORT_SYMBOL(kill_proc_info); EXPORT_SYMBOL(kill_sl); EXPORT_SYMBOL(kill_sl_info); EXPORT_SYMBOL(notify_parent); -EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(block_all_signals); @@ -870,7 +1612,7 @@ sys_rt_sigprocmask(int how, sigset_t *se goto out; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); old_set = current->blocked; error = 0; @@ -889,16 +1631,16 @@ sys_rt_sigprocmask(int how, sigset_t *se } current->blocked = new_set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (error) goto out; if (oset) goto set_old; } else if (oset) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); old_set = current->blocked; - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); set_old: error = -EFAULT; @@ -918,13 +1660,18 @@ long do_sigpending(void *set, unsigned l if (sigsetsize > sizeof(sigset_t)) goto out; - spin_lock_irq(¤t->sigmask_lock); - sigandsets(&pending, ¤t->blocked, ¤t->pending.signal); - spin_unlock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); + sigorsets(&pending, ¤t->pending.signal, + ¤t->signal->shared_pending.signal); + spin_unlock_irq(¤t->sighand->siglock); + + /* Outside the lock because only this thread touches it. */ + sigandsets(&pending, ¤t->blocked, &pending); error = -EFAULT; if (!copy_to_user(set, &pending, sigsetsize)) error = 0; + out: return error; } @@ -967,7 +1714,7 @@ sys_rt_sigtimedwait(const sigset_t *uthe return -EINVAL; } - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(&these, &info); if (!sig) { timeout = MAX_SCHEDULE_TIMEOUT; @@ -979,21 +1726,22 @@ sys_rt_sigtimedwait(const sigset_t *uthe /* None ready -- temporarily unblock those we're * interested while we are sleeping in so that we'll * be awakened when they arrive. */ - sigset_t oldblocked = current->blocked; + current->real_blocked = current->blocked; sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); current->state = TASK_INTERRUPTIBLE; timeout = schedule_timeout(timeout); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(&these, &info); - current->blocked = oldblocked; - recalc_sigpending(current); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); } } - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); if (sig) { ret = sig; @@ -1025,33 +1773,43 @@ sys_kill(int pid, int sig) } /* - * Kill only one task, even if it's a CLONE_THREAD task. + * Send a signal to only one task, even if it's a CLONE_THREAD task. */ asmlinkage long sys_tkill(int pid, int sig) { - struct siginfo info; - int error; - struct task_struct *p; - - /* This is only valid for single tasks */ - if (pid <= 0) - return -EINVAL; - - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = current->pid; - info.si_uid = current->uid; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - error = -ESRCH; - if (p) { - error = send_sig_info(sig, &info, p); - } - read_unlock(&tasklist_lock); - return error; + struct siginfo info; + int error; + struct task_struct *p; + + /* This is only valid for single tasks */ + if (pid <= 0) + return -EINVAL; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = current->pid; + info.si_uid = current->uid; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + error = -ESRCH; + if (p) { + error = check_kill_permission(sig, &info, p); + /* + * The null signal is a permissions and process existence + * probe. No signal is actually delivered. + */ + if (!error && sig && p->sighand) { + spin_lock_irq(&p->sighand->siglock); + handle_stop_signal(sig, p); + error = specific_send_sig_info(sig, &info, p); + spin_unlock_irq(&p->sighand->siglock); + } + } + read_unlock(&tasklist_lock); + return error; } asmlinkage long @@ -1077,21 +1835,25 @@ do_sigaction(int sig, const struct k_sig { struct k_sigaction *k; - if (sig < 1 || sig > _NSIG || - (act && (sig == SIGKILL || sig == SIGSTOP))) + if (sig < 1 || sig > _NSIG || (act && sig_kernel_only(sig))) return -EINVAL; - k = ¤t->sig->action[sig-1]; + k = ¤t->sighand->action[sig-1]; - spin_lock(¤t->sig->siglock); + spin_lock_irq(¤t->sighand->siglock); + if (signal_pending(current)) { + /* + * If there might be a fatal signal pending on multiple + * threads, make sure we take it before changing the action. + */ + spin_unlock_irq(¤t->sighand->siglock); + return -ERESTARTNOINTR; + } if (oact) *oact = *k; if (act) { - *k = *act; - sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -1102,27 +1864,40 @@ do_sigaction(int sig, const struct k_sig * pending and whose default action is to ignore the signal * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" - * - * Note the silly behaviour of SIGCHLD: SIG_IGN means that the - * signal isn't actually ignored, but does automatic child - * reaping, while SIG_DFL is explicitly said by POSIX to force - * the signal to be ignored. */ - - if (k->sa.sa_handler == SIG_IGN - || (k->sa.sa_handler == SIG_DFL - && (sig == SIGCONT || - sig == SIGCHLD || - sig == SIGURG || - sig == SIGWINCH))) { - spin_lock_irq(¤t->sigmask_lock); - if (rm_sig_from_queue(sig, current)) - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + if (act->sa.sa_handler == SIG_IGN || + (act->sa.sa_handler == SIG_DFL && + sig_kernel_ignore(sig))) { + /* + * This is a fairly rare case, so we only take the + * tasklist_lock once we're sure we'll need it. + * Now we must do this little unlock and relock + * dance to maintain the lock hierarchy. + */ + struct task_struct *t = current; + spin_unlock_irq(&t->sighand->siglock); + read_lock(&tasklist_lock); + spin_lock_irq(&t->sighand->siglock); + *k = *act; + sigdelsetmask(&k->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); + rm_from_queue(sigmask(sig), &t->signal->shared_pending); + do { + rm_from_queue(sigmask(sig), &t->pending); + recalc_sigpending_tsk(t); + t = next_thread(t); + } while (t != current); + spin_unlock_irq(¤t->sighand->siglock); + read_unlock(&tasklist_lock); + return 0; } + + *k = *act; + sigdelsetmask(&k->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); } - spin_unlock(¤t->sig->siglock); + spin_unlock_irq(¤t->sighand->siglock); return 0; } @@ -1151,7 +1926,7 @@ do_sigaltstack (const stack_t *uss, stac goto out; error = -EPERM; - if (on_sig_stack (sp)) + if (on_sig_stack(sp)) goto out; error = -EINVAL; @@ -1209,9 +1984,9 @@ sys_sigprocmask(int how, old_sigset_t *s error = -EFAULT; if (copy_from_user(&new_set, set, sizeof(*set))) goto out; - new_set &= ~(sigmask(SIGKILL)|sigmask(SIGSTOP)); + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); old_set = current->blocked.sig[0]; error = 0; @@ -1230,8 +2005,8 @@ sys_sigprocmask(int how, old_sigset_t *s break; } - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (error) goto out; if (oset) @@ -1277,7 +2052,7 @@ out: #endif /* __sparc__ */ #endif -#if !defined(__alpha__) && !defined(__ia64__) +#if !defined(__alpha__) && !defined(__ia64__) && !defined(__arm__) /* * For backwards compatibility. Functionality superseded by sigprocmask. */ @@ -1293,19 +2068,20 @@ sys_ssetmask(int newmask) { int old; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); old = current->blocked.sig[0]; siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| sigmask(SIGSTOP))); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); return old; } #endif /* !defined(__alpha__) */ -#if !defined(__alpha__) && !defined(__ia64__) && !defined(__mips__) +#if !defined(__alpha__) && !defined(__ia64__) && !defined(__mips__) && \ + !defined(__arm__) /* * For backwards compatibility. Functionality superseded by sigaction. */ @@ -1322,4 +2098,28 @@ sys_signal(int sig, __sighandler_t handl return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } -#endif /* !alpha && !__ia64__ && !defined(__mips__) */ +#endif /* !alpha && !__ia64__ && !defined(__mips__) && !defined(__arm__) */ + +#ifndef HAVE_ARCH_SYS_PAUSE + +asmlinkage int +sys_pause(void) +{ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return -ERESTARTNOHAND; +} + +#endif /* HAVE_ARCH_SYS_PAUSE */ + +void __init signals_init(void) +{ + sigqueue_cachep = + kmem_cache_create("sigqueue", + sizeof(struct sigqueue), + __alignof__(struct sigqueue), + 0, NULL, NULL); + if (!sigqueue_cachep) + panic("signals_init(): cannot create sigqueue SLAB cache"); +} + --- linux/kernel/softirq.c.orig +++ linux/kernel/softirq.c @@ -364,13 +364,13 @@ static int ksoftirqd(void * __bind_cpu) int cpu = cpu_logical_map(bind_cpu); daemonize(); - current->nice = 19; + set_user_nice(current, 19); sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; - while (smp_processor_id() != cpu) - schedule(); + set_cpus_allowed(current, 1UL << cpu); + if (cpu() != cpu) + BUG(); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); @@ -395,13 +395,13 @@ static int ksoftirqd(void * __bind_cpu) } } -static __init int spawn_ksoftirqd(void) +__init int spawn_ksoftirqd(void) { int cpu; for (cpu = 0; cpu < smp_num_cpus; cpu++) { if (kernel_thread(ksoftirqd, (void *) (long) cpu, - CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) + CLONE_KERNEL) < 0) printk("spawn_ksoftirqd() failed for cpu %d\n", cpu); else { while (!ksoftirqd_task(cpu_logical_map(cpu))) --- linux/kernel/sys.c.orig +++ linux/kernel/sys.c @@ -16,9 +16,31 @@ #include #include #include +#include +#include #include #include +#include + +#ifndef SET_UNALIGN_CTL +# define SET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_UNALIGN_CTL +# define GET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEMU_CTL +# define SET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEMU_CTL +# define GET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEXC_CTL +# define SET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEXC_CTL +# define GET_FPEXC_CTL(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -43,6 +65,7 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOW int C_A_D = 1; int cad_pid = 1; +extern int system_running; /* * Notifier list for kernel code which wants to be called @@ -181,35 +204,34 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -static int proc_sel(struct task_struct *p, int which, int who) +static int set_one_prio(struct task_struct *p, int niceval, int error) { - if(p->pid) - { - switch (which) { - case PRIO_PROCESS: - if (!who && p == current) - return 1; - return(p->pid == who); - case PRIO_PGRP: - if (!who) - who = current->pgrp; - return(p->pgrp == who); - case PRIO_USER: - if (!who) - who = current->uid; - return(p->uid == who); - } + if (p->uid != current->euid && + p->uid != current->uid && !capable(CAP_SYS_NICE)) { + error = -EPERM; + goto out; } - return 0; + + if (error == -ESRCH) + error = 0; + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) + error = -EACCES; + else + set_user_nice(p, niceval); +out: + return error; } asmlinkage long sys_setpriority(int which, int who, int niceval) { - struct task_struct *p; - int error; + struct task_struct *g, *p; + struct user_struct *user; + struct pid *pid; + struct list_head *l; + int error = -EINVAL; if (which > 2 || which < 0) - return -EINVAL; + goto out; /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; @@ -219,23 +241,38 @@ asmlinkage long sys_setpriority(int whic niceval = 19; read_lock(&tasklist_lock); - for_each_task(p) { - if (!proc_sel(p, which, who)) - continue; - if (p->uid != current->euid && - p->uid != current->uid && !capable(CAP_SYS_NICE)) { - error = -EPERM; - continue; - } - if (error == -ESRCH) - error = 0; - if (niceval < p->nice && !capable(CAP_SYS_NICE)) - error = -EACCES; - else - p->nice = niceval; + switch (which) { + case PRIO_PROCESS: + if (!who) + who = current->pid; + p = find_task_by_pid(who); + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (!who) + who = current->pgrp; + for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) + error = set_one_prio(p, niceval, error); + break; + case PRIO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + goto out_unlock; + + do_each_thread(g, p) + if (p->uid == who) + error = set_one_prio(p, niceval, error); + while_each_thread(g, p); + break; } +out_unlock: read_unlock(&tasklist_lock); - +out: return error; } @@ -247,21 +284,55 @@ asmlinkage long sys_setpriority(int whic */ asmlinkage long sys_getpriority(int which, int who) { - struct task_struct *p; - long retval = -ESRCH; + struct task_struct *g, *p; + struct list_head *l; + struct pid *pid; + struct user_struct *user; + long niceval, retval = -ESRCH; if (which > 2 || which < 0) return -EINVAL; read_lock(&tasklist_lock); - for_each_task (p) { - long niceval; - if (!proc_sel(p, which, who)) - continue; - niceval = 20 - p->nice; - if (niceval > retval) - retval = niceval; + switch (which) { + case PRIO_PROCESS: + if (!who) + who = current->pid; + p = find_task_by_pid(who); + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (!who) + who = current->pgrp; + for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + goto out_unlock; + + do_each_thread(g, p) + if (p->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + while_each_thread(g, p); + break; } +out_unlock: read_unlock(&tasklist_lock); return retval; @@ -840,41 +911,54 @@ asmlinkage long sys_setpgid(pid_t pid, p /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ - read_lock(&tasklist_lock); + write_lock_irq(&tasklist_lock); err = -ESRCH; p = find_task_by_pid(pid); if (!p) goto out; - if (p->p_pptr == current || p->p_opptr == current) { + err = -EINVAL; + if (!thread_group_leader(p)) + goto out; + + if (p->parent == current || p->real_parent == current) { err = -EPERM; if (p->session != current->session) goto out; err = -EACCES; if (p->did_exec) goto out; - } else if (p != current) - goto out; + } else { + err = -ESRCH; + if (p != current) + goto out; + } + err = -EPERM; if (p->leader) goto out; if (pgid != pid) { - struct task_struct * tmp; - for_each_task (tmp) { - if (tmp->pgrp == pgid && - tmp->session == current->session) + struct task_struct *p; + struct pid *pid; + struct list_head *l; + + for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) + if (p->session == current->session) goto ok_pgid; - } goto out; } ok_pgid: - p->pgrp = pgid; + if (p->pgrp != pgid) { + detach_pid(p, PIDTYPE_PGID); + p->pgrp = pgid; + attach_pid(p, PIDTYPE_PGID, pgid); + } err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ - read_unlock(&tasklist_lock); + write_unlock_irq(&tasklist_lock); return err; } @@ -915,7 +999,7 @@ asmlinkage long sys_getsid(pid_t pid) p = find_task_by_pid(pid); retval = -ESRCH; - if(p) + if (p) retval = p->session; read_unlock(&tasklist_lock); return retval; @@ -924,22 +1008,25 @@ asmlinkage long sys_getsid(pid_t pid) asmlinkage long sys_setsid(void) { - struct task_struct * p; + struct pid *pid; int err = -EPERM; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pgrp == current->pid) - goto out; - } + if (!thread_group_leader(current)) + return -EINVAL; + + write_lock_irq(&tasklist_lock); + + pid = find_pid(PIDTYPE_PGID, current->pid); + if (pid) + goto out; current->leader = 1; - current->session = current->pgrp = current->pid; + __set_special_pids(current->pid, current->pid); current->tty = NULL; current->tty_old_pgrp = 0; err = current->pgrp; out: - read_unlock(&tasklist_lock); + write_unlock_irq(&tasklist_lock); return err; } @@ -974,12 +1061,15 @@ asmlinkage long sys_getgroups(int gidset asmlinkage long sys_setgroups(int gidsetsize, gid_t *grouplist) { + gid_t groups[NGROUPS]; + if (!capable(CAP_SETGID)) return -EPERM; if ((unsigned) gidsetsize > NGROUPS) return -EINVAL; - if(copy_from_user(current->groups, grouplist, gidsetsize * sizeof(gid_t))) + if(copy_from_user(groups, grouplist, gidsetsize * sizeof(gid_t))) return -EFAULT; + memcpy(current->groups, groups, gidsetsize * sizeof(gid_t)); current->ngroups = gidsetsize; return 0; } @@ -1137,13 +1227,8 @@ asmlinkage long sys_setrlimit(unsigned i if (resource == RLIMIT_NOFILE) { if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) return -EPERM; - } else if (resource == RLIMIT_RSS && current->mm) { - /* rlimit is specified in bytes, convert to pages */ - unsigned long pages = RLIM_INFINITY; - if (new_rlim.rlim_cur != RLIM_INFINITY) - pages = new_rlim.rlim_cur >> PAGE_SHIFT; - current->mm->rlimit_rss = pages; } + *old_rlim = new_rlim; return 0; } @@ -1244,37 +1329,26 @@ asmlinkage long sys_prctl(int option, un } current->mm->dumpable = arg2; break; - case PR_SET_UNALIGN: -#ifdef SET_UNALIGN_CTL + + case PR_SET_UNALIGN: error = SET_UNALIGN_CTL(current, arg2); -#else - error = -EINVAL; -#endif break; - - case PR_GET_UNALIGN: -#ifdef GET_UNALIGN_CTL + case PR_GET_UNALIGN: error = GET_UNALIGN_CTL(current, arg2); -#else - error = -EINVAL; -#endif break; - - case PR_SET_FPEMU: -#ifdef SET_FPEMU_CTL + case PR_SET_FPEMU: error = SET_FPEMU_CTL(current, arg2); -#else - error = -EINVAL; -#endif break; - - case PR_GET_FPEMU: -#ifdef GET_FPEMU_CTL + case PR_GET_FPEMU: error = GET_FPEMU_CTL(current, arg2); -#else - error = -EINVAL; -#endif break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(current, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(current, arg2); + break; + case PR_GET_KEEPCAPS: if (current->keep_capabilities) --- linux/kernel/sysctl.c.orig +++ linux/kernel/sysctl.c @@ -41,6 +41,7 @@ /* External variables not in a header file. */ extern int panic_timeout; +extern int print_fatal_signals; extern int C_A_D; extern int bdf_prm[], bdflush_min[], bdflush_max[]; extern int sysctl_overcommit_memory; @@ -51,6 +52,7 @@ extern int sysrq_enabled; extern int core_uses_pid; extern char core_pattern[]; extern int cad_pid; +extern int pid_max; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -173,6 +175,8 @@ static ctl_table kern_table[] = { 0644, NULL, &proc_doutsstring, &sysctl_string}, {KERN_PANIC, "panic", &panic_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, + {KERN_PANIC, "print_fatal_signals", &print_fatal_signals, sizeof(int), + 0644, NULL, &proc_dointvec}, {KERN_CORE_USES_PID, "core_uses_pid", &core_uses_pid, sizeof(int), 0644, NULL, &proc_dointvec}, {KERN_CORE_PATTERN, "core_pattern", core_pattern, 64, @@ -247,6 +251,8 @@ static ctl_table kern_table[] = { 0600, NULL, &proc_dointvec}, {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), 0644, NULL, &proc_dointvec}, + {KERN_PID_MAX, "pid_max", &pid_max, sizeof (int), + 0600, NULL, &proc_dointvec}, {KERN_RANDOM, "random", NULL, 0, 0555, random_table}, {KERN_OVERFLOWUID, "overflowuid", &overflowuid, sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, --- linux/kernel/timer.c.orig +++ linux/kernel/timer.c @@ -25,6 +25,8 @@ #include +struct kernel_stat kstat; + /* * Timekeeping variables */ @@ -536,7 +538,9 @@ static inline void do_process_times(stru unsigned long psecs; psecs = (p->times.tms_utime += user); + p->group_leader->group_times.tms_utime += user; psecs += (p->times.tms_stime += system); + p->group_leader->group_times.tms_stime += system; if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { /* Send SIGXCPU every second.. */ if (!(psecs % HZ)) @@ -594,25 +598,7 @@ void update_process_times(int user_tick) int cpu = smp_processor_id(), system = user_tick ^ 1; update_one_process(p, user_tick, system, cpu); - if (p->pid) { - if (--p->counter <= 0) { - p->counter = 0; - /* - * SCHED_FIFO is priority preemption, so this is - * not the place to decide whether to reschedule a - * SCHED_FIFO task or not - Bhavesh Davda - */ - if (p->policy != SCHED_FIFO) { - p->need_resched = 1; - } - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_tick; - else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + scheduler_tick(user_tick, system); } /* @@ -620,17 +606,7 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; + return (nr_running() + nr_uninterruptible()) * FIXED_1; } /* @@ -752,8 +728,8 @@ asmlinkage long sys_getpid(void) } /* - * This is not strictly SMP safe: p_opptr could change - * from under us. However, rather than getting any lock + * Accessing ->group_leader->real_parent is not SMP-safe, it could + * change from under us. However, rather than getting any lock * we can use an optimistic algorithm: get the parent * pid, and go back and check that the parent is still * the same. If it has changed (which is extremely unlikely @@ -761,33 +737,31 @@ asmlinkage long sys_getpid(void) * * NOTE! This depends on the fact that even if we _do_ * get an old value of "parent", we can happily dereference - * the pointer: we just can't necessarily trust the result + * the pointer (it was and remains a dereferencable kernel pointer + * no matter what): we just can't necessarily trust the result * until we know that the parent pointer is valid. * - * The "mb()" macro is a memory barrier - a synchronizing - * event. It also makes sure that gcc doesn't optimize - * away the necessary memory references.. The barrier doesn't - * have to have all that strong semantics: on x86 we don't - * really require a synchronizing instruction, for example. - * The barrier is more important for code generation than - * for any real memory ordering semantics (even if there is - * a small window for a race, using the old pointer is - * harmless for a while). + * NOTE2: ->group_leader never changes from under us. */ asmlinkage long sys_getppid(void) { int pid; - struct task_struct * me = current; - struct task_struct * parent; + struct task_struct *me = current; + struct task_struct *parent; - parent = me->p_opptr; + parent = me->group_leader->real_parent; for (;;) { - pid = parent->pid; + pid = parent->tgid; #if CONFIG_SMP { struct task_struct *old = parent; - mb(); - parent = me->p_opptr; + + /* + * Make sure we read the pid before re-reading the + * parent pointer: + */ + rmb(); + parent = me->group_leader->real_parent; if (old != parent) continue; } @@ -823,6 +797,89 @@ asmlinkage long sys_getegid(void) #endif +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -842,7 +899,7 @@ asmlinkage long sys_nanosleep(struct tim if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_OTHER) + current->policy != SCHED_NORMAL) { /* * Short delay requests up to 2 ms will be handled with @@ -869,4 +926,3 @@ asmlinkage long sys_nanosleep(struct tim } return 0; } - --- linux/kernel/user.c.orig 2000-11-29 07:43:39.000000000 +0100 +++ linux/kernel/user.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * UID task count cache, to get fast user lookup in "alloc_uid" @@ -19,58 +20,53 @@ #define UIDHASH_BITS 8 #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn(uid)) +#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) +#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) static kmem_cache_t *uid_cachep; -static struct user_struct *uidhash_table[UIDHASH_SZ]; +static struct list_head uidhash_table[UIDHASH_SZ]; static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED; struct user_struct root_user = { - __count: ATOMIC_INIT(1), - processes: ATOMIC_INIT(1), - files: ATOMIC_INIT(0) + .__count = ATOMIC_INIT(1), + .processes = ATOMIC_INIT(1), + .files = ATOMIC_INIT(0) }; /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct user_struct **hashent) +static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) { - struct user_struct *next = *hashent; - - up->next = next; - if (next) - next->pprev = &up->next; - up->pprev = hashent; - *hashent = up; + list_add(&up->uidhash_list, hashent); } static inline void uid_hash_remove(struct user_struct *up) { - struct user_struct *next = up->next; - struct user_struct **pprev = up->pprev; + list_del(&up->uidhash_list); +} + +static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +{ + struct list_head *up; + + list_for_each(up, hashent) { + struct user_struct *user; + + user = list_entry(up, struct user_struct, uidhash_list); - if (next) - next->pprev = pprev; - *pprev = next; -} - -static inline struct user_struct *uid_hash_find(uid_t uid, struct user_struct **hashent) -{ - struct user_struct *next; - - next = *hashent; - for (;;) { - struct user_struct *up = next; - if (next) { - next = up->next; - if (up->uid != uid) - continue; - atomic_inc(&up->__count); + if(user->uid == uid) { + atomic_inc(&user->__count); + return user; } - return up; } + + return NULL; +} + +struct user_struct *find_user(uid_t uid) +{ + return uid_hash_find(uid, uidhashentry(uid)); } void free_uid(struct user_struct *up) @@ -84,7 +80,7 @@ void free_uid(struct user_struct *up) struct user_struct * alloc_uid(uid_t uid) { - struct user_struct **hashent = uidhashentry(uid); + struct list_head *hashent = uidhashentry(uid); struct user_struct *up; spin_lock(&uidhash_lock); @@ -123,12 +119,17 @@ struct user_struct * alloc_uid(uid_t uid static int __init uid_cache_init(void) { + int n; + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if(!uid_cachep) panic("Cannot create uid taskcount SLAB cache\n"); + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(uidhash_table + n); + /* Insert the root user immediately - init already runs with this */ uid_hash_insert(&root_user, uidhashentry(0)); return 0; --- linux/mm/Makefile.orig +++ linux/mm/Makefile @@ -14,7 +14,7 @@ export-objs := shmem.o filemap.o memory. obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o rmap.o mempool.o + shmem.o rmap.o mempool.o vcache.o obj-$(CONFIG_HIGHMEM) += highmem.o --- linux/mm/memory.c.orig +++ linux/mm/memory.c @@ -46,6 +46,8 @@ #include #include #include +#include + #include #include @@ -586,7 +588,7 @@ void zap_page_range(struct vm_area_struc /* * Do a quick page-table lookup for a single page. */ -static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) +struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; @@ -1119,6 +1121,7 @@ static inline void establish_pte(struct static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table) { + invalidate_vcache(address, vma->vm_mm, new_page); flush_page_to_ram(new_page); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); --- linux/mm/oom_kill.c.orig +++ linux/mm/oom_kill.c @@ -82,7 +82,7 @@ static int badness(struct task_struct *p * Niced processes are most likely less important, so double * their badness points. */ - if (p->nice > 0) + if (task_nice(p) > 0) points *= 2; /* @@ -117,10 +117,10 @@ static int badness(struct task_struct *p static struct task_struct * select_bad_process(void) { int maxpoints = 0; - struct task_struct *p = NULL; + struct task_struct *g, *p; struct task_struct *chosen = NULL; - for_each_task(p) { + do_each_thread(g, p) if (p->pid) { int points = badness(p); if (points > maxpoints) { @@ -128,7 +128,7 @@ static struct task_struct * select_bad_p maxpoints = points; } } - } + while_each_thread(g, p); return chosen; } @@ -146,7 +146,7 @@ void oom_kill_task(struct task_struct *p * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->counter = 5 * HZ; + p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ @@ -167,9 +167,9 @@ void oom_kill_task(struct task_struct *p */ static void oom_kill(void) { - struct task_struct *p, *q; + struct task_struct *g, *p, *q; extern wait_queue_head_t kswapd_done; - + read_lock(&tasklist_lock); p = select_bad_process(); @@ -177,11 +177,16 @@ static void oom_kill(void) if (p == NULL) panic("Out of memory and no killable processes...\n"); - /* kill all processes that share the ->mm (i.e. all threads) */ - for_each_task(q) { - if (q->mm == p->mm) + oom_kill_task(p); + /* + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group + */ + do_each_thread(g, q) + if (q->mm == p->mm && q->tgid != p->tgid) oom_kill_task(q); - } + while_each_thread(g, q); + read_unlock(&tasklist_lock); /* Chances are by this time our victim is sleeping on kswapd. */ --- linux/mm/vcache.c.orig +++ linux/mm/vcache.c @@ -0,0 +1,90 @@ +/* + * linux/mm/vcache.c + * + * virtual => physical page mapping cache. Users of this mechanism + * register callbacks for a given (virt,mm,phys) page mapping, and + * the kernel guarantees to call back when this mapping is invalidated. + * (ie. upon COW or unmap.) + * + * Started by Ingo Molnar, Copyright (C) 2002 + */ + +#include +#include +#include +#include + +#define VCACHE_HASHBITS 8 +#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS) + +spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED; + +static struct list_head hash[VCACHE_HASHSIZE]; + +static struct list_head *hash_vcache(unsigned long address, + struct mm_struct *mm) +{ + return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)]; +} + +void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new)) +{ + struct list_head *hash_head; + + address &= PAGE_MASK; + vcache->address = address; + vcache->mm = mm; + vcache->callback = callback; + + hash_head = hash_vcache(address, mm); + + list_add_tail(&vcache->hash_entry, hash_head); +} + +void __detach_vcache(vcache_t *vcache) +{ + list_del_init(&vcache->hash_entry); +} + +void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page) +{ + struct list_head *l, *hash_head; + vcache_t *vcache; + + address &= PAGE_MASK; + + hash_head = hash_vcache(address, mm); + /* + * This is safe, because this path is called with the pagetable + * lock held. So while other mm's might add new entries in + * parallel, *this* mm is locked out, so if the list is empty + * now then we do not have to take the vcache lock to see it's + * really empty. + */ + if (likely(list_empty(hash_head))) + return; + + spin_lock(&vcache_lock); + list_for_each(l, hash_head) { + vcache = list_entry(l, vcache_t, hash_entry); + if (vcache->address != address || vcache->mm != mm) + continue; + vcache->callback(vcache, new_page); + } + spin_unlock(&vcache_lock); +} + +static int __init vcache_init(void) +{ + unsigned int i; + + for (i = 0; i < VCACHE_HASHSIZE; i++) + INIT_LIST_HEAD(hash + i); + return 0; +} +__initcall(vcache_init); + --- linux/mm/vmscan.c.orig +++ linux/mm/vmscan.c @@ -1247,8 +1247,8 @@ static int __init kswapd_init(void) { printk("Starting kswapd\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); - kernel_thread(kscand, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + kernel_thread(kswapd, NULL, CLONE_KERNEL); + kernel_thread(kscand, NULL, CLONE_KERNEL); return 0; } --- linux/mm/mmap.c.orig +++ linux/mm/mmap.c @@ -611,26 +611,37 @@ free_vma: * This function "knows" that -ENOMEM has the bits set. */ #ifndef HAVE_ARCH_UNMAPPED_AREA -static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +static inline unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + int found_hole = 0; if (len > TASK_SIZE) return -ENOMEM; if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(current->mm, addr); + vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } - addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); + addr = mm->free_area_cache; - for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) return -ENOMEM; + /* + * Record the first available hole. + */ + if (!found_hole && (!vma || addr < vma->vm_start)) { + mm->free_area_cache = addr; + found_hole = 1; + } if (!vma || addr + len <= vma->vm_start) return addr; addr = vma->vm_end; @@ -793,6 +804,12 @@ static struct vm_area_struct * unmap_fix area->vm_mm->total_vm -= len >> PAGE_SHIFT; if (area->vm_flags & VM_LOCKED) area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + /* + * Is this a new hole at the lowest possible address? + */ + if (area->vm_start >= TASK_UNMAPPED_BASE && + area->vm_start < area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_start; /* Unmapping the whole area. */ if (addr == area->vm_start && end == area->vm_end) { --- linux/include/linux/raid/md_compatible.h.orig +++ linux/include/linux/raid/md_compatible.h @@ -64,9 +64,9 @@ static inline int md_capable_admin(void) /* 009 */ static inline void md_flush_signals (void) { - spin_lock(¤t->sigmask_lock); + spin_lock(¤t->sighand->siglock); flush_signals(current); - spin_unlock(¤t->sigmask_lock); + spin_unlock(¤t->sighand->siglock); } /* 010 */ --- linux/include/linux/apm_bios.h.orig +++ linux/include/linux/apm_bios.h @@ -21,8 +21,7 @@ typedef unsigned short apm_eventinfo_t; #ifdef __KERNEL__ -#define APM_40 0x40 -#define APM_CS (APM_40 + 8) +#define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) #define APM_CS_16 (APM_CS + 8) #define APM_DS (APM_CS_16 + 8) --- linux/include/linux/binfmts.h.orig +++ linux/include/linux/binfmts.h @@ -1,7 +1,6 @@ #ifndef _LINUX_BINFMTS_H #define _LINUX_BINFMTS_H -#include #include /* @@ -60,7 +59,7 @@ extern int setup_arg_pages(struct linux_ extern int copy_strings(int argc,char ** argv,struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern void compute_creds(struct linux_binprm *binprm); -extern int do_coredump(long signr, struct pt_regs * regs); +extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); extern void set_binfmt(struct linux_binfmt *new); --- linux/include/linux/completion.h.orig 2001-08-21 14:26:21.000000000 +0200 +++ linux/include/linux/completion.h @@ -29,6 +29,7 @@ static inline void init_completion(struc extern void FASTCALL(wait_for_completion(struct completion *)); extern void FASTCALL(complete(struct completion *)); +extern void FASTCALL(complete_all(struct completion *)); #define INIT_COMPLETION(x) ((x).done = 0) --- linux/include/linux/elf.h.orig +++ linux/include/linux/elf.h @@ -1,6 +1,7 @@ #ifndef _LINUX_ELF_H #define _LINUX_ELF_H +#include #include #include @@ -575,7 +576,8 @@ typedef struct elf64_shdr { #define NT_PRFPREG 2 #define NT_PRPSINFO 3 #define NT_TASKSTRUCT 4 -#define NT_PRFPXREG 20 +#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ + /* Note header in a PT_NOTE section */ typedef struct elf32_note { @@ -607,5 +609,4 @@ extern Elf64_Dyn _DYNAMIC []; #endif - #endif /* _LINUX_ELF_H */ --- linux/include/linux/elfcore.h.orig 2000-12-31 20:11:13.000000000 +0100 +++ linux/include/linux/elfcore.h @@ -4,7 +4,6 @@ #include #include #include -#include #include struct elf_siginfo @@ -86,4 +85,45 @@ typedef struct elf_prpsinfo prpsinfo_t; #define PRARGSZ ELF_PRARGSZ #endif +#ifdef __KERNEL__ +static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) +{ +#ifdef ELF_CORE_COPY_REGS + ELF_CORE_COPY_REGS((*elfregs), regs) +#else + BUG_ON(sizeof(*elfregs) != sizeof(*regs)); + *(struct pt_regs *)elfregs = *regs; +#endif +} + +static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) +{ +#ifdef ELF_CORE_COPY_TASK_REGS + + return ELF_CORE_COPY_TASK_REGS(t, elfregs); +#endif + return 0; +} + +extern int dump_fpu (struct pt_regs *, elf_fpregset_t *); + +static inline int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) +{ +#ifdef ELF_CORE_COPY_FPREGS + return ELF_CORE_COPY_FPREGS(t, fpu); +#else + return dump_fpu(NULL, fpu); +#endif +} + +#ifdef ELF_CORE_COPY_XFPREGS +static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) +{ + return ELF_CORE_COPY_XFPREGS(t, xfpu); +} +#endif + +#endif /* __KERNEL__ */ + + #endif /* _LINUX_ELFCORE_H */ --- linux/include/linux/fs.h.orig +++ linux/include/linux/fs.h @@ -446,7 +446,7 @@ struct inode { atomic_t i_count; kdev_t i_dev; umode_t i_mode; - nlink_t i_nlink; + unsigned int i_nlink; uid_t i_uid; gid_t i_gid; kdev_t i_rdev; --- linux/include/linux/futex.h.orig +++ linux/include/linux/futex.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_FUTEX_H +#define _LINUX_FUTEX_H + +/* Second argument to futex syscall */ +#define FUTEX_WAIT (0) +#define FUTEX_WAKE (1) +#define FUTEX_FD (2) + +extern asmlinkage int sys_futex(unsigned long uaddr, int op, int val, struct timespec *utime); + +#endif --- linux/include/linux/hash.h.orig +++ linux/include/linux/hash.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_HASH_H +#define _LINUX_HASH_H +/* Fast hashing routine for a long. + (C) 2002 William Lee Irwin III, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +static inline unsigned long hash_long(unsigned long val, unsigned int bits) +{ + unsigned long hash = val; + +#if BITS_PER_LONG == 64 + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + unsigned long n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; +#else + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; +#endif + + /* High bits are more random, so use them. */ + return hash >> (BITS_PER_LONG - bits); +} + +static inline unsigned long hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} +#endif /* _LINUX_HASH_H */ --- linux/include/linux/intermezzo_psdev.h.orig +++ linux/include/linux/intermezzo_psdev.h @@ -31,8 +31,8 @@ struct upc_channel { }; #define ISLENTO(minor) (current->pid == izo_channels[minor].uc_pid \ - || current->p_pptr->pid == izo_channels[minor].uc_pid \ - || current->p_pptr->p_pptr->pid == izo_channels[minor].uc_pid) + || current->parent->pid == izo_channels[minor].uc_pid \ + || current->parent->parent->pid == izo_channels[minor].uc_pid) extern struct upc_channel izo_channels[MAX_CHANNEL]; --- linux/include/linux/kernel.h.orig +++ linux/include/linux/kernel.h @@ -176,6 +176,18 @@ extern int lookup_symbol(unsigned long a #define max_t(type,x,y) \ ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) +/** + * container_of - cast a member of a structure out to the containing structure + * + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + extern void __out_of_line_bug(int line) ATTRIB_NORET; #define out_of_line_bug() __out_of_line_bug(__LINE__) --- linux/include/linux/kernel_stat.h.orig +++ linux/include/linux/kernel_stat.h @@ -31,7 +31,6 @@ struct kernel_stat { #elif !defined(CONFIG_ARCH_S390) unsigned int irqs[NR_CPUS][NR_IRQS]; #endif - unsigned int context_swtch; }; extern struct kernel_stat kstat; --- linux/include/linux/list.h.orig +++ linux/include/linux/list.h @@ -195,6 +195,20 @@ static inline void list_splice_init(stru #define list_for_each(pos, head) \ for (pos = (head)->next, prefetch(pos->next); pos != (head); \ pos = pos->next, prefetch(pos->next)) + +/** + * __list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + * + * This variant differs from list_for_each() in that it's the + * simplest possible list iteration code, no prefetching is done. + * Use this for code that knows the list to be very short (empty + * or 1 entry) most of the time. + */ +#define __list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop counter. --- linux/include/linux/mm.h.orig +++ linux/include/linux/mm.h @@ -579,13 +579,8 @@ extern pte_t *FASTCALL(pte_alloc_map(str extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len); -extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len); -extern int ptrace_attach(struct task_struct *tsk); -extern int ptrace_detach(struct task_struct *, unsigned int); -extern void ptrace_disable(struct task_struct *); -extern int ptrace_check_attach(struct task_struct *task, int kill); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); --- linux/include/linux/pid.h.orig +++ linux/include/linux/pid.h @@ -0,0 +1,64 @@ +#ifndef _LINUX_PID_H +#define _LINUX_PID_H + +enum pid_type +{ + PIDTYPE_PID, + PIDTYPE_TGID, + PIDTYPE_PGID, + PIDTYPE_SID, + PIDTYPE_MAX +}; + +struct pid +{ + int nr; + atomic_t count; + struct task_struct *task; + struct list_head task_list; + struct list_head hash_chain; +}; + +struct pid_link +{ + struct list_head pid_chain; + struct pid *pidptr; + struct pid pid; +}; + +#define pid_task(elem, type) \ + list_entry(elem, struct task_struct, pids[type].pid_chain) + +/* + * attach_pid() and link_pid() must be called with the tasklist_lock + * write-held. + */ +extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); + +extern void FASTCALL(link_pid(struct task_struct *task, struct pid_link *link, struct pid *pid)); + +/* + * detach_pid() must be called with the tasklist_lock write-held. + */ +extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); + +/* + * look up a PID in the hash table. Must be called with the tasklist_lock + * held. + */ +extern struct pid *FASTCALL(find_pid(enum pid_type, int)); + +extern int alloc_pidmap(void); +extern void FASTCALL(free_pidmap(int)); +extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); + +#define for_each_task_pid(who, type, task, elem, pid) \ + if ((pid = find_pid(type, who))) \ + for (elem = pid->task_list.next, \ + prefetch(elem->next), \ + task = pid_task(elem, type); \ + elem != &pid->task_list; \ + elem = elem->next, prefetch(elem->next), \ + task = pid_task(elem, type)) + +#endif /* _LINUX_PID_H */ --- linux/include/linux/prctl.h.orig +++ linux/include/linux/prctl.h @@ -26,4 +26,12 @@ # define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ # define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ +/* Get/set floating-point exception mode (if meaningful) */ +#define PR_GET_FPEXC 11 +#define PR_SET_FPEXC 12 +# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */ +# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */ +# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */ +# define PR_FP_EXC_PRECISE 3 /* precise exception mode */ + #endif /* _LINUX_PRCTL_H */ --- linux/include/linux/ptrace.h.orig 2000-10-31 00:29:23.000000000 +0100 +++ linux/include/linux/ptrace.h @@ -3,6 +3,8 @@ /* ptrace.h */ /* structs and defines to help the user use the ptrace system call. */ +#include + /* has the defines to get at the registers. */ #define PTRACE_TRACEME 0 @@ -21,6 +23,54 @@ #define PTRACE_SYSCALL 24 +/* 0x4200-0x4300 are reserved for architecture-independent additions. */ +#define PTRACE_SETOPTIONS 0x4200 +#define PTRACE_GETEVENTMSG 0x4201 +#define PTRACE_GETSIGINFO 0x4202 +#define PTRACE_SETSIGINFO 0x4203 + +/* options set using PTRACE_SETOPTIONS */ +#define PTRACE_O_TRACESYSGOOD 0x00000001 +#define PTRACE_O_TRACEFORK 0x00000002 +#define PTRACE_O_TRACEVFORK 0x00000004 +#define PTRACE_O_TRACECLONE 0x00000008 +#define PTRACE_O_TRACEEXEC 0x00000010 +#define PTRACE_O_TRACEVFORKDONE 0x00000020 +#define PTRACE_O_TRACEEXIT 0x00000040 + +/* Wait extended result codes for the above trace options. */ +#define PTRACE_EVENT_FORK 1 +#define PTRACE_EVENT_VFORK 2 +#define PTRACE_EVENT_CLONE 3 +#define PTRACE_EVENT_EXEC 4 +#define PTRACE_EVENT_VFORK_DONE 5 +#define PTRACE_EVENT_EXIT 6 + #include +#include + +extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len); +extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len); +extern int ptrace_attach(struct task_struct *tsk); +extern int ptrace_detach(struct task_struct *, unsigned int); +extern void ptrace_disable(struct task_struct *); +extern int ptrace_check_attach(struct task_struct *task, int kill); +extern int ptrace_request(struct task_struct *child, long request, long addr, long data); +extern void ptrace_notify(int exit_code); +extern void __ptrace_link(struct task_struct *child, + struct task_struct *new_parent); +extern void __ptrace_unlink(struct task_struct *child); + +static inline void ptrace_link(struct task_struct *child, + struct task_struct *new_parent) +{ + if (unlikely(child->ptrace)) + __ptrace_link(child, new_parent); +} +static inline void ptrace_unlink(struct task_struct *child) +{ + if (unlikely(child->ptrace)) + __ptrace_unlink(child); +} #endif --- linux/include/linux/sched.h.orig +++ linux/include/linux/sched.h @@ -6,6 +6,7 @@ extern unsigned long event; #include +#include #include #include #include @@ -27,8 +28,10 @@ extern unsigned long event; #include #include #include +#include struct exec_domain; +extern int panic_timeout; /* * cloning flags: @@ -38,14 +41,25 @@ struct exec_domain; #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ -#define CLONE_PID 0x00001000 /* set if pid shared */ +#define CLONE_IDLETASK 0x00001000 /* set if new pid should be 0 (kernel only)*/ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ #define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* parent wants no child-exit signal */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) +/* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. + */ +#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) /* * These are the constant used to fake the fixed-point load-average @@ -74,10 +88,12 @@ extern unsigned long avenrun[]; /* Load #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); -#include +//#include #include #include #include @@ -90,8 +106,9 @@ extern int last_pid; #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 -#define TASK_ZOMBIE 4 -#define TASK_STOPPED 8 +#define TASK_STOPPED 4 +#define TASK_ZOMBIE 8 +#define TASK_DEAD 16 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -116,16 +133,10 @@ extern int last_pid; /* * Scheduling policies */ -#define SCHED_OTHER 0 +#define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 -/* - * This is an additional bit set when we want to - * yield the CPU for one re-schedule.. - */ -#define SCHED_YIELD 0x10 - struct sched_param { int sched_priority; }; @@ -143,18 +154,27 @@ struct completion; * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +typedef struct task_struct task_t; + +extern void __put_task_struct(struct task_struct *tsk); +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#define put_task_struct(tsk) \ +do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) + extern void sched_init(void); -extern void init_idle(void); +extern void init_idle(task_t *idle, int cpu); extern void show_state(void); extern void show_stack(unsigned long * esp); extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +extern void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern int migration_init(void); +extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); @@ -166,6 +186,24 @@ extern int start_context_thread(void); extern int current_is_keventd(void); /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * re inverted: lower p->prio value means higher priority. + * + * The MAX_RT_USER_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ @@ -213,6 +251,7 @@ struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ rb_root_t mm_rb; struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned long free_area_cache; /* first hole */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ @@ -248,6 +287,10 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; + + /* coredumping support */ + int core_waiters; + struct completion *core_startup_done, core_done; }; extern int mmlist_nr; @@ -264,17 +307,51 @@ extern int mmlist_nr; rlimit_rss: RLIM_INFINITY, \ } -struct signal_struct { +extern void show_stack(unsigned long *esp); + +extern int __broadcast_thread_group(struct task_struct *p, int sig); + +struct sighand_struct { atomic_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; }; +/* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t count; + + /* current thread group signal load-balancing target: */ + task_t *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* thread group exit support */ + int group_exit; + int group_exit_code; + struct task_struct *group_exit_task; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; +}; + -#define INIT_SIGNALS { \ - count: ATOMIC_INIT(1), \ - action: { {{0,}}, }, \ - siglock: SPIN_LOCK_UNLOCKED \ +#define INIT_SIGNALS(sig) { \ + .count = ATOMIC_INIT(1), \ + .shared_pending = { NULL, &sig.shared_pending.head, {{0}}}, \ +} + +#define INIT_SIGHAND(sighand) { \ + .count = ATOMIC_INIT(1), \ + .action = { {{0,}}, }, \ + .siglock = SPIN_LOCK_UNLOCKED, \ } /* @@ -286,7 +363,7 @@ struct user_struct { atomic_t files; /* How many open files does this user have? */ /* Hash table maintenance information */ - struct user_struct *next, **pprev; + struct list_head uidhash_list; uid_t uid; }; @@ -295,9 +372,13 @@ struct user_struct { atomic_inc(&__user->__count); \ __user; }) +extern struct user_struct *find_user(uid_t); + extern struct user_struct root_user; #define INIT_USER (&root_user) +typedef struct prio_array prio_array_t; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -315,34 +396,28 @@ struct task_struct { int lock_depth; /* Lock depth */ -/* - * offset 32 begins here on 32-bit platforms. We keep - * all fields in a single cacheline that are needed for - * the goodness() loop in schedule(). - */ - long counter; - long nice; - unsigned long policy; - struct mm_struct *mm; - int processor; /* - * cpus_runnable is ~0 if the process is not running on any - * CPU. It's (1 << cpu) if it's running on a CPU. This mask - * is updated under the runqueue lock. - * - * To determine whether a process might run on a CPU, this - * mask is AND-ed with cpus_allowed. - */ - unsigned long cpus_runnable, cpus_allowed; - /* - * (only the 'next' pointer fits into the cacheline, but - * that's just fine.) + * offset 32 begins here on 32-bit platforms. */ + unsigned int cpu; + int prio, static_prio; struct list_head run_list; - unsigned long sleep_time; + prio_array_t *array; + + unsigned long sleep_avg; + unsigned long last_run; + + unsigned long policy; + unsigned long cpus_allowed; + unsigned int time_slice, first_time_slice; - struct task_struct *next_task, *prev_task; - struct mm_struct *active_mm; + atomic_t usage; + + struct list_head tasks; + struct list_head ptrace_children; + struct list_head ptrace_list; + + struct mm_struct *mm, *active_mm; /* task state */ struct linux_binfmt *binfmt; @@ -361,22 +436,28 @@ struct task_struct { /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with - * p->p_pptr->pid) + * p->parent->pid) */ - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; - struct list_head thread_group; + struct task_struct *real_parent; /* real parent process (when being debugged) */ + struct task_struct *parent; /* parent process */ + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ + struct task_struct *group_leader; - /* PID hash table linkage. */ - struct task_struct *pidhash_next; - struct task_struct **pidhash_pprev; + /* PID/PID hash table linkage. */ + struct pid_link pids[PIDTYPE_MAX]; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ + int *set_child_tid; /* CLONE_CHILD_SETTID */ + int *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + unsigned long rt_priority; unsigned long it_real_value, it_prof_value, it_virt_value; unsigned long it_real_incr, it_prof_incr, it_virt_incr; struct timer_list real_timer; struct tms times; + struct tms group_times; unsigned long start_time; long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ @@ -410,10 +491,10 @@ struct task_struct { /* namespace */ struct namespace *namespace; /* signal handlers */ - spinlock_t sigmask_lock; /* Protects signal and blocked */ - struct signal_struct *sig; + struct signal_struct *signal; + struct sighand_struct *sighand; - sigset_t blocked; + sigset_t blocked, real_blocked; struct sigpending pending; unsigned long sas_ss_sp; @@ -432,9 +513,14 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* context-switch lock */ + spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; + + unsigned long ptrace_message; + siginfo_t *last_siginfo; /* For ptrace use. */ }; /* @@ -461,9 +547,15 @@ struct task_struct { #define PT_PTRACED 0x00000001 #define PT_TRACESYS 0x00000002 -#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ -#define PT_TRACESYSGOOD 0x00000008 -#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ +#define PT_TRACESYSGOOD 0x00000004 +#define PT_PTRACE_CAP 0x00000008 /* ptracer can follow suid-exec */ +#define PT_TRACE_FORK 0x00000010 +#define PT_TRACE_VFORK 0x00000020 +#define PT_TRACE_CLONE 0x00000040 +#define PT_TRACE_EXEC 0x00000080 +#define PT_TRACE_VFORK_DONE 0x00000100 +#define PT_TRACE_EXIT 0x00000200 +#define PT_DTRACE 0x00000400 /* delayed trace (used on m68k, i386) */ /* * Limit the stack by to some sane default: root can always @@ -471,9 +563,57 @@ struct task_struct { */ #define _STK_LIM (8*1024*1024) -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ -#define MAX_COUNTER (20*HZ/100) -#define DEF_NICE (0) +#if CONFIG_SMP +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +#else +#define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); +extern int task_curr(task_t *p); + +/* Reevaluate whether the task has signals pending delivery. + * This is required every time the blocked sigset_t changes. + * callers must hold sig->siglock. */ + +extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t)); +extern void recalc_sigpending(void); +extern void signal_wake_up(struct task_struct *t, int resume_stopped); +extern void print_signals(struct task_struct *t); + + +/* + * * Wrappers for p->cpu access. No-op on UP. + * */ +#ifdef CONFIG_SMP + +#define cpu_online(cpu) ((cpu) < smp_num_cpus) + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return p->cpu; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->cpu = cpu; +} + +#else + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + extern void yield(void); @@ -494,19 +634,22 @@ extern struct exec_domain default_exec_d addr_limit: KERNEL_DS, \ exec_domain: &default_exec_domain, \ lock_depth: -1, \ - counter: DEF_COUNTER, \ - nice: DEF_NICE, \ - policy: SCHED_OTHER, \ + prio: MAX_PRIO-20, \ + static_prio: MAX_PRIO-20, \ + policy: SCHED_NORMAL, \ + cpus_allowed: -1, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_runnable: ~0UL, \ - cpus_allowed: ~0UL, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ - next_task: &tsk, \ - prev_task: &tsk, \ - p_opptr: &tsk, \ - p_pptr: &tsk, \ - thread_group: LIST_HEAD_INIT(tsk.thread_group), \ + time_slice: HZ, \ + tasks: LIST_HEAD_INIT(tsk.tasks), \ + ptrace_children: LIST_HEAD_INIT(tsk.ptrace_children), \ + ptrace_list: LIST_HEAD_INIT(tsk.ptrace_list), \ + real_parent: &tsk, \ + parent: &tsk, \ + children: LIST_HEAD_INIT(tsk.children), \ + sibling: LIST_HEAD_INIT(tsk.sibling), \ + group_leader: &tsk, \ wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ real_timer: { \ function: it_real_fn \ @@ -521,11 +664,12 @@ extern struct exec_domain default_exec_d thread: INIT_THREAD, \ fs: &init_fs, \ files: &init_files, \ - sigmask_lock: SPIN_LOCK_UNLOCKED, \ - sig: &init_signals, \ + signal: &init_signals, \ + sighand: &init_sighand, \ pending: { NULL, &tsk.pending.head, {{0}}}, \ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ + switch_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ } @@ -535,60 +679,15 @@ extern struct exec_domain default_exec_d #endif union task_union { - struct task_struct task; + task_t task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; - -/* PID hashing. (shouldnt this be dynamic?) */ -#define PIDHASH_SZ (4096 >> 2) -extern struct task_struct *pidhash[PIDHASH_SZ]; - -#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) - -static inline void hash_pid(struct task_struct *p) -{ - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; - - if((p->pidhash_next = *htable) != NULL) - (*htable)->pidhash_pprev = &p->pidhash_next; - *htable = p; - p->pidhash_pprev = htable; -} -static inline void unhash_pid(struct task_struct *p) -{ - if(p->pidhash_next) - p->pidhash_next->pidhash_pprev = p->pidhash_pprev; - *p->pidhash_pprev = p->pidhash_next; -} - -static inline struct task_struct *find_task_by_pid(int pid) -{ - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; - - for(p = *htable; p && p->pid != pid; p = p->pidhash_next) - ; - - return p; -} - -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) - -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) -{ - tsk->processor = cpu; - tsk->cpus_runnable = 1UL << cpu; -} - -static inline void task_release_cpu(struct task_struct *tsk) -{ - tsk->cpus_runnable = ~0UL; -} +extern struct task_struct *find_task_by_pid(int pid); /* per-UID process charging. */ extern struct user_struct * alloc_uid(uid_t); @@ -604,6 +703,7 @@ extern void do_timer(struct pt_regs *); #define CURRENT_TIME (xtime.tv_sec) +extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); extern void FASTCALL(sleep_on(wait_queue_head_t *q)); @@ -612,90 +712,57 @@ extern long FASTCALL(sleep_on_timeout(wa extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process(task_t * p)); +extern void FASTCALL(wake_up_forked_process(task_t * p)); +extern void FASTCALL(sched_exit(task_t * p)); #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +#ifdef CONFIG_SMP +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#else +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#endif + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *); -extern void sig_exit(int, int, struct siginfo *); +extern void flush_signals(task_t *); +extern void flush_signal_handlers(task_t *); extern int dequeue_signal(sigset_t *, siginfo_t *); extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); extern void unblock_all_signals(void); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_sig_info(int, struct siginfo *, task_t *); +extern int force_sig_info(int, struct siginfo *, task_t *); +extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp); extern int kill_pg_info(int, struct siginfo *, pid_t); extern int kill_sl_info(int, struct siginfo *, pid_t); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void notify_parent(struct task_struct *, int); -extern void do_notify_parent(struct task_struct *, int); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); +extern void notify_parent(task_t *, int); +extern void do_notify_parent(task_t *, int); +extern void force_sig(int, task_t *); +extern void force_sig_specific(int, struct task_struct *); +extern int send_sig(int, task_t *, int); +extern void zap_other_threads(struct task_struct *p); extern int kill_pg(pid_t, int, int); extern int kill_sl(pid_t, int, int); extern int kill_proc(pid_t, int, int); extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); -static inline int signal_pending(struct task_struct *p) +static inline int signal_pending(task_t *p) { return (p->sigpending != 0); } -/* - * Re-calculate pending state from the set of locally pending - * signals, globally pending signals, and blocked signals. - */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) -{ - unsigned long ready; - long i; - - switch (_NSIG_WORDS) { - default: - for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) - ready |= signal->sig[i] &~ blocked->sig[i]; - break; - - case 4: ready = signal->sig[3] &~ blocked->sig[3]; - ready |= signal->sig[2] &~ blocked->sig[2]; - ready |= signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 2: ready = signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 1: ready = signal->sig[0] &~ blocked->sig[0]; - } - return ready != 0; -} - -/* Reevaluate whether the task has signals pending delivery. - This is required every time the blocked sigset_t changes. - All callers should have t->sigmask_lock. */ - -static inline void recalc_sigpending(struct task_struct *t) -{ - t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); -} - /* True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) @@ -798,24 +865,43 @@ extern fd_set *alloc_fdset(int); extern int expand_fdset(struct files_struct *, int nr); extern void free_fdset(fd_set *, int); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -extern void exit_mm(struct task_struct *); -extern void exit_files(struct task_struct *); +extern void exit_mm(task_t *); +extern void exit_files(task_t *); +extern void exit_signal(struct task_struct *); +extern void __exit_signal(struct task_struct *); extern void exit_sighand(struct task_struct *); +extern void __exit_sighand(struct task_struct *); + +extern NORET_TYPE void do_group_exit(int); extern void reparent_to_init(void); extern void daemonize(void); +extern void __set_special_pids(pid_t session, pid_t pgrp); +extern void set_special_pids(pid_t session, pid_t pgrp); +extern task_t *child_reaper; extern int do_execve(char *, char **, char **, struct pt_regs *); -extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); +extern struct task_struct *do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int *, int *); +extern void reap_thread(task_t *p); +extern void release_task(struct task_struct * p); + extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +#ifdef CONFIG_SMP +extern void wait_task_inactive(task_t * p); +extern void kick_if_running(task_t * p); +#else +#define wait_task_inactive(p) do {} while (0) +#define kick_if_running(p) do {} while (0) +#endif + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -868,75 +954,120 @@ do { \ __ret; \ }) -#define REMOVE_LINKS(p) do { \ - (p)->next_task->prev_task = (p)->prev_task; \ - (p)->prev_task->next_task = (p)->next_task; \ - if ((p)->p_osptr) \ - (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ - if ((p)->p_ysptr) \ - (p)->p_ysptr->p_osptr = (p)->p_osptr; \ - else \ - (p)->p_pptr->p_cptr = (p)->p_osptr; \ +#define remove_parent(p) list_del_init(&(p)->sibling) +#define add_parent(p, parent) list_add_tail(&(p)->sibling,&(parent)->children) + +#if 0 + +#define REMOVE_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_del_init(&(p)->tasks); \ + remove_parent(p); \ } while (0) -#define SET_LINKS(p) do { \ - (p)->next_task = &init_task; \ - (p)->prev_task = init_task.prev_task; \ - init_task.prev_task->next_task = (p); \ - init_task.prev_task = (p); \ - (p)->p_ysptr = NULL; \ - if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ - (p)->p_osptr->p_ysptr = p; \ - (p)->p_pptr->p_cptr = p; \ +#define SET_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_add_tail(&(p)->tasks,&init_task.tasks); \ + add_parent(p, (p)->parent); \ } while (0) -#define for_each_task(p) \ - for (p = &init_task ; (p = p->next_task) != &init_task ; ) +#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +#define __for_each_process(p) \ + for ( ; (p = next_task(p)) != &init_task ; ) -#define for_each_thread(task) \ - for (task = next_thread(current) ; task != current ; task = next_thread(task)) +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) -#define next_thread(p) \ - list_entry((p)->thread_group.next, struct task_struct, thread_group) +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) +#else + +extern void check_tasklist_locked(void); + +#define REMOVE_LINKS(p) do { \ + check_tasklist_locked(); \ + BUG_ON(list_empty(&(p)->tasks)); \ + list_del_init(&(p)->tasks); \ + remove_parent(p); \ + } while (0) + +#define SET_LINKS(p) do { \ + check_tasklist_locked(); \ + list_add_tail(&(p)->tasks,&init_task.tasks); \ + add_parent(p, (p)->parent); \ + } while (0) + +#define next_task(p) ({ check_tasklist_locked(); list_entry((p)->tasks.next, struct task_struct, tasks); }) +#define prev_task(p) ({ check_tasklist_locked(); list_entry((p)->tasks.prev, struct task_struct, tasks); }) + +#define __for_each_process(p) \ + for ( ; (p = next_task(p)) != &init_task ; ) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +#define do_each_thread(g, t) \ + for (t = &init_task ; (t = next_task(t)) != &init_task ; ) + +#define while_each_thread(g, t) + +#endif + +extern task_t * FASTCALL(next_thread(task_t *p)); #define thread_group_leader(p) (p->pid == p->tgid) -static inline void del_from_runqueue(struct task_struct * p) +static inline int thread_group_empty(task_t *p) { - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; -static inline int task_on_runqueue(struct task_struct *p) -{ - return (p->run_list.next != NULL); + return pid->task_list.next->next == &pid->task_list; } -static inline void unhash_process(struct task_struct *p) -{ - if (task_on_runqueue(p)) - out_of_line_bug(); - write_lock_irq(&tasklist_lock); - nr_threads--; - unhash_pid(p); - REMOVE_LINKS(p); - list_del(&p->thread_group); - write_unlock_irq(&tasklist_lock); -} +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern void unhash_process(struct task_struct *p); /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ -static inline void task_lock(struct task_struct *p) +static inline void task_lock(task_t *p) { spin_lock(&p->alloc_lock); } -static inline void task_unlock(struct task_struct *p) +static inline void task_unlock(task_t *p) { spin_unlock(&p->alloc_lock); } +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. User must release + * the mm via mmput() after use. + */ +static inline struct mm_struct * get_task_mm(struct task_struct * task) +{ + struct mm_struct * mm; + + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + + return mm; +} + /* write full pathname into buffer and return start of pathname */ static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, char *buf, int buflen) @@ -956,6 +1087,26 @@ static inline char * d_path(struct dentr return res; } +static inline void set_need_resched(void) +{ + current->need_resched = 1; +} + +static inline void clear_need_resched(void) +{ + current->need_resched = 0; +} + +static inline void set_tsk_need_resched(task_t *tsk) +{ + tsk->need_resched = 1; +} + +static inline void clear_tsk_need_resched(task_t *tsk) +{ + tsk->need_resched = 0; +} + static inline int need_resched(void) { return (unlikely(current->need_resched)); @@ -969,4 +1120,5 @@ static inline void cond_resched(void) } #endif /* __KERNEL__ */ + #endif --- linux/include/linux/signal.h.orig 2000-12-31 20:10:17.000000000 +0100 +++ linux/include/linux/signal.h @@ -220,6 +220,11 @@ static inline void init_sigpending(struc extern long do_sigpending(void *, unsigned long); +#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER +struct pt_regs; +extern int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs); +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNAL_H */ --- linux/include/linux/slab.h.orig +++ linux/include/linux/slab.h @@ -73,7 +73,8 @@ extern kmem_cache_t *filp_cachep; extern kmem_cache_t *dquot_cachep; extern kmem_cache_t *bh_cachep; extern kmem_cache_t *fs_cachep; -extern kmem_cache_t *sigact_cachep; +extern kmem_cache_t *signal_cachep; +extern kmem_cache_t *sighand_cachep; #endif /* __KERNEL__ */ --- linux/include/linux/sys.h.orig 1995-12-11 05:56:37.000000000 +0100 +++ linux/include/linux/sys.h @@ -4,7 +4,7 @@ /* * system call entry points ... but not all are defined */ -#define NR_syscalls 256 +#define NR_syscalls 260 /* * These are system calls that will be removed at some time --- linux/include/linux/sysctl.h.orig +++ linux/include/linux/sysctl.h @@ -125,6 +125,7 @@ enum KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_CORE_PATTERN=56, /* string: pattern for core-files */ + KERN_PID_MAX=55, /* int: max PID value of processes */ }; --- linux/include/linux/threads.h.orig +++ linux/include/linux/threads.h @@ -17,8 +17,13 @@ #define MIN_THREADS_LEFT_FOR_ROOT 4 /* - * This controls the maximum pid allocated to a process + * This controls the default maximum pid allocated to a process */ -#define PID_MAX 0x8000 +#define PID_MAX_DEFAULT 0x8000 + +/* + * A maximum of 4 million PIDs should be enough for a while: + */ +#define PID_MAX_LIMIT (4*1024*1024) #endif --- linux/include/linux/vcache.h.orig +++ linux/include/linux/vcache.h @@ -0,0 +1,26 @@ +/* + * virtual => physical mapping cache support. + */ +#ifndef _LINUX_VCACHE_H +#define _LINUX_VCACHE_H + +typedef struct vcache_s { + unsigned long address; + struct mm_struct *mm; + struct list_head hash_entry; + void (*callback)(struct vcache_s *data, struct page *new_page); +} vcache_t; + +extern spinlock_t vcache_lock; + +extern void __attach_vcache(vcache_t *vcache, + unsigned long address, + struct mm_struct *mm, + void (*callback)(struct vcache_s *data, struct page *new_page)); + +extern void __detach_vcache(vcache_t *vcache); + +extern void invalidate_vcache(unsigned long address, struct mm_struct *mm, + struct page *new_page); + +#endif --- linux/include/linux/wait.h.orig +++ linux/include/linux/wait.h @@ -60,6 +60,7 @@ typedef struct __wait_queue wait_queue_t # define wq_write_lock_irqsave write_lock_irqsave # define wq_write_unlock_irq write_unlock_irq # define wq_write_unlock_irqrestore write_unlock_irqrestore +# define wq_write_unlock_irq write_unlock_irq # define wq_write_unlock write_unlock #else # define wq_lock_t spinlock_t @@ -73,6 +74,7 @@ typedef struct __wait_queue wait_queue_t # define wq_write_lock_irqsave spin_lock_irqsave # define wq_write_unlock_irq spin_unlock_irq # define wq_write_unlock_irqrestore spin_unlock_irqrestore +# define wq_write_unlock_irq spin_unlock_irq # define wq_write_unlock spin_unlock #endif --- linux/include/asm-generic/bitops.h.orig +++ linux/include/asm-generic/bitops.h @@ -51,6 +51,12 @@ extern __inline__ int test_bit(int nr, l return ((mask & *addr) != 0); } +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + #ifdef __KERNEL__ /* --- linux/include/asm-i386/bitops.h.orig +++ linux/include/asm-i386/bitops.h @@ -6,6 +6,7 @@ */ #include +#include /* * These have to be done with inline assembly: that way the bit-setting @@ -75,6 +76,14 @@ static __inline__ void clear_bit(int nr, :"=m" (ADDR) :"Ir" (nr)); } + +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() @@ -284,6 +293,34 @@ static __inline__ int find_first_zero_bi } /** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +static __inline__ int find_first_bit(void * addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leal -4(%%edi),%%edi\n\t" + "bsfl (%%edi),%%eax\n" + "1:\tsubl %%ebx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); + return res; +} + +/** * find_next_zero_bit - find the first zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at @@ -296,7 +333,7 @@ static __inline__ int find_next_zero_bit if (bit) { /* - * Look for zero in first byte + * Look for zero in the first 32 bits. */ __asm__("bsfl %1,%0\n\t" "jne 1f\n\t" @@ -317,6 +354,39 @@ static __inline__ int find_next_zero_bit } /** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + set + res); +} + +/** * ffz - find first zero in word. * @word: The word to search * @@ -330,8 +400,41 @@ static __inline__ unsigned long ffz(unsi return word; } +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + __asm__("bsfl %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /** * ffs - find first bit set * @x: the word to search --- linux/include/asm-i386/desc.h.orig +++ linux/include/asm-i386/desc.h @@ -2,63 +2,23 @@ #define __ARCH_DESC_H #include - -/* - * The layout of the GDT under Linux: - * - * 0 - null - * 1 - not used - * 2 - kernel code segment - * 3 - kernel data segment - * 4 - user code segment <-- new cacheline - * 5 - user data segment - * 6 - not used - * 7 - not used - * 8 - APM BIOS support <-- new cacheline - * 9 - APM BIOS support - * 10 - APM BIOS support - * 11 - APM BIOS support - * - * The TSS+LDT descriptors are spread out a bit so that every CPU - * has an exclusive cacheline for the per-CPU TSS and LDT: - * - * 12 - CPU#0 TSS <-- new cacheline - * 13 - CPU#0 LDT - * 14 - not used - * 15 - not used - * 16 - CPU#1 TSS <-- new cacheline - * 17 - CPU#1 LDT - * 18 - not used - * 19 - not used - * ... NR_CPUS per-CPU TSS+LDT's if on SMP - * - * Entry into gdt where to find first TSS. - */ -#define __FIRST_TSS_ENTRY 12 -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY+1) - -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) +#include #ifndef __ASSEMBLY__ -struct desc_struct { - unsigned long a,b; -}; -extern struct desc_struct gdt_table[]; -extern struct desc_struct *idt, *gdt; +#include + +extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); -}; +} __attribute__ ((packed)); -#define idt_descr (*(struct Xgt_desc_struct *)((char *)&idt - 2)) -#define gdt_descr (*(struct Xgt_desc_struct *)((char *)&gdt - 2)) +extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; -#define load_TR(n) __asm__ __volatile__("ltr %%ax"::"a" (__TSS(n)<<3)) - -#define __load_LDT(n) __asm__ __volatile__("lldt %%ax"::"a" (__LDT(n)<<3)) +#define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) +#define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) /* * This is the ldt that every process will get unless we need @@ -66,14 +26,68 @@ struct Xgt_desc_struct { */ extern struct desc_struct default_ldt[]; extern void set_intr_gate(unsigned int irq, void * addr); -extern void set_ldt_desc(unsigned int n, void *addr, unsigned int size); -extern void set_tss_desc(unsigned int n, void *addr); + +#define _set_tssldt_desc(n,addr,limit,type) \ +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ + "movw %%ax,2(%2)\n\t" \ + "rorl $16,%%eax\n\t" \ + "movb %%al,4(%2)\n\t" \ + "movb %4,5(%2)\n\t" \ + "movb $0,6(%2)\n\t" \ + "movb %%ah,7(%2)\n\t" \ + "rorl $16,%%eax" \ + : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) + +static inline void set_tss_desc(unsigned int cpu, void *addr) +{ + _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_TSS], (int)addr, 235, 0x89); +} + +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) +{ + _set_tssldt_desc(&cpu_gdt_table[cpu][GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); +} + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#if TLS_SIZE != 24 +# error update this code. +#endif + +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) cpu_gdt_table[cpu][GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} static inline void clear_LDT(void) { - int cpu = smp_processor_id(); - set_ldt_desc(cpu, &default_ldt[0], 5); - __load_LDT(cpu); + set_ldt_desc(smp_processor_id(), &default_ldt[0], 5); + load_LDT_desc(); } /* @@ -81,7 +95,6 @@ static inline void clear_LDT(void) */ static inline void load_LDT (mm_context_t *pc) { - int cpu = smp_processor_id(); void *segments = pc->ldt; int count = pc->size; @@ -90,8 +103,8 @@ static inline void load_LDT (mm_context_ count = 5; } - set_ldt_desc(cpu, segments, count); - __load_LDT(cpu); + set_ldt_desc(smp_processor_id(), segments, count); + load_LDT_desc(); } #endif /* !__ASSEMBLY__ */ --- linux/include/asm-i386/elf.h.orig 2001-08-21 14:26:19.000000000 +0200 +++ linux/include/asm-i386/elf.h @@ -7,6 +7,7 @@ #include #include +#include #include @@ -59,6 +60,9 @@ typedef struct user_fxsr_struct elf_fpxr /* Wow, the "main" arch needs arch dependent functions too.. :) */ +#define savesegment(seg,value) \ + asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) + /* regs is struct pt_regs, pr_reg is elf_gregset_t (which is now struct_user_regs, they are different) */ @@ -72,9 +76,8 @@ typedef struct user_fxsr_struct elf_fpxr pr_reg[6] = regs->eax; \ pr_reg[7] = regs->xds; \ pr_reg[8] = regs->xes; \ - /* fake once used fs and gs selectors? */ \ - pr_reg[9] = regs->xds; /* was fs and __fs */ \ - pr_reg[10] = regs->xds; /* was gs and __gs */ \ + savesegment(fs,pr_reg[9]); \ + savesegment(gs,pr_reg[10]); \ pr_reg[11] = regs->orig_eax; \ pr_reg[12] = regs->eip; \ pr_reg[13] = regs->xcs; \ @@ -97,8 +100,40 @@ typedef struct user_fxsr_struct elf_fpxr #define ELF_PLATFORM (system_utsname.machine) +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#define AT_SYSINFO 32 + #ifdef __KERNEL__ #define SET_PERSONALITY(ex, ibcs2) set_personality((ibcs2)?PER_SVR4:PER_LINUX) + +extern int dump_task_regs (struct task_struct *, elf_gregset_t *); +extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *); +extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct *); + +#define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs) +#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) +#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs) + +#ifdef CONFIG_SMP +extern void dump_smp_unlazy_fpu(void); +#define ELF_CORE_SYNC dump_smp_unlazy_fpu +#endif + + + +extern int allowsysinfo; /* setup.c */ +#define DLINFO_ARCH_ITEMS (allowsysinfo ? 1 : 0) +#define ARCH_DLINFO \ +do { \ + if (allowsysinfo) { \ + sp -= 2; \ + NEW_AUX_ENT(0, AT_SYSINFO, 0xffffe000); \ + } \ +} while (0) + #endif #endif --- linux/include/asm-i386/fixmap.h.orig +++ linux/include/asm-i386/fixmap.h @@ -48,6 +48,8 @@ * fix-mapped? */ enum fixed_addresses { + FIX_HOLE, + FIX_VSYSCALL, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -93,10 +95,9 @@ extern void __set_fixmap (enum fixed_add * used by vmalloc.c. * * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap, and leave one page empty - * at the top of mem.. + * the start of the fixmap. */ -#define FIXADDR_TOP (0xffffe000UL) +#define FIXADDR_TOP (0xfffff000UL) #define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) --- linux/include/asm-i386/ldt.h.orig 1999-07-11 18:11:46.000000000 +0200 +++ linux/include/asm-i386/ldt.h @@ -12,7 +12,7 @@ #define LDT_ENTRY_SIZE 8 #ifndef __ASSEMBLY__ -struct modify_ldt_ldt_s { +struct user_desc { unsigned int entry_number; unsigned long base_addr; unsigned int limit; --- linux/include/asm-i386/mmu_context.h.orig +++ linux/include/asm-i386/mmu_context.h @@ -18,7 +18,7 @@ void destroy_context(struct mm_struct *m static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu) { - if(cpu_tlbstate[cpu].state == TLBSTATE_OK) + if (cpu_tlbstate[cpu].state == TLBSTATE_OK) cpu_tlbstate[cpu].state = TLBSTATE_LAZY; } #else --- linux/include/asm-i386/processor.h.orig +++ linux/include/asm-i386/processor.h @@ -18,6 +18,15 @@ #include #include +struct desc_struct { + unsigned long a,b; +}; + +#define desc_empty(desc) \ + (!((desc)->a + (desc)->b)) + +#define desc_equal(desc1, desc2) \ + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -274,7 +283,7 @@ extern unsigned int mca_pentium_flag; /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE (TASK_SIZE / 3) +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) /* * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. @@ -367,6 +376,8 @@ struct tss_struct { }; struct thread_struct { +/* cached TLS descriptors. */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; unsigned long esp0; unsigned long eip; unsigned long esp; @@ -382,18 +393,21 @@ struct thread_struct { struct vm86_struct * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, saved_esp0; + unsigned int saved_fs, saved_gs; /* IO permissions */ int ioperm; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; #define INIT_THREAD { \ + { { 0, 0 } , }, \ 0, \ 0, 0, 0, 0, \ { [0 ... 7] = 0 }, /* debugging registers */ \ 0, 0, 0, \ { { 0, }, }, /* 387 state */ \ 0,0,0,0,0, \ + 0, 0, /* fs/gs */ \ 0,{~0,} /* io permissions */ \ } @@ -408,7 +422,7 @@ struct thread_struct { 0,0,0,0, /* esp,ebp,esi,edi */ \ 0,0,0,0,0,0, /* es,cs,ss */ \ 0,0,0,0,0,0, /* ds,fs,gs */ \ - __LDT(0),0, /* ldt */ \ + GDT_ENTRY_LDT,0, /* ldt */ \ 0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */ \ {~0, } /* ioperm */ \ } @@ -455,9 +469,8 @@ unsigned long get_wchan(struct task_stru #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) #define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) -#define free_task_struct(p) free_pages((unsigned long) (p), 1) -#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) +#define __alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) +#define __free_task_struct(p) do { BUG_ON((p)->state < TASK_ZOMBIE); free_pages((unsigned long) (p), 1); } while (0) #define init_task (init_task_union.task) #define init_stack (init_task_union.stack) --- linux/include/asm-i386/ptrace.h.orig +++ linux/include/asm-i386/ptrace.h @@ -49,10 +49,10 @@ struct pt_regs { #define PTRACE_GETFPXREGS 18 #define PTRACE_SETFPXREGS 19 -#define PTRACE_SETOPTIONS 21 +#define PTRACE_OLDSETOPTIONS 21 -/* options set using PTRACE_SETOPTIONS */ -#define PTRACE_O_TRACESYSGOOD 0x00000001 +#define PTRACE_GET_THREAD_AREA 25 +#define PTRACE_SET_THREAD_AREA 26 #ifdef __KERNEL__ #define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) --- linux/include/asm-i386/segment.h.orig 1997-12-01 19:34:12.000000000 +0100 +++ linux/include/asm-i386/segment.h @@ -1,10 +1,79 @@ #ifndef _ASM_SEGMENT_H #define _ASM_SEGMENT_H -#define __KERNEL_CS 0x10 -#define __KERNEL_DS 0x18 +/* + * The layout of the per-CPU GDT under Linux: + * + * 0 - null + * 1 - reserved + * 2 - reserved + * 3 - reserved + * + * 4 - default user CS <==== new cacheline + * 5 - default user DS + * + * ------- start of TLS (Thread-Local Storage) segments: + * + * 6 - TLS segment #1 [ glibc's TLS segment ] + * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] + * 8 - TLS segment #3 + * 9 - reserved + * 10 - reserved + * 11 - reserved + * + * ------- start of kernel segments: + * + * 12 - kernel code segment <==== new cacheline + * 13 - kernel data segment + * 14 - TSS + * 15 - LDT + * 16 - PNPBIOS support (16->32 gate) + * 17 - PNPBIOS support + * 18 - PNPBIOS support + * 19 - PNPBIOS support + * 20 - PNPBIOS support + * 21 - APM BIOS support + * 22 - APM BIOS support + * 23 - APM BIOS support + */ +#define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) -#define __USER_CS 0x23 -#define __USER_DS 0x2B +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) + +#define GDT_ENTRY_DEFAULT_USER_CS 4 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) + +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) + +#define GDT_ENTRY_KERNEL_BASE 12 + +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) + +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) + +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 2) +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 3) + +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 4) +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 9) + +/* + * The GDT has 21 entries but we pad it to cacheline boundary: + */ +#define GDT_ENTRIES 24 + +#define GDT_SIZE (GDT_ENTRIES * 8) + +/* + * The interrupt descriptor table has room for 256 idt's, + * the global descriptor table is dependent on the number + * of tasks we can have.. + */ +#define IDT_ENTRIES 256 #endif --- linux/include/asm-i386/smp.h.orig +++ linux/include/asm-i386/smp.h @@ -7,7 +7,6 @@ #ifndef __ASSEMBLY__ #include #include -#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -40,6 +39,7 @@ extern int cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); +extern void smp_send_reschedule_all(void); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); @@ -89,7 +89,7 @@ extern void smp_store_cpu_info(int id); * so this is correct in the x86 case. */ -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) static __inline int hard_smp_processor_id(void) { @@ -107,17 +107,5 @@ static __inline int logical_smp_processo #define NO_PROC_ID 0xFF /* No processor magic marker */ -/* - * This magic constant controls our willingness to transfer - * a process across CPUs. Such a transfer incurs misses on the L1 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My - * gut feeling is this will vary by board in value. For a board - * with separate L2 cache it probably depends also on the RSS, and - * for a board with shared L2 cache it ought to decay fast as other - * processes are run. - */ - -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ - #endif #endif --- linux/include/asm-i386/spinlock.h.orig +++ linux/include/asm-i386/spinlock.h @@ -168,6 +168,8 @@ typedef struct { #define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) +#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS) + /* * On x86, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. --- linux/include/asm-i386/system.h.orig +++ linux/include/asm-i386/system.h @@ -12,25 +12,22 @@ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %3,%%esp\n\t" /* restore ESP */ \ + "movl %2,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %4\n\t" /* restore EIP */ \ + "pushl %3\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=b" (last) \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next), \ - "b" (prev)); \ + "a" (prev), "d" (next)); \ } while (0) #define _set_base(addr,base) do { unsigned long __pr; \ @@ -338,6 +335,13 @@ static inline unsigned long __cmpxchg(vo #define local_irq_disable() __cli() #define local_irq_enable() __sti() +#define irqs_disabled() \ +({ \ + unsigned long flags; \ + __save_flags(flags); \ + !(flags & (1<<9)); \ +}) + #ifdef CONFIG_SMP extern void __global_cli(void); --- linux/include/asm-i386/unistd.h.orig +++ linux/include/asm-i386/unistd.h @@ -257,6 +257,8 @@ #define __NR_alloc_hugepages 250 #define __NR_free_hugepages 251 #define __NR_exit_group 252 +#define __NR_lookup_dcookie 253 +#define __NR_set_tid_address 258 /* user-visible error numbers are in the range -1 - -124: see */ --- linux/include/asm-mips/bitops.h.orig +++ linux/include/asm-mips/bitops.h @@ -43,6 +43,8 @@ #ifdef CONFIG_CPU_HAS_LLSC +#include + /* * These functions for MIPS ISA > 1 are interrupt and SMP proof and * interrupt friendly @@ -628,7 +630,8 @@ extern __inline__ int find_first_zero_bi "2:" : "=r" (res), "=r" (dummy), "=r" (addr) : "0" ((signed int) 0), "1" ((unsigned int) 0xffffffff), - "2" (addr), "r" (size)); + "2" (addr), "r" (size) + : "$1"); return res; } @@ -663,7 +666,8 @@ extern __inline__ int find_next_zero_bit ".set\treorder\n" "1:" : "=r" (set), "=r" (dummy) - : "0" (0), "1" (1 << bit), "r" (*p)); + : "0" (0), "1" (1 << bit), "r" (*p) + : "$1"); if (set < (32 - bit)) return set + offset; set = 32 - bit; @@ -684,21 +688,30 @@ extern __inline__ int find_next_zero_bit * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __inline__ unsigned long ffz(unsigned long word) +extern __inline__ unsigned long ffz(unsigned long word) { - int b = 0, s; + unsigned int __res; + unsigned int mask = 1; - word = ~word; - s = 16; if (word << 16 != 0) s = 0; b += s; word >>= s; - s = 8; if (word << 24 != 0) s = 0; b += s; word >>= s; - s = 4; if (word << 28 != 0) s = 0; b += s; word >>= s; - s = 2; if (word << 30 != 0) s = 0; b += s; word >>= s; - s = 1; if (word << 31 != 0) s = 0; b += s; + __asm__ ( + ".set\tnoreorder\n\t" + ".set\tnoat\n\t" + "move\t%0,$0\n" + "1:\tand\t$1,%2,%1\n\t" + "beqz\t$1,2f\n\t" + "sll\t%1,1\n\t" + "bnez\t%1,1b\n\t" + "addiu\t%0,1\n\t" + ".set\tat\n\t" + ".set\treorder\n" + "2:\n\t" + : "=&r" (__res), "=r" (mask) + : "r" (word), "1" (mask) + : "$1"); - return b; + return __res; } - #ifdef __KERNEL__ /** --- linux/include/asm-alpha/bitops.h.orig +++ linux/include/asm-alpha/bitops.h @@ -3,6 +3,7 @@ #include #include +#include /* * Copyright 1994, Linus Torvalds. @@ -60,25 +61,25 @@ clear_bit(unsigned long nr, volatile voi __asm__ __volatile__( "1: ldl_l %0,%3\n" - " and %0,%2,%0\n" + " bic %0,%2,%0\n" " stl_c %0,%1\n" " beq %0,2f\n" ".subsection 2\n" "2: br 1b\n" ".previous" :"=&r" (temp), "=m" (*m) - :"Ir" (~(1UL << (nr & 31))), "m" (*m)); + :"Ir" (1UL << (nr & 31)), "m" (*m)); } /* * WARNING: non atomic version. */ static __inline__ void -__change_bit(unsigned long nr, volatile void * addr) +__clear_bit(unsigned long nr, volatile void * addr) { int *m = ((int *) addr) + (nr >> 5); - *m ^= 1 << (nr & 31); + *m &= ~(1 << (nr & 31)); } static inline void @@ -99,6 +100,17 @@ change_bit(unsigned long nr, volatile vo :"Ir" (1UL << (nr & 31)), "m" (*m)); } +/* + * WARNING: non atomic version. + */ +static __inline__ void +__change_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + + *m ^= 1 << (nr & 31); +} + static inline int test_and_set_bit(unsigned long nr, volatile void *addr) { @@ -181,20 +193,6 @@ __test_and_clear_bit(unsigned long nr, v return (old & mask) != 0; } -/* - * WARNING: non atomic version. - */ -static __inline__ int -__test_and_change_bit(unsigned long nr, volatile void * addr) -{ - unsigned long mask = 1 << (nr & 0x1f); - int *m = ((int *) addr) + (nr >> 5); - int old = *m; - - *m = old ^ mask; - return (old & mask) != 0; -} - static inline int test_and_change_bit(unsigned long nr, volatile void * addr) { @@ -220,6 +218,20 @@ test_and_change_bit(unsigned long nr, vo return oldbit != 0; } +/* + * WARNING: non atomic version. + */ +static __inline__ int +__test_and_change_bit(unsigned long nr, volatile void * addr) +{ + unsigned long mask = 1 << (nr & 0x1f); + int *m = ((int *) addr) + (nr >> 5); + int old = *m; + + *m = old ^ mask; + return (old & mask) != 0; +} + static inline int test_bit(int nr, volatile void * addr) { @@ -235,12 +247,15 @@ test_bit(int nr, volatile void * addr) */ static inline unsigned long ffz_b(unsigned long x) { - unsigned long sum = 0; + unsigned long sum, x1, x2, x4; x = ~x & -~x; /* set first 0 bit, clear others */ - if (x & 0xF0) sum += 4; - if (x & 0xCC) sum += 2; - if (x & 0xAA) sum += 1; + x1 = x & 0xAA; + x2 = x & 0xCC; + x4 = x & 0xF0; + sum = x2 ? 2 : 0; + sum += (x4 != 0) * 4; + sum += (x1 != 0); return sum; } @@ -257,24 +272,46 @@ static inline unsigned long ffz(unsigned __asm__("cmpbge %1,%2,%0" : "=r"(bits) : "r"(word), "r"(~0UL)); qofs = ffz_b(bits); - __asm__("extbl %1,%2,%0" : "=r"(bits) : "r"(word), "r"(qofs)); + bits = __kernel_extbl(word, qofs); bofs = ffz_b(bits); return qofs*8 + bofs; #endif } +/* + * __ffs = Find First set bit in word. Undefined if no set bit exists. + */ +static inline unsigned long __ffs(unsigned long word) +{ +#if defined(__alpha_cix__) && defined(__alpha_fix__) + /* Whee. EV67 can calculate it directly. */ + unsigned long result; + __asm__("cttz %1,%0" : "=r"(result) : "r"(word)); + return result; +#else + unsigned long bits, qofs, bofs; + + __asm__("cmpbge $31,%1,%0" : "=r"(bits) : "r"(word)); + qofs = ffz_b(bits); + bits = __kernel_extbl(word, qofs); + bofs = ffz_b(~bits); + + return qofs*8 + bofs; +#endif +} + #ifdef __KERNEL__ /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). + * differs in spirit from the above __ffs. */ static inline int ffs(int word) { - int result = ffz(~word); + int result = __ffs(word); return word ? result+1 : 0; } @@ -316,6 +353,14 @@ static inline unsigned long hweight64(un #define hweight16(x) hweight64((x) & 0xfffful) #define hweight8(x) hweight64((x) & 0xfful) #else +static inline unsigned long hweight64(unsigned long w) +{ + unsigned long result; + for (result = 0; w ; w >>= 1) + result += (w & 1); + return result; +} + #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) @@ -365,13 +410,77 @@ found_middle: } /* - * The optimizer actually does good code for this case.. + * Find next one bit in a bitmap reasonably efficiently. + */ +static inline unsigned long +find_next_bit(void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= ~0UL >> (64 - size); + if (!tmp) + return result + size; +found_middle: + return result + __ffs(tmp); +} + +/* + * The optimizer actually does good code for this case. */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is set. + */ +static inline unsigned long +sched_find_first_bit(unsigned long b[3]) +{ + unsigned long b0 = b[0], b1 = b[1], b2 = b[2]; + unsigned long ofs; + + ofs = (b1 ? 64 : 128); + b1 = (b1 ? b1 : b2); + ofs = (b0 ? 0 : ofs); + b0 = (b0 ? b0 : b1); + + return __ffs(b0) + ofs; +} + + #define ext2_set_bit __test_and_set_bit #define ext2_clear_bit __test_and_clear_bit #define ext2_test_bit test_bit --- linux/include/asm-alpha/smp.h.orig +++ linux/include/asm-alpha/smp.h @@ -55,7 +55,7 @@ extern int __cpu_logical_map[NR_CPUS]; #define cpu_logical_map(cpu) __cpu_logical_map[cpu] #define hard_smp_processor_id() __hard_smp_processor_id() -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) extern unsigned long cpu_present_mask; #define cpu_online_map cpu_present_mask --- linux/include/asm-alpha/system.h.orig +++ linux/include/asm-alpha/system.h @@ -130,7 +130,6 @@ struct el_common_EV6_mcheck { extern void halt(void) __attribute__((noreturn)); #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt)) -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) \ do { \ unsigned long pcbb; \ --- linux/include/asm-m68k/bitops.h.orig +++ linux/include/asm-m68k/bitops.h @@ -97,6 +97,7 @@ extern __inline__ int __generic_test_and (__builtin_constant_p(nr) ? \ __constant_clear_bit(nr, vaddr) : \ __generic_clear_bit(nr, vaddr)) +#define __clear_bit(nr,vaddr) clear_bit(nr,vaddr) extern __inline__ void __constant_clear_bit(int nr, volatile void * vaddr) { @@ -239,6 +240,28 @@ extern __inline__ int ffs(int x) return 32 - cnt; } +#define __ffs(x) (ffs(x) - 1) + + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /* * hweightN: returns the hamming weight (i.e. the number --- linux/include/asm-ppc/bitops.h.orig +++ linux/include/asm-ppc/bitops.h @@ -1,5 +1,5 @@ /* - * BK Id: SCCS/s.bitops.h 1.9 05/26/01 14:48:14 paulus + * BK Id: %F% %I% %G% %U% %#% */ /* * bitops.h: Bit string operations on the ppc @@ -10,7 +10,9 @@ #define _PPC_BITOPS_H #include +#include #include +#include /* * The test_and_*_bit operations are taken to imply a memory barrier @@ -28,7 +30,7 @@ * These used to be if'd out here because using : "cc" as a constraint * resulted in errors from egcs. Things appear to be OK with gcc-2.95. */ -static __inline__ void set_bit(int nr, volatile void * addr) +static __inline__ void set_bit(int nr, volatile unsigned long * addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -36,8 +38,9 @@ static __inline__ void set_bit(int nr, v __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - or %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + or %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -47,7 +50,7 @@ static __inline__ void set_bit(int nr, v /* * non-atomic version */ -static __inline__ void __set_bit(int nr, volatile void *addr) +static __inline__ void __set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -61,7 +64,7 @@ static __inline__ void __set_bit(int nr, #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ void clear_bit(int nr, volatile void *addr) +static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -69,8 +72,9 @@ static __inline__ void clear_bit(int nr, __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - andc %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + andc %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -80,7 +84,7 @@ static __inline__ void clear_bit(int nr, /* * non-atomic version */ -static __inline__ void __clear_bit(int nr, volatile void *addr) +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -88,7 +92,7 @@ static __inline__ void __clear_bit(int n *p &= ~mask; } -static __inline__ void change_bit(int nr, volatile void *addr) +static __inline__ void change_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -96,8 +100,9 @@ static __inline__ void change_bit(int nr __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - xor %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + xor %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -107,7 +112,7 @@ static __inline__ void change_bit(int nr /* * non-atomic version */ -static __inline__ void __change_bit(int nr, volatile void *addr) +static __inline__ void __change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -118,7 +123,7 @@ static __inline__ void __change_bit(int /* * test_and_*_bit do imply a memory barrier (?) */ -static __inline__ int test_and_set_bit(int nr, volatile void *addr) +static __inline__ int test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -126,8 +131,9 @@ static __inline__ int test_and_set_bit(i __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - or %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + or %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -140,7 +146,7 @@ static __inline__ int test_and_set_bit(i /* * non-atomic version */ -static __inline__ int __test_and_set_bit(int nr, volatile void *addr) +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -150,7 +156,7 @@ static __inline__ int __test_and_set_bit return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -158,8 +164,9 @@ static __inline__ int test_and_clear_bit __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - andc %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + andc %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -172,7 +179,7 @@ static __inline__ int test_and_clear_bit /* * non-atomic version */ -static __inline__ int __test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -182,7 +189,7 @@ static __inline__ int __test_and_clear_b return (old & mask) != 0; } -static __inline__ int test_and_change_bit(int nr, volatile void *addr) +static __inline__ int test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -190,8 +197,9 @@ static __inline__ int test_and_change_bi __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - xor %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + xor %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -204,7 +212,7 @@ static __inline__ int test_and_change_bi /* * non-atomic version */ -static __inline__ int __test_and_change_bit(int nr, volatile void *addr) +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -214,7 +222,7 @@ static __inline__ int __test_and_change_ return (old & mask) != 0; } -static __inline__ int test_bit(int nr, __const__ volatile void *addr) +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr) { __const__ unsigned int *p = (__const__ unsigned int *) addr; @@ -222,7 +230,7 @@ static __inline__ int test_bit(int nr, _ } /* Return the bit position of the most significant 1 bit in a word */ -static __inline__ int __ilog2(unsigned int x) +static __inline__ int __ilog2(unsigned long x) { int lz; @@ -230,13 +238,18 @@ static __inline__ int __ilog2(unsigned i return 31 - lz; } -static __inline__ int ffz(unsigned int x) +static __inline__ int ffz(unsigned long x) { if ((x = ~x) == 0) return 32; return __ilog2(x & -x); } +static inline int __ffs(unsigned long x) +{ + return __ilog2(x & -x); +} + /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore @@ -248,6 +261,18 @@ static __inline__ int ffs(int x) } /* + * fls: find last (most-significant) bit set. + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. + */ +static __inline__ int fls(unsigned int x) +{ + int lz; + + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); + return 32 - lz; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -257,13 +282,86 @@ static __inline__ int ffs(int x) #define hweight8(x) generic_hweight8(x) /* + * Find the first bit set in a 140-bit bitmap. + * The first 100 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ unsigned long find_next_bit(unsigned long *addr, + unsigned long size, unsigned long offset) +{ + unsigned int *p = ((unsigned int *) addr) + (offset >> 5); + unsigned int result = offset & ~31UL; + unsigned int tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 31UL; + if (offset) { + tmp = *p++; + tmp &= ~0UL << offset; + if (size < 32) + goto found_first; + if (tmp) + goto found_middle; + size -= 32; + result += 32; + } + while (size >= 32) { + if ((tmp = *p++) != 0) + goto found_middle; + result += 32; + size -= 32; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= ~0UL >> (32 - size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + +/* * This implementation of find_{first,next}_zero_bit was stolen from * Linus' asm-alpha/bitops.h. */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) -static __inline__ unsigned long find_next_zero_bit(void * addr, +static __inline__ unsigned long find_next_zero_bit(unsigned long * addr, unsigned long size, unsigned long offset) { unsigned int * p = ((unsigned int *) addr) + (offset >> 5); @@ -302,8 +400,8 @@ found_middle: } -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, addr) -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, addr) +#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr)) +#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr)) static __inline__ int ext2_test_bit(int nr, __const__ void * addr) { --- linux/include/asm-ppc/unistd.h.orig +++ linux/include/asm-ppc/unistd.h @@ -228,7 +228,6 @@ #define __NR_removexattr 218 #define __NR_lremovexattr 219 #define __NR_fremovexattr 220 -#if 0 #define __NR_futex 221 #define __NR_sched_setaffinity 222 #define __NR_sched_getaffinity 223 @@ -240,7 +239,6 @@ #define __NR_io_getevents 229 #define __NR_io_submit 230 #define __NR_io_cancel 231 -#endif #define __NR(n) #n --- linux/include/asm-sparc64/bitops.h.orig +++ linux/include/asm-sparc64/bitops.h @@ -1,4 +1,4 @@ -/* $Id: bitops.h,v 1.38 2001/11/19 18:36:34 davem Exp $ +/* $Id: bitops.h,v 1.39 2002/01/30 01:40:00 davem Exp $ * bitops.h: Bit string operations on the V9. * * Copyright 1996, 1997 David S. Miller (davem@caip.rutgers.edu) @@ -7,11 +7,12 @@ #ifndef _SPARC64_BITOPS_H #define _SPARC64_BITOPS_H +#include #include -extern long ___test_and_set_bit(unsigned long nr, volatile void *addr); -extern long ___test_and_clear_bit(unsigned long nr, volatile void *addr); -extern long ___test_and_change_bit(unsigned long nr, volatile void *addr); +extern long ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr); +extern long ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr); +extern long ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr); #define test_and_set_bit(nr,addr) ({___test_and_set_bit(nr,addr)!=0;}) #define test_and_clear_bit(nr,addr) ({___test_and_clear_bit(nr,addr)!=0;}) @@ -21,109 +22,132 @@ extern long ___test_and_change_bit(unsig #define change_bit(nr,addr) ((void)___test_and_change_bit(nr,addr)) /* "non-atomic" versions... */ -#define __set_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m |= (1UL << (__nr & 63)); \ -} while (0) -#define __clear_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m &= ~(1UL << (__nr & 63)); \ -} while (0) -#define __change_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m ^= (1UL << (__nr & 63)); \ -} while (0) -#define __test_and_set_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old | __mask); \ - ((__old & __mask) != 0); \ -}) -#define __test_and_clear_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old & ~__mask); \ - ((__old & __mask) != 0); \ -}) -#define __test_and_change_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old ^ __mask); \ - ((__old & __mask) != 0); \ -}) + +static __inline__ void __set_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m |= (1UL << (nr & 63)); +} + +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m &= ~(1UL << (nr & 63)); +} + +static __inline__ void __change_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m ^= (1UL << (nr & 63)); +} + +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old | mask); + return ((old & mask) != 0); +} + +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old & ~mask); + return ((old & mask) != 0); +} + +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old ^ mask); + return ((old & mask) != 0); +} #define smp_mb__before_clear_bit() do { } while(0) #define smp_mb__after_clear_bit() do { } while(0) -extern __inline__ int test_bit(int nr, __const__ void *addr) +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr) { - return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))) != 0UL; + return (1UL & ((addr)[nr >> 6] >> (nr & 63))) != 0UL; } /* The easy/cheese version for now. */ -extern __inline__ unsigned long ffz(unsigned long word) +static __inline__ unsigned long ffz(unsigned long word) { unsigned long result; -#ifdef ULTRA_HAS_POPULATION_COUNT /* Thanks for nothing Sun... */ - __asm__ __volatile__( -" brz,pn %0, 1f\n" -" neg %0, %%g1\n" -" xnor %0, %%g1, %%g2\n" -" popc %%g2, %0\n" -"1: " : "=&r" (result) - : "0" (word) - : "g1", "g2"); -#else -#if 1 /* def EASY_CHEESE_VERSION */ result = 0; while(word & 1) { result++; word >>= 1; } -#else - unsigned long tmp; + return result; +} - result = 0; - tmp = ~word & -~word; - if (!(unsigned)tmp) { - tmp >>= 32; - result = 32; - } - if (!(unsigned short)tmp) { - tmp >>= 16; - result += 16; - } - if (!(unsigned char)tmp) { - tmp >>= 8; - result += 8; +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + unsigned long result = 0; + + while (!(word & 1UL)) { + result++; + word >>= 1; } - if (tmp & 0xf0) result += 4; - if (tmp & 0xcc) result += 2; - if (tmp & 0xaa) result ++; -#endif -#endif return result; } +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + #ifdef __KERNEL__ /* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(((unsigned int)b[1]))) + return __ffs(b[1]) + 64; + if (b[1] >> 32) + return __ffs(b[1] >> 32) + 96; + return __ffs(b[2]) + 128; +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ - -#define ffs(x) generic_ffs(x) +static __inline__ int ffs(int x) +{ + if (!x) + return 0; + return __ffs((unsigned long)x); +} /* * hweightN: returns the hamming weight (i.e. the number @@ -132,7 +156,7 @@ extern __inline__ unsigned long ffz(unsi #ifdef ULTRA_HAS_POPULATION_COUNT -extern __inline__ unsigned int hweight32(unsigned int w) +static __inline__ unsigned int hweight32(unsigned int w) { unsigned int res; @@ -140,7 +164,7 @@ extern __inline__ unsigned int hweight32 return res; } -extern __inline__ unsigned int hweight16(unsigned int w) +static __inline__ unsigned int hweight16(unsigned int w) { unsigned int res; @@ -148,7 +172,7 @@ extern __inline__ unsigned int hweight16 return res; } -extern __inline__ unsigned int hweight8(unsigned int w) +static __inline__ unsigned int hweight8(unsigned int w) { unsigned int res; @@ -165,14 +189,69 @@ extern __inline__ unsigned int hweight8( #endif #endif /* __KERNEL__ */ +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = addr + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (64 - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + /* find_next_zero_bit() finds the first zero bit in a bit string of length * 'size' bits, starting the search at bit 'offset'. This is largely based * on Linus's ALPHA routines, which are pretty portable BTW. */ -extern __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) +static __inline__ unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset) { - unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long *p = addr + (offset >> 6); unsigned long result = offset & ~63UL; unsigned long tmp; @@ -211,15 +290,15 @@ found_middle: #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) -extern long ___test_and_set_le_bit(int nr, volatile void *addr); -extern long ___test_and_clear_le_bit(int nr, volatile void *addr); +extern long ___test_and_set_le_bit(int nr, volatile unsigned long *addr); +extern long ___test_and_clear_le_bit(int nr, volatile unsigned long *addr); #define test_and_set_le_bit(nr,addr) ({___test_and_set_le_bit(nr,addr)!=0;}) #define test_and_clear_le_bit(nr,addr) ({___test_and_clear_le_bit(nr,addr)!=0;}) #define set_le_bit(nr,addr) ((void)___test_and_set_le_bit(nr,addr)) #define clear_le_bit(nr,addr) ((void)___test_and_clear_le_bit(nr,addr)) -extern __inline__ int test_le_bit(int nr, __const__ void * addr) +static __inline__ int test_le_bit(int nr, __const__ unsigned long * addr) { int mask; __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; @@ -232,9 +311,9 @@ extern __inline__ int test_le_bit(int nr #define find_first_zero_le_bit(addr, size) \ find_next_zero_le_bit((addr), (size), 0) -extern __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset) +static __inline__ unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset) { - unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long *p = addr + (offset >> 6); unsigned long result = offset & ~63UL; unsigned long tmp; @@ -271,18 +350,22 @@ found_middle: #ifdef __KERNEL__ -#define ext2_set_bit test_and_set_le_bit -#define ext2_clear_bit test_and_clear_le_bit -#define ext2_test_bit test_le_bit -#define ext2_find_first_zero_bit find_first_zero_le_bit -#define ext2_find_next_zero_bit find_next_zero_le_bit +#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr)) +#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr)) +#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr)) +#define ext2_find_first_zero_bit(addr, size) \ + find_first_zero_le_bit((unsigned long *)(addr), (size)) +#define ext2_find_next_zero_bit(addr, size, off) \ + find_next_zero_le_bit((unsigned long *)(addr), (size), (off)) /* Bitmap functions for the minix filesystem. */ -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) -#define minix_set_bit(nr,addr) set_bit(nr,addr) -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) -#define minix_test_bit(nr,addr) test_bit(nr,addr) -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +#define minix_test_and_set_bit(nr,addr) test_and_set_bit((nr),(unsigned long *)(addr)) +#define minix_set_bit(nr,addr) set_bit((nr),(unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr,addr) \ + test_and_clear_bit((nr),(unsigned long *)(addr)) +#define minix_test_bit(nr,addr) test_bit((nr),(unsigned long *)(addr)) +#define minix_find_first_zero_bit(addr,size) \ + find_first_zero_bit((unsigned long *)(addr),(size)) #endif /* __KERNEL__ */ --- linux/include/asm-sparc64/smp.h.orig +++ linux/include/asm-sparc64/smp.h @@ -103,7 +103,7 @@ extern __inline__ int hard_smp_processor } } -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) /* This needn't do anything as we do not sleep the cpu * inside of the idler task, so an interrupt is not needed --- linux/include/asm-sparc64/system.h.orig +++ linux/include/asm-sparc64/system.h @@ -154,7 +154,18 @@ extern void __flushw_user(void); #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_to_switch flushw_all + +#define prepare_arch_schedule(prev) task_lock(prev) +#define finish_arch_schedule(prev) task_unlock(prev) +#define prepare_arch_switch(rq, next) \ +do { spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ + flushw_all(); \ +} while (0) + +#define finish_arch_switch(rq, prev) \ +do { spin_unlock_irq(&(prev)->switch_lock); \ +} while (0) #ifndef CONFIG_DEBUG_SPINLOCK #define CHECK_LOCKS(PREV) do { } while(0) --- linux/include/asm-arm/bitops.h.orig +++ linux/include/asm-arm/bitops.h @@ -2,6 +2,8 @@ * Copyright 1995, Russell King. * Various bits and pieces copyrights include: * Linus Torvalds (test_bit). + * Big endian support: Copyright 2001, Nicolas Pitre + * reworked by rmk. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). * @@ -17,81 +19,271 @@ #ifdef __KERNEL__ +#include + #define smp_mb__before_clear_bit() do { } while (0) #define smp_mb__after_clear_bit() do { } while (0) /* - * Function prototypes to keep gcc -Wall happy. + * These functions are the basis of our bit ops. + * First, the atomic bitops. + * + * The endian issue for these functions is handled by the macros below. */ -extern void set_bit(int nr, volatile void * addr); +static inline void +____atomic_set_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p |= mask; + local_irq_restore(flags); +} + +static inline void +____atomic_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p &= ~mask; + local_irq_restore(flags); +} + +static inline void +____atomic_change_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p ^= mask; + local_irq_restore(flags); +} -static inline void __set_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_set_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] |= (1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res | mask; + local_irq_restore(flags); + + return res & mask; } -extern void clear_bit(int nr, volatile void * addr); +static inline int +____atomic_test_and_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res & ~mask; + local_irq_restore(flags); + + return res & mask; +} -static inline void __clear_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_change_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] &= ~(1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res ^ mask; + local_irq_restore(flags); + + return res & mask; } -extern void change_bit(int nr, volatile void * addr); +/* + * Now the non-atomic variants. We let the compiler handle all optimisations + * for these. + */ +static inline void ____nonatomic_set_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] |= (1U << (nr & 7)); +} -static inline void __change_bit(int nr, volatile void *addr) +static inline void ____nonatomic_clear_bit(int nr, volatile void *p) { - ((unsigned char *) addr)[nr >> 3] ^= (1U << (nr & 7)); + ((unsigned char *) p)[nr >> 3] &= ~(1U << (nr & 7)); } -extern int test_and_set_bit(int nr, volatile void * addr); +static inline void ____nonatomic_change_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] ^= (1U << (nr & 7)); +} -static inline int __test_and_set_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_set_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval | mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval | mask; return oldval & mask; } -extern int test_and_clear_bit(int nr, volatile void * addr); - -static inline int __test_and_clear_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_clear_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval & ~mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval & ~mask; return oldval & mask; } -extern int test_and_change_bit(int nr, volatile void * addr); - -static inline int __test_and_change_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_change_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval ^ mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval ^ mask; return oldval & mask; } -extern int find_first_zero_bit(void * addr, unsigned size); -extern int find_next_zero_bit(void * addr, int size, int offset); - /* * This routine doesn't need to be atomic. */ -static inline int test_bit(int nr, const void * addr) +static inline int ____test_bit(int nr, const void * p) { - return ((unsigned char *) addr)[nr >> 3] & (1U << (nr & 7)); + return ((volatile unsigned char *) p)[nr >> 3] & (1U << (nr & 7)); } /* + * A note about Endian-ness. + * ------------------------- + * + * When the ARM is put into big endian mode via CR15, the processor + * merely swaps the order of bytes within words, thus: + * + * ------------ physical data bus bits ----------- + * D31 ... D24 D23 ... D16 D15 ... D8 D7 ... D0 + * little byte 3 byte 2 byte 1 byte 0 + * big byte 0 byte 1 byte 2 byte 3 + * + * This means that reading a 32-bit word at address 0 returns the same + * value irrespective of the endian mode bit. + * + * Peripheral devices should be connected with the data bus reversed in + * "Big Endian" mode. ARM Application Note 61 is applicable, and is + * available from http://www.arm.com/. + * + * The following assumes that the data bus connectivity for big endian + * mode has been followed. + * + * Note that bit 0 is defined to be 32-bit word bit 0, not byte 0 bit 0. + */ + +/* + * Little endian assembly bitops. nr = 0 -> byte 0 bit 0. + */ +extern void _set_bit_le(int nr, volatile void * p); +extern void _clear_bit_le(int nr, volatile void * p); +extern void _change_bit_le(int nr, volatile void * p); +extern int _test_and_set_bit_le(int nr, volatile void * p); +extern int _test_and_clear_bit_le(int nr, volatile void * p); +extern int _test_and_change_bit_le(int nr, volatile void * p); +extern int _find_first_zero_bit_le(void * p, unsigned size); +extern int _find_next_zero_bit_le(void * p, int size, int offset); + +/* + * Big endian assembly bitops. nr = 0 -> byte 3 bit 0. + */ +extern void _set_bit_be(int nr, volatile void * p); +extern void _clear_bit_be(int nr, volatile void * p); +extern void _change_bit_be(int nr, volatile void * p); +extern int _test_and_set_bit_be(int nr, volatile void * p); +extern int _test_and_clear_bit_be(int nr, volatile void * p); +extern int _test_and_change_bit_be(int nr, volatile void * p); +extern int _find_first_zero_bit_be(void * p, unsigned size); +extern int _find_next_zero_bit_be(void * p, int size, int offset); + + +/* + * The __* form of bitops are non-atomic and may be reordered. + */ +#define ATOMIC_BITOP_LE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + ((nr) >> 3)) : \ + _##name##_le(nr,p)) + +#define ATOMIC_BITOP_BE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + (((nr) >> 3) ^ 3)) : \ + _##name##_be(nr,p)) + +#define NONATOMIC_BITOP_LE(name,nr,p) \ + (____nonatomic_##name(nr, p)) + +#define NONATOMIC_BITOP_BE(name,nr,p) \ + (____nonatomic_##name(nr ^ 0x18, p)) + +#ifndef __ARMEB__ +/* + * These are the little endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_LE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_LE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define test_and_change_bit(nr,p) ATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit(nr,p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * These are the little endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p) +#define __clear_bit(nr,p) NONATOMIC_BITOP_LE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_LE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit(nr,p) + +#else + +/* + * These are the big endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_BE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_BE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_BE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_BE(test_and_clear_bit,nr,p) +#define test_and_change_bit(nr,p) ATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off) + +/* + * These are the big endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_BE(set_bit,nr,p) +#define __clear_bit(nr,p) NONATOMIC_BITOP_BE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_BE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_BE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) + +#endif + +/* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ @@ -110,6 +302,29 @@ static inline unsigned long ffz(unsigned } /* + * ffz = Find First Zero in word. Undefined if no zero exists, + * so code should check against ~0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int k; + + k = 31; + if (word & 0x0000ffff) { k -= 16; word <<= 16; } + if (word & 0x00ff0000) { k -= 8; word <<= 8; } + if (word & 0x0f000000) { k -= 4; word <<= 4; } + if (word & 0x30000000) { k -= 2; word <<= 2; } + if (word & 0x40000000) { k -= 1; } + return k; +} + +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). @@ -118,6 +333,22 @@ static inline unsigned long ffz(unsigned #define ffs(x) generic_ffs(x) /* + * Find first bit set in a 168-bit bitmap, where the first + * 128 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + unsigned long v; + unsigned int off; + + for (off = 0; v = b[off], off < 4; off++) { + if (unlikely(v)) + break; + } + return __ffs(v) + off * 32; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -126,18 +357,25 @@ static inline unsigned long ffz(unsigned #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) -#define ext2_set_bit test_and_set_bit -#define ext2_clear_bit test_and_clear_bit -#define ext2_test_bit test_bit -#define ext2_find_first_zero_bit find_first_zero_bit -#define ext2_find_next_zero_bit find_next_zero_bit - -/* Bitmap functions for the minix filesystem. */ -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) -#define minix_set_bit(nr,addr) set_bit(nr,addr) -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) -#define minix_test_bit(nr,addr) test_bit(nr,addr) -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +/* + * Ext2 is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define ext2_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define ext2_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define ext2_test_bit(nr,p) __test_bit(nr,p) +#define ext2_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define ext2_find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * Minix is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define minix_set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p) +#define minix_test_bit(nr,p) __test_bit(nr,p) +#define minix_test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define minix_test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define minix_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) #endif /* __KERNEL__ */ --- linux/include/asm-ia64/bitops.h.orig +++ linux/include/asm-ia64/bitops.h @@ -2,10 +2,15 @@ #define _ASM_IA64_BITOPS_H /* - * Copyright (C) 1998-2001 Hewlett-Packard Co - * Copyright (C) 1998-2001 David Mosberger-Tang + * Copyright (C) 1998-2002 Hewlett-Packard Co + * David Mosberger-Tang + * + * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64 O(1) + * scheduler patch */ +#include + #include /** @@ -89,6 +94,17 @@ clear_bit (int nr, volatile void *addr) } /** + * __clear_bit - Clears a bit in memory (non-atomic version) + */ +static __inline__ void +__clear_bit (int nr, volatile void *addr) +{ + volatile __u32 *p = (__u32 *) addr + (nr >> 5); + __u32 m = 1 << (nr & 31); + *p &= ~m; +} + +/** * change_bit - Toggle a bit in memory * @nr: Bit to clear * @addr: Address to start counting from @@ -264,12 +280,11 @@ test_bit (int nr, volatile void *addr) } /** - * ffz - find the first zero bit in a memory region - * @x: The address to start the search at + * ffz - find the first zero bit in a long word + * @x: The long word to find the bit in * - * Returns the bit-number (0..63) of the first (least significant) zero bit, not - * the number of the byte containing a bit. Undefined if no zero exists, so - * code should check against ~0UL first... + * Returns the bit-number (0..63) of the first (least significant) zero bit. Undefined if + * no zero exists, so code should check against ~0UL first... */ static inline unsigned long ffz (unsigned long x) @@ -280,6 +295,21 @@ ffz (unsigned long x) return result; } +/** + * __ffs - find first bit in word. + * @x: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long +__ffs (unsigned long x) +{ + unsigned long result; + + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" ((x - 1) & ~x)); + return result; +} + #ifdef __KERNEL__ /* @@ -296,6 +326,12 @@ ia64_fls (unsigned long x) return exp - 0xffff; } +static int +fls (int x) +{ + return ia64_fls((unsigned int) x); +} + /* * ffs: find first bit set. This is defined the same way as the libc and compiler builtin * ffs routines, therefore differs in spirit from the above ffz (man ffs): it operates on @@ -368,8 +404,53 @@ found_middle: */ #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +/* + * Find next bit in a bitmap reasonably efficiently.. + */ +static inline int +find_next_bit (void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (64-size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ + found_middle: + return result + __ffs(tmp); +} + +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) + #ifdef __KERNEL__ +#define __clear_bit(nr, addr) clear_bit(nr, addr) + #define ext2_set_bit test_and_set_bit #define ext2_clear_bit test_and_clear_bit #define ext2_test_bit test_bit @@ -383,6 +464,16 @@ found_middle: #define minix_test_bit(nr,addr) test_bit(nr,addr) #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +static inline int +sched_find_first_bit (unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return 64 + __ffs(b[1]); + return __ffs(b[2]) + 128; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IA64_BITOPS_H */ --- linux/include/asm-mips64/bitops.h.orig +++ linux/include/asm-mips64/bitops.h @@ -19,6 +19,7 @@ #include #include +#include /* * set_bit - Atomically set a bit in memory @@ -30,7 +31,8 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void set_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +set_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -54,7 +56,7 @@ static inline void set_bit(unsigned long * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile void * addr) +extern __inline__ void __set_bit(int nr, volatile void * addr) { unsigned long * m = ((unsigned long *) addr) + (nr >> 6); @@ -71,7 +73,8 @@ static inline void __set_bit(int nr, vol * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void clear_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +clear_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -97,7 +100,8 @@ static inline void clear_bit(unsigned lo * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +change_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -120,7 +124,7 @@ static inline void change_bit(unsigned l * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile void * addr) +extern __inline__ void __change_bit(int nr, volatile void * addr) { unsigned long * m = ((unsigned long *) addr) + (nr >> 6); @@ -135,8 +139,8 @@ static inline void __change_bit(int nr, * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_set_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_set_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -168,7 +172,8 @@ static inline unsigned long test_and_set * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile void *addr) +extern __inline__ int +__test_and_set_bit(int nr, volatile void * addr) { unsigned long mask, retval; long *a = (unsigned long *) addr; @@ -189,8 +194,8 @@ static inline int __test_and_set_bit(int * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_clear_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_clear_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -223,7 +228,8 @@ static inline unsigned long test_and_cle * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_clear_bit(int nr, volatile void * addr) +extern __inline__ int +__test_and_clear_bit(int nr, volatile void * addr) { unsigned long mask, retval; unsigned long *a = (unsigned long *) addr; @@ -244,8 +250,8 @@ static inline int __test_and_clear_bit(i * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_change_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_change_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -277,7 +283,8 @@ static inline unsigned long test_and_cha * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_change_bit(int nr, volatile void *addr) +extern __inline__ int +__test_and_change_bit(int nr, volatile void * addr) { unsigned long mask, retval; unsigned long *a = (unsigned long *) addr; @@ -294,7 +301,8 @@ static inline int __test_and_change_bit( * @nr: bit number to test * @addr: Address to start counting from */ -static inline unsigned long test_bit(int nr, volatile void * addr) +extern __inline__ unsigned long +test_bit(int nr, volatile void * addr) { return 1UL & (((volatile unsigned long *) addr)[nr >> 6] >> (nr & 0x3f)); } @@ -311,7 +319,8 @@ static inline unsigned long test_bit(int * Returns the bit-number of the first zero bit, not the number of the byte * containing a bit. */ -static inline int find_first_zero_bit (void *addr, unsigned size) +extern __inline__ int +find_first_zero_bit (void *addr, unsigned size) { unsigned long dummy; int res; @@ -347,7 +356,8 @@ static inline int find_first_zero_bit (v "2:" : "=r" (res), "=r" (dummy), "=r" (addr) : "0" ((signed int) 0), "1" ((unsigned int) 0xffffffff), - "2" (addr), "r" (size)); + "2" (addr), "r" (size) + : "$1"); return res; } @@ -358,7 +368,8 @@ static inline int find_first_zero_bit (v * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static inline int find_next_zero_bit (void * addr, int size, int offset) +extern __inline__ int +find_next_zero_bit (void * addr, int size, int offset) { unsigned int *p = ((unsigned int *) addr) + (offset >> 5); int set = 0, bit = offset & 31, res; @@ -379,7 +390,8 @@ static inline int find_next_zero_bit (vo ".set\treorder\n" "1:" : "=r" (set), "=r" (dummy) - : "0" (0), "1" (1 << bit), "r" (*p)); + : "0" (0), "1" (1 << bit), "r" (*p) + : "$1"); if (set < (32 - bit)) return set + offset; set = 32 - bit; @@ -400,19 +412,20 @@ static inline int find_next_zero_bit (vo * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __inline__ unsigned long ffz(unsigned long word) +extern __inline__ unsigned long ffz(unsigned long word) { - int b = 0, s; + unsigned long k; word = ~word; - s = 32; if (word << 32 != 0) s = 0; b += s; word >>= s; - s = 16; if (word << 48 != 0) s = 0; b += s; word >>= s; - s = 8; if (word << 56 != 0) s = 0; b += s; word >>= s; - s = 4; if (word << 60 != 0) s = 0; b += s; word >>= s; - s = 2; if (word << 62 != 0) s = 0; b += s; word >>= s; - s = 1; if (word << 63 != 0) s = 0; b += s; + k = 63; + if (word & 0x00000000ffffffffUL) { k -= 32; word <<= 32; } + if (word & 0x0000ffff00000000UL) { k -= 16; word <<= 16; } + if (word & 0x00ff000000000000UL) { k -= 8; word <<= 8; } + if (word & 0x0f00000000000000UL) { k -= 4; word <<= 4; } + if (word & 0x3000000000000000UL) { k -= 2; word <<= 2; } + if (word & 0x4000000000000000UL) { k -= 1; } - return b; + return k; } #ifdef __KERNEL__ @@ -450,8 +463,8 @@ static __inline__ unsigned long ffz(unsi * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static inline unsigned long find_next_zero_bit(void *addr, unsigned long size, - unsigned long offset) +extern __inline__ unsigned long +find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) { unsigned long *p = ((unsigned long *) addr) + (offset >> 6); unsigned long result = offset & ~63UL; @@ -498,7 +511,8 @@ found_middle: #ifdef __MIPSEB__ -static inline int ext2_set_bit(int nr,void * addr) +extern inline int +ext2_set_bit(int nr,void * addr) { int mask, retval, flags; unsigned char *ADDR = (unsigned char *) addr; @@ -512,7 +526,8 @@ static inline int ext2_set_bit(int nr,vo return retval; } -static inline int ext2_clear_bit(int nr, void * addr) +extern inline int +ext2_clear_bit(int nr, void * addr) { int mask, retval, flags; unsigned char *ADDR = (unsigned char *) addr; @@ -526,7 +541,8 @@ static inline int ext2_clear_bit(int nr, return retval; } -static inline int ext2_test_bit(int nr, const void * addr) +extern inline int +ext2_test_bit(int nr, const void * addr) { int mask; const unsigned char *ADDR = (const unsigned char *) addr; @@ -539,9 +555,8 @@ static inline int ext2_test_bit(int nr, #define ext2_find_first_zero_bit(addr, size) \ ext2_find_next_zero_bit((addr), (size), 0) -static inline unsigned int ext2_find_next_zero_bit(void *addr, - unsigned long size, - unsigned long offset) +extern inline unsigned int +ext2_find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) { unsigned int *p = ((unsigned int *) addr) + (offset >> 5); unsigned int result = offset & ~31UL; --- linux/include/asm-s390/bitops.h.orig +++ linux/include/asm-s390/bitops.h @@ -47,272 +47,217 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(int nr, volatile void * addr) +static inline void set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(int nr, volatile void * addr) +static inline void clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " x %3,%4\n" /* make AND mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(int nr, volatile void * addr) +static inline void change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make XR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_set_bit_cs(int nr, volatile void * addr) +static inline int test_and_set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR/test mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_clear_bit_cs(int nr, volatile void * addr) +static inline int test_and_clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " l %0,0(%1)\n" - " x %3,%4\n" /* make AND mask */ - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " x %3,%4\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); - return nr; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_change_bit_cs(int nr, volatile void * addr) +static inline int test_and_change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(int nr, volatile void * addr) +static inline void __set_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -323,76 +268,58 @@ __constant_set_bit(const int nr, volatil /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(int nr, volatile void * addr) +static inline void +__clear_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -403,75 +330,57 @@ __constant_clear_bit(const int nr, volat /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(int nr, volatile void * addr) +static inline void __change_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -482,74 +391,54 @@ __constant_change_bit(const int nr, vola /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int test_and_set_bit_simple(int nr, volatile void * addr) +static inline int test_and_set_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int test_and_clear_bit_simple(int nr, volatile void * addr) +static inline int test_and_clear_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int test_and_change_bit_simple(int nr, volatile void * addr) +static inline int test_and_change_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%1\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -574,25 +463,17 @@ static __inline__ int test_and_change_bi * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(int nr, volatile void * addr) +static inline int __test_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %2,24\n" - " lhi %1,7\n" - " xr %2,%3\n" - " nr %1,%3\n" - " srl %2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int __constant_test_bit(int nr, volatile void * addr) { +static inline int __constant_test_bit(int nr, volatile void * addr) { return (((volatile char *) addr)[(nr>>3)^3] & (1<<(nr&7))) != 0; } @@ -604,7 +485,7 @@ static __inline__ int __constant_test_bi /* * Find-bit routines.. */ -static __inline__ int find_first_zero_bit(void * addr, unsigned size) +static inline int find_first_zero_bit(void * addr, unsigned size) { unsigned long cmp, count; int res; @@ -642,7 +523,45 @@ static __inline__ int find_first_zero_bi return (res < size) ? res : size; } -static __inline__ int find_next_zero_bit (void * addr, int size, int offset) +static inline int find_first_bit(void * addr, unsigned size) +{ + unsigned long cmp, count; + int res; + + if (!size) + return 0; + __asm__(" slr %1,%1\n" + " lr %2,%3\n" + " slr %0,%0\n" + " ahi %2,31\n" + " srl %2,5\n" + "0: c %1,0(%0,%4)\n" + " jne 1f\n" + " ahi %0,4\n" + " brct %2,0b\n" + " lr %0,%3\n" + " j 4f\n" + "1: l %2,0(%0,%4)\n" + " sll %0,3\n" + " lhi %1,0xff\n" + " tml %2,0xffff\n" + " jnz 2f\n" + " ahi %0,16\n" + " srl %2,16\n" + "2: tml %2,0x00ff\n" + " jnz 3f\n" + " ahi %0,8\n" + " srl %2,8\n" + "3: nr %2,%1\n" + " ic %2,0(%2,%5)\n" + " alr %0,%2\n" + "4:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline int find_next_zero_bit (void * addr, int size, int offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 5); unsigned long bitvec, reg; @@ -680,11 +599,49 @@ static __inline__ int find_next_zero_bit return (offset + res); } +static inline int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + unsigned long bitvec, reg; + int set, bit = offset & 31, res; + + if (bit) { + /* + * Look for set bit in first word + */ + bitvec = (*p) >> bit; + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (32 - bit)) + return set + offset; + offset += 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { unsigned long reg; int result; @@ -708,40 +665,109 @@ static __inline__ unsigned long ffz(unsi } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -769,7 +795,7 @@ extern int __inline__ ffs (int x) #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^24, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^24, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^24, addr) -static __inline__ int ext2_find_first_zero_bit(void *vaddr, unsigned size) +static inline int ext2_find_first_zero_bit(void *vaddr, unsigned size) { unsigned long cmp, count; int res; @@ -808,7 +834,7 @@ static __inline__ int ext2_find_first_ze return (res < size) ? res : size; } -static __inline__ int +static inline int ext2_find_next_zero_bit(void *vaddr, unsigned size, unsigned offset) { unsigned long *addr = vaddr; --- linux/include/asm-s390x/bitops.h.orig +++ linux/include/asm-s390x/bitops.h @@ -51,271 +51,220 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(unsigned long nr, volatile void * addr) +static inline void set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(unsigned long nr, volatile void * addr) +static inline void clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " lghi %3,-2\n" - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(unsigned long nr, volatile void * addr) +static inline void change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make XR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_set_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR/test mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_clear_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " xgr %0,%2\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_change_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(unsigned long nr, volatile void * addr) +static inline void __set_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "a" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -326,76 +275,58 @@ __constant_set_bit(const unsigned long n /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(unsigned long nr, volatile void * addr) +static inline void +__clear_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -406,75 +337,57 @@ __constant_clear_bit(const unsigned long /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(unsigned long nr, volatile void * addr) +static inline void __change_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -485,77 +398,57 @@ __constant_change_bit(const unsigned lon /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int -test_and_set_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int -test_and_clear_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int -test_and_change_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -580,26 +473,18 @@ test_and_change_bit_simple(unsigned long * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(unsigned long nr, volatile void * addr) +static inline int __test_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %2,56\n" - " lghi %1,7\n" - " xgr %2,%3\n" - " nr %1,%3\n" - " srlg %2,%2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)\n" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int -__constant_test_bit(unsigned long nr, volatile void * addr) { +static inline int +__constant_test_bit(unsigned long nr, volatile void *addr) { return (((volatile char *) addr)[(nr>>3)^7] & (1<<(nr&7))) != 0; } @@ -611,7 +496,7 @@ __constant_test_bit(unsigned long nr, vo /* * Find-bit routines.. */ -static __inline__ unsigned long +static inline unsigned long find_first_zero_bit(void * addr, unsigned long size) { unsigned long res, cmp, count; @@ -653,7 +538,49 @@ find_first_zero_bit(void * addr, unsigne return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long +find_first_bit(void * addr, unsigned long size) +{ + unsigned long res, cmp, count; + + if (!size) + return 0; + __asm__(" slgr %1,%1\n" + " lgr %2,%3\n" + " slgr %0,%0\n" + " aghi %2,63\n" + " srlg %2,%2,6\n" + "0: cg %1,0(%0,%4)\n" + " jne 1f\n" + " aghi %0,8\n" + " brct %2,0b\n" + " lgr %0,%3\n" + " j 5f\n" + "1: lg %2,0(%0,%4)\n" + " sllg %0,%0,3\n" + " clr %2,%1\n" + " jne 2f\n" + " aghi %0,32\n" + " srlg %2,%2,32\n" + "2: lghi %1,0xff\n" + " tmll %2,0xffff\n" + " jnz 3f\n" + " aghi %0,16\n" + " srl %2,16\n" + "3: tmll %2,0x00ff\n" + " jnz 4f\n" + " aghi %0,8\n" + " srl %2,8\n" + "4: ngr %2,%1\n" + " ic %2,0(%2,%5)\n" + " algr %0,%2\n" + "5:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline unsigned long find_next_zero_bit (void * addr, unsigned long size, unsigned long offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 6); @@ -697,14 +624,56 @@ find_next_zero_bit (void * addr, unsigne return (offset + res); } +static inline unsigned long +find_next_bit (void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long bitvec, reg; + unsigned long set, bit = offset & 63, res; + + if (bit) { + /* + * Look for zero in first word + */ + bitvec = (*p) >> bit; + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (64 - bit)) + return set + offset; + offset += 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 64 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { - unsigned long reg; - int result; + unsigned long reg, result; __asm__(" lhi %2,-1\n" " slgr %0,%0\n" @@ -730,40 +699,112 @@ static __inline__ unsigned long ffz(unsi } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs (unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ - -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -791,7 +832,7 @@ extern int __inline__ ffs (int x) #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^56, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^56, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^56, addr) -static __inline__ unsigned long +static inline unsigned long ext2_find_first_zero_bit(void *vaddr, unsigned long size) { unsigned long res, cmp, count; @@ -833,7 +874,7 @@ ext2_find_first_zero_bit(void *vaddr, un return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset) { unsigned long *addr = vaddr; --- linux/include/asm-ppc64/bitops.h.orig +++ linux/include/asm-ppc64/bitops.h @@ -33,7 +33,6 @@ #ifdef __KERNEL__ -#include #include /* @@ -42,12 +41,12 @@ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ int test_bit(unsigned long nr, __const__ volatile void *addr) +static __inline__ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr) { return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))); } -static __inline__ void set_bit(unsigned long nr, volatile void *addr) +static __inline__ void set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -63,7 +62,7 @@ static __inline__ void set_bit(unsigned : "cc"); } -static __inline__ void clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -79,7 +78,7 @@ static __inline__ void clear_bit(unsigne : "cc"); } -static __inline__ void change_bit(unsigned long nr, volatile void *addr) +static __inline__ void change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -95,7 +94,7 @@ static __inline__ void change_bit(unsign : "cc"); } -static __inline__ int test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -115,7 +114,7 @@ static __inline__ int test_and_set_bit(u return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -135,7 +134,7 @@ static __inline__ int test_and_clear_bit return (old & mask) != 0; } -static __inline__ int test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -158,7 +157,7 @@ static __inline__ int test_and_change_bi /* * non-atomic versions */ -static __inline__ void __set_bit(unsigned long nr, volatile void *addr) +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -166,7 +165,7 @@ static __inline__ void __set_bit(unsigne *p |= mask; } -static __inline__ void __clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -174,7 +173,7 @@ static __inline__ void __clear_bit(unsig *p &= ~mask; } -static __inline__ void __change_bit(unsigned long nr, volatile void *addr) +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -182,7 +181,7 @@ static __inline__ void __change_bit(unsi *p ^= mask; } -static __inline__ int __test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -192,7 +191,7 @@ static __inline__ int __test_and_set_bit return (old & mask) != 0; } -static __inline__ int __test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -202,7 +201,7 @@ static __inline__ int __test_and_clear_b return (old & mask) != 0; } -static __inline__ int __test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -224,54 +223,29 @@ static __inline__ int __ilog2(unsigned l return 63 - lz; } -/* Return the zero-based bit position - * from RIGHT TO LEFT 63 --> 0 - * of the most significant (left-most) 1-bit in an 8-byte area. - */ -static __inline__ long cnt_trailing_zeros(unsigned long mask) -{ - long cnt; - - asm( -" addi %0,%1,-1 \n\ - andc %0,%0,%1 \n\ - cntlzd %0,%0 \n\ - subfic %0,%0,64" - : "=r" (cnt) - : "r" (mask)); - return cnt; -} - - - /* - * ffz = Find First Zero in word. Undefined if no zero exists, - * Determines the bit position of the LEAST significant - * (rightmost) 0 bit in the specified DOUBLE-WORD. - * The returned bit position will be zero-based, starting - * from the right side (63 - 0). - * the code should check against ~0UL first.. + * Determines the bit position of the least significant (rightmost) 0 bit + * in the specified double word. The returned bit position will be zero-based, + * starting from the right side (63 - 0). */ static __inline__ unsigned long ffz(unsigned long x) { - u32 tempRC; - - /* Change all of x's 1s to 0s and 0s to 1s in x. - * And insure at least 1 zero exists in the 8 byte area. - */ + /* no zero exists anywhere in the 8 byte area. */ if ((x = ~x) == 0) - /* no zero exists anywhere in the 8 byte area. */ return 64; - /* Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least - * significant '0' bit in the original x). - * Note: (x & -x) gives us a mask that is the LEAST significant - * (RIGHT-most) 1-bit of the value in x. + /* + * Calculate the bit position of the least signficant '1' bit in x + * (since x has been changed this will actually be the least signficant + * '0' bit in * the original x). Note: (x & -x) gives us a mask that + * is the least significant * (RIGHT-most) 1-bit of the value in x. */ - tempRC = __ilog2(x & -x); + return __ilog2(x & -x); +} - return tempRC; +static __inline__ int __ffs(unsigned long x) +{ + return __ilog2(x & -x); } /* @@ -281,8 +255,8 @@ static __inline__ unsigned long ffz(unsi */ static __inline__ int ffs(int x) { - int result = ffz(~x); - return x ? result+1 : 0; + unsigned long i = (unsigned long)x; + return __ilog2(i & -i) + 1; } /* @@ -293,139 +267,82 @@ static __inline__ int ffs(int x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) -extern unsigned long find_next_zero_bit(void * addr, unsigned long size, - unsigned long offset); -/* - * The optimizer actually does good code for this case.. - */ -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +extern unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_zero_bit(addr, size) \ + find_next_zero_bit((addr), (size), 0) + +extern unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + +extern unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_zero_le_bit(addr, size) \ + find_next_zero_le_bit((addr), (size), 0) -/* Bitmap functions for the ext2 filesystem. */ -#define _EXT2_HAVE_ASM_BITOPS_ - -static __inline__ int ext2_set_bit(int nr, void* addr) +static __inline__ int test_le_bit(unsigned long nr, __const__ unsigned long * addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * it's bitmaps as "little endian" unsigned integers. - * Note: this method is not atomic, but ext2 does not need it to be. - */ - int mask; - int oldbit; - unsigned char* ADDR = (unsigned char*) addr; - - /* Determine the BYTE containing the specified bit - * (nr) - important as if we go to a byte there are no - * little endian concerns. - */ - ADDR += nr >> 3; - mask = 1 << (nr & 0x07); /* Create a mask to the bit within this byte. */ - oldbit = *ADDR & mask; /* Save the bit's previous value. */ - *ADDR |= mask; /* Turn the bit on. */ - return oldbit; /* Return the bit's previous value. */ + __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; + return (ADDR[nr >> 3] >> (nr & 7)) & 1; } -static __inline__ int ext2_clear_bit(int nr, void* addr) +/* + * non-atomic versions + */ +static __inline__ void __set_le_bit(unsigned long nr, unsigned long *addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - * Note: this method is not atomic, but ext2 does not need it to be. - */ - int mask; - int oldbit; - unsigned char* ADDR = (unsigned char*) addr; - - /* Determine the BYTE containing the specified bit (nr) - * - important as if we go to a byte there are no little endian concerns. - */ - ADDR += nr >> 3; - mask = 1 << (nr & 0x07); /* Create a mask to the bit within this byte. */ - oldbit = *ADDR & mask; /* Save the bit's previous value. */ - *ADDR = *ADDR & ~mask; /* Turn the bit off. */ - return oldbit; /* Return the bit's previous value. */ -} + unsigned char *ADDR = (unsigned char *)addr; -static __inline__ int ext2_test_bit(int nr, __const__ void * addr) -{ - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - * Determine the BYTE containing the specified bit (nr), - * then shift to the right the correct number of bits and return that bit's value. - */ - __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; - return (ADDR[nr >> 3] >> (nr & 7)) & 1; + ADDR += nr >> 3; + *ADDR |= 1 << (nr & 0x07); } -/* Returns the bit position of the most significant 1 bit in a WORD. */ -static __inline__ int ext2_ilog2(unsigned int x) +static __inline__ void __clear_le_bit(unsigned long nr, unsigned long *addr) { - int lz; + unsigned char *ADDR = (unsigned char *)addr; - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); - return 31 - lz; + ADDR += nr >> 3; + *ADDR &= ~(1 << (nr & 0x07)); } -/* ext2_ffz = ext2's Find First Zero. - * Determines the bit position of the LEAST significant (rightmost) 0 bit in the specified WORD. - * The returned bit position will be zero-based, starting from the right side (31 - 0). - */ -static __inline__ int ext2_ffz(unsigned int x) +static __inline__ int __test_and_set_le_bit(unsigned long nr, unsigned long *addr) { - u32 tempRC; - /* Change all of x's 1s to 0s and 0s to 1s in x. And insure at least 1 zero exists in the word. */ - if ((x = ~x) == 0) - /* no zero exists anywhere in the 4 byte area. */ - return 32; - /* Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least - * significant '0' bit in the original x). - * Note: (x & -x) gives us a mask that is the LEAST significant - * (RIGHT-most) 1-bit of the value in x. - */ - tempRC = ext2_ilog2(x & -x); - return tempRC; + int mask, retval; + unsigned char *ADDR = (unsigned char *)addr; + + ADDR += nr >> 3; + mask = 1 << (nr & 0x07); + retval = (mask & *ADDR) != 0; + *ADDR |= mask; + return retval; } -static __inline__ u32 ext2_find_next_zero_bit(void* addr, u32 size, u32 offset) +static __inline__ int __test_and_clear_le_bit(unsigned long nr, unsigned long *addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - */ - unsigned int *p = ((unsigned int *) addr) + (offset >> 5); - unsigned int result = offset & ~31; - unsigned int tmp; - - if (offset >= size) - return size; - size -= result; - offset &= 31; - if (offset) { - tmp = cpu_to_le32p(p++); - tmp |= ~0U >> (32-offset); /* bug or feature ? */ - if (size < 32) - goto found_first; - if (tmp != ~0) - goto found_middle; - size -= 32; - result += 32; - } - while (size >= 32) { - if ((tmp = cpu_to_le32p(p++)) != ~0) - goto found_middle; - result += 32; - size -= 32; - } - if (!size) - return result; - tmp = cpu_to_le32p(p); -found_first: - tmp |= ~0 << size; - if (tmp == ~0) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + ext2_ffz(tmp); -} + int mask, retval; + unsigned char *ADDR = (unsigned char *)addr; -#define ext2_find_first_zero_bit(addr, size) ext2_find_next_zero_bit((addr), (size), 0) + ADDR += nr >> 3; + mask = 1 << (nr & 0x07); + retval = (mask & *ADDR) != 0; + *ADDR &= ~mask; + return retval; +} + +#define ext2_set_bit(nr,addr) \ + __test_and_set_le_bit((nr),(unsigned long*)addr) +#define ext2_clear_bit(nr, addr) \ + __test_and_clear_le_bit((nr),(unsigned long*)addr) +#define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr) +#define ext2_find_first_zero_bit(addr, size) \ + find_first_zero_le_bit((unsigned long*)addr, size) +#define ext2_find_next_zero_bit(addr, size, off) \ + find_next_zero_le_bit((unsigned long*)addr, size, off) + +#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) +#define minix_set_bit(nr,addr) set_bit(nr,addr) +#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) +#define minix_test_bit(nr,addr) test_bit(nr,addr) +#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) #endif /* __KERNEL__ */ #endif /* _PPC64_BITOPS_H */ --- linux/include/asm-x86_64/processor.h.orig +++ linux/include/asm-x86_64/processor.h @@ -381,11 +381,10 @@ extern unsigned long get_wchan(struct ta /* Note: most of the infrastructure to separate stack and task_struct are already there. When you run out of stack try this first. */ -#define alloc_task_struct() \ - ((struct task_struct *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) -#define free_task_struct(p) free_pages((unsigned long) (p), 1) -#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) +#define __alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) +#define __free_task_struct(p) free_pages((unsigned long) (p), 1) + #define init_task (init_task_union.task) #define init_stack (init_task_union.stack) --- linux/include/asm-x86_64/ptrace.h.orig +++ linux/include/asm-x86_64/ptrace.h @@ -32,8 +32,6 @@ /* top of stack page */ #define FRAME_SIZE 168 -#define PTRACE_SETOPTIONS 21 - /* options set using PTRACE_SETOPTIONS */ #define PTRACE_O_TRACESYSGOOD 0x00000001 --- linux/net/ipv4/netfilter/ipt_owner.c.orig +++ linux/net/ipv4/netfilter/ipt_owner.c @@ -14,12 +14,12 @@ static int match_comm(const struct sk_buff *skb, const char *comm) { - struct task_struct *p; + struct task_struct *p, *g; struct files_struct *files; int i; read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g,p) { if(strncmp(p->comm, comm, sizeof(p->comm))) continue; @@ -38,7 +38,7 @@ match_comm(const struct sk_buff *skb, co read_unlock(&files->file_lock); } task_unlock(p); - } + } while_each_thread(g, p); read_unlock(&tasklist_lock); return 0; } @@ -77,12 +77,12 @@ out: static int match_sid(const struct sk_buff *skb, pid_t sid) { - struct task_struct *p; + struct task_struct *p, *g; struct file *file = skb->sk->socket->file; int i, found=0; read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g, p) { struct files_struct *files; if (p->session != sid) continue; @@ -101,8 +101,10 @@ match_sid(const struct sk_buff *skb, pid } task_unlock(p); if(found) - break; - } + goto out; + } while_each_thread(g, p); + +out: read_unlock(&tasklist_lock); return found; --- linux/net/ipv6/netfilter/ip6t_owner.c.orig +++ linux/net/ipv6/netfilter/ip6t_owner.c @@ -49,12 +49,12 @@ out: static int match_sid(const struct sk_buff *skb, pid_t sid) { - struct task_struct *p; + struct task_struct *p, *g; struct file *file = skb->sk->socket->file; int i, found=0; read_lock(&tasklist_lock); - for_each_task(p) { + do_each_thread(g, p) { struct files_struct *files; if (p->session != sid) continue; @@ -73,8 +73,9 @@ match_sid(const struct sk_buff *skb, pid } task_unlock(p); if(found) - break; - } + goto out;; + } while_each_thread(g, p); +out: read_unlock(&tasklist_lock); return found; --- linux/net/sunrpc/clnt.c.orig +++ linux/net/sunrpc/clnt.c @@ -211,27 +211,27 @@ void rpc_clnt_sigmask(struct rpc_clnt *c /* Turn off various signals */ if (clnt->cl_intr) { - struct k_sigaction *action = current->sig->action; + struct k_sigaction *action = current->sighand->action; if (action[SIGINT-1].sa.sa_handler == SIG_DFL) sigallow |= sigmask(SIGINT); if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL) sigallow |= sigmask(SIGQUIT); } - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); *oldset = current->blocked; siginitsetinv(¤t->blocked, sigallow & ~oldset->sig[0]); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset) { unsigned long irqflags; - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = *oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } /* --- linux/net/sunrpc/sched.c.orig +++ linux/net/sunrpc/sched.c @@ -992,10 +992,10 @@ rpciod(void *ptr) daemonize(); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); strcpy(current->comm, "rpciod"); @@ -1003,7 +1003,10 @@ rpciod(void *ptr) while (rpciod_users) { if (signalled()) { rpciod_killall(); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); } __rpc_schedule(); @@ -1049,9 +1052,9 @@ rpciod_killall(void) } } - spin_lock_irqsave(¤t->sigmask_lock, flags); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } /* @@ -1127,9 +1130,9 @@ rpciod_down(void) } interruptible_sleep_on(&rpciod_killer); } - spin_lock_irqsave(¤t->sigmask_lock, flags); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); out: up(&rpciod_sema); MOD_DEC_USE_COUNT; --- linux/net/sunrpc/svc.c.orig +++ linux/net/sunrpc/svc.c @@ -213,9 +213,9 @@ svc_register(struct svc_serv *serv, int } if (!port) { - spin_lock_irqsave(¤t->sigmask_lock, flags); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } return error; --- linux/net/khttpd/main.c.orig +++ linux/net/khttpd/main.c @@ -118,11 +118,11 @@ static int MainDaemon(void *cpu_pointer) /* Block all signals except SIGKILL, SIGSTOP and SIGHUP */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); tmpsig = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP)| sigmask(SIGHUP)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (MainSocket->sk==NULL) @@ -202,11 +202,11 @@ static int ManagementDaemon(void *unused daemonize(); /* Block all signals except SIGKILL and SIGSTOP */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); tmpsig = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP) ); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); /* main loop */ while (sysctl_khttpd_unload==0) --- linux/net/bluetooth/bnep/core.c.orig +++ linux/net/bluetooth/bnep/core.c @@ -460,8 +460,6 @@ static int bnep_session(void *arg) sigfillset(¤t->blocked); flush_signals(current); - current->nice = -15; - set_fs(KERNEL_DS); init_waitqueue_entry(&wait, current); --- linux/net/rxrpc/krxiod.c.orig +++ linux/net/rxrpc/krxiod.c @@ -49,14 +49,10 @@ static int rxrpc_krxiod(void *arg) daemonize(); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around waiting for work to do */ do { @@ -147,9 +143,9 @@ static int rxrpc_krxiod(void *arg) /* discard pending signals */ while (signal_pending(current)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); dequeue_signal(¤t->blocked,&sinfo); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } } while (!rxrpc_krxiod_die); --- linux/net/rxrpc/krxsecd.c.orig +++ linux/net/rxrpc/krxsecd.c @@ -60,14 +60,10 @@ static int rxrpc_krxsecd(void *arg) daemonize(); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around waiting for work to do */ do { @@ -121,9 +117,9 @@ static int rxrpc_krxsecd(void *arg) /* discard pending signals */ while (signal_pending(current)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); dequeue_signal(¤t->blocked,&sinfo); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } } while (!die); --- linux/net/rxrpc/krxtimod.c.orig +++ linux/net/rxrpc/krxtimod.c @@ -77,14 +77,10 @@ static int krxtimod(void *arg) complete(&krxtimod_alive); /* only certain signals are of interest */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked,0); -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,3) recalc_sigpending(); -#else - recalc_sigpending(current); -#endif - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); /* loop around looking for things to attend to */ loop: @@ -106,9 +102,9 @@ static int krxtimod(void *arg) while (signal_pending(current)) { siginfo_t sinfo; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); dequeue_signal(¤t->blocked,&sinfo); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } /* work out the time to elapse before the next event */ --- linux/net/tux/cachemiss.c.orig +++ linux/net/tux/cachemiss.c @@ -113,12 +113,12 @@ static int cachemiss_thread (void *data) sprintf(current->comm, "async IO %d/%d", nr, iot->threads); - spin_lock_irq(¤t->sigmask_lock); - ka = current->sig->action + SIGCHLD-1; + spin_lock_irq(¤t->sighand->siglock); + ka = current->sighand->action + SIGCHLD-1; ka->sa.sa_handler = SIG_IGN; siginitsetinv(¤t->blocked, sigmask(SIGCHLD)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); spin_unlock(&iot->async_lock); #if CONFIG_SMP --- linux/net/tux/cgi.c.orig +++ linux/net/tux/cgi.c @@ -119,10 +119,10 @@ static int exec_usermode(char *program_p /* Allow execve args to be in kernel space. */ set_fs(KERNEL_DS); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); flush_signal_handlers(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); for (i = 3; i < current->files->max_fds; i++ ) if (current->files->fd[i]) @@ -206,7 +206,7 @@ pid_t tux_exec_process (char *command, c int ret = 0; struct k_sigaction *ka; - ka = current->sig->action + SIGCHLD-1; + ka = current->sighand->action + SIGCHLD-1; ka->sa.sa_handler = SIG_IGN; if (!param && wait) --- linux/net/tux/extcgi.c.orig +++ linux/net/tux/extcgi.c @@ -226,12 +226,12 @@ static int exec_external_cgi (void *data *envp_p = NULL; - spin_lock_irq(¤t->sigmask_lock); - ka = current->sig->action + SIGPIPE-1; + spin_lock_irq(¤t->sighand->siglock); + ka = current->sighand->action + SIGPIPE-1; ka->sa.sa_handler = SIG_IGN; siginitsetinv(¤t->blocked, sigmask(SIGCHLD)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); sys_close(0); sys_close(1); sys_close(2); sys_close(3); --- linux/net/tux/logger.c.orig +++ linux/net/tux/logger.c @@ -744,10 +744,10 @@ static int logger_thread (void *data) #endif - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, 0); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (log_buffer) TUX_BUG(); --- linux/net/tux/main.c.orig +++ linux/net/tux/main.c @@ -39,10 +39,10 @@ static void flush_all_requests (threadin void flush_all_signals (void) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); } int nr_requests_used (void) @@ -455,15 +455,15 @@ static int user_req_start_thread (thread for (j = 0; j < CONFIG_TUX_NUMSOCKETS; j++) init_waitqueue_entry(ti->wait_event + j, current); - ka = current->sig->action + SIGCHLD-1; + ka = current->sighand->action + SIGCHLD-1; ka->sa.sa_handler = SIG_IGN; /* Block all signals except SIGKILL, SIGSTOP, SIGHUP and SIGCHLD */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP)| sigmask(SIGHUP) | sigmask(SIGCHLD)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (!tux_listen[cpu][0].proto) { printk(KERN_ERR "no listen socket specified for TUX thread %d, in /proc/net/tux/%d/listen/, aborting.\n", cpu, cpu); --- linux/drivers/net/8139too.c.orig +++ linux/drivers/net/8139too.c @@ -1590,10 +1590,10 @@ static int rtl8139_thread (void *data) daemonize (); reparent_to_init(); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigemptyset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); strncpy (current->comm, dev->name, sizeof(current->comm) - 1); current->comm[sizeof(current->comm) - 1] = '\0'; @@ -1605,9 +1605,9 @@ static int rtl8139_thread (void *data) } while (!signal_pending (current) && (timeout > 0)); if (signal_pending (current)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } if (tp->time_to_die) --- linux/drivers/block/loop.c.orig +++ linux/drivers/block/loop.c @@ -585,10 +585,10 @@ static int loop_thread(void *data) sprintf(current->comm, "loop%d", lo->lo_number); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); flush_signals(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_bound; --- linux/drivers/block/nbd.c.orig +++ linux/drivers/block/nbd.c @@ -105,12 +105,12 @@ static int nbd_xmit(int send, struct soc /* Allow interception of SIGKILL only * Don't allow other signals to interrupt the transmission */ - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); oldset = current->blocked; sigfillset(¤t->blocked); sigdelsetmask(¤t->blocked, sigmask(SIGKILL)); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); do { @@ -133,11 +133,11 @@ static int nbd_xmit(int send, struct soc if (signal_pending(current)) { siginfo_t info; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n", current->pid, current->comm, dequeue_signal(¤t->blocked, &info)); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); result = -EINTR; break; } @@ -153,10 +153,10 @@ static int nbd_xmit(int send, struct soc buf += result; } while (size > 0); - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = oldset; - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); set_fs(oldfs); return result; --- linux/drivers/char/ftape/lowlevel/fdc-io.c.orig 2000-10-16 21:58:51.000000000 +0200 +++ linux/drivers/char/ftape/lowlevel/fdc-io.c @@ -404,11 +404,11 @@ int fdc_interrupt_wait(unsigned int time /* timeout time will be up to USPT microseconds too long ! */ timeout = (1000 * time + FT_USPT - 1) / FT_USPT; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); old_sigmask = current->blocked; sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); current->state = TASK_INTERRUPTIBLE; add_wait_queue(&ftape_wait_intr, &wait); @@ -416,10 +416,10 @@ int fdc_interrupt_wait(unsigned int time timeout = schedule_timeout(timeout); } - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = old_sigmask; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); remove_wait_queue(&ftape_wait_intr, &wait); /* the following IS necessary. True: as well --- linux/drivers/char/mwave/mwavedd.c.orig +++ linux/drivers/char/mwave/mwavedd.c @@ -279,7 +279,6 @@ static int mwave_ioctl(struct inode *ino pDrvData->IPCs[ipcnum].bIsHere = FALSE; pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) - current->nice = -20; /* boost to provide priority timing */ #else current->priority = 0x28; /* boost to provide priority timing */ #endif --- linux/drivers/char/n_tty.c.orig +++ linux/drivers/char/n_tty.c @@ -810,7 +810,7 @@ static void n_tty_receive_buf(struct tty int is_ignored(int sig) { return (sigismember(¤t->blocked, sig) || - current->sig->action[sig-1].sa.sa_handler == SIG_IGN); + current->sighand->action[sig-1].sa.sa_handler == SIG_IGN); } static void n_tty_set_termios(struct tty_struct *tty, struct termios * old) --- linux/drivers/char/serial_txx927.c.orig +++ linux/drivers/char/serial_txx927.c @@ -1533,7 +1533,6 @@ static void rs_wait_until_sent(struct tt printk("cisr = %d (jiff=%lu)...", cisr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; - current->counter = 0; /* make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; --- linux/drivers/char/sysrq.c.orig +++ linux/drivers/char/sysrq.c @@ -282,7 +282,7 @@ static void send_sig_all(int sig) { struct task_struct *p; - for_each_task(p) { + for_each_process(p) { if (p->mm && p->pid != 1) /* Not swapper, init nor kernel thread */ force_sig(sig, p); --- linux/drivers/char/tty_io.c.orig +++ linux/drivers/char/tty_io.c @@ -439,6 +439,7 @@ void do_tty_hangup(void *data) struct task_struct *p; struct list_head *l; int closecount = 0, n; + struct pid *pid; if (!tty) return; @@ -499,19 +500,19 @@ void do_tty_hangup(void *data) "error %d\n", -i); } } - + read_lock(&tasklist_lock); - for_each_task(p) { - if ((tty->session > 0) && (p->session == tty->session) && - p->leader) { - send_sig(SIGHUP,p,1); - send_sig(SIGCONT,p,1); + if (tty->session > 0) + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { + if (p->tty == tty) + p->tty = NULL; + if (!p->leader) + continue; + send_sig(SIGHUP, p, 1); + send_sig(SIGCONT, p, 1); if (tty->pgrp > 0) p->tty_old_pgrp = tty->pgrp; } - if (p->tty == tty) - p->tty = NULL; - } read_unlock(&tasklist_lock); tty->flags = 0; @@ -576,6 +577,8 @@ void disassociate_ctty(int on_exit) struct tty_struct *tty = current->tty; struct task_struct *p; int tty_pgrp = -1; + struct list_head *l; + struct pid *pid; if (tty) { tty_pgrp = tty->pgrp; @@ -599,9 +602,8 @@ void disassociate_ctty(int on_exit) tty->pgrp = -1; read_lock(&tasklist_lock); - for_each_task(p) - if (p->session == current->session) - p->tty = NULL; + for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; read_unlock(&tasklist_lock); } @@ -1227,12 +1229,15 @@ static void release_dev(struct file * fi */ if (tty_closing || o_tty_closing) { struct task_struct *p; + struct list_head *l; + struct pid *pid; read_lock(&tasklist_lock); - for_each_task(p) { - if (p->tty == tty || (o_tty && p->tty == o_tty)) + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; + if (o_tty) + for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) p->tty = NULL; - } read_unlock(&tasklist_lock); if (redirect == tty || (o_tty && redirect == o_tty)) @@ -1567,11 +1572,12 @@ static int tiocsctty(struct tty_struct * * Steal it away */ struct task_struct *p; + struct list_head *l; + struct pid *pid; read_lock(&tasklist_lock); - for_each_task(p) - if (p->tty == tty) - p->tty = NULL; + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; read_unlock(&tasklist_lock); } else return -EPERM; @@ -1858,7 +1864,7 @@ static void __do_SAK(void *arg) if (tty->driver.flush_buffer) tty->driver.flush_buffer(tty); read_lock(&tasklist_lock); - for_each_task(p) { + for_each_process(p) { if ((p->tty == tty) || ((session > 0) && (p->session == session))) { send_sig(SIGKILL, p, 1); --- linux/drivers/scsi/cpqfcTSworker.c.orig +++ linux/drivers/scsi/cpqfcTSworker.c @@ -165,8 +165,7 @@ void cpqfcTSWorkerThread(void *host) */ exit_mm(current); - current->session = 1; - current->pgrp = 1; + set_special_pids(1, 1); /* Become as one with the init task */ --- linux/drivers/usb/storage/usb.c.orig +++ linux/drivers/usb/storage/usb.c @@ -324,11 +324,11 @@ static int usb_stor_control_thread(void reparent_to_init(); /* avoid getting signals */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); flush_signals(current); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); /* set our name for identification purposes */ sprintf(current->comm, "usb-storage-%d", us->host_number); --- linux/drivers/media/video/saa5249.c.orig +++ linux/drivers/media/video/saa5249.c @@ -284,17 +284,17 @@ static void jdelay(unsigned long delay) { sigset_t oldblocked = current->blocked; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); current->state = TASK_INTERRUPTIBLE; schedule_timeout(delay); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = oldblocked; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); } --- linux/drivers/mtd/devices/blkmtd.c.orig +++ linux/drivers/mtd/devices/blkmtd.c @@ -309,10 +309,10 @@ static int write_queue_task(void *data) daemonize(); strcpy(tsk->comm, "blkmtdd"); tsk->tty = NULL; - spin_lock_irq(&tsk->sigmask_lock); + spin_lock_irq(&tsk->sighand->siglock); sigfillset(&tsk->blocked); - recalc_sigpending(tsk); - spin_unlock_irq(&tsk->sigmask_lock); + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); exit_sighand(tsk); if(alloc_kiovec(1, &iobuf)) { --- linux/drivers/mtd/mtdblock.c.orig +++ linux/drivers/mtd/mtdblock.c @@ -489,10 +489,10 @@ int mtdblock_thread(void *dummy) tsk->flags |= PF_MEMALLOC; strcpy(tsk->comm, "mtdblockd"); tsk->tty = NULL; - spin_lock_irq(&tsk->sigmask_lock); + spin_lock_irq(&tsk->sighand->siglock); sigfillset(&tsk->blocked); - recalc_sigpending(tsk); - spin_unlock_irq(&tsk->sigmask_lock); + recalc_sigpending_tsk(tsk); + spin_unlock_irq(&tsk->sighand->siglock); exit_mm(tsk); exit_files(tsk); exit_sighand(tsk); --- linux/drivers/md/md.c.orig +++ linux/drivers/md/md.c @@ -2930,8 +2930,6 @@ int md_thread(void * arg) * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - current->policy = SCHED_OTHER; - current->nice = -20; md_unlock_kernel(); complete(thread->event); @@ -3384,11 +3382,6 @@ recheck: "(but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); - /* - * Resync has low priority. - */ - current->nice = 19; - is_mddev_idle(mddev); /* this also initializes IO event counters */ for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -3466,16 +3459,13 @@ recheck: currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > sysctl_speed_limit_min) { - current->nice = 19; - if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { current->state = TASK_INTERRUPTIBLE; md_schedule_timeout(HZ/4); goto repeat; } - } else - current->nice = -20; + } } printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); err = 0; --- linux/drivers/bluetooth/bt3c_cs.c.orig +++ linux/drivers/bluetooth/bt3c_cs.c @@ -528,19 +528,19 @@ static int bt3c_firmware_load(bt3c_info_ } /* Block signals, everything but SIGKILL/SIGSTOP */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); tmpsig = current->blocked; siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); result = waitpid(pid, NULL, __WCLONE); /* Allow signals again */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = tmpsig; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (result != pid) { printk(KERN_WARNING "bt3c_cs: Waiting for pid %d failed (errno=%d).\n", pid, -result); --- linux/drivers/addon/cipe/device.c.orig +++ linux/drivers/addon/cipe/device.c @@ -223,7 +223,7 @@ static int cipe_isowned(struct cipe *c) tasklist_UNLOCK(); return 1; } - p=p->next_task; + p=next_task(p); } while (p!=current); tasklist_UNLOCK(); return 0; --- linux/drivers/addon/iscsi/iscsi-kernel.h.orig +++ linux/drivers/addon/iscsi/iscsi-kernel.h @@ -65,11 +65,7 @@ typedef struct wait_queue wait_queue_t; # define SIGNAL_IS_PENDING(SIG) sigismember(¤t->signal, (SIG)) # endif -# if ( LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) ) # define RECALC_PENDING_SIGNALS recalc_sigpending() -# else -# define RECALC_PENDING_SIGNALS recalc_sigpending(current) -# endif #if ( LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,0) ) --- linux/drivers/addon/iscsi/iscsi.c.orig +++ linux/drivers/addon/iscsi/iscsi.c @@ -380,12 +380,8 @@ void iscsi_daemonize(void) reparent_to_init(); /* increase priority like the md driver does for it's kernel threads */ - this_task->policy = SCHED_OTHER; -# ifdef set_user_nice + this_task->policy = SCHED_NORMAL; set_user_nice(this_task, -20); -# else - this_task->nice = -20; -# endif wmb(); # elif ( LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ) @@ -397,7 +393,7 @@ void iscsi_daemonize(void) */ /* increase priority like the md driver does for it's kernel threads */ - this_task->policy = SCHED_OTHER; + this_task->policy = SCHED_NORMAL; this_task->nice = -20; wmb(); @@ -433,7 +429,7 @@ void iscsi_daemonize(void) */ /* increase priority like the md driver does for it's kernel threads */ - this_task->policy = SCHED_OTHER; + this_task->policy = SCHED_NORMAL; this_task->priority = 40; wmb(); @@ -539,7 +535,7 @@ static int iscsi_handle_signals(iscsi_se * if we got SIGKILL, terminate this session. */ if (signal_pending(current)) { - spin_lock_irq(¤t->sig->siglock); + spin_lock_irq(¤t->sighand->siglock); /* iscsi_drop_session and iscsi_terminate_session signal both * threads, but someone logged in as root may not. So, we @@ -578,7 +574,7 @@ static int iscsi_handle_signals(iscsi_se } /* we don't care about any other signals */ flush_signals(current); - spin_unlock_irq(¤t->sig->siglock); + spin_unlock_irq(¤t->sighand->siglock); } return ret; @@ -3475,10 +3471,10 @@ static int iscsi_tx_thread( void *vtaskp goto ThreadExit; /* Block all signals except SIGHUP and SIGKILL */ - spin_lock_irq(¤t->sig->siglock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGHUP)); RECALC_PENDING_SIGNALS; - spin_unlock_irq(¤t->sig->siglock); + spin_unlock_irq(¤t->sighand->siglock); DEBUG_INIT3("iSCSI: tx thread %d for session %p starting on cpu%d\n", current->pid, session, smp_processor_id()); @@ -6052,10 +6048,10 @@ static int iscsi_rx_thread(void *vtaskp) goto ThreadExit; /* Block all signals except SIGHUP and SIGKILL */ - spin_lock_irq(¤t->sig->siglock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGHUP)); RECALC_PENDING_SIGNALS; - spin_unlock_irq(¤t->sig->siglock); + spin_unlock_irq(¤t->sighand->siglock); DEBUG_INIT3("iSCSI: rx thread %d for session %p starting on cpu%d\n", current->pid, session, smp_processor_id()); @@ -7047,10 +7043,10 @@ static int iscsi_timer_thread(void *vtas DEBUG_INIT2("iSCSI: timer pid %d starting at %lu\n", iscsi_timer_pid, jiffies); /* Block all signals except SIGKILL */ - spin_lock_irq(¤t->sig->siglock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL)); RECALC_PENDING_SIGNALS; - spin_unlock_irq(¤t->sig->siglock); + spin_unlock_irq(¤t->sighand->siglock); /* wait for the module to initialize */ while (test_bit(0, &init_module_complete) == 0) { --- linux/drivers/addon/iscsi/iscsi_main.c.orig +++ linux/drivers/addon/iscsi/iscsi_main.c @@ -403,9 +403,6 @@ struct proc_dir_entry proc_dir_iscsi = { */ void iscsi_daemonize(void) { - struct task_struct *this_task = current; - -# if ( LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10) ) /* use the kernel's daemonize */ daemonize(); @@ -415,54 +412,6 @@ void iscsi_daemonize(void) /* increase priority like the md driver does for it's kernel threads */ wmb(); -# elif ( LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ) - /* use the kernel's daemonize */ - daemonize(); - - /* We'd like to reparent to init, but don't have a function to do it, and - * symbols like child_reaper aren't exported to modules - */ - - wmb(); - -# else - /* 2.2.18 and later has daemonize(), but it's not always correct, so we do it ourselves. */ - struct fs_struct *fs; - - lock_kernel(); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(this_task); - - this_task->session = 1; - this_task->pgrp = 1; - this_task->tty = NULL; - - /* Become as one with the init task */ - exit_files(this_task); - this_task->files = init_task.files; - atomic_inc(&this_task->files->count); - - exit_fs(this_task); /* this_task->fs->count--; */ - fs = init_task.fs; - this_task->fs = fs; - atomic_inc(&fs->count); - - /* We'd like to reparent to init, but don't have a function to do it, and - * symbols like child_reaper aren't exported to modules. - */ - - /* increase priority like the md driver does for it's kernel threads */ - this_task->policy = SCHED_OTHER; - this_task->priority = 40; - wmb(); - - unlock_kernel(); -# endif } /* drop an iscsi session */ @@ -509,7 +458,7 @@ static int iscsi_handle_signals(iscsi_se * if we got SIGKILL, terminate this session. */ if (signal_pending(current)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); /* iscsi_drop_session and iscsi_terminate_session signal both * threads, but someone logged in as root may not. So, we @@ -548,7 +497,7 @@ static int iscsi_handle_signals(iscsi_se } /* we don't care about any other signals */ flush_signals(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sighand->siglock); } return ret; @@ -2873,10 +2822,10 @@ static int iscsi_tx_thread( void *vtaskp mb(); /* Block all signals except SIGHUP and SIGKILL */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGHUP)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); DEBUG_INIT3("iSCSI: tx thread %d for session %p starting cpu%d\n", current->pid, session, smp_processor_id()); @@ -4342,10 +4291,10 @@ static int iscsi_rx_thread(void *vtaskp) mb(); /* Block all signals except SIGHUP and SIGKILL */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL) | sigmask(SIGHUP)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); DEBUG_INIT3("iSCSI: rx thread %d for session %p, cpu%d\n", current->pid, session, smp_processor_id()); @@ -4971,10 +4920,10 @@ static int iscsi_timer_thread(void *vtas printk("iSCSI: timer thread is pid %d\n", iscsi_timer_pid); /* Block all signals except SIGKILL */ - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); siginitsetinv(¤t->blocked, sigmask(SIGKILL)); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); /* wait for the module to initialize */ while (test_bit(0, &init_module_complete) == 0) { --- linux/drivers/sensors/gl518sm.c.orig +++ linux/drivers/sensors/gl518sm.c @@ -679,8 +679,7 @@ int gl518_update_thread(void *c) lock_kernel(); #endif exit_mm(current); - current->session = 1; - current->pgrp = 1; + set_special_pids(1, 1); sigfillset(¤t->blocked); current->fs->umask = 0; strcpy(current->comm, "gl518sm"); --- linux/arch/i386/boot/setup.S.orig +++ linux/arch/i386/boot/setup.S @@ -1074,9 +1074,14 @@ delay: ret # Descriptor tables +# +# NOTE: if you think the GDT is large, you can make it smaller by just +# defining the KERNEL_CS and KERNEL_DS entries and shifting the gdt +# address down by GDT_ENTRY_KERNEL_CS*8. This puts bogus entries into +# the GDT, but those wont be used so it's not a problem. +# gdt: - .word 0, 0, 0, 0 # dummy - .word 0, 0, 0, 0 # unused + .fill GDT_ENTRY_KERNEL_CS,8,0 .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) .word 0 # base address = 0 --- linux/arch/i386/mm/fault.c.orig +++ linux/arch/i386/mm/fault.c @@ -24,6 +24,7 @@ #include #include #include +#include extern void die(const char *,struct pt_regs *,long); @@ -123,7 +124,6 @@ void bust_spinlocks(int yes) } asmlinkage void do_invalid_op(struct pt_regs *, unsigned long); -extern unsigned long idt; /* * This routine handles page faults. It determines the address, @@ -287,7 +287,7 @@ bad_area: if (boot_cpu_data.f00f_bug) { unsigned long nr; - nr = (address - idt) >> 3; + nr = (address - idt_descr.address) >> 3; if (nr == 6) { do_invalid_op(regs, 0); --- linux/arch/i386/mm/init.c.orig +++ linux/arch/i386/mm/init.c @@ -154,7 +154,7 @@ static void __init fixrange_init (unsign for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { if (pmd_none(*pmd)) { pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); if (pte != pte_offset_kernel(pmd, 0)) BUG(); } --- linux/arch/i386/kernel/Makefile.orig +++ linux/arch/i386/kernel/Makefile @@ -18,7 +18,7 @@ export-objs := mca.o mtrr.o msr.o cp obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ - pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o + pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o sysenter.o ifdef CONFIG_PCI --- linux/arch/i386/kernel/apm.c.orig +++ linux/arch/i386/kernel/apm.c @@ -214,6 +214,7 @@ #include #include #include +#include #include #include @@ -418,6 +419,7 @@ static int broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user * user_list; +static struct desc_struct bad_bios_desc = { 0, 0x00409200 }; static char driver_version[] = "1.16"; /* no spaces */ @@ -603,6 +605,7 @@ static u8 apm_bios_call(u32 func, u32 eb APM_DECL_SEGS unsigned long flags; unsigned long cpus = apm_save_cpus(); + struct desc_struct save_desc_40; __save_flags(flags); APM_DO_CLI; @@ -628,6 +631,8 @@ static u8 apm_bios_call(u32 func, u32 eb apm_restore_cpus(cpus); + cpu_gdt_table[smp_processor_id()][0x40 / 8] = save_desc_40; + return *eax & 0xff; } @@ -649,9 +654,13 @@ static u8 apm_bios_call_simple(u32 func, { u8 error; APM_DECL_SEGS - unsigned long flags; - + unsigned long flags; + int cpu = smp_processor_id(); + struct desc_struct save_desc_40; unsigned long cpus = apm_save_cpus(); + + save_desc_40 = cpu_gdt_table[cpu][0x40 / 8]; + cpu_gdt_table[cpu][0x40 / 8] = bad_bios_desc; __save_flags(flags); APM_DO_CLI; @@ -681,6 +690,8 @@ static u8 apm_bios_call_simple(u32 func, apm_restore_cpus(cpus); + cpu_gdt_table[smp_processor_id()][0x40 / 8] = save_desc_40; + return error; } @@ -1187,6 +1198,11 @@ static void queue_event(apm_event_t even static void set_time(void) { unsigned long flags; + int cpu = smp_processor_id(); + struct desc_struct save_desc_40; + + save_desc_40 = cpu_gdt_table[cpu][0x40 / 8]; + cpu_gdt_table[cpu][0x40 / 8] = bad_bios_desc; if (got_clock_diff) { /* Must know time zone in order to set clock */ save_flags(flags); @@ -1924,6 +1940,8 @@ static struct miscdevice apm_device = { */ static int __init apm_init(void) { + int i; + struct proc_dir_entry *apm_proc; if (apm_info.bios.version == 0) { @@ -1995,37 +2013,39 @@ static int __init apm_init(void) * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ - set_base(gdt[APM_40 >> 3], - __va((unsigned long)0x40 << 4)); - _set_limit((char *)&gdt[APM_40 >> 3], 4095 - (0x40 << 4)); + set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); + _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); apm_bios_entry.offset = apm_info.bios.offset; apm_bios_entry.segment = APM_CS; - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + + for (i = 0; i < NR_CPUS; i++) { + set_base(cpu_gdt_table[i][APM_CS >> 3], + __va((unsigned long)apm_info.bios.cseg << 4)); + set_base(cpu_gdt_table[i][APM_CS_16 >> 3], + __va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_base(cpu_gdt_table[i][APM_DS >> 3], + __va((unsigned long)apm_info.bios.dseg << 4)); #ifndef APM_RELAX_SEGMENTS - if (apm_info.bios.version == 0x100) { + if (apm_info.bios.version == 0x100) { #endif - /* For ASUS motherboard, Award BIOS rev 110 (and others?) */ - _set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 - 1); - /* For some unknown machine. */ - _set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1); - /* For the DEC Hinote Ultra CT475 (and others?) */ - _set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1); + /* For ASUS motherboard, Award BIOS rev 110 (and others?) */ + _set_limit((char *)&cpu_gdt_table[i][APM_CS >> 3], 64 * 1024 - 1); + /* For some unknown machine. */ + _set_limit((char *)&cpu_gdt_table[i][APM_CS_16 >> 3], 64 * 1024 - 1); + /* For the DEC Hinote Ultra CT475 (and others?) */ + _set_limit((char *)&cpu_gdt_table[i][APM_DS >> 3], 64 * 1024 - 1); #ifndef APM_RELAX_SEGMENTS - } else { - _set_limit((char *)&gdt[APM_CS >> 3], - (apm_info.bios.cseg_len - 1) & 0xffff); - _set_limit((char *)&gdt[APM_CS_16 >> 3], - (apm_info.bios.cseg_16_len - 1) & 0xffff); - _set_limit((char *)&gdt[APM_DS >> 3], - (apm_info.bios.dseg_len - 1) & 0xffff); - } + } else { + _set_limit((char *)&cpu_gdt_table[i][APM_CS >> 3], + (apm_info.bios.cseg_len - 1) & 0xffff); + _set_limit((char *)&cpu_gdt_table[i][APM_CS_16 >> 3], + (apm_info.bios.cseg_16_len - 1) & 0xffff); + _set_limit((char *)&cpu_gdt_table[i][APM_DS >> 3], + (apm_info.bios.dseg_len - 1) & 0xffff); + } #endif + } apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); if (apm_proc) --- linux/arch/i386/kernel/entry.S.orig +++ linux/arch/i386/kernel/entry.S @@ -79,7 +79,7 @@ addr_limit = 12 exec_domain = 16 need_resched = 20 tsk_ptrace = 24 -processor = 52 +cpu = 32 ENOSYS = 38 @@ -652,11 +654,11 @@ ENTRY(sys_call_table) .long SYMBOL_NAME(sys_fremovexattr) .long SYMBOL_NAME(sys_tkill) .long SYMBOL_NAME(sys_sendfile64) /* reserved for sendfile64 */ - .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_setaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_getaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_get_thread_area */ + .long SYMBOL_NAME(sys_futex) /* 240 */ + .long SYMBOL_NAME(sys_sched_setaffinity) + .long SYMBOL_NAME(sys_sched_getaffinity) + .long SYMBOL_NAME(sys_set_thread_area) + .long SYMBOL_NAME(sys_get_thread_area) .long SYMBOL_NAME(sys_ni_syscall) /* 245 sys_io_setup */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_destroy */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_getevents */ @@ -664,8 +666,14 @@ ENTRY(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_cancel */ .long SYMBOL_NAME(sys_ni_syscall) /* 250 sys_alloc_hugepages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_free_hugepages */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_exit_group */ + .long SYMBOL_NAME(sys_exit_group) .long SYMBOL_NAME(sys_lookup_dcookie) + .long SYMBOL_NAME(sys_ni_syscall) + .long SYMBOL_NAME(sys_ni_syscall) /* 255 sys_epoll_ctl */ + .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_wait */ + .long SYMBOL_NAME(sys_ni_syscall) /* sys_remap_file_pages */ + .long SYMBOL_NAME(sys_set_tid_address) + .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) --- linux/arch/i386/kernel/head.S.orig +++ linux/arch/i386/kernel/head.S @@ -241,7 +241,7 @@ is386: pushl %ecx # restore original EF 2: movl %eax,%cr0 call check_x87 incb ready - lgdt gdt_descr + lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers @@ -249,12 +249,7 @@ is386: pushl %ecx # restore original EF movl %eax,%es movl %eax,%fs movl %eax,%gs -#ifdef CONFIG_SMP - movl $(__KERNEL_DS), %eax - movl %eax,%ss # Reload the stack pointer (segment only) -#else - lss stack_start,%esp # Load processor stack -#endif + movl %eax,%ss xorl %eax,%eax lldt %ax cld # gcc2 wants the direction flag cleared at all times @@ -347,30 +342,30 @@ ignore_int: popl %eax iret + /* - * The interrupt descriptor table has room for 256 idt's, - * the global descriptor table is dependent on the number - * of tasks we can have.. + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: */ -#define IDT_ENTRIES 256 -#define GDT_ENTRIES (__TSS(NR_CPUS)) - -.globl SYMBOL_NAME(idt) -.globl SYMBOL_NAME(gdt) +.globl SYMBOL_NAME(idt_descr) +.globl SYMBOL_NAME(cpu_gdt_descr) ALIGN - .word 0 -idt_descr: + .word 0 # 32-bit align idt_desc.address + +SYMBOL_NAME(idt_descr): .word IDT_ENTRIES*8-1 # idt contains 256 entries -SYMBOL_NAME(idt): .long SYMBOL_NAME(idt_table) - .word 0 -gdt_descr: +SYMBOL_NAME(cpu_gdt_descr): .word GDT_ENTRIES*8-1 -SYMBOL_NAME(gdt): - .long SYMBOL_NAME(gdt_table) + .long SYMBOL_NAME(cpu_gdt_table) + + .fill NR_CPUS-1,6,0 # space for the other GDT descriptors + /* * This is initialized to create an identity-mapping at 0-8M (for bootup @@ -423,26 +418,42 @@ ENTRY(_stext) ALIGN /* - * This contains typically 140 quadwords, depending on NR_CPUS. - * - * NOTE! Make sure the gdt descriptor in head.S matches this if you - * change anything. + * The Global Descriptor Table contains 28 quadwords, per-CPU. */ -ENTRY(gdt_table) +ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* not used */ - .quad 0x00cf9a000000ffff /* 0x10 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x18 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ - .quad 0x0000000000000000 /* not used */ - .quad 0x0000000000000000 /* not used */ + .quad 0x0000000000000000 /* 0x0b reserved */ + .quad 0x0000000000000000 /* 0x13 reserved */ + .quad 0x0000000000000000 /* 0x1b reserved */ + .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ + .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ + .quad 0x0000000000000000 /* 0x4b reserved */ + .quad 0x0000000000000000 /* 0x53 reserved */ + .quad 0x0000000000000000 /* 0x5b reserved */ + + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x0000000000000000 /* 0x70 TSS descriptor */ + .quad 0x0000000000000000 /* 0x78 LDT descriptor */ + + /* Segments used for calling PnP BIOS */ + .quad 0x00c09a0000000000 /* 0x80 32-bit code */ + .quad 0x00809a0000000000 /* 0x88 16-bit code */ + .quad 0x0080920000000000 /* 0x90 16-bit data */ + .quad 0x0080920000000000 /* 0x98 16-bit data */ + .quad 0x0080920000000000 /* 0xa0 16-bit data */ /* * The APM segments have byte granularity and their bases * and limits are set at run time. */ - .quad 0x0040920000000000 /* 0x40 APM set up for bad BIOS's */ - .quad 0x00409a0000000000 /* 0x48 APM CS code */ - .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ - .quad 0x0040920000000000 /* 0x58 APM DS data */ - .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ + .quad 0x00409a0000000000 /* 0xa8 APM CS code */ + .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ + .quad 0x0040920000000000 /* 0xb8 APM DS data */ + +#if CONFIG_SMP + .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ +#endif + --- linux/arch/i386/kernel/i386_ksyms.c.orig +++ linux/arch/i386/kernel/i386_ksyms.c @@ -76,7 +76,6 @@ EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL_GPL(cpu_khz); EXPORT_SYMBOL(apm_info); -EXPORT_SYMBOL(gdt); EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_DEBUG_IOVIRT --- linux/arch/i386/kernel/i387.c.orig +++ linux/arch/i386/kernel/i387.c @@ -554,3 +554,44 @@ int dump_extended_fpu( struct pt_regs *r return fpvalid; } + +int dump_task_fpu( struct task_struct *tsk, struct user_i387_struct *fpu ) +{ + int fpvalid; + + fpvalid = tsk->used_math; + if ( fpvalid ) { + if (tsk == current) unlazy_fpu( tsk ); + if ( cpu_has_fxsr ) { + copy_fpu_fxsave( tsk, fpu ); + } else { + copy_fpu_fsave( tsk, fpu ); + } + } + + return fpvalid; +} + +int dump_task_extended_fpu( struct task_struct *tsk, struct user_fxsr_struct *fpu ) +{ + int fpvalid; + + fpvalid = tsk->used_math && cpu_has_fxsr; + if ( fpvalid ) { + if (tsk == current) unlazy_fpu( tsk ); + memcpy( fpu, &tsk->thread.i387.fxsave, + sizeof(struct user_fxsr_struct) ); + } + + return fpvalid; +} + + +#ifdef CONFIG_SMP +void dump_smp_unlazy_fpu(void) +{ + unlazy_fpu(current); + return; +} +#endif + --- linux/arch/i386/kernel/init_task.c.orig +++ linux/arch/i386/kernel/init_task.c @@ -8,7 +8,8 @@ static struct fs_struct init_fs = INIT_FS; static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); /* --- linux/arch/i386/kernel/ldt.c.orig +++ linux/arch/i386/kernel/ldt.c @@ -171,7 +171,7 @@ static int write_ldt(void * ptr, unsigne struct mm_struct * mm = current->mm; __u32 entry_1, entry_2, *lp; int error; - struct modify_ldt_ldt_s ldt_info; + struct user_desc ldt_info; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -201,32 +201,17 @@ static int write_ldt(void * ptr, unsigne /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || - (ldt_info.contents == 0 && - ldt_info.read_exec_only == 1 && - ldt_info.seg_32bit == 0 && - ldt_info.limit_in_pages == 0 && - ldt_info.seg_not_present == 1 && - ldt_info.useable == 0 )) { + if (oldmode || LDT_empty(&ldt_info)) { entry_1 = 0; entry_2 = 0; goto install; } } - entry_1 = ((ldt_info.base_addr & 0x0000ffff) << 16) | - (ldt_info.limit & 0x0ffff); - entry_2 = (ldt_info.base_addr & 0xff000000) | - ((ldt_info.base_addr & 0x00ff0000) >> 16) | - (ldt_info.limit & 0xf0000) | - ((ldt_info.read_exec_only ^ 1) << 9) | - (ldt_info.contents << 10) | - ((ldt_info.seg_not_present ^ 1) << 15) | - (ldt_info.seg_32bit << 22) | - (ldt_info.limit_in_pages << 23) | - 0x7000; - if (!oldmode) - entry_2 |= (ldt_info.useable << 20); + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); /* Install the new entry ... */ install: --- linux/arch/i386/kernel/process.c.orig +++ linux/arch/i386/kernel/process.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include @@ -83,7 +85,7 @@ void default_idle(void) { if (current_cpu_data.hlt_works_ok && !hlt_counter) { __cli(); - if (!current->need_resched) + if (!need_resched()) safe_halt(); else __sti(); @@ -131,14 +133,17 @@ static void poll_idle (void) void cpu_idle (void) { /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; + /* + * We use the last_run timestamp to measure the idleness + * of a CPU. + */ + current->last_run = jiffies; + while (!current->need_resched) idle(); schedule(); @@ -446,11 +451,15 @@ void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; static char buffer[512]; + unsigned int fs = 0, gs = 0; + + savesegment(fs, fs); + savesegment(gs, gs); lookup_symbol(regs->eip,buffer,512); printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("Pid/TGid: %d/%d, comm: %20s\n", current->pid, current->tgid, current->comm); printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); printk("\nEIP is at %s (" UTS_RELEASE ")\n",buffer); if (regs->xcs & 3) @@ -460,8 +469,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); + printk(" DS: %04x ES: %04x FS: %04x GS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, fs, gs); __asm__("movl %%cr0, %0": "=r" (cr0)); __asm__("movl %%cr2, %0": "=r" (cr2)); @@ -478,32 +487,42 @@ void show_regs(struct pt_regs * regs) } /* + * This gets run with %ebx containing the + * function to call, and %edx containing + * the "args". + */ +extern void kernel_thread_helper(void); +__asm__(".align 4\n" + "kernel_thread_helper:\n\t" + "movl %edx,%eax\n\t" + "pushl %edx\n\t" + "call *%ebx\n\t" + "pushl %eax\n\t" + "call do_exit"); + +/* * Create a kernel thread */ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { - long retval, d0; + struct task_struct *p; + struct pt_regs regs; - __asm__ __volatile__( - "movl %%esp,%%esi\n\t" - "int $0x80\n\t" /* Linux/i386 system call */ - "cmpl %%esp,%%esi\n\t" /* child or parent? */ - "je 1f\n\t" /* parent - jump */ - /* Load the argument into eax, and push it. That way, it does - * not matter whether the called function is compiled with - * -mregparm or not. */ - "movl %4,%%eax\n\t" - "pushl %%eax\n\t" - "call *%5\n\t" /* call fn */ - "movl %3,%0\n\t" /* exit */ - "int $0x80\n" - "1:\t" - :"=&a" (retval), "=&S" (d0) - :"0" (__NR_clone), "i" (__NR_exit), - "r" (arg), "r" (fn), - "b" (flags | CLONE_VM) - : "memory"); - return retval; + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __KERNEL_DS; + regs.xes = __KERNEL_DS; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS; + regs.eflags = 0x286; + + /* Ok, create the new process.. */ + p = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + return IS_ERR(p) ? PTR_ERR(p) : p->pid; } /* @@ -519,6 +538,7 @@ void flush_thread(void) struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -538,7 +558,6 @@ void release_thread(struct task_struct * BUG(); } } - release_x86_irqs(dead_task); } /* @@ -552,11 +571,13 @@ int copy_thread(int nr, unsigned long cl struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; + struct task_struct *tsk; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; struct_cpy(childregs, regs); childregs->eax = 0; childregs->esp = esp; + p->set_child_tid = p->clear_child_tid = NULL; p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); @@ -566,9 +587,31 @@ int copy_thread(int nr, unsigned long cl savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); - unlazy_fpu(current); - struct_cpy(&p->thread.i387, ¤t->thread.i387); + tsk = current; + unlazy_fpu(tsk); + struct_cpy(&p->thread.i387, &tsk->thread.i387); + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { + struct desc_struct *desc; + struct user_desc info; + int idx; + + if (copy_from_user(&info, (void *)childregs->esi, sizeof(info))) + return -EFAULT; + if (LDT_empty(&info)) + return -EINVAL; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } return 0; } @@ -614,6 +657,25 @@ void dump_thread(struct pt_regs * regs, dump->u_fpvalid = dump_fpu (regs, &dump->i387); } +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs ptregs; + + ptregs = *(struct pt_regs *)((unsigned long)tsk + THREAD_SIZE - sizeof(struct pt_regs)); + + ptregs.xcs &= 0xffff; + ptregs.xds &= 0xffff; + ptregs.xes &= 0xffff; + ptregs.xss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + /* * This special macro can be used to load a debugging register */ @@ -649,7 +711,10 @@ void __switch_to(struct task_struct *pre { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - struct tss_struct *tss = init_tss + smp_processor_id(); + int cpu = smp_processor_id(); + struct tss_struct *tss = init_tss + cpu; + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ unlazy_fpu(prev_p); @@ -659,6 +724,11 @@ void __switch_to(struct task_struct *pre tss->esp0 = next->esp0; /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + /* * Save away %fs and %gs. No need to save %es and %ds, as * those are always kernel segments while inside the kernel. */ @@ -712,19 +782,27 @@ void __switch_to(struct task_struct *pre asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0); + struct task_struct *p; + + p = do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return IS_ERR(p) ? PTR_ERR(p) : p->pid; } asmlinkage int sys_clone(struct pt_regs regs) { + struct task_struct *p; unsigned long clone_flags; unsigned long newsp; + int *parent_tidptr, *child_tidptr; clone_flags = regs.ebx; newsp = regs.ecx; + parent_tidptr = (int *)regs.edx; + child_tidptr = (int *)regs.edi; if (!newsp) newsp = regs.esp; - return do_fork(clone_flags, newsp, ®s, 0); + p = do_fork(clone_flags & ~CLONE_IDLETASK, newsp, ®s, 0, parent_tidptr, child_tidptr); + return IS_ERR(p) ? PTR_ERR(p) : p->pid; } /* @@ -739,7 +817,10 @@ asmlinkage int sys_clone(struct pt_regs */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0); + struct task_struct *p; + + p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return IS_ERR(p) ? PTR_ERR(p) : p->pid; } /* @@ -795,3 +876,112 @@ unsigned long get_wchan(struct task_stru } #undef last_sched #undef first_sched + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(t->tls_array + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + */ +asmlinkage int sys_set_thread_area(struct user_desc *u_info) +{ + struct thread_struct *t = ¤t->thread; + struct user_desc info; + struct desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; + + cpu = smp_processor_id(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + load_TLS(t, cpu); + + + return 0; +} + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 23) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + +asmlinkage int sys_get_thread_area(struct user_desc *u_info) +{ + struct user_desc info; + struct desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + --- linux/arch/i386/kernel/ptrace.c.orig +++ linux/arch/i386/kernel/ptrace.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include /* * does not yet catch signals sent when the child dies. @@ -28,7 +30,7 @@ /* determines which flags the user has access to. */ /* 1 = access 0 = no access */ -#define FLAG_MASK 0x00040dd5 +#define FLAG_MASK 0x00044dd5 /* set's the trap flag. */ #define TRAP_FLAG 0x100 @@ -147,6 +149,85 @@ void ptrace_disable(struct task_struct * put_stack_long(child, EFL_OFFSET, tmp); } +/* + * Perform get_thread_area on behalf of the traced child. + */ +static int +ptrace_get_thread_area(struct task_struct *child, + int idx, struct user_desc *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 23) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(user_desc, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +/* + * Perform set_thread_area on behalf of the traced child. + */ +static int +ptrace_set_thread_area(struct task_struct *child, + int idx, struct user_desc *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + return 0; +} + asmlinkage int sys_ptrace(long request, long pid, long addr, long data) { struct task_struct *child; @@ -369,7 +450,7 @@ asmlinkage int sys_ptrace(long request, break; } ret = 0; - if ( !child->used_math ) + if (!child->used_math) load_empty_fpu(child); get_fpregs((struct user_i387_struct *)data, child); break; @@ -393,7 +474,7 @@ asmlinkage int sys_ptrace(long request, ret = -EIO; break; } - if ( !child->used_math ) + if (!child->used_math) load_empty_fpu(child); ret = get_fpxregs((struct user_fxsr_struct *)data, child); break; @@ -410,21 +491,22 @@ asmlinkage int sys_ptrace(long request, break; } - case PTRACE_SETOPTIONS: { - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - else - child->ptrace &= ~PT_TRACESYSGOOD; - ret = 0; + case PTRACE_GET_THREAD_AREA: + ret = ptrace_get_thread_area(child, + addr, (struct user_desc *) data); + break; + + case PTRACE_SET_THREAD_AREA: + ret = ptrace_set_thread_area(child, + addr, (struct user_desc *) data); break; - } default: - ret = -EIO; + ret = ptrace_request(child, request, addr, data); break; } out_tsk: - free_task_struct(child); + put_task_struct(child); out: unlock_kernel(); return ret; @@ -433,7 +515,7 @@ out: asmlinkage void syscall_trace(void) { if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) != - (PT_PTRACED|PT_TRACESYS)) + (PT_PTRACED|PT_TRACESYS)) return; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ @@ -451,4 +533,5 @@ asmlinkage void syscall_trace(void) send_sig(current->exit_code, current, 1); current->exit_code = 0; } + recalc_sigpending(); } --- linux/arch/i386/kernel/setup.c.orig +++ linux/arch/i386/kernel/setup.c @@ -750,6 +750,7 @@ static void __init setup_memory_region(v print_memory_map(who); } /* setup_memory_region */ +int allowsysinfo = 1; static void __init parse_cmdline_early (char ** cmdline_p) { @@ -812,6 +813,9 @@ static void __init parse_cmdline_early ( else if (!memcmp(from, "acpismp=force", 13)) enable_acpi_smp_table = 1; + else if (!memcmp(from, "nosysinfo", 9)) + allowsysinfo = 0; + /* * highmem=size forces highmem to be exactly 'size' bytes. * This works even on boxes that have no highmem otherwise. @@ -3058,14 +3062,15 @@ unsigned long cpu_initialized __initdata */ void __init cpu_init (void) { - int nr = smp_processor_id(); - struct tss_struct * t = &init_tss[nr]; + int cpu = smp_processor_id(); + struct tss_struct * t = &init_tss[cpu]; + struct thread_struct *thread = ¤t->thread; - if (test_and_set_bit(nr, &cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", nr); + if (test_and_set_bit(cpu, &cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) __sti(); } - printk(KERN_INFO "Initializing CPU#%d\n", nr); + printk(KERN_INFO "Initializing CPU#%d\n", cpu); if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); @@ -3078,7 +3083,21 @@ void __init cpu_init (void) } #endif - __asm__ __volatile__("lgdt %0": "=m" (gdt_descr)); + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + if (cpu) { + memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); + cpu_gdt_descr[cpu].size = GDT_SIZE-1; + cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; + } + /* + * Set up the per-thread TLS descriptor cache: + */ + memset(thread->tls_array, 0, sizeof(thread->tls_array)); + + __asm__ __volatile__("lgdt %0": "=m" (cpu_gdt_descr[cpu])); __asm__ __volatile__("lidt %0": "=m" (idt_descr)); /* @@ -3093,12 +3112,11 @@ void __init cpu_init (void) current->active_mm = &init_mm; if(current->mm) BUG(); - enter_lazy_tlb(&init_mm, current, nr); - - t->esp0 = current->thread.esp0; - set_tss_desc(nr,t); - gdt_table[__TSS(nr)].b &= 0xfffffdff; - load_TR(nr); + enter_lazy_tlb(&init_mm, current, cpu); + t->esp0 = thread->esp0; + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); load_LDT(&init_mm.context); /* Clear %fs and %gs. */ --- linux/arch/i386/kernel/signal.c.orig +++ linux/arch/i386/kernel/signal.c @@ -75,11 +75,11 @@ sys_sigsuspend(int history0, int history sigset_t saveset; mask &= _BLOCKABLE; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); saveset = current->blocked; siginitset(¤t->blocked, mask); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); regs->eax = -EINTR; while (1) { @@ -104,11 +104,11 @@ sys_rt_sigsuspend(sigset_t *unewset, siz return -EFAULT; sigdelsetmask(&newset, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); saveset = current->blocked; current->blocked = newset; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); regs->eax = -EINTR; while (1) { @@ -262,10 +262,10 @@ asmlinkage int sys_sigreturn(unsigned lo goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (restore_sigcontext(regs, &frame->sc, &eax)) goto badframe; @@ -290,10 +290,10 @@ asmlinkage int sys_rt_sigreturn(unsigned goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) goto badframe; @@ -535,9 +535,10 @@ give_sigsegv: */ static void -handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) +handle_signal(unsigned long sig, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) { + struct k_sigaction *ka = ¤t->sighand->action[sig-1]; + /* Are we from a system call? */ if (regs->orig_eax >= 0) { /* If so, check system call restarting.. */ @@ -568,14 +569,16 @@ handle_signal(unsigned long sig, struct ka->sa.sa_handler = SIG_DFL; if (!(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); } } +int print_fatal_signals; + /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -584,7 +587,7 @@ handle_signal(unsigned long sig, struct int do_signal(struct pt_regs *regs, sigset_t *oldset) { siginfo_t info; - struct k_sigaction *ka; + int signr; /* * We want the common case to go fast, which @@ -598,98 +601,8 @@ int do_signal(struct pt_regs *regs, sigs if (!oldset) oldset = ¤t->blocked; - for (;;) { - unsigned long signr; - - spin_lock_irq(¤t->sigmask_lock); - signr = dequeue_signal(¤t->blocked, &info); - spin_unlock_irq(¤t->sigmask_lock); - - if (!signr) - break; - - if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { - /* Let the debugger run. */ - current->exit_code = signr; - current->state = TASK_STOPPED; - notify_parent(current, SIGCHLD); - schedule(); - - /* We're back. Did the debugger cancel the sig? */ - if (!(signr = current->exit_code)) - continue; - current->exit_code = 0; - - /* The debugger continued. Ignore SIGSTOP. */ - if (signr == SIGSTOP) - continue; - - /* Update the siginfo structure. Is this good? */ - if (signr != info.si_signo) { - info.si_signo = signr; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = current->p_pptr->pid; - info.si_uid = current->p_pptr->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - send_sig_info(signr, &info, current); - continue; - } - } - - ka = ¤t->sig->action[signr-1]; - if (ka->sa.sa_handler == SIG_IGN) { - if (signr != SIGCHLD) - continue; - /* Check for SIGCHLD: it's special. */ - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - /* nothing */; - continue; - } - - if (ka->sa.sa_handler == SIG_DFL) { - int exit_code = signr; - - /* Init gets no signals it doesn't want. */ - if (current->pid == 1) - continue; - - switch (signr) { - case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG: - continue; - - case SIGTSTP: case SIGTTIN: case SIGTTOU: - if (is_orphaned_pgrp(current->pgrp)) - continue; - /* FALLTHRU */ - - case SIGSTOP: { - struct signal_struct *sig; - current->state = TASK_STOPPED; - current->exit_code = signr; - sig = current->p_pptr->sig; - if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - notify_parent(current, SIGCHLD); - schedule(); - continue; - } - - case SIGQUIT: case SIGILL: case SIGTRAP: - case SIGABRT: case SIGFPE: case SIGSEGV: - case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ: - if (do_coredump(signr, regs)) - exit_code |= 0x80; - /* FALLTHRU */ - - default: - sig_exit(signr, exit_code, &info); - /* NOTREACHED */ - } - } - + signr = get_signal_to_deliver(&info, regs); + if (signr > 0) { /* Reenable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered @@ -698,7 +611,7 @@ int do_signal(struct pt_regs *regs, sigs __asm__("movl %0,%%db7" : : "r" (current->thread.debugreg[7])); /* Whee! Actually deliver the signal. */ - handle_signal(signr, ka, &info, oldset, regs); + handle_signal(signr, &info, oldset, regs); return 1; } --- linux/arch/i386/kernel/smpboot.c.orig +++ linux/arch/i386/kernel/smpboot.c @@ -308,14 +308,14 @@ static void __init synchronize_tsc_bp (v if (tsc_values[i] < avg) realdelta = -realdelta; - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); } sum += delta; } if (!buggy) printk("passed.\n"); + ; } static void __init synchronize_tsc_ap (void) @@ -365,7 +365,7 @@ void __init smp_callin(void) * (This works even if the APIC is not enabled.) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = current->processor; + cpuid = cpu(); if (test_and_set_bit(cpuid, &cpu_online_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); @@ -435,6 +435,7 @@ void __init smp_callin(void) */ smp_store_cpu_info(cpuid); + disable_APIC_timer(); /* * Allow the master to continue. */ @@ -465,6 +466,7 @@ int __init start_secondary(void *unused) smp_callin(); while (!atomic_read(&smp_commenced)) rep_nop(); + enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -499,14 +501,14 @@ extern struct { unsigned short ss; } stack_start; -static int __init fork_by_hand(void) +static struct task_struct * __init fork_by_hand(void) { struct pt_regs regs; /* * don't care about the eip and regs settings since * we'll never reschedule the forked task. */ - return do_fork(CLONE_VM|CLONE_PID, 0, ®s, 0); + return do_fork(CLONE_VM|CLONE_IDLETASK, 0, ®s, 0, NULL, NULL); } /* which physical APIC ID maps to which logical CPU number */ @@ -792,27 +794,17 @@ static void __init do_boot_cpu (int apic * We can't use kernel_thread since we must avoid to * reschedule the child. */ - if (fork_by_hand() < 0) + idle = fork_by_hand(); + if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - /* - * We remove it from the pidhash and the runqueue - * once we got the process: - */ - idle = init_task.prev_task; - if (!idle) - panic("No idle process for CPU %d", cpu); - - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); map_cpu_to_boot_apicid(cpu, apicid); idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -925,6 +917,7 @@ static void __init do_boot_cpu (int apic } cycles_t cacheflush_time; +unsigned long cache_decay_ticks; static void smp_tune_scheduling (void) { @@ -958,9 +951,13 @@ static void smp_tune_scheduling (void) cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + (cache_decay_ticks + 1) * 1000 / HZ); } /* @@ -1026,8 +1023,7 @@ void __init smp_boot_cpus(void) map_cpu_to_boot_apicid(0, boot_cpu_apicid); global_irq_holder = 0; - current->processor = 0; - init_idle(); + current->cpu = 0; smp_tune_scheduling(); /* --- linux/arch/i386/kernel/sys_i386.c.orig 2001-08-21 14:26:08.000000000 +0200 +++ linux/arch/i386/kernel/sys_i386.c @@ -247,10 +247,3 @@ asmlinkage int sys_olduname(struct oldol return error; } -asmlinkage int sys_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - --- linux/arch/i386/kernel/sysenter.c.orig +++ linux/arch/i386/kernel/sysenter.c @@ -0,0 +1,40 @@ +/* + * linux/arch/i386/kernel/sysenter.c + * + * (C) Copyright 2002 Linus Torvalds + * + * This file contains the needed initializations to support AT_SYSINFO + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +extern int allowsysinfo; + +static int __init sysenter_setup(void) +{ + static const char int80[] = { + 0xcd, 0x80, /* int $0x80 */ + 0xc3 /* ret */ + }; + unsigned long page; + + if (!allowsysinfo) + return 0; + + page = get_zeroed_page(GFP_ATOMIC); + + __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY); + memcpy((void *) page, int80, sizeof(int80)); + return 0; +} + +__initcall(sysenter_setup); --- linux/arch/i386/kernel/trampoline.S.orig +++ linux/arch/i386/kernel/trampoline.S @@ -61,9 +61,14 @@ idt_48: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L +# +# NOTE: here we actually use CPU#0's GDT - but that is OK, we reload +# the proper GDT shortly after booting up the secondary CPUs. +# + gdt_48: .word 0x0800 # gdt limit = 2048, 256 GDT entries - .long gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) + .long cpu_gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU) .globl SYMBOL_NAME(trampoline_end) SYMBOL_NAME_LABEL(trampoline_end) --- linux/arch/i386/kernel/traps.c.orig +++ linux/arch/i386/kernel/traps.c @@ -489,6 +489,8 @@ static void default_do_nmi(struct pt_reg { unsigned char reason = inb(0x61); + ++nmi_count(smp_processor_id()); + if (!(reason & 0xc0)) { #if CONFIG_X86_LOCAL_APIC /* @@ -528,8 +530,6 @@ asmlinkage void do_nmi(struct pt_regs * { int cpu = smp_processor_id(); - ++nmi_count(cpu); - if (!nmi_callback(regs, cpu)) default_do_nmi(regs); } @@ -828,7 +828,7 @@ void __init trap_init_f00f_bug(void) * update the idt descriptor.. */ __set_fixmap(FIX_F00F, __pa(&idt_table), PAGE_KERNEL_RO); - idt = (struct desc_struct *)__fix_to_virt(FIX_F00F); + idt_descr.address = __fix_to_virt(FIX_F00F); __asm__ __volatile__("lidt %0": "=m" (idt_descr)); } @@ -874,37 +874,6 @@ static void __init set_call_gate(void *a _set_gate(a,12,3,addr); } -#define _set_seg_desc(gate_addr,type,dpl,base,limit) {\ - *((gate_addr)+1) = ((base) & 0xff000000) | \ - (((base) & 0x00ff0000)>>16) | \ - ((limit) & 0xf0000) | \ - ((dpl)<<13) | \ - (0x00408000) | \ - ((type)<<8); \ - *(gate_addr) = (((base) & 0x0000ffff)<<16) | \ - ((limit) & 0x0ffff); } - -#define _set_tssldt_desc(n,addr,limit,type) \ -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ - "movw %%ax,2(%2)\n\t" \ - "rorl $16,%%eax\n\t" \ - "movb %%al,4(%2)\n\t" \ - "movb %4,5(%2)\n\t" \ - "movb $0,6(%2)\n\t" \ - "movb %%ah,7(%2)\n\t" \ - "rorl $16,%%eax" \ - : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) - -void set_tss_desc(unsigned int n, void *addr) -{ - _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89); -} - -void set_ldt_desc(unsigned int n, void *addr, unsigned int size) -{ - _set_tssldt_desc(gdt_table+__LDT(n), (int)addr, ((size << 3)-1), 0x82); -} - #ifdef CONFIG_X86_VISWS_APIC /* --- linux/arch/i386/kernel/vm86.c.orig +++ linux/arch/i386/kernel/vm86.c @@ -114,6 +114,8 @@ struct pt_regs * save_v86_state(struct k tss = init_tss + smp_processor_id(); tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; current->thread.saved_esp0 = 0; + loadsegment(fs, current->thread.saved_fs); + loadsegment(gs, current->thread.saved_gs); ret = KVM86->regs32; return ret; } @@ -279,6 +281,9 @@ static void do_sys_vm86(struct kernel_vm */ info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; + asm volatile("movl %%fs,%0":"=m" (tsk->thread.saved_fs)); + asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); + tss = init_tss + smp_processor_id(); tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -500,10 +505,10 @@ int handle_vm86_trap(struct kernel_vm86_ return 1; /* we let this handle by the calling routine */ if (current->ptrace & PT_PTRACED) { unsigned long flags; - spin_lock_irqsave(¤t->sigmask_lock, flags); + spin_lock_irqsave(¤t->sighand->siglock, flags); sigdelset(¤t->blocked, SIGTRAP); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); } send_sig(SIGTRAP, current, 1); current->thread.trap_no = trapno; @@ -701,16 +706,16 @@ static inline void free_vm86_irq(int irq static inline int task_valid(struct task_struct *tsk) { - struct task_struct *p; + struct task_struct *g, *p; int ret = 0; read_lock(&tasklist_lock); - for_each_task(p) { - if ((p == tsk) && (p->sig)) { + do_each_thread(g, p) + if ((p == tsk) && (p->signal)) { ret = 1; break; } - } + while_each_thread(g, p); read_unlock(&tasklist_lock); return ret; } --- linux/arch/sparc/kernel/process.c.orig +++ linux/arch/sparc/kernel/process.c @@ -74,8 +74,6 @@ int cpu_idle(void) goto out; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -128,8 +126,6 @@ out: int cpu_idle(void) { /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); while(1) { --- linux/arch/mips/config-shared.in.orig +++ linux/arch/mips/config-shared.in @@ -618,6 +618,8 @@ fi bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 define_bool CONFIG_KCORE_ELF y define_bool CONFIG_KCORE_AOUT n define_bool CONFIG_BINFMT_AOUT n --- linux/arch/ppc/kernel/idle.c.orig +++ linux/arch/ppc/kernel/idle.c @@ -51,9 +51,8 @@ int idled(void) do_power_save = 1; /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); + for (;;) { #ifdef CONFIG_SMP if (!do_power_save) { --- linux/arch/ppc/kernel/misc.S.orig +++ linux/arch/ppc/kernel/misc.S @@ -1174,8 +1174,8 @@ _GLOBAL(sys_call_table) .long sys_lremovexattr .long sys_fremovexattr /* 220 */ .long sys_ni_syscall /* reserved for sys_futex */ - .long sys_ni_syscall /* reserved for sys_sched_setaffinity */ - .long sys_ni_syscall /* reserved for sys_sched_getaffinity */ + .long sys_sched_setaffinity + .long sys_sched_getaffinity .long sys_ni_syscall /* reserved for sys_security */ .long sys_ni_syscall /* 225 reserved for Tux */ .long sys_ni_syscall /* reserved for sys_sendfile64 */ --- linux/arch/ppc/8xx_io/uart.c.orig +++ linux/arch/ppc/8xx_io/uart.c @@ -1796,7 +1796,6 @@ static void rs_8xx_wait_until_sent(struc printk("lsr = %d (jiff=%lu)...", lsr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; -/* current->counter = 0; make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; --- linux/arch/ppc/8260_io/uart.c.orig +++ linux/arch/ppc/8260_io/uart.c @@ -1732,7 +1732,6 @@ static void rs_8xx_wait_until_sent(struc printk("lsr = %d (jiff=%lu)...", lsr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; -/* current->counter = 0; make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; --- linux/arch/sparc64/kernel/process.c.orig +++ linux/arch/sparc64/kernel/process.c @@ -53,8 +53,6 @@ int cpu_idle(void) return -EPERM; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -83,8 +81,6 @@ int cpu_idle(void) #define unidle_me() (cpu_data[current->processor].idle_volume = 0) int cpu_idle(void) { - current->nice = 20; - current->counter = -100; init_idle(); while(1) { --- linux/arch/sh/kernel/process.c.orig +++ linux/arch/sh/kernel/process.c @@ -40,8 +40,6 @@ void cpu_idle(void *unused) { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { if (hlt_counter) { --- linux/arch/mips64/kernel/process.c.orig +++ linux/arch/mips64/kernel/process.c @@ -35,8 +35,7 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { while (!current->need_resched) if (cpu_wait) --- linux/arch/s390/kernel/process.c.orig +++ linux/arch/s390/kernel/process.c @@ -57,8 +57,7 @@ int cpu_idle(void *unused) /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { if (current->need_resched) { schedule(); --- linux/arch/parisc/kernel/process.c.orig +++ linux/arch/parisc/kernel/process.c @@ -64,8 +64,6 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { while (!current->need_resched) { --- linux/arch/s390x/kernel/process.c.orig +++ linux/arch/s390x/kernel/process.c @@ -57,8 +57,7 @@ int cpu_idle(void *unused) /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { if (current->need_resched) { schedule(); --- linux/arch/ppc64/kernel/idle.c.orig +++ linux/arch/ppc64/kernel/idle.c @@ -76,9 +76,6 @@ int idled(void) unsigned long CTRL; #endif - /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; #ifdef CONFIG_PPC_ISERIES /* ensure iSeries run light will be out when idle */ current->thread.flags &= ~PPC_FLAG_RUN_LIGHT; @@ -86,6 +83,7 @@ int idled(void) CTRL &= ~RUNLATCH; mtspr(CTRLT, CTRL); #endif + /* endless loop with no priority at all */ init_idle(); lpaca = get_paca(); --- linux/arch/x86_64/ia32/ia32_binfmt.c.orig +++ linux/arch/x86_64/ia32/ia32_binfmt.c @@ -45,6 +45,7 @@ struct elf_phdr; #define _LINUX_ELFCORE_H 1 typedef unsigned int elf_greg_t; + #define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) typedef elf_greg_t elf_gregset_t[ELF_NGREG]; @@ -95,6 +96,27 @@ struct elf_prpsinfo char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ }; +static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) +{ +#ifdef ELF_CORE_COPY_REGS + ELF_CORE_COPY_REGS((*elfregs), regs) +#else + BUG_ON(sizeof(*elfregs) != sizeof(*regs)); + *(struct pt_regs *)elfregs = *regs; +#endif +} + +static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) +{ +#ifdef ELF_CORE_COPY_TASK_REGS + + return ELF_CORE_COPY_TASK_REGS(t, elfregs); +#endif + return 0; +} + + + #define __STR(x) #x #define STR(x) __STR(x) --- linux/arch/x86_64/ia32/ia32_signal.c.orig +++ linux/arch/x86_64/ia32/ia32_signal.c @@ -82,11 +82,11 @@ sys32_sigsuspend(int history0, int histo sigset_t saveset; mask &= _BLOCKABLE; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); saveset = current->blocked; siginitset(¤t->blocked, mask); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); regs.rax = -EINTR; while (1) { @@ -239,10 +239,10 @@ asmlinkage long sys32_sigreturn(struct p goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); if (ia32_restore_sigcontext(®s, &frame->sc, &eax)) goto badframe; @@ -266,10 +266,10 @@ asmlinkage long sys32_rt_sigreturn(struc goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); if (ia32_restore_sigcontext(®s, &frame->uc.uc_mcontext, &eax)) goto badframe; --- linux/arch/x86_64/ia32/ptrace32.c.orig +++ linux/arch/x86_64/ia32/ptrace32.c @@ -172,13 +172,13 @@ static struct task_struct *find_target(i if (request != PTRACE_KILL) goto out; } - if (child->p_pptr != current) + if (child->parent != current) goto out; return child; } out: - free_task_struct(child); + put_task_struct(child); return NULL; } @@ -324,7 +324,7 @@ asmlinkage long sys32_ptrace(long reques break; } - free_task_struct(child); + put_task_struct(child); return ret; } --- linux/arch/x86_64/kernel/init_task.c.orig +++ linux/arch/x86_64/kernel/init_task.c @@ -9,8 +9,8 @@ static struct fs_struct init_fs = INIT_FS; static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS; struct mm_struct init_mm = INIT_MM(init_mm); +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); /* * Initial task structure. --- linux/arch/x86_64/kernel/process.c.orig +++ linux/arch/x86_64/kernel/process.c @@ -602,14 +602,14 @@ void set_personality_64bit(void) asmlinkage long sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.rsp, ®s, 0); + return do_fork(SIGCHLD, regs.rsp, ®s, 0, NULL); } asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, struct pt_regs regs) { if (!newsp) newsp = regs.rsp; - return do_fork(clone_flags, newsp, ®s, 0); + return do_fork(clone_flags, newsp, ®s, 0, NULL); } /* @@ -624,7 +624,7 @@ asmlinkage long sys_clone(unsigned long */ asmlinkage long sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0, NULL); } /* --- linux/arch/x86_64/kernel/ptrace.c.orig +++ linux/arch/x86_64/kernel/ptrace.c @@ -212,7 +212,7 @@ asmlinkage long sys_ptrace(long request, if (request != PTRACE_KILL) goto out_tsk; } - if (child->p_pptr != current) + if (child->parent != current) goto out_tsk; switch (request) { /* when I and D space are separate, these will need to be fixed. */ @@ -426,7 +426,7 @@ asmlinkage long sys_ptrace(long request, break; } out_tsk: - free_task_struct(child); + put_task_struct(child); out: unlock_kernel(); return ret; --- linux/arch/x86_64/kernel/signal.c.orig +++ linux/arch/x86_64/kernel/signal.c @@ -35,6 +35,9 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) +int print_fatal_signals; + + asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -90,11 +93,11 @@ sys_rt_sigsuspend(sigset_t *unewset, siz return -EFAULT; sigdelsetmask(&newset, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); saveset = current->blocked; current->blocked = newset; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); #if DEBUG_SIG printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", saveset, newset, ®s, regs.rip); @@ -200,10 +203,10 @@ asmlinkage long sys_rt_sigreturn(struct goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); if (restore_sigcontext(®s, &frame->uc.uc_mcontext, &eax)) goto badframe; @@ -433,11 +436,11 @@ handle_signal(unsigned long sig, struct ka->sa.sa_handler = SIG_DFL; if (!(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + recalc_sigpending(); + spin_unlock_irq(¤t->sig->siglock); } } @@ -467,9 +470,9 @@ int do_signal(struct pt_regs *regs, sigs for (;;) { unsigned long signr; - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irq(¤t->sig->siglock); signr = dequeue_signal(¤t->blocked, &info); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irq(¤t->sig->siglock); if (!signr) { break; @@ -496,8 +499,8 @@ int do_signal(struct pt_regs *regs, sigs info.si_signo = signr; info.si_errno = 0; info.si_code = SI_USER; - info.si_pid = current->p_pptr->pid; - info.si_uid = current->p_pptr->uid; + info.si_pid = current->parent->pid; + info.si_uid = current->parent->uid; } /* If the (new) signal is now blocked, requeue it. */ @@ -537,7 +540,7 @@ int do_signal(struct pt_regs *regs, sigs struct signal_struct *sig; current->state = TASK_STOPPED; current->exit_code = signr; - sig = current->p_pptr->sig; + sig = current->parent->sig; if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) notify_parent(current, SIGCHLD); schedule(); --- linux/arch/x86_64/kernel/sys_x86_64.c.orig +++ linux/arch/x86_64/kernel/sys_x86_64.c @@ -116,12 +116,6 @@ asmlinkage long sys_uname(struct new_uts return err?-EFAULT:0; } -asmlinkage long sys_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} asmlinkage long wrap_sys_shmat(int shmid, char *shmaddr, int shmflg) { --- linux/Documentation/Configure.help.orig +++ linux/Documentation/Configure.help @@ -4120,6 +4120,38 @@ CONFIG_BINFMT_MISC you have use for it; the module is called binfmt_misc.o. If you don't know what to answer at this point, say Y. +Maximum User Real-Time Priority +CONFIG_MAX_USER_RT_PRIO + The maximum user real-time priority. Tasks with priorities from + zero through one less than this value are scheduled as real-time. + To the application, a higher priority value implies a higher + priority task. + + The minimum allowed value is 100 and the maximum allowed value + is (arbitrary) 1000. Values specified outside this range will + be rounded accordingly during compile-time. The default is 100. + Setting this higher than 100 is safe but will result in slightly + more processing overhead in the scheduler. + + Unless you are doing specialized real-time computing and require + a much larger range than usual, the default is fine. + +Maximum Kernel Real-Time Priority +CONFIG_MAX_RT_PRIO + The difference between the maximum real-time priority and the + maximum user real-time priority. Usually this value is zero, + which sets the maximum real-time priority to the same as the + maximum user real-time priority. Setting this higher, + however, will allow kernel threads to set their priority to a + value higher than any user task. This is safe, but will result + in slightly more processing overhead in the scheduler. + + This value can be at most 200. The default is zero, i.e. the + maximum priority and maximum user priority are the same. + + Unless you are doing specialized real-time programming with + kernel threads, the default is fine. + Kernel support for JAVA binaries CONFIG_BINFMT_JAVA If you say Y here, the kernel will load and execute Java J-code --- linux/Documentation/sched-coding.txt.orig +++ linux/Documentation/sched-coding.txt @@ -0,0 +1,126 @@ + Reference for various scheduler-related methods in the O(1) scheduler + Robert Love , MontaVista Software + + +Note most of these methods are local to kernel/sched.c - this is by design. +The scheduler is meant to be self-contained and abstracted away. This document +is primarily for understanding the scheduler, not interfacing to it. Some of +the discussed interfaces, however, are general process/scheduling methods. +They are typically defined in include/linux/sched.h. + + +Main Scheduling Methods +----------------------- + +void load_balance(runqueue_t *this_rq, int idle) + Attempts to pull tasks from one cpu to another to balance cpu usage, + if needed. This method is called explicitly if the runqueues are + inbalanced or periodically by the timer tick. Prior to calling, + the current runqueue must be locked and interrupts disabled. + +void schedule() + The main scheduling function. Upon return, the highest priority + process will be active. + + +Locking +------- + +Each runqueue has its own lock, rq->lock. When multiple runqueues need +to be locked, lock acquires must be ordered by ascending &runqueue value. + +A specific runqueue is locked via + + task_rq_lock(task_t pid, unsigned long *flags) + +which disables preemption, disables interrupts, and locks the runqueue pid is +running on. Likewise, + + task_rq_unlock(task_t pid, unsigned long *flags) + +unlocks the runqueue pid is running on, restores interrupts to their previous +state, and reenables preemption. + +The routines + + double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + +and + + double_rq_unlock(runqueue_t *rq1, runqueue_t rq2) + +safely lock and unlock, respectively, the two specified runqueues. They do +not, however, disable and restore interrupts. Users are required to do so +manually before and after calls. + + +Values +------ + +MAX_PRIO + The maximum priority of the system, stored in the task as task->prio. + Lower priorities are higher. Normal (non-RT) priorities range from + MAX_RT_PRIO to (MAX_PRIO - 1). +MAX_RT_PRIO + The maximum real-time priority of the system. Valid RT priorities + range from 0 to (MAX_RT_PRIO - 1). +MAX_USER_RT_PRIO + The maximum real-time priority that is exported to user-space. Should + always be equal to or less than MAX_RT_PRIO. Setting it less allows + kernel threads to have higher priorities than any user-space task. +MIN_TIMESLICE +MAX_TIMESLICE + Respectively, the minimum and maximum timeslices (quanta) of a process. + +Data +---- + +struct runqueue + The main per-CPU runqueue data structure. +struct task_struct + The main per-process data structure. + + +General Methods +--------------- + +cpu_rq(cpu) + Returns the runqueue of the specified cpu. +this_rq() + Returns the runqueue of the current cpu. +task_rq(pid) + Returns the runqueue which holds the specified pid. +cpu_curr(cpu) + Returns the task currently running on the given cpu. +rt_task(pid) + Returns true if pid is real-time, false if not. + + +Process Control Methods +----------------------- + +void set_user_nice(task_t *p, long nice) + Sets the "nice" value of task p to the given value. +int setscheduler(pid_t pid, int policy, struct sched_param *param) + Sets the scheduling policy and parameters for the given pid. +void set_cpus_allowed(task_t *p, unsigned long new_mask) + Sets a given task's CPU affinity and migrates it to a proper cpu. + Callers must have a valid reference to the task and assure the + task not exit prematurely. No locks can be held during the call. +set_task_state(tsk, state_value) + Sets the given task's state to the given value. +set_current_state(state_value) + Sets the current task's state to the given value. +void set_tsk_need_resched(struct task_struct *tsk) + Sets need_resched in the given task. +void clear_tsk_need_resched(struct task_struct *tsk) + Clears need_resched in the given task. +void set_need_resched() + Sets need_resched in the current task. +void clear_need_resched() + Clears need_resched in the current task. +int need_resched() + Returns true if need_resched is set in the current task, false + otherwise. +yield() + Place the current process at the end of the runqueue and call schedule. --- linux/Documentation/sched-design.txt.orig +++ linux/Documentation/sched-design.txt @@ -0,0 +1,165 @@ + Goals, Design and Implementation of the + new ultra-scalable O(1) scheduler + + + This is an edited version of an email Ingo Molnar sent to + lkml on 4 Jan 2002. It describes the goals, design, and + implementation of Ingo's new ultra-scalable O(1) scheduler. + Last Updated: 18 April 2002. + + +Goal +==== + +The main goal of the new scheduler is to keep all the good things we know +and love about the current Linux scheduler: + + - good interactive performance even during high load: if the user + types or clicks then the system must react instantly and must execute + the user tasks smoothly, even during considerable background load. + + - good scheduling/wakeup performance with 1-2 runnable processes. + + - fairness: no process should stay without any timeslice for any + unreasonable amount of time. No process should get an unjustly high + amount of CPU time. + + - priorities: less important tasks can be started with lower priority, + more important tasks with higher priority. + + - SMP efficiency: no CPU should stay idle if there is work to do. + + - SMP affinity: processes which run on one CPU should stay affine to + that CPU. Processes should not bounce between CPUs too frequently. + + - plus additional scheduler features: RT scheduling, CPU binding. + +and the goal is also to add a few new things: + + - fully O(1) scheduling. Are you tired of the recalculation loop + blowing the L1 cache away every now and then? Do you think the goodness + loop is taking a bit too long to finish if there are lots of runnable + processes? This new scheduler takes no prisoners: wakeup(), schedule(), + the timer interrupt are all O(1) algorithms. There is no recalculation + loop. There is no goodness loop either. + + - 'perfect' SMP scalability. With the new scheduler there is no 'big' + runqueue_lock anymore - it's all per-CPU runqueues and locks - two + tasks on two separate CPUs can wake up, schedule and context-switch + completely in parallel, without any interlocking. All + scheduling-relevant data is structured for maximum scalability. + + - better SMP affinity. The old scheduler has a particular weakness that + causes the random bouncing of tasks between CPUs if/when higher + priority/interactive tasks, this was observed and reported by many + people. The reason is that the timeslice recalculation loop first needs + every currently running task to consume its timeslice. But when this + happens on eg. an 8-way system, then this property starves an + increasing number of CPUs from executing any process. Once the last + task that has a timeslice left has finished using up that timeslice, + the recalculation loop is triggered and other CPUs can start executing + tasks again - after having idled around for a number of timer ticks. + The more CPUs, the worse this effect. + + Furthermore, this same effect causes the bouncing effect as well: + whenever there is such a 'timeslice squeeze' of the global runqueue, + idle processors start executing tasks which are not affine to that CPU. + (because the affine tasks have finished off their timeslices already.) + + The new scheduler solves this problem by distributing timeslices on a + per-CPU basis, without having any global synchronization or + recalculation. + + - batch scheduling. A significant proportion of computing-intensive tasks + benefit from batch-scheduling, where timeslices are long and processes + are roundrobin scheduled. The new scheduler does such batch-scheduling + of the lowest priority tasks - so nice +19 jobs will get + 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are + in essence SCHED_IDLE, from an interactiveness point of view. + + - handle extreme loads more smoothly, without breakdown and scheduling + storms. + + - O(1) RT scheduling. For those RT folks who are paranoid about the + O(nr_running) property of the goodness loop and the recalculation loop. + + - run fork()ed children before the parent. Andrea has pointed out the + advantages of this a few months ago, but patches for this feature + do not work with the old scheduler as well as they should, + because idle processes often steal the new child before the fork()ing + CPU gets to execute it. + + +Design +====== + +the core of the new scheduler are the following mechanizms: + + - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active' + array and an 'expired' array. The active array contains all tasks that + are affine to this CPU and have timeslices left. The expired array + contains all tasks which have used up their timeslices - but this array + is kept sorted as well. The active and expired array is not accessed + directly, it's accessed through two pointers in the per-CPU runqueue + structure. If all active tasks are used up then we 'switch' the two + pointers and from now on the ready-to-go (former-) expired array is the + active array - and the empty active array serves as the new collector + for expired tasks. + + - there is a 64-bit bitmap cache for array indices. Finding the highest + priority task is thus a matter of two x86 BSFL bit-search instructions. + +the split-array solution enables us to have an arbitrary number of active +and expired tasks, and the recalculation of timeslices can be done +immediately when the timeslice expires. Because the arrays are always +access through the pointers in the runqueue, switching the two arrays can +be done very quickly. + +this is a hybride priority-list approach coupled with roundrobin +scheduling and the array-switch method of distributing timeslices. + + - there is a per-task 'load estimator'. + +one of the toughest things to get right is good interactive feel during +heavy system load. While playing with various scheduler variants i found +that the best interactive feel is achieved not by 'boosting' interactive +tasks, but by 'punishing' tasks that want to use more CPU time than there +is available. This method is also much easier to do in an O(1) fashion. + +to establish the actual 'load' the task contributes to the system, a +complex-looking but pretty accurate method is used: there is a 4-entry +'history' ringbuffer of the task's activities during the last 4 seconds. +This ringbuffer is operated without much overhead. The entries tell the +scheduler a pretty accurate load-history of the task: has it used up more +CPU time or less during the past N seconds. [the size '4' and the interval +of 4x 1 seconds was found by lots of experimentation - this part is +flexible and can be changed in both directions.] + +the penalty a task gets for generating more load than the CPU can handle +is a priority decrease - there is a maximum amount to this penalty +relative to their static priority, so even fully CPU-bound tasks will +observe each other's priorities, and will share the CPU accordingly. + +the SMP load-balancer can be extended/switched with additional parallel +computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs +can be supported easily by changing the load-balancer. Right now it's +tuned for my SMP systems. + +i skipped the prev->mm == next->mm advantage - no workload i know of shows +any sensitivity to this. It can be added back by sacrificing O(1) +schedule() [the current and one-lower priority list can be searched for a +that->mm == current->mm condition], but costs a fair number of cycles +during a number of important workloads, so i wanted to avoid this as much +as possible. + +- the SMP idle-task startup code was still racy and the new scheduler +triggered this. So i streamlined the idle-setup code a bit. We do not call +into schedule() before all processors have started up fully and all idle +threads are in place. + +- the patch also cleans up a number of aspects of sched.c - moves code +into other areas of the kernel where it's appropriate, and simplifies +certain code paths and data constructs. As a result, the new scheduler's +code is smaller than the old one. + + Ingo --- linux-nptl-2.49/drivers/char/sysrq.c +++ linux-nptl-latest/drivers/char/sysrq.c @@ -282,11 +282,13 @@ { struct task_struct *p; + read_lock(&tasklist_lock); for_each_process(p) { if (p->mm && p->pid != 1) /* Not swapper, init nor kernel thread */ force_sig(sig, p); } + read_unlock(&tasklist_lock); } static void sysrq_handle_term(int key, struct pt_regs *pt_regs, --- linux-nptl-2.49/fs/proc/array.c +++ linux-nptl-latest/fs/proc/array.c @@ -347,7 +347,7 @@ read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %ld %ld %ld %ld\n" +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %ld %ld %ld %ld %ld %ld\n" , task->pid, task->comm, @@ -392,6 +392,8 @@ task->cnswap, task->exit_signal, task->cpu, + task->rt_priority, + task->policy, task->group_times.tms_utime, task->group_times.tms_stime, task->group_times.tms_cutime, --- linux-nptl-2.49/kernel/exit.c +++ linux-nptl-latest/kernel/exit.c @@ -74,8 +74,8 @@ p->parent->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; p->parent->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; - p->parent->group_leader->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; - p->parent->group_leader->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; + p->parent->group_leader->group_times.tms_cutime += p->times.tms_utime + p->times.tms_cutime; + p->parent->group_leader->group_times.tms_cstime += p->times.tms_stime + p->times.tms_cstime; p->parent->cmin_flt += p->min_flt + p->cmin_flt; p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; --- linux-nptl-2.49/kernel/signal.c +++ linux-nptl-latest/kernel/signal.c @@ -1111,16 +1099,17 @@ { int ret; + read_lock(&tasklist_lock); /* XXX should nix these interfaces and update the kernel */ - if (T(sig, SIG_KERNEL_BROADCAST_MASK)) { - read_lock(&tasklist_lock); + if (T(sig, SIG_KERNEL_BROADCAST_MASK)) ret = group_send_sig_info(sig, info, p); - read_unlock(&tasklist_lock); - } else { + else { spin_lock_irq(&p->sighand->siglock); ret = specific_send_sig_info(sig, info, p); spin_unlock_irq(&p->sighand->siglock); } + read_unlock(&tasklist_lock); + return ret; } --- linux/net/tux/main.c.orig 2003-02-20 10:47:45.000000000 +0100 +++ linux/net/tux/main.c 2003-02-20 10:59:25.000000000 +0100 @@ -77,18 +77,6 @@ return 0; } -void reap_kids (void) -{ - int count = 0; - - flush_all_signals(); - __set_task_state(current, TASK_RUNNING); - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - count++; - - Dprintk("reaped %d kids (%p) [signals pending: %08lx].\n", count, __builtin_return_address(0), current->pending.signal.sig[0]); -} - static int event_loop (threadinfo_t *ti) { tux_req_t *req; @@ -507,6 +495,7 @@ ti->listen[k].proto = tux_listen[cpu][k].proto; Dprintk("thread %d got sock %p (%d), proto %s.\n", cpu, ti->listen[k].sock, k, ti->listen[k].proto->name); next_socket: + ; } ti->started = 1; Dprintk("thread %d done initializing sockets.\n", cpu); @@ -1069,6 +1058,7 @@ goto out; default: + ; } userspace_actions: --- linux/net/tux/cgi.c.orig 2003-02-20 10:38:20.000000000 +0100 +++ linux/net/tux/cgi.c 2003-02-20 10:49:58.000000000 +0100 @@ -203,7 +203,6 @@ { exec_param_t param_local; pid_t pid; - int ret = 0; struct k_sigaction *ka; ka = current->sighand->action + SIGCHLD-1; @@ -226,13 +225,5 @@ schedule_timeout(HZ); goto repeat_fork; } - if (wait) { -repeat: - reap_kids(); - ret = sys_wait4(pid, NULL, __WALL, NULL); - Dprintk("sys_wait4 returned %d.\n", ret); - if (ret == -ERESTARTSYS) - goto repeat; - } return pid; } --- linux/net/tux/cachemiss.c.orig 2003-02-20 10:42:37.000000000 +0100 +++ linux/net/tux/cachemiss.c 2003-02-20 10:42:09.000000000 +0100 @@ -143,17 +143,11 @@ continue; } tux_schedule_atom(req, 1); - if (signal_pending(current)) { + if (signal_pending(current)) flush_all_signals(); - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - /* nothing */; - } } - if (signal_pending(current)) { + if (signal_pending(current)) flush_all_signals(); - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - /* nothing */; - } if (!list_empty(&iot->async_queue)) continue; if (iot->shutdown) { --- linux/net/tux/logger.c.orig 2003-02-20 10:43:13.000000000 +0100 +++ linux/net/tux/logger.c 2003-02-20 11:00:22.000000000 +0100 @@ -640,6 +640,7 @@ } kfree_req(req); out: + ; } static int warn_once = 1; @@ -777,11 +778,8 @@ } schedule_timeout(HZ); Dprintk("logger back from sleep - stop:%d.\n", stop_logger); - if (signal_pending(current)) { + if (signal_pending(current)) flush_all_signals(); - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - /* nothing */; - } } remove_wait_queue(&log_wait, &wait); @@ -807,7 +805,6 @@ void stop_log_thread (void) { DECLARE_WAITQUEUE(wait, current); - int ret; Dprintk("stopping logger thread %d ...\n", logger_pid); @@ -819,8 +816,5 @@ __set_current_state(TASK_RUNNING); remove_wait_queue(&stop_logger_wait, &wait); - ret = sys_wait4(logger_pid, NULL, __WCLONE, NULL); - if (stop_logger) - TUX_BUG(); Dprintk("logger thread stopped!\n"); } --- linux/net/tux/extcgi.c.orig 2003-02-20 10:42:52.000000000 +0100 +++ linux/net/tux/extcgi.c 2003-02-20 10:51:03.000000000 +0100 @@ -93,7 +93,6 @@ Dprintk("CGI reply: (%d bytes, total %d).\n", len, total); if (len == -ERESTARTSYS) { flush_all_signals(); - reap_kids(); goto repeat_read; } } while (len > 0); @@ -154,7 +153,6 @@ int in_pipe_fds[2], out_pipe_fds[2], err_pipe_fds[2], len; char command [MAX_CGI_COMMAND_LEN]; pid_t pid; - int ret; len = strlen(tux_common_docroot); if (req->objectname_len + len + 12 > MAX_CGI_COMMAND_LEN) @@ -280,14 +278,8 @@ sys_close(1); handle_cgi_reply(req); -repeat: - reap_kids(); - ret = sys_wait4(pid, NULL, __WALL, NULL); - Dprintk("exec_external_cgi() sys_wait4() returned %d.\n", ret); - if (ret == -ERESTARTSYS) - goto repeat; - return ret; + return 0; } void start_external_cgi (tux_req_t *req) --- linux/net/tux/input.c.orig 2003-02-20 10:44:55.000000000 +0100 +++ linux/net/tux/input.c 2003-02-20 10:46:23.000000000 +0100 @@ -294,7 +294,7 @@ len = 0; goto out; } - reap_kids(); + flush_all_signals(); goto read_again; } out: @@ -338,7 +338,7 @@ len = 0; goto out; } - reap_kids(); + flush_all_signals(); goto read_again; } out: @@ -359,7 +359,7 @@ repeat_trunc: len = sk->prot->recvmsg(sk, NULL, req->parsed_len, 1, MSG_TRUNC, &addr_len); if ((len == -ERESTARTSYS) || (len == -EAGAIN)) { - reap_kids(); + flush_all_signals(); goto repeat_trunc; } Dprintk("truncated (TRUNC) %d bytes at %p. (wanted: %d.)\n", len, __builtin_return_address(0), req->parsed_len); --- linux/net/tux/output.c.orig 2003-02-20 10:47:11.000000000 +0100 +++ linux/net/tux/output.c 2003-02-20 10:47:23.000000000 +0100 @@ -49,7 +49,7 @@ Dprintk("sendmsg ret: %d, written: %d, left: %d.\n", len,written,left); if ((len == -ERESTARTSYS) || (!(flags & MSG_DONTWAIT) && (len == -EAGAIN))) { - reap_kids(); + flush_all_signals(); goto repeat_send; } if (len > 0) { --- linux/net/tux/proc.c.orig 2003-02-20 11:00:58.000000000 +0100 +++ linux/net/tux/proc.c 2003-02-20 11:01:05.000000000 +0100 @@ -1002,6 +1002,7 @@ if (!listen->proto) return sprintf(page, INACTIVE_1); +#undef IP #define IP(n) ((unsigned char *)&listen->ip)[n] return sprintf (page, "%s://%u.%u.%u.%u:%hu\n", listen->proto->name, --- linux/net/tux/directory.c.orig 2003-02-20 11:01:20.000000000 +0100 +++ linux/net/tux/directory.c 2003-02-20 11:01:24.000000000 +0100 @@ -111,6 +111,7 @@ case DT_DIR: case DT_LNK: /* valid entries - fall through. */ + ; } } len = strlen(dirp->d_name); --- linux-2.4.20/kernel/signal.c~ 2003-02-23 13:03:12.000000000 +0100 +++ linux-2.4.20/kernel/signal.c 2003-02-23 13:03:12.000000000 +0100 @@ -1439,8 +1439,11 @@ * Group stop is so we can do a core dump. */ current->signal->group_exit_task = NULL; goto dequeue; } + if (current->signal->group_exit) + goto dequeue; + /* * There is a group stop in progress. We stop * without any associated signal being in our queue. diff -purN linux-2.4.20.orig/include/asm-x86_64/unistd.h linux-2.4.20/include/asm-x86_64/unistd.h --- linux-2.4.20.orig/include/asm-x86_64/unistd.h 2003-03-10 14:32:10.000000000 -0500 +++ linux-2.4.20/include/asm-x86_64/unistd.h 2003-03-10 14:44:27.000000000 -0500 @@ -495,6 +495,8 @@ __SYSCALL(__NR_epoll_wait, sys_ni_syscal __SYSCALL(__NR_remap_file_pages, sys_ni_syscall) #define __NR_getdents64 217 __SYSCALL(__NR_getdents64, sys_getdents64) +#define __NR_exit_group 231 +__SYSCALL(__NR_exit_group, sys_exit_group) -#define __NR_syscall_max __NR_getdents64 +#define __NR_syscall_max __NR_exit_group --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -1048,10 +1048,10 @@ static inline runqueue_t *find_busiest_q if (likely(!busiest)) goto out; - *imbalance = (max_load - nr_running) / 2; + *imbalance = max_load - nr_running; /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (*imbalance < (max_load + 3)/4)) { + if (!idle && ((*imbalance)*4 < max_load)) { busiest = NULL; goto out; } @@ -1061,10 +1061,15 @@ static inline runqueue_t *find_busiest_q * Make sure nothing changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) { + if (busiest->nr_running <= nr_running) { spin_unlock(&busiest->lock); busiest = NULL; } + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + *imbalance /= 2; out: return busiest; } --- linux/kernel/exit.c.orig +++ linux/kernel/exit.c @@ -40,6 +40,7 @@ static void __unhash_process(struct task } REMOVE_LINKS(p); + p->pid = 0; } void release_task(struct task_struct * p) @@ -221,7 +218,7 @@ void reparent_to_init(void) /* rt_priority? */ /* signals? */ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); - current->user = INIT_USER; + switch_uid(INIT_USER); write_unlock_irq(&tasklist_lock); } --- linux/kernel/user.c.orig +++ linux/kernel/user.c @@ -116,6 +116,22 @@ struct user_struct * alloc_uid(uid_t uid return up; } +void switch_uid(struct user_struct *new_user) +{ + struct user_struct *old_user; + + /* What if a process setreuid()'s and this brings the + * new uid over his NPROC rlimit? We can check this now + * cheaply with the new uid cache, so if it matters + * we should be checking for it. -DaveM + */ + old_user = current->user; + atomic_inc(&new_user->__count); + atomic_inc(&new_user->processes); + atomic_dec(&old_user->processes); + current->user = new_user; + free_uid(old_user); +} static int __init uid_cache_init(void) { --- linux-2.4.20/kernel/signal.c~ 2003-05-10 10:34:02.000000000 +0200 +++ linux-2.4.20/kernel/signal.c 2003-05-10 10:34:02.000000000 +0200 @@ -1112,15 +1112,16 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { int ret; + unsigned long flags; read_lock(&tasklist_lock); /* XXX should nix these interfaces and update the kernel */ if (T(sig, SIG_KERNEL_BROADCAST_MASK)) ret = group_send_sig_info(sig, info, p); else { - spin_lock_irq(&p->sighand->siglock); + spin_lock_irqsave(&p->sighand->siglock, flags); ret = specific_send_sig_info(sig, info, p); - spin_unlock_irq(&p->sighand->siglock); + spin_unlock_irqrestore(&p->sighand->siglock, flags); } read_unlock(&tasklist_lock); --- linux-2.4.20/kernel/fork.c~ 2003-11-26 11:56:37.000000000 +0000 +++ linux-2.4.20/kernel/fork.c 2003-11-26 11:57:21.000000000 +0000 @@ -927,6 +927,7 @@ if (current->signal->group_exit) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); + retval = -EAGAIN; goto bad_fork_cleanup_namespace; } p->tgid = current->tgid; --- linux-2.4.21/kernel/exit.c.~1~ 2004-01-14 18:21:08.000000000 -0800 +++ linux-2.4.21/kernel/exit.c 2004-01-14 18:23:00.000000000 -0800 @@ -465,7 +465,8 @@ static inline void reparent_thread(task_ p->self_exec_id++; if (p->pdeath_signal) - send_sig(p->pdeath_signal, p, 0); + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, (void *) 0, p); /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) {