anticipatory I/O scheduler drivers/block/Makefile | 3 drivers/block/as-iosched.c | 1761 +++++++++++++++++++++++++++++++++++++++++++++ drivers/block/ll_rw_blk.c | 23 include/linux/elevator.h | 5 include/linux/sched.h | 4 kernel/exit.c | 2 kernel/fork.c | 1 7 files changed, 1795 insertions(+), 4 deletions(-) diff -puN /dev/null drivers/block/as-iosched.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/drivers/block/as-iosched.c 2003-05-25 23:21:11.000000000 -0700 @@ -0,0 +1,1761 @@ +/* + * linux/drivers/block/as-iosched.c + * + * Anticipatory & deadline i/o scheduler. + * + * Copyright (C) 2002 Jens Axboe + * Nick Piggin + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * See Documentation/as-iosched.txt + */ + +/* + * max time before a read is submitted. + */ +static unsigned long read_expire = HZ / 20; + +/* + * ditto for writes, these limits are not hard, even + * if the disk is capable of satisfying them. + */ +static unsigned long write_expire = HZ / 5; + +/* + * read_batch_expire describes how long we will allow a stream of reads to + * persist before looking to see whether it is time to switch over to writes. + */ +static unsigned long read_batch_expire = HZ / 5; + +/* + * write_batch_expire describes how long we will allow a stream of writes to + * persist before looking to see whether it is time to switch over to reads. + */ +static unsigned long write_batch_expire = HZ / 20; + +/* + * max time we may wait to anticipate a read (default around 6ms) + */ +static unsigned long antic_expire = ((HZ / 150) ? HZ / 150 : 1); + +/* + * This is the per-process anticipatory I/O scheduler state. It is refcounted + * and kmalloc'ed. + * + * There is no locking protecting the contents of this structure! Pointers + * to a single as_io_context may appear in multiple queues at once. + */ + +/* + * Keep track of up to 20ms thinktimes. We can go as big as we like here, + * however huge values tend to interfere and not decay fast enough. A program + * might be in a non-io phase of operation. Waiting on user input for example, + * or doing a lengthy computation. A small penalty can be justified there, and + * will still catch out those processes that constantly have large thinktimes. + */ +#define MAX_THINKTIME (HZ/50UL) + +struct as_io_context { + atomic_t refcount; + pid_t pid; + unsigned long state; + atomic_t nr_queued; /* queued reads & sync writes */ + atomic_t nr_dispatched; /* number of requests gone to the drivers */ + + /* IO History tracking */ + /* Thinktime */ + unsigned long last_end_request; + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; + /* Read / write pattern */ + int last_data_dir; + unsigned long dir_after_read[2]; + int mean_dir_after_read; + /* Layout pattern */ + long seek_samples; + sector_t last_request_pos; + sector_t seek_total; + sector_t seek_mean; +}; + +/* Bits in as_io_context.state */ +enum as_io_states { + AS_TASK_RUNNING=0, /* Process has not exitted */ + AS_TASK_IORUNNING, /* Process has completed some IO */ +}; + +struct as_data { + /* + * run time data + */ + + struct request_queue *q; /* the "owner" queue */ + + /* + * requests (as_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + struct as_rq *next_arq[2]; /* next in sort order */ + sector_t last_sector[2]; /* last READ and WRITE sectors */ + struct list_head *dispatch; /* driver dispatch queue */ + struct list_head *hash; /* request hash */ + unsigned long hash_valid_count; /* barrier hash count */ + unsigned long current_batch_expires; + unsigned long last_check_fifo[2]; + int batch_data_dir; /* current/last batch READ or WRITE */ + mempool_t *arq_pool; + + int antic_status; + unsigned long antic_start; /* jiffies: when it started */ + struct timer_list antic_timer; /* anticipatory scheduling timer */ + struct work_struct antic_work; /* Deferred unplugging */ + struct as_io_context *as_io_context;/* Identify the expected process */ + int aic_finished; /* IO associated with as_io_context finished */ + + /* + * settings that change how the i/o scheduler behaves + */ + unsigned long fifo_expire[2]; + unsigned long batch_expire[2]; + unsigned long antic_expire; +}; + +#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) + +enum anticipation_states { + ANTIC_OFF=0, /* Not anticipating (normal operation) */ + ANTIC_WAIT_REQ, /* The last read has not yet completed */ + ANTIC_WAIT_NEXT, /* Currently anticipating a request vs + last read (which has completed) */ + ANTIC_FINISHED, /* Anticipating but have found a candidate + * or timed out */ +}; + +/* + * per-request data. + */ +enum arq_state { + AS_RQ_NEW=0, /* New - not referenced and not on any lists */ + AS_RQ_QUEUED, /* In the request queue. It belongs to the + scheduler */ + AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the + driver now */ +}; + +struct as_rq { + /* + * rbtree index, key is the starting offset + */ + struct rb_node rb_node; + sector_t rb_key; + + struct request *request; + + struct as_io_context *as_io_context; /* The submitting task */ + + /* + * request hash, key is the ending offset (for back merge lookup) + */ + struct list_head hash; + unsigned long hash_valid_count; + + /* + * expire fifo + */ + struct list_head fifo; + unsigned long expires; + + enum arq_state state; /* debug only */ +}; + +#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private) + +static kmem_cache_t *arq_pool; + +/* + * IO Context helper functions + */ +/* Debug */ +static atomic_t nr_as_io_requests = ATOMIC_INIT(0); + +static void put_as_io_context(struct as_io_context **paic) +{ + struct as_io_context *aic = *paic; + + if (aic == NULL) + return; + + BUG_ON(atomic_read(&aic->refcount) == 0); + *paic = NULL; + if (atomic_dec_and_test(&aic->refcount)) { + atomic_dec(&nr_as_io_requests); + kfree(aic); + } +} + +/* Called by the exitting task */ +void exit_as_io_context(void) +{ + unsigned long flags; + struct as_io_context *aic; + + local_irq_save(flags); + aic = current->as_io_context; + if (aic) { + clear_bit(AS_TASK_RUNNING, &aic->state); + put_as_io_context(&aic); + current->as_io_context = NULL; + } + local_irq_restore(flags); +} + +/* + * If the current task has no IO context then create one and initialise it. + * If it does have a context, take a ref on it. + * + * This is always called in the context of the task which submitted the I/O. + * But weird things happen, so we disable local interrupts to ensure exclusive + * access to *current. + */ +static struct as_io_context *get_as_io_context(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + struct as_io_context *ret; + + local_irq_save(flags); + ret = tsk->as_io_context; + if (ret == NULL) { + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (ret) { + atomic_inc(&nr_as_io_requests); + atomic_set(&ret->refcount, 1); + ret->pid = tsk->pid; + ret->state = 1 << AS_TASK_RUNNING; + atomic_set(&ret->nr_queued, 0); + atomic_set(&ret->nr_dispatched, 0); + ret->ttime_total = 0; + ret->ttime_samples = 0; + ret->ttime_mean = 0; + ret->dir_after_read[READ] = 0; + ret->dir_after_read[WRITE] = 0; + ret->mean_dir_after_read = READ; + ret->seek_total = 0; + ret->seek_samples = 0; + ret->seek_mean = 0; + tsk->as_io_context = ret; + } + } + local_irq_restore(flags); + atomic_inc(&ret->refcount); + return ret; +} + +static void +copy_as_io_context(struct as_io_context **pdst, struct as_io_context **psrc) +{ + struct as_io_context *src = *psrc; + + if (src) { + BUG_ON(atomic_read(&src->refcount) == 0); + atomic_inc(&src->refcount); + put_as_io_context(pdst); + *pdst = src; + } +} + +static void +swap_as_io_context(struct as_io_context **aic1, struct as_io_context **aic2) +{ + struct as_io_context *temp; + temp = *aic1; + *aic1 = *aic2; + *aic2 = temp; +} + +/* + * the back merge hash support functions + */ +static const int as_hash_shift = 6; +#define AS_HASH_BLOCK(sec) ((sec) >> 3) +#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift)) +#define AS_HASH_ENTRIES (1 << as_hash_shift) +#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) +#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash) +#define ON_HASH(arq) (arq)->hash_valid_count + +#define AS_INVALIDATE_HASH(ad) \ + do { \ + if (!++(ad)->hash_valid_count) \ + (ad)->hash_valid_count = 1; \ + } while (0) + +static inline void __as_del_arq_hash(struct as_rq *arq) +{ + arq->hash_valid_count = 0; + list_del_init(&arq->hash); +} + +static inline void as_del_arq_hash(struct as_rq *arq) +{ + if (ON_HASH(arq)) + __as_del_arq_hash(arq); +} + +static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq) +{ + struct request *rq = arq->request; + + BUG_ON(ON_HASH(arq)); + + arq->hash_valid_count = ad->hash_valid_count; + list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]); +} + +/* + * move hot entry to front of chain + */ +static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq) +{ + struct request *rq = arq->request; + struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))]; + + if (ON_HASH(arq) && arq->hash.prev != head) { + list_del(&arq->hash); + list_add(&arq->hash, head); + } +} + +static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset) +{ + struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; + + while ((entry = next) != hash_list) { + struct as_rq *arq = list_entry_hash(entry); + struct request *__rq = arq->request; + + next = entry->next; + + BUG_ON(!ON_HASH(arq)); + + if (!rq_mergeable(__rq) + || arq->hash_valid_count != ad->hash_valid_count) { + __as_del_arq_hash(arq); + continue; + } + + if (rq_hash_key(__rq) == offset) + return __rq; + } + + return NULL; +} + +/* + * rb tree support functions + */ +#define RB_NONE (2) +#define RB_EMPTY(root) ((root)->rb_node == NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) +#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) +#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node) +#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[rq_data_dir((arq)->request)]) +#define rq_rb_key(rq) (rq)->sector + +/* + * as_find_first_arq finds the first (lowest sector numbered) request + * for the specified data_dir. Used to sweep back to the start of the disk + * (1-way elevator) after we process the last (highest sector) request. + */ +static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir) +{ + struct rb_node *n = ad->sort_list[data_dir].rb_node; + + if (n == NULL) + return NULL; + + for (;;) { + if (n->rb_left == NULL) + return rb_entry_arq(n); + + n = n->rb_left; + } +} + +static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq) +{ + struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node; + struct rb_node *parent = NULL; + struct as_rq *__arq; + + while (*p) { + parent = *p; + __arq = rb_entry_arq(parent); + + if (arq->rb_key < __arq->rb_key) + p = &(*p)->rb_left; + else if (arq->rb_key > __arq->rb_key) + p = &(*p)->rb_right; + else + return __arq; + } + + rb_link_node(&arq->rb_node, parent, p); + return 0; +} + +static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq); +/* + * Add the request to the rb tree if it is unique. If there is an alias (an + * existing request against the same sector), which can happen when using + * direct IO, then move the alias to the dispatch list and then add the + * request. + */ +static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq) +{ + struct as_rq *alias; + struct request *rq = arq->request; + + arq->rb_key = rq_rb_key(rq); + + /* This can be caused by direct IO */ + while ((alias = __as_add_arq_rb(ad, arq))) + as_move_to_dispatch(ad, alias); + + rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); +} + +static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq) +{ + if (ON_RB(&arq->rb_node)) { + rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); + RB_CLEAR(&arq->rb_node); + } +} + +static struct request * +as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir) +{ + struct rb_node *n = ad->sort_list[data_dir].rb_node; + struct as_rq *arq; + + while (n) { + arq = rb_entry_arq(n); + + if (sector < arq->rb_key) + n = n->rb_left; + else if (sector > arq->rb_key) + n = n->rb_right; + else + return arq->request; + } + + return NULL; +} + +/* + * IO Scheduler proper + */ + +#define MAXBACK (1024 * 1024) /* + * Maximum distance the disk will go backward + * for a request. + */ + +/* + * as_choose_req selects the preferred one of two requests of the same data_dir + * ignoring time - eg. timeouts, which is the job of as_dispatch_request + */ +static struct as_rq * +as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2) +{ + int data_dir; + sector_t last, s1, s2, d1, d2; + int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ + const sector_t maxback = MAXBACK; + + if (arq1 == NULL || arq1 == arq2) + return arq2; + if (arq2 == NULL) + return arq1; + + data_dir = rq_data_dir(arq1->request); + + last = ad->last_sector[data_dir]; + s1 = arq1->request->sector; + s2 = arq2->request->sector; + + BUG_ON(data_dir != rq_data_dir(arq2->request)); + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (data_dir == READ + && s1+maxback >= last) + d1 = (last - s1)*2; + else { + r1_wrap = 1; + d1 = 0; /* shut up, gcc */ + } + + if (s2 >= last) + d2 = s2 - last; + else if (data_dir == READ + && s2+maxback >= last) + d2 = (last - s2)*2; + else { + r2_wrap = 1; + d2 = 0; + } + + /* Found required data */ + if (!r1_wrap && r2_wrap) + return arq1; + else if (!r2_wrap && r1_wrap) + return arq2; + else if (r1_wrap && r2_wrap) { + /* both behind the head */ + if (s1 <= s2) + return arq1; + else + return arq2; + } + + /* Both requests in front of the head */ + if (d1 < d2) + return arq1; + else if (d2 < d1) + return arq2; + else { + if (s1 >= s2) + return arq1; + else + return arq2; + } +} + +/* + * as_find_next_arq finds the next request after @prev in elevator order. + * this with as_choose_arq form the basis for how the scheduler chooses + * what request to process next. Anticipation works on top of this. + */ +static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) +{ + const int data_dir = rq_data_dir(last->request); + struct as_rq *ret; + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct as_rq *arq_next, *arq_prev; + + BUG_ON(!ON_RB(&last->rb_node)); + + if (rbprev) + arq_prev = rb_entry_arq(rbprev); + else + arq_prev = NULL; + + if (rbnext) + arq_next = rb_entry_arq(rbnext); + else { + arq_next = as_find_first_arq(ad, data_dir); + if (arq_next == last) + arq_next = NULL; + } + + ret = as_choose_req(ad, arq_next, arq_prev); + + return ret; +} + +/* + * anticipatory scheduling functions follow + */ + +/* + * as_antic_expired tells us when we have anticipated too long. + * The funny "absolute difference" math on the elapsed time is to handle + * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. + */ +static int as_antic_expired(struct as_data *ad) +{ + long delta_jif; + + delta_jif = jiffies - ad->antic_start; + if (unlikely(delta_jif < 0)) + delta_jif = -delta_jif; + if (delta_jif < ad->antic_expire) + return 0; + + return 1; +} + +/* + * as_antic_waitnext starts anticipating that a nice request will soon be + * submitted. See also as_antic_waitreq + */ +static void as_antic_waitnext(struct as_data *ad) +{ + unsigned long timeout; + + BUG_ON(ad->antic_status != ANTIC_OFF + && ad->antic_status != ANTIC_WAIT_REQ); + + timeout = ad->antic_start + ad->antic_expire; + timeout = min(timeout, ad->current_batch_expires); + + mod_timer(&ad->antic_timer, timeout); + + ad->antic_status = ANTIC_WAIT_NEXT; +} + +/* + * as_antic_waitreq starts anticipating. We don't start timing the anticipation + * until the request that we're anticipating on has finished. This means we + * are timing from when the candidate process wakes up hopefully. + */ +static void as_antic_waitreq(struct as_data *ad) +{ + BUG_ON(ad->antic_status == ANTIC_FINISHED); + if (ad->antic_status == ANTIC_OFF) { + if (!ad->as_io_context || ad->aic_finished) + as_antic_waitnext(ad); + else + ad->antic_status = ANTIC_WAIT_REQ; + } +} + +/* + * This is called directly by the functions in this file to stop anticipation. + * We kill the timer and schedule a call to the request_fn asap. + */ +static void as_antic_stop(struct as_data *ad) +{ + int status = ad->antic_status; + + if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { + if (status == ANTIC_WAIT_NEXT) + del_timer(&ad->antic_timer); + ad->antic_status = ANTIC_FINISHED; + /* see as_work_handler */ + kblockd_schedule_work(&ad->antic_work); + } +} + +/* + * as_antic_timeout is the timer function set by as_antic_waitnext. + */ +static void as_antic_timeout(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + struct as_data *ad = q->elevator.elevator_data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (ad->antic_status == ANTIC_WAIT_REQ + || ad->antic_status == ANTIC_WAIT_NEXT) { + ad->antic_status = ANTIC_FINISHED; + kblockd_schedule_work(&ad->antic_work); + } + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * as_close_req decides if one request is considered "close" to the + * previous one issued. + */ +static int as_close_req(struct as_data *ad, struct as_rq *arq) +{ + unsigned long delay; /* milliseconds */ + sector_t last = ad->last_sector[ad->batch_data_dir]; + sector_t next = arq->request->sector; + sector_t delta; /* acceptable close offset (in sectors) */ + + if (ad->antic_status == ANTIC_OFF || !ad->aic_finished) + delay = 0; + else + delay = ((jiffies - ad->antic_start) * 1000) / HZ; + + if (delay <= 1) + delta = 32; + else if (delay <= 20 && delay <= ad->antic_expire) + delta = 32 << (delay-1); + else + return 1; + + return (last <= next) && (next <= last + delta); +} + +/* + * as_can_break_anticipation returns true if we have been anticipating this + * request. + * + * It also returns true if the process against which we are anticipating + * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to + * dispatch it ASAP, because we know that application will not be submitting + * any new reads. + * + * If the task which has submitted the request has exitted, break anticipation. + * + * If this task has queued some other IO, do not enter enticipation. + */ +static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) +{ + struct as_io_context *aic; + + if (rq_data_dir(arq->request) == READ && as_close_req(ad, arq)) { + /* close request */ + return 1; + } + + if (ad->aic_finished && as_antic_expired(ad)) { + /* + * In this situation status should really be FINISHED, + * however the timer hasn't had the chance to run yet. + */ + return 1; + } + + aic = ad->as_io_context; + if (aic == arq->as_io_context) { + /* request from same process */ + return 1; + } + + if (!aic) + return 0; + + if (!test_bit(AS_TASK_RUNNING, &aic->state)) { + /* process anticipated on has exitted */ + return 1; + } + + if (atomic_read(&aic->nr_queued) > 0) { + /* process has more requests queued */ + return 1; + } + + if (atomic_read(&aic->nr_dispatched) > 0) { + /* process has more requests dispatched */ + return 1; + } + + if (aic->ttime_mean > ad->antic_expire) { + /* the process thinks too much between requests */ + return 1; + } + + if (aic->mean_dir_after_read != READ) { + /* next request from this process will probably be a write */ + return 1; + } + + if (aic->seek_samples) { + sector_t s; + if (ad->last_sector[READ] < arq->request->sector) + s = arq->request->sector - ad->last_sector[READ]; + else + s = ad->last_sector[READ] - arq->request->sector; + + if (aic->seek_mean > s) + /* this request is better than what we're expecting */ + return 1; + } + + return 0; +} + +/* + * as_can_anticipate indicates weather we should either run arq + * or keep anticipating a better request. + */ +static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) +{ + if (!ad->as_io_context) + /* + * Last request submitted was a write + */ + return 0; + + if (ad->antic_status == ANTIC_FINISHED) + /* + * Don't restart if we have just finished. Run the next request + */ + return 0; + + if (arq && as_can_break_anticipation(ad, arq)) + /* + * This request is a good candidate. Don't keep anticipating, + * run it. + */ + return 0; + + /* + * OK from here, we haven't finished, and don't have a decent request! + * Status is either ANTIC_OFF so start waiting, + * ANTIC_WAIT_REQ so continue waiting for request to finish + * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. + * + */ + + return 1; +} + +/* + * as_update_iohist keeps a decaying histogram of IO thinktimes, and + * updates @aic->ttime_mean based on that. It is called when a new + * request is queued. + */ +static void as_update_iohist(struct as_io_context *aic, struct request *rq) +{ + int data_dir = rq_data_dir(rq); + unsigned long thinktime; + sector_t seek_dist; + + if (aic == NULL) + return; + + if (test_bit(AS_TASK_IORUNNING, &aic->state)) { + if (data_dir == READ) { + /* Calculate read -> read thinktime */ + thinktime = jiffies - aic->last_end_request; + thinktime = min(thinktime, MAX_THINKTIME-1); + /* fixed point: 1.0 == 1<<8 */ + aic->ttime_samples += 256; + aic->ttime_total += 256*thinktime; + if (aic->ttime_samples) + /* fixed point factor is cancelled here */ + aic->ttime_mean = (aic->ttime_total + 128) + / aic->ttime_samples; + aic->ttime_samples = (aic->ttime_samples>>1) + + (aic->ttime_samples>>2); + aic->ttime_total = (aic->ttime_total>>1) + + (aic->ttime_total>>2); + + /* Calculate read -> read seek distance */ + if (aic->last_request_pos < rq->sector) + seek_dist = rq->sector - aic->last_request_pos; + else + seek_dist = aic->last_request_pos - rq->sector; + aic->last_request_pos = rq->sector + rq->nr_sectors; + + aic->seek_samples += 256; + aic->seek_total += 256*seek_dist; + if (aic->seek_samples) { + aic->seek_mean = aic->seek_total + 128; + do_div(aic->seek_mean, aic->seek_samples); + } + aic->seek_samples = (aic->seek_samples>>1) + + (aic->seek_samples>>2); + aic->seek_total = (aic->seek_total>>1) + + (aic->seek_total>>2); + + } + + /* Calculate read/write pattern */ + if (aic->last_data_dir == READ) { + unsigned long rprob, wprob; + aic->dir_after_read[data_dir] += 256; + rprob = aic->dir_after_read[READ]; + wprob = aic->dir_after_read[WRITE]; + + if (rprob*4 >= wprob*5) + aic->mean_dir_after_read = READ; + else + aic->mean_dir_after_read = WRITE; + + aic->dir_after_read[READ] = (rprob>>1) + (rprob>>2); + aic->dir_after_read[WRITE] = (wprob>>1) + (wprob>>2);; + } + } +} + +/* + * as_update_arq must be called whenever a request (arq) is added to + * the sort_list. This function keeps caches up to date, and checks if the + * request might be one we are "anticipating" + */ +static void as_update_arq(struct as_data *ad, struct as_rq *arq) +{ + const int data_dir = rq_data_dir(arq->request); + + /* keep the next_arq cache up to date */ + ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]); + + /* + * have we been anticipating this request? + * or does it come from the same process as the one we are anticipating + * for? + */ + if (ad->batch_data_dir == READ + && ad->antic_status != ANTIC_FINISHED + && as_can_break_anticipation(ad, arq)) + as_antic_stop(ad); +} + +/* + * as_completed_request is to be called when a request has completed and + * returned something to the requesting process, be it an error or data. + */ +static void as_completed_request(request_queue_t *q, struct request *rq) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(rq); + struct as_io_context *aic = arq->as_io_context; + + arq->state = AS_RQ_NEW; + + if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER)) { + WARN_ON(aic); + return; + } + + if (!aic) + return; + + if (rq_data_dir(arq->request) == READ) { + set_bit(AS_TASK_IORUNNING, &aic->state); + aic->last_end_request = jiffies; + } + aic->last_data_dir = rq_data_dir(arq->request); + + if (ad->as_io_context == aic) { + ad->antic_start = jiffies; + ad->aic_finished = 1; + if (ad->antic_status == ANTIC_WAIT_REQ) { + /* + * We were waiting on this request, now anticipate + * the next one + */ + as_antic_waitnext(ad); + } + } + + put_as_io_context(&arq->as_io_context); +} + +/* + * as_remove_queued_request removes a request from the pre dispatch queue + * without updating refcounts. It is expected the caller will drop the + * reference unless it replaces the request at somepart of the elevator + * (ie. the dispatch queue) + */ +static void as_remove_queued_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + + if (!arq) + BUG(); + else { + const int data_dir = rq_data_dir(arq->request); + struct as_data *ad = q->elevator.elevator_data; + + WARN_ON(arq->state != AS_RQ_QUEUED); + + if (arq->as_io_context) { + BUG_ON(!atomic_read(&arq->as_io_context->nr_queued)); + atomic_dec(&arq->as_io_context->nr_queued); + } + + /* + * Update the "next_arq" cache if we are about to remove its + * entry + */ + if (ad->next_arq[data_dir] == arq) + ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + + list_del_init(&arq->fifo); + as_del_arq_hash(arq); + as_del_arq_rb(ad, arq); + } + +} + +/* + * as_remove_dispatched_request is called to remove a request which has gone + * to the dispatch list. + */ +static void as_remove_dispatched_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + + if (arq) { + struct as_io_context *aic; + + WARN_ON(arq->state != AS_RQ_DISPATCHED); + WARN_ON(ON_RB(&arq->rb_node)); + aic = arq->as_io_context; + if (aic) { + WARN_ON(!atomic_read(&aic->nr_dispatched)); + atomic_dec(&aic->nr_dispatched); + } + } +} +/* + * as_remove_request is called when a driver has finished with a request. + * This should be only called for dispatched requests, but for some reason + * a POWER4 box running hwscan it does not. + */ +static void as_remove_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + + if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER)) + return; + + if (arq) { + if (ON_RB(&arq->rb_node)) + as_remove_queued_request(q, rq); + else + as_remove_dispatched_request(q, rq); + } +} + +/* + * as_fifo_expired returns 0 if there are no expired reads on the fifo, + * 1 otherwise. It is ratelimited so that we only perform the check once per + * `fifo_expire' interval. Otherwise a large number of expired requests + * would create a hopeless seekstorm. + * + * See as_antic_expired comment. + */ +static int as_fifo_expired(struct as_data *ad, int adir) +{ + struct as_rq *arq; + long delta_jif; + + delta_jif = jiffies - ad->last_check_fifo[adir]; + if (unlikely(delta_jif < 0)) + delta_jif = -delta_jif; + if (delta_jif < ad->fifo_expire[adir]) + return 0; + + ad->last_check_fifo[adir] = jiffies; + + if (list_empty(&ad->fifo_list[adir])) + return 0; + + arq = list_entry_fifo(ad->fifo_list[adir].next); + + return time_after(jiffies, arq->expires); +} + +/* + * as_batch_expired returns true if the current batch has expired. A batch + * is a set of reads or a set of writes. + */ +static inline int as_batch_expired(struct as_data *ad) +{ + return time_after(jiffies, ad->current_batch_expires); +} + +/* + * move an entry to dispatch queue + */ +static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) +{ + const int data_dir = rq_data_dir(arq->request); + + BUG_ON(!ON_RB(&arq->rb_node)); + + as_antic_stop(ad); + ad->antic_status = ANTIC_OFF; + + /* + * This has to be set in order to be correctly updated by + * as_find_next_arq + */ + ad->last_sector[data_dir] = arq->request->sector + + arq->request->nr_sectors; + + if (data_dir == READ) { + /* In case we have to anticipate after this */ + copy_as_io_context(&ad->as_io_context, &arq->as_io_context); + } else + put_as_io_context(&ad->as_io_context); + ad->aic_finished = 0; + + ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + + /* + * take it off the sort and fifo list, add to dispatch queue + */ + as_remove_queued_request(ad->q, arq->request); + list_add_tail(&arq->request->queuelist, ad->dispatch); + if (arq->as_io_context) + atomic_inc(&arq->as_io_context->nr_dispatched); + + WARN_ON(arq->state != AS_RQ_QUEUED); + arq->state = AS_RQ_DISPATCHED; +} + +/* + * as_dispatch_request selects the best request according to + * read/write expire, batch expire, etc, and moves it to the dispatch + * queue. Returns 1 if a request was found, 0 otherwise. + */ +static int as_dispatch_request(struct as_data *ad) +{ + struct as_rq *arq; + const int reads = !list_empty(&ad->fifo_list[READ]); + const int writes = !list_empty(&ad->fifo_list[WRITE]); + + if (!(reads || writes)) + return 0; + + if (!(reads && writes && as_batch_expired(ad)) ) { + /* + * batch is still running or no reads or no writes + */ + arq = ad->next_arq[ad->batch_data_dir]; + + if (ad->batch_data_dir == READ && ad->antic_expire) { + if (as_fifo_expired(ad, READ)) + goto fifo_expired; + + if (as_can_anticipate(ad, arq)) { + as_antic_waitreq(ad); + return 0; + } + } + + if (arq) { + /* we have a "next request" */ + if (reads && !writes) + ad->current_batch_expires = + jiffies + ad->batch_expire[READ]; + goto dispatch_request; + } + } + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY(&ad->sort_list[READ])); + + if (writes && ad->batch_data_dir == READ) + /* + * Last batch was a read, switch to writes + */ + goto dispatch_writes; + + ad->batch_data_dir = READ; + arq = ad->next_arq[ad->batch_data_dir]; + ad->current_batch_expires = jiffies + + ad->batch_expire[ad->batch_data_dir]; + goto dispatch_request; + } + + /* + * the last batch was a read + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY(&ad->sort_list[WRITE])); + + ad->batch_data_dir = WRITE; + arq = ad->next_arq[ad->batch_data_dir]; + ad->current_batch_expires = jiffies + + ad->batch_expire[ad->batch_data_dir]; + goto dispatch_request; + } + + BUG(); + return 0; + +dispatch_request: + /* + * If a request has expired, service it. + */ + + if (as_fifo_expired(ad, ad->batch_data_dir)) { +fifo_expired: + arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); + BUG_ON(arq == NULL); + } + + /* + * arq is the selected appropriate request. + */ + as_move_to_dispatch(ad, arq); + + return 1; +} + +static struct request *as_next_request(request_queue_t *q) +{ + struct as_data *ad = q->elevator.elevator_data; + struct request *rq = NULL; + + /* + * if there are still requests on the dispatch queue, grab the first + */ + if (!list_empty(ad->dispatch) || as_dispatch_request(ad)) + rq = list_entry_rq(ad->dispatch->next); + + return rq; +} + +/* + * add arq to rbtree and fifo + */ +static void as_add_request(struct as_data *ad, struct as_rq *arq) +{ + const int data_dir = rq_data_dir(arq->request); + + arq->as_io_context = get_as_io_context(); + + if (arq->as_io_context) { + atomic_inc(&arq->as_io_context->nr_queued); + as_update_iohist(arq->as_io_context, arq->request); + } + + as_add_arq_rb(ad, arq); + + /* + * set expire time (only used for reads) and add to fifo list + */ + arq->expires = jiffies + ad->fifo_expire[data_dir]; + list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]); + arq->state = AS_RQ_QUEUED; + as_update_arq(ad, arq); /* keep state machine up to date */ +} + +static void +as_insert_request(request_queue_t *q, struct request *rq, + struct list_head *insert_here) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(rq); + + if (unlikely(rq->flags & REQ_HARDBARRIER)) { + AS_INVALIDATE_HASH(ad); + q->last_merge = NULL; + + while (ad->next_arq[READ]) + as_move_to_dispatch(ad, ad->next_arq[READ]); + + while (ad->next_arq[WRITE]) + as_move_to_dispatch(ad, ad->next_arq[WRITE]); + + list_add_tail(&rq->queuelist, ad->dispatch); + + /* Stop anticipating - let this request get through */ + if (!list_empty(ad->dispatch) && rq_data_dir(rq) == READ + && (ad->antic_status == ANTIC_WAIT_REQ + || ad->antic_status == ANTIC_WAIT_NEXT)) + as_antic_stop(ad); + + return; + } + + if (unlikely(!blk_fs_request(rq))) { + if (!insert_here) + insert_here = ad->dispatch->prev; + + list_add(&rq->queuelist, insert_here); + + /* Stop anticipating - let this request get through */ + if (!list_empty(ad->dispatch) && rq_data_dir(rq) == READ + && (ad->antic_status == ANTIC_WAIT_REQ + || ad->antic_status == ANTIC_WAIT_NEXT)) + as_antic_stop(ad); + + return; + } + + if (rq_mergeable(rq)) { + as_add_arq_hash(ad, arq); + + if (!q->last_merge) + q->last_merge = &rq->queuelist; + } + + as_add_request(ad, arq); +} + +/* + * as_queue_empty tells us if there are requests left in the device. It may + * not be the case that a driver can get the next request even if the queue + * is not empty - it is used in the block layer to check for plugging and + * merging opportunities + */ +static int as_queue_empty(request_queue_t *q) +{ + struct as_data *ad = q->elevator.elevator_data; + + if (!list_empty(&ad->fifo_list[WRITE]) + || !list_empty(&ad->fifo_list[READ]) + || !list_empty(ad->dispatch)) + return 0; + + return 1; +} + +static struct request * +as_former_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + struct rb_node *rbprev = rb_prev(&arq->rb_node); + struct request *ret = NULL; + + if (rbprev) + ret = rb_entry_arq(rbprev)->request; + + return ret; +} + +static struct request * +as_latter_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + struct rb_node *rbnext = rb_next(&arq->rb_node); + struct request *ret = NULL; + + if (rbnext) + ret = rb_entry_arq(rbnext)->request; + + return ret; +} + +static int +as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio) +{ + struct as_data *ad = q->elevator.elevator_data; + sector_t rb_key = bio->bi_sector + bio_sectors(bio); + struct request *__rq; + int ret; + + /* + * try last_merge to avoid going to hash + */ + ret = elv_try_last_merge(q, bio); + if (ret != ELEVATOR_NO_MERGE) { + __rq = list_entry_rq(q->last_merge); + goto out_insert; + } + + /* + * see if the merge hash can satisfy a back merge + */ + __rq = as_find_arq_hash(ad, bio->bi_sector); + if (__rq) { + BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_BACK_MERGE; + goto out; + } + } + + /* + * check for front merge + */ + __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio)); + if (__rq) { + BUG_ON(rb_key != rq_rb_key(__rq)); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_FRONT_MERGE; + goto out; + } + } + + return ELEVATOR_NO_MERGE; +out: + q->last_merge = &__rq->queuelist; +out_insert: + if (ret) + as_hot_arq_hash(ad, RQ_DATA(__rq)); + *insert = &__rq->queuelist; + return ret; +} + +static void as_merged_request(request_queue_t *q, struct request *req) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(req); + + /* + * hash always needs to be repositioned, key is end sector + */ + as_del_arq_hash(arq); + as_add_arq_hash(ad, arq); + + /* + * if the merge was a front merge, we need to reposition request + */ + if (rq_rb_key(req) != arq->rb_key) { + as_del_arq_rb(ad, arq); + as_add_arq_rb(ad, arq); + /* + * Note! At this stage of this and the next function, our next + * request may not be optimal - eg the request may have "grown" + * behind the disk head. We currently don't bother adjusting. + */ + } + + q->last_merge = &req->queuelist; +} + +static void +as_merged_requests(request_queue_t *q, struct request *req, + struct request *next) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(req); + struct as_rq *anext = RQ_DATA(next); + + BUG_ON(!arq); + BUG_ON(!anext); + + /* + * reposition arq (this is the merged request) in hash, and in rbtree + * in case of a front merge + */ + as_del_arq_hash(arq); + as_add_arq_hash(ad, arq); + + if (rq_rb_key(req) != arq->rb_key) { + as_del_arq_rb(ad, arq); + as_add_arq_rb(ad, arq); + } + + /* + * if anext expires before arq, assign its expire time to arq + * and move into anext position (anext will be deleted) in fifo + */ + if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) { + if (time_before(anext->expires, arq->expires)) { + list_move(&arq->fifo, &anext->fifo); + arq->expires = anext->expires; + /* + * Don't copy here but swap, because when anext is + * removed below, it must contain the unused context + */ + swap_as_io_context(&arq->as_io_context, + &anext->as_io_context); + } + } + + /* + * kill knowledge of next, this one is a goner + */ + as_remove_queued_request(q, next); +/* put_as_io_context(&anext->as_io_context); no need to do this because as_completed_request takes care of it */ +} + +/* + * This is executed in a "deferred" process context, by kblockd. It calls the + * driver's request_fn so the driver can submit that request. + * + * IMPORTANT! This guy will reenter the elevator, so set up all queue global + * state before calling, and don't rely on any state over calls. + * + * FIXME! dispatch queue is not a queue at all! + */ +static void as_work_handler(void *data) +{ + struct request_queue *q = data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (as_next_request(q)) + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void as_put_request(request_queue_t *q, struct request *rq) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(rq); + + if (arq) { + mempool_free(arq, ad->arq_pool); + rq->elevator_private = NULL; + } +} + +static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); + + if (arq) { + RB_CLEAR(&arq->rb_node); + arq->request = rq; + + arq->as_io_context = NULL; + INIT_LIST_HEAD(&arq->hash); + arq->hash_valid_count = 0; + + INIT_LIST_HEAD(&arq->fifo); + + rq->elevator_private = arq; + return 0; + } + + return 1; +} + +static void as_exit(request_queue_t *q, elevator_t *e) +{ + struct as_data *ad = e->elevator_data; + + del_timer_sync(&ad->antic_timer); + kblockd_flush(); + + BUG_ON(!list_empty(&ad->fifo_list[READ])); + BUG_ON(!list_empty(&ad->fifo_list[WRITE])); + + mempool_destroy(ad->arq_pool); + put_as_io_context(&ad->as_io_context); + kfree(ad->hash); + kfree(ad); +} + +/* + * initialize elevator private data (as_data), and alloc a arq for + * each request on the free lists + */ +static int as_init(request_queue_t *q, elevator_t *e) +{ + struct as_data *ad; + int i; + + if (!arq_pool) + return -ENOMEM; + + ad = kmalloc(sizeof(*ad), GFP_KERNEL); + if (!ad) + return -ENOMEM; + memset(ad, 0, sizeof(*ad)); + + ad->q = q; /* Identify what queue the data belongs to */ + + ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL); + if (!ad->hash) { + kfree(ad); + return -ENOMEM; + } + + ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool); + if (!ad->arq_pool) { + kfree(ad->hash); + kfree(ad); + return -ENOMEM; + } + + /* anticipatory scheduling helpers */ + ad->antic_timer.function = as_antic_timeout; + ad->antic_timer.data = (unsigned long)q; + init_timer(&ad->antic_timer); + INIT_WORK(&ad->antic_work, as_work_handler, q); + + for (i = 0; i < AS_HASH_ENTRIES; i++) + INIT_LIST_HEAD(&ad->hash[i]); + + INIT_LIST_HEAD(&ad->fifo_list[READ]); + INIT_LIST_HEAD(&ad->fifo_list[WRITE]); + ad->sort_list[READ] = RB_ROOT; + ad->sort_list[WRITE] = RB_ROOT; + ad->dispatch = &q->queue_head; + ad->fifo_expire[READ] = read_expire; + ad->fifo_expire[WRITE] = write_expire; + ad->hash_valid_count = 1; + ad->antic_expire = antic_expire; + ad->batch_expire[READ] = read_batch_expire; + ad->batch_expire[WRITE] = write_batch_expire; + e->elevator_data = ad; + + ad->current_batch_expires = jiffies + ad->batch_expire[READ]; + return 0; +} + +/* + * sysfs parts below + */ +struct as_fs_entry { + struct attribute attr; + ssize_t (*show)(struct as_data *, char *); + ssize_t (*store)(struct as_data *, const char *, size_t); +}; + +static ssize_t +as_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +as_var_store(unsigned long *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtoul(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct as_data *ad, char *page) \ +{ \ + return as_var_show(__VAR, (page)); \ +} +SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[READ]); +SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[WRITE]); +SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire); +SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[READ]); +SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[WRITE]); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \ +{ \ + int ret = as_var_store(__PTR, (page), count); \ + if (*(__PTR) < (MIN)) \ + *(__PTR) = (MIN); \ + else if (*(__PTR) > (MAX)) \ + *(__PTR) = (MAX); \ + return ret; \ +} +STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[READ], 0, INT_MAX); +STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[WRITE], 0, INT_MAX); +STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX); +STORE_FUNCTION(as_read_batchexpire_store, + &ad->batch_expire[READ], 0, INT_MAX); +STORE_FUNCTION(as_write_batchexpire_store, + &ad->batch_expire[WRITE], 0, INT_MAX); +#undef STORE_FUNCTION + +static struct as_fs_entry as_readexpire_entry = { + .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_readexpire_show, + .store = as_readexpire_store, +}; +static struct as_fs_entry as_writeexpire_entry = { + .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_writeexpire_show, + .store = as_writeexpire_store, +}; +static struct as_fs_entry as_anticexpire_entry = { + .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_anticexpire_show, + .store = as_anticexpire_store, +}; +static struct as_fs_entry as_read_batchexpire_entry = { + .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_read_batchexpire_show, + .store = as_read_batchexpire_store, +}; +static struct as_fs_entry as_write_batchexpire_entry = { + .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_write_batchexpire_show, + .store = as_write_batchexpire_store, +}; + +static struct attribute *default_attrs[] = { + &as_readexpire_entry.attr, + &as_writeexpire_entry.attr, + &as_anticexpire_entry.attr, + &as_read_batchexpire_entry.attr, + &as_write_batchexpire_entry.attr, + NULL, +}; + +#define to_as(atr) container_of((atr), struct as_fs_entry, attr) + +static ssize_t +as_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct as_fs_entry *entry = to_as(attr); + + if (!entry->show) + return 0; + + return entry->show(e->elevator_data, page); +} + +static ssize_t +as_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct as_fs_entry *entry = to_as(attr); + + if (!entry->store) + return -EINVAL; + + return entry->store(e->elevator_data, page, length); +} + +static struct sysfs_ops as_sysfs_ops = { + .show = as_attr_show, + .store = as_attr_store, +}; + +struct kobj_type as_ktype = { + .sysfs_ops = &as_sysfs_ops, + .default_attrs = default_attrs, +}; + +static int __init as_slab_setup(void) +{ + arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq), + 0, 0, NULL, NULL); + + if (!arq_pool) + panic("as: can't init slab pool\n"); + + return 0; +} + +subsys_initcall(as_slab_setup); + +elevator_t iosched_as = { + .elevator_merge_fn = as_merge, + .elevator_merged_fn = as_merged_request, + .elevator_merge_req_fn = as_merged_requests, + .elevator_next_req_fn = as_next_request, + .elevator_add_req_fn = as_insert_request, + .elevator_remove_req_fn = as_remove_request, + .elevator_queue_empty_fn = as_queue_empty, + .elevator_completed_req_fn = as_completed_request, + .elevator_former_req_fn = as_former_request, + .elevator_latter_req_fn = as_latter_request, + .elevator_set_req_fn = as_set_request, + .elevator_put_req_fn = as_put_request, + .elevator_init_fn = as_init, + .elevator_exit_fn = as_exit, + + .elevator_ktype = &as_ktype, +}; + +EXPORT_SYMBOL(iosched_as); diff -puN drivers/block/ll_rw_blk.c~as-iosched drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-05-25 23:21:08.000000000 -0700 @@ -983,7 +983,7 @@ static inline void __generic_unplug_devi /* * was plugged, fire request_fn if queue has stuff to do */ - if (!elv_queue_empty(q)) + if (elv_next_request(q)) q->request_fn(q); } @@ -1037,7 +1037,7 @@ void blk_start_queue(request_queue_t *q) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - if (!elv_queue_empty(q)) + if (elv_next_request(q)) q->request_fn(q); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -1175,6 +1175,18 @@ static int blk_init_free_list(request_qu static int __make_request(request_queue_t *, struct bio *); +static elevator_t *chosen_elevator = &iosched_as; + +static int __init elevator_setup(char *str) +{ + if (!strcmp(str, "deadline")) { + chosen_elevator = &iosched_deadline; + printk("elevator: cfq\n"); + } + return 1; +} +__setup("elevator=", elevator_setup); + /** * blk_init_queue - prepare a request queue for use with a block device * @q: The &request_queue_t to be initialised @@ -1210,7 +1222,12 @@ int blk_init_queue(request_queue_t *q, r if (blk_init_free_list(q)) return -ENOMEM; - if ((ret = elevator_init(q, &iosched_deadline))) { + if (chosen_elevator == &iosched_deadline) + printk("deadline elevator\n"); + else if (chosen_elevator == &iosched_as) + printk("anticipatory scheduling elevator\n"); + + if ((ret = elevator_init(q, chosen_elevator))) { blk_cleanup_queue(q); return ret; } diff -puN drivers/block/Makefile~as-iosched drivers/block/Makefile --- 25/drivers/block/Makefile~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/drivers/block/Makefile 2003-05-25 23:21:07.000000000 -0700 @@ -13,7 +13,8 @@ # kblockd threads # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o deadline-iosched.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o \ + deadline-iosched.o as-iosched.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff -puN include/linux/elevator.h~as-iosched include/linux/elevator.h --- 25/include/linux/elevator.h~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/include/linux/elevator.h 2003-05-25 23:21:07.000000000 -0700 @@ -89,6 +89,11 @@ extern elevator_t elevator_noop; */ extern elevator_t iosched_deadline; +/* + * anticipatory I/O scheduler + */ +extern elevator_t iosched_as; + extern int elevator_init(request_queue_t *, elevator_t *); extern void elevator_exit(request_queue_t *); extern inline int elv_rq_merge_ok(struct request *, struct bio *); diff -puN include/linux/sched.h~as-iosched include/linux/sched.h --- 25/include/linux/sched.h~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-05-25 23:21:08.000000000 -0700 @@ -320,6 +320,8 @@ struct k_itimer { }; +struct as_io_context; /* Anticipatory scheduler */ +void exit_as_io_context(void); struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -449,6 +451,8 @@ struct task_struct { struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; + struct as_io_context *as_io_context; + unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ }; diff -puN kernel/exit.c~as-iosched kernel/exit.c --- 25/kernel/exit.c~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/kernel/exit.c 2003-05-25 23:20:51.000000000 -0700 @@ -680,6 +680,8 @@ NORET_TYPE void do_exit(long code) panic("Attempted to kill the idle task!"); if (unlikely(tsk->pid == 1)) panic("Attempted to kill init!"); + if (tsk->as_io_context) + exit_as_io_context(); tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); diff -puN kernel/fork.c~as-iosched kernel/fork.c --- 25/kernel/fork.c~as-iosched 2003-05-25 23:20:51.000000000 -0700 +++ 25-akpm/kernel/fork.c 2003-05-25 23:20:51.000000000 -0700 @@ -857,6 +857,7 @@ struct task_struct *copy_process(unsigne p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; + p->as_io_context = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) _