diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2024-01-16 17:00:02 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2024-01-16 17:17:23 -0500 |
commit | b5fd066153c40a70a29caa1ea7987723ab687763 (patch) | |
tree | 6d43a8b0a90d549a54c65565ac96c92b3e84b594 /linux | |
parent | 06ff8b55b70fda44d91b31b5511fafd1680a8934 (diff) |
Move c_src dirs back to toplevel
We just wanted c sourcefiles out of the top level, not c source
directories.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'linux')
34 files changed, 7773 insertions, 0 deletions
diff --git a/linux/atomic64.c b/linux/atomic64.c new file mode 100644 index 00000000..4654d092 --- /dev/null +++ b/linux/atomic64.c @@ -0,0 +1,188 @@ +/* + * Generic implementation of 64-bit atomics using spinlocks, + * useful on processors that don't have 64-bit atomic instructions. + * + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> + +#ifdef ATOMIC64_SPINLOCK + +/* + * We use a hashed array of spinlocks to provide exclusive access + * to each atomic64_t variable. Since this is expected to used on + * systems with small numbers of CPUs (<= 4 or so), we use a + * relatively small array of 16 spinlocks to avoid wasting too much + * memory on the spinlock array. + */ +#define NR_LOCKS 16 + +/* + * Ensure each lock is in a separate cacheline. + */ +static union { + raw_spinlock_t lock; + char pad[L1_CACHE_BYTES]; +} atomic64_lock[NR_LOCKS] ____cacheline_aligned_in_smp = { + [0 ... (NR_LOCKS - 1)] = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(atomic64_lock.lock), + }, +}; + +static inline raw_spinlock_t *lock_addr(const atomic64_t *v) +{ + unsigned long addr = (unsigned long) v; + + addr >>= L1_CACHE_SHIFT; + addr ^= (addr >> 8) ^ (addr >> 16); + return &atomic64_lock[addr & (NR_LOCKS - 1)].lock; +} + +long long atomic64_read(const atomic64_t *v) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + long long val; + + raw_spin_lock_irqsave(lock, flags); + val = v->counter; + raw_spin_unlock_irqrestore(lock, flags); + return val; +} + +void atomic64_set(atomic64_t *v, long long i) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + + raw_spin_lock_irqsave(lock, flags); + v->counter = i; + raw_spin_unlock_irqrestore(lock, flags); +} + +#define ATOMIC64_OP(op, c_op) \ +void atomic64_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + \ + raw_spin_lock_irqsave(lock, flags); \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ +} + +#define ATOMIC64_OP_RETURN(op, c_op) \ +long long atomic64_##op##_return(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = (v->counter c_op a); \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} + +#define ATOMIC64_FETCH_OP(op, c_op) \ +long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = v->counter; \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} + +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(add, +=) +ATOMIC64_OPS(sub, -=) + +#undef ATOMIC64_OPS +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &=) +ATOMIC64_OPS(or, |=) +ATOMIC64_OPS(xor, ^=) + +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP +#undef ATOMIC64_OP_RETURN +#undef ATOMIC64_OP + +long long atomic64_dec_if_positive(atomic64_t *v) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + long long val; + + raw_spin_lock_irqsave(lock, flags); + val = v->counter - 1; + if (val >= 0) + v->counter = val; + raw_spin_unlock_irqrestore(lock, flags); + return val; +} + +long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + long long val; + + raw_spin_lock_irqsave(lock, flags); + val = v->counter; + if (val == o) + v->counter = n; + raw_spin_unlock_irqrestore(lock, flags); + return val; +} + +long long atomic64_xchg(atomic64_t *v, long long new) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + long long val; + + raw_spin_lock_irqsave(lock, flags); + val = v->counter; + v->counter = new; + raw_spin_unlock_irqrestore(lock, flags); + return val; +} + +int atomic64_add_unless(atomic64_t *v, long long a, long long u) +{ + unsigned long flags; + raw_spinlock_t *lock = lock_addr(v); + int ret = 0; + + raw_spin_lock_irqsave(lock, flags); + if (v->counter != u) { + v->counter += a; + ret = 1; + } + raw_spin_unlock_irqrestore(lock, flags); + return ret; +} + +#endif diff --git a/linux/bio.c b/linux/bio.c new file mode 100644 index 00000000..93a791c4 --- /dev/null +++ b/linux/bio.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/kernel.h> + +static const struct { + int err; + const char *name; +} blk_errors[] = { + [BLK_STS_OK] = { 0, "" }, + [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, + [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, + [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, + [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, + [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, + [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, + [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, + [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + + /* everything else not covered above: */ + [BLK_STS_IOERR] = { -EIO, "I/O" }, +}; + +int blk_status_to_errno(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return -EIO; + return blk_errors[idx].err; +} + +const char *blk_status_to_str(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return "(invalid error)"; + return blk_errors[idx].name; +} + +void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) +{ + struct bio_vec src_bv, dst_bv; + void *src_p, *dst_p; + unsigned bytes; + + while (src_iter->bi_size && dst_iter->bi_size) { + src_bv = bio_iter_iovec(src, *src_iter); + dst_bv = bio_iter_iovec(dst, *dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + flush_dcache_page(dst_bv.bv_page); + + bio_advance_iter(src, src_iter, bytes); + bio_advance_iter(dst, dst_iter, bytes); + } +} + +/** + * bio_copy_data - copy contents of data buffers from one bio to another + * @src: source bio + * @dst: destination bio + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); +} + +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +{ + unsigned long flags; + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, bio, iter, start) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + bvec_kunmap_irq(data, &flags); + } +} + +static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) +{ + bio_set_flag(bio, BIO_CLONED); + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_iter = bio_src->bi_iter; + return 0; +} + +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs) +{ + struct bio *bio; + + bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); + if (!bio) + return NULL; + + if (__bio_clone(bio, bio_src, gfp) < 0) { + bio_put(bio); + return NULL; + } + bio->bi_io_vec = bio_src->bi_io_vec; + + return bio; +} + +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) +{ + struct bio *split = NULL; + + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); + + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); + if (!split) + return NULL; + + split->bi_iter.bi_size = sectors << 9; + + bio_advance(bio, split->bi_iter.bi_size); + + return split; +} + +void bio_free_pages(struct bio *bio) +{ + struct bvec_iter_all iter; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, iter) + __free_page(bvec->bv_page); +} + +void bio_advance(struct bio *bio, unsigned bytes) +{ + bio_advance_iter(bio, &bio->bi_iter, bytes); +} + +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + + if (bs) { + if (bio->bi_max_vecs > BIO_INLINE_VECS) + mempool_free(bio->bi_io_vec, &bs->bvec_pool); + + mempool_free((void *) bio - bs->front_pad, &bs->bio_pool); + } else { + kfree(bio); + } +} + +void bio_put(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_REFFED)) + bio_free(bio); + else { + BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } +} + +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; + + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; + + bio->bi_iter.bi_size += len; + bio->bi_vcnt++; + return len; +} + +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) { + bio_clear_flag(bio, BIO_CHAIN); + return true; + } + + return false; +} + +static struct bio *__bio_chain_endio(struct bio *bio) +{ + struct bio *parent = bio->bi_private; + + if (!parent->bi_status) + parent->bi_status = bio->bi_status; + bio_put(bio); + return parent; +} + +static void bio_chain_endio(struct bio *bio) +{ + bio_endio(__bio_chain_endio(bio)); +} + +void bio_endio(struct bio *bio) +{ +again: + if (!bio_remaining_done(bio)) + return; + + /* + * Need to have a real endio function for chained bios, otherwise + * various corner cases will break (like stacking block devices that + * save/restore bi_end_io) - however, we want to avoid unbounded + * recursion and blowing the stack. Tail call optimization would + * handle this, but compiling with frame pointers also disables + * gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + bio = __bio_chain_endio(bio); + goto again; + } + + if (bio->bi_end_io) + bio->bi_end_io(bio); +} + +void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_bdev = bdev; + bio->bi_opf = opf; + bio->bi_flags = flags; + atomic_set(&bio->__bi_remaining, 1); +} + +struct bio *bio_kmalloc(unsigned int nr_iovecs, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = kmalloc(sizeof(struct bio) + + sizeof(struct bio_vec) * nr_iovecs, gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, 0); + bio->bi_pool = NULL; + return bio; +} + +static struct bio_vec *bvec_alloc(mempool_t *pool, int *nr_vecs, + gfp_t gfp_mask) +{ + *nr_vecs = roundup_pow_of_two(*nr_vecs); + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_VECS entries. + */ + if (*nr_vecs < BIO_MAX_VECS) { + struct bio_vec *bvl; + + bvl = kmalloc(sizeof(*bvl) * *nr_vecs, gfp_mask); + if (likely(bvl)) + return bvl; + *nr_vecs = BIO_MAX_VECS; + } + + return mempool_alloc(pool, gfp_mask); +} + +struct bio *bio_alloc_bioset(struct block_device *bdev, + unsigned nr_iovecs, + unsigned opf, + gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bio *bio; + void *p; + + if (nr_iovecs > BIO_MAX_VECS) + return NULL; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (unlikely(!p)) + return NULL; + + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; + + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + if (unlikely(!bvl)) + goto err_free; + + bio_init(bio, bdev, bvl, nr_iovecs, opf); + } else if (nr_iovecs) { + bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + } else { + bio_init(bio, bdev, NULL, 0, opf); + } + + bio->bi_pool = bs; + return bio; + +err_free: + mempool_free(p, &bs->bio_pool); + return NULL; +} + +void bioset_exit(struct bio_set *bs) +{ + mempool_exit(&bs->bio_pool); + mempool_exit(&bs->bvec_pool); +} + +int bioset_init(struct bio_set *bs, + unsigned int pool_size, + unsigned int front_pad, + int flags) +{ + int ret; + + bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; + + ret = mempool_init_kmalloc_pool(&bs->bio_pool, pool_size, bs->front_pad + + sizeof(struct bio) + bs->back_pad) ?: + mempool_init_kmalloc_pool(&bs->bvec_pool, pool_size, + sizeof(struct bio_vec) * BIO_MAX_VECS); + if (ret) + bioset_exit(bs); + return ret; +} diff --git a/linux/blkdev.c b/linux/blkdev.c new file mode 100644 index 00000000..61f23362 --- /dev/null +++ b/linux/blkdev.c @@ -0,0 +1,428 @@ + +#include <alloca.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <unistd.h> + +#include <libaio.h> + +#ifdef CONFIG_VALGRIND +#include <valgrind/memcheck.h> +#endif + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/completion.h> +#include <linux/fs.h> +#include <linux/kthread.h> + +#include "tools-util.h" + +struct fops { + void (*init)(void); + void (*cleanup)(void); + void (*read)(struct bio *bio, struct iovec * iov, unsigned i); + void (*write)(struct bio *bio, struct iovec * iov, unsigned i); +}; + +static struct fops *fops; +static io_context_t aio_ctx; +static atomic_t running_requests; + +void generic_make_request(struct bio *bio) +{ + struct iovec *iov; + struct bvec_iter iter; + struct bio_vec bv; + ssize_t ret; + unsigned i; + + if (bio->bi_opf & REQ_PREFLUSH) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) { + fprintf(stderr, "fsync error: %m\n"); + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + } + + i = 0; + bio_for_each_segment(bv, bio, iter) + i++; + + iov = alloca(sizeof(*iov) * i); + + i = 0; + bio_for_each_segment(bv, bio, iter) { + void *start = page_address(bv.bv_page) + bv.bv_offset; + size_t len = bv.bv_len; + + iov[i++] = (struct iovec) { + .iov_base = start, + .iov_len = len, + }; + +#ifdef CONFIG_VALGRIND + /* To be pedantic it should only be on IO completion. */ + if (bio_op(bio) == REQ_OP_READ) + VALGRIND_MAKE_MEM_DEFINED(start, len); +#endif + } + + switch (bio_op(bio)) { + case REQ_OP_READ: + fops->read(bio, iov, i); + break; + case REQ_OP_WRITE: + fops->write(bio, iov, i); + break; + case REQ_OP_FLUSH: + ret = fsync(bio->bi_bdev->bd_fd); + if (ret) + die("fsync error: %m"); + bio_endio(bio); + break; + default: + BUG(); + } +} + +static void submit_bio_wait_endio(struct bio *bio) +{ + complete(bio->bi_private); +} + +int submit_bio_wait(struct bio *bio) +{ + struct completion done; + + init_completion(&done); + bio->bi_private = &done; + bio->bi_end_io = submit_bio_wait_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + wait_for_completion(&done); + + return blk_status_to_errno(bio->bi_status); +} + +int blkdev_issue_discard(struct block_device *bdev, + sector_t sector, sector_t nr_sects, + gfp_t gfp_mask) +{ + return 0; +} + +int blkdev_issue_zeroout(struct block_device *bdev, + sector_t sector, sector_t nr_sects, + gfp_t gfp_mask, unsigned flags) +{ + /* Not yet implemented: */ + BUG(); +} + +unsigned bdev_logical_block_size(struct block_device *bdev) +{ + struct stat statbuf; + unsigned blksize; + int ret; + + ret = fstat(bdev->bd_fd, &statbuf); + BUG_ON(ret); + + if (!S_ISBLK(statbuf.st_mode)) + return statbuf.st_blksize; + + xioctl(bdev->bd_fd, BLKPBSZGET, &blksize); + return blksize; +} + +sector_t get_capacity(struct gendisk *disk) +{ + struct block_device *bdev = + container_of(disk, struct block_device, __bd_disk); + struct stat statbuf; + u64 bytes; + int ret; + + ret = fstat(bdev->bd_fd, &statbuf); + BUG_ON(ret); + + if (!S_ISBLK(statbuf.st_mode)) + return statbuf.st_size >> 9; + + ret = ioctl(bdev->bd_fd, BLKGETSIZE64, &bytes); + BUG_ON(ret); + + return bytes >> 9; +} + +void blkdev_put(struct block_device *bdev, void *holder) +{ + fdatasync(bdev->bd_fd); + close(bdev->bd_fd); + free(bdev); +} + +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop) +{ + struct block_device *bdev; + int fd, flags = 0; + + if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE)) + flags = O_RDWR; + else if (mode & BLK_OPEN_READ) + flags = O_RDONLY; + else if (mode & BLK_OPEN_WRITE) + flags = O_WRONLY; + + if (!(mode & BLK_OPEN_BUFFERED)) + flags |= O_DIRECT; + + if (mode & BLK_OPEN_EXCL) + flags |= O_EXCL; + + fd = open(path, flags); + if (fd < 0) + return ERR_PTR(-errno); + + bdev = malloc(sizeof(*bdev)); + memset(bdev, 0, sizeof(*bdev)); + + strncpy(bdev->name, path, sizeof(bdev->name)); + bdev->name[sizeof(bdev->name) - 1] = '\0'; + + bdev->bd_dev = xfstat(fd).st_rdev; + bdev->bd_fd = fd; + bdev->bd_holder = holder; + bdev->bd_disk = &bdev->__bd_disk; + bdev->bd_disk->bdi = &bdev->bd_disk->__bdi; + bdev->queue.backing_dev_info = bdev->bd_disk->bdi; + + return bdev; +} + +void bdput(struct block_device *bdev) +{ + BUG(); +} + +int lookup_bdev(const char *path, dev_t *dev) +{ + return -EINVAL; +} + +static void io_fallback(void) +{ + fops++; + if (fops->init == NULL) + die("no fallback possible, something is very wrong"); + fops->init(); +} + +static void sync_check(struct bio *bio, int ret) +{ + if (ret != bio->bi_iter.bi_size) { + die("IO error: %s\n", strerror(-ret)); + } + + if (bio->bi_opf & REQ_FUA) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) + die("fsync error: %s\n", strerror(-ret)); + } + bio_endio(bio); +} + +static void sync_init(void) {} + +static void sync_cleanup(void) +{ + /* not necessary? */ + sync(); +} + +static void sync_read(struct bio *bio, struct iovec * iov, unsigned i) +{ + + ssize_t ret = preadv(bio->bi_bdev->bd_fd, iov, i, + bio->bi_iter.bi_sector << 9); + sync_check(bio, ret); +} + +static void sync_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + ssize_t ret = pwritev2(bio->bi_bdev->bd_fd, iov, i, + bio->bi_iter.bi_sector << 9, + bio->bi_opf & REQ_FUA ? RWF_SYNC : 0); + sync_check(bio, ret); +} + +static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed); + +static int aio_completion_thread(void *arg) +{ + struct io_event events[8], *ev; + int ret; + bool stop = false; + + while (!stop) { + ret = io_getevents(aio_ctx, 1, ARRAY_SIZE(events), + events, NULL); + + if (ret < 0 && ret == -EINTR) + continue; + if (ret < 0) + die("io_getevents() error: %s", strerror(-ret)); + if (ret) + wake_up(&aio_events_completed); + + for (ev = events; ev < events + ret; ev++) { + struct bio *bio = (struct bio *) ev->data; + + /* This should only happen during blkdev_cleanup() */ + if (!bio) { + BUG_ON(atomic_read(&running_requests) != 0); + stop = true; + continue; + } + + if (ev->res != bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + + bio_endio(bio); + atomic_dec(&running_requests); + } + } + + return 0; +} + +static struct task_struct *aio_task = NULL; + +static void aio_init(void) +{ + struct task_struct *p; + long err = io_setup(256, &aio_ctx); + if (!err) { + p = kthread_run(aio_completion_thread, NULL, "aio_completion"); + BUG_ON(IS_ERR(p)); + + aio_task = p; + + } else if (err == -ENOSYS) { + io_fallback(); + } else { + die("io_setup() error: %s", strerror(err)); + } +} + +static void aio_cleanup(void) +{ + struct task_struct *p = NULL; + swap(aio_task, p); + get_task_struct(p); + + /* I mean, really?! IO_CMD_NOOP is even defined, but not implemented. */ + int fds[2]; + int ret = pipe(fds); + if (ret != 0) + die("pipe err: %s", strerror(ret)); + + /* Wake up the completion thread with spurious work. */ + int junk = 0; + struct iocb iocb = { + .aio_lio_opcode = IO_CMD_PWRITE, + .data = NULL, /* Signal to stop */ + .aio_fildes = fds[1], + .u.c.buf = &junk, + .u.c.nbytes = 1, + }, *iocbp = &iocb; + ret = io_submit(aio_ctx, 1, &iocbp); + if (ret != 1) + die("io_submit cleanup err: %s", strerror(-ret)); + + ret = kthread_stop(p); + BUG_ON(ret); + + put_task_struct(p); + + close(fds[0]); + close(fds[1]); +} + +static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode) +{ + ssize_t ret; + struct iocb iocb = { + .data = bio, + .aio_fildes = bio->bi_bdev->bd_fd, + .aio_rw_flags = bio->bi_opf & REQ_FUA ? RWF_SYNC : 0, + .aio_lio_opcode = opcode, + .u.c.buf = iov, + .u.c.nbytes = i, + .u.c.offset = bio->bi_iter.bi_sector << 9, + + }, *iocbp = &iocb; + + atomic_inc(&running_requests); + + wait_event(aio_events_completed, + (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);; + + if (ret != 1) + die("io_submit err: %s", strerror(-ret)); +} + +static void aio_read(struct bio *bio, struct iovec *iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PREADV); +} + +static void aio_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PWRITEV); +} + + +/* not implemented */ +static void uring_init(void) { + io_fallback(); +} + +struct fops fops_list[] = { + { + .init = uring_init, + }, { + .init = aio_init, + .cleanup = aio_cleanup, + .read = aio_read, + .write = aio_write, + }, { + .init = sync_init, + .cleanup = sync_cleanup, + .read = sync_read, + .write = sync_write, + }, { + /* NULL */ + } +}; + +__attribute__((constructor(102))) +static void blkdev_init(void) +{ + fops = fops_list; + fops->init(); +} + +__attribute__((destructor(102))) +static void blkdev_cleanup(void) +{ + fops->cleanup(); +} diff --git a/linux/closure.c b/linux/closure.c new file mode 100644 index 00000000..c1654055 --- /dev/null +++ b/linux/closure.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/closure.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/sched/debug.h> + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + int r = flags & CLOSURE_REMAINING_MASK; + + BUG_ON(flags & CLOSURE_GUARD_MASK); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + + if (!r) { + smp_acquire__after_ctrl_dep(); + + cl->closure_get_happened = false; + + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { + atomic_set(&cl->remaining, + CLOSURE_REMAINING_INITIALIZER); + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; + + closure_debug_destroy(cl); + + if (destructor) + destructor(&cl->work); + + if (parent) + closure_put(parent); + } + } +} + +/* For clearing flags with the same atomic op as a put */ +void closure_sub(struct closure *cl, int v) +{ + closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining)); +} +EXPORT_SYMBOL(closure_sub); + +/* + * closure_put - decrement a closure's refcount + */ +void closure_put(struct closure *cl) +{ + closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining)); +} +EXPORT_SYMBOL(closure_put); + +/* + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ +void __closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl, *t; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + reverse = llist_reverse_order(list); + + /* Then do the wakeups */ + llist_for_each_entry_safe(cl, t, reverse, list) { + closure_set_waiting(cl, 0); + closure_sub(cl, CLOSURE_WAITING + 1); + } +} +EXPORT_SYMBOL(__closure_wake_up); + +/** + * closure_wait - add a closure to a waitlist + * @waitlist: will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * @cl: closure pointer. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + cl->closure_get_happened = true; + closure_set_waiting(cl, _RET_IP_); + atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + llist_add(&cl->list, &waitlist->list); + + return true; +} +EXPORT_SYMBOL(closure_wait); + +struct closure_syncer { + struct task_struct *task; + int done; +}; + +static CLOSURE_CALLBACK(closure_sync_fn) +{ + struct closure *cl = container_of(ws, struct closure, work); + struct closure_syncer *s = cl->s; + struct task_struct *p; + + rcu_read_lock(); + p = READ_ONCE(s->task); + s->done = 1; + wake_up_process(p); + rcu_read_unlock(); +} + +void __sched __closure_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + + cl->s = &s; + continue_at(cl, closure_sync_fn, NULL); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL(__closure_sync); + +#ifdef CONFIG_DEBUG_CLOSURES + +static LIST_HEAD(closure_list); +static DEFINE_SPINLOCK(closure_list_lock); + +void closure_debug_create(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_ALIVE; + + spin_lock_irqsave(&closure_list_lock, flags); + list_add(&cl->all, &closure_list); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL(closure_debug_create); + +void closure_debug_destroy(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_DEAD; + + spin_lock_irqsave(&closure_list_lock, flags); + list_del(&cl->all); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL(closure_debug_destroy); + +static int debug_show(struct seq_file *f, void *data) +{ + struct closure *cl; + + spin_lock_irq(&closure_list_lock); + + list_for_each_entry(cl, &closure_list, all) { + int r = atomic_read(&cl->remaining); + + seq_printf(f, "%p: %pS -> %pS p %p r %i ", + cl, (void *) cl->ip, cl->fn, cl->parent, + r & CLOSURE_REMAINING_MASK); + + seq_printf(f, "%s%s\n", + test_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(&cl->work)) ? "Q" : "", + r & CLOSURE_RUNNING ? "R" : ""); + + if (r & CLOSURE_WAITING) + seq_printf(f, " W %pS\n", + (void *) cl->waiting_on); + + seq_puts(f, "\n"); + } + + spin_unlock_irq(&closure_list_lock); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(debug); + +static int __init closure_debug_init(void) +{ + debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); + return 0; +} +late_initcall(closure_debug_init) + +#endif diff --git a/linux/crc64.c b/linux/crc64.c new file mode 100644 index 00000000..0ef8ae6a --- /dev/null +++ b/linux/crc64.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Normal 64-bit CRC calculation. + * + * This is a basic crc64 implementation following ECMA-182 specification, + * which can be found from, + * http://www.ecma-international.org/publications/standards/Ecma-182.htm + * + * Dr. Ross N. Williams has a great document to introduce the idea of CRC + * algorithm, here the CRC64 code is also inspired by the table-driven + * algorithm and detail example from this paper. This paper can be found + * from, + * http://www.ross.net/crc/download/crc_v3.txt + * + * crc64table[256] is the lookup table of a table-driven 64-bit CRC + * calculation, which is generated by gen_crc64table.c in kernel build + * time. The polynomial of crc64 arithmetic is from ECMA-182 specification + * as well, which is defined as, + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li <colyli@suse.de> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include "crc64table.h" + +MODULE_DESCRIPTION("CRC64 calculations"); +MODULE_LICENSE("GPL v2"); + +/** + * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64 + * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation, + or the previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +u64 __pure crc64_be(u64 crc, const void *p, size_t len) +{ + size_t i, t; + + const unsigned char *_p = p; + + for (i = 0; i < len; i++) { + t = ((crc >> 56) ^ (*_p++)) & 0xFF; + crc = crc64table[t] ^ (crc << 8); + } + + return crc; +} +EXPORT_SYMBOL_GPL(crc64_be); diff --git a/linux/crc64table.h b/linux/crc64table.h new file mode 100644 index 00000000..9964164d --- /dev/null +++ b/linux/crc64table.h @@ -0,0 +1,135 @@ +/* this file is generated - do not edit */ + +#include <linux/types.h> +#include <linux/cache.h> + +static const u64 ____cacheline_aligned crc64table[256] = { + 0x0000000000000000ULL, 0x42f0e1eba9ea3693ULL, + 0x85e1c3d753d46d26ULL, 0xc711223cfa3e5bb5ULL, + 0x493366450e42ecdfULL, 0x0bc387aea7a8da4cULL, + 0xccd2a5925d9681f9ULL, 0x8e224479f47cb76aULL, + 0x9266cc8a1c85d9beULL, 0xd0962d61b56fef2dULL, + 0x17870f5d4f51b498ULL, 0x5577eeb6e6bb820bULL, + 0xdb55aacf12c73561ULL, 0x99a54b24bb2d03f2ULL, + 0x5eb4691841135847ULL, 0x1c4488f3e8f96ed4ULL, + 0x663d78ff90e185efULL, 0x24cd9914390bb37cULL, + 0xe3dcbb28c335e8c9ULL, 0xa12c5ac36adfde5aULL, + 0x2f0e1eba9ea36930ULL, 0x6dfeff5137495fa3ULL, + 0xaaefdd6dcd770416ULL, 0xe81f3c86649d3285ULL, + 0xf45bb4758c645c51ULL, 0xb6ab559e258e6ac2ULL, + 0x71ba77a2dfb03177ULL, 0x334a9649765a07e4ULL, + 0xbd68d2308226b08eULL, 0xff9833db2bcc861dULL, + 0x388911e7d1f2dda8ULL, 0x7a79f00c7818eb3bULL, + 0xcc7af1ff21c30bdeULL, 0x8e8a101488293d4dULL, + 0x499b3228721766f8ULL, 0x0b6bd3c3dbfd506bULL, + 0x854997ba2f81e701ULL, 0xc7b97651866bd192ULL, + 0x00a8546d7c558a27ULL, 0x4258b586d5bfbcb4ULL, + 0x5e1c3d753d46d260ULL, 0x1cecdc9e94ace4f3ULL, + 0xdbfdfea26e92bf46ULL, 0x990d1f49c77889d5ULL, + 0x172f5b3033043ebfULL, 0x55dfbadb9aee082cULL, + 0x92ce98e760d05399ULL, 0xd03e790cc93a650aULL, + 0xaa478900b1228e31ULL, 0xe8b768eb18c8b8a2ULL, + 0x2fa64ad7e2f6e317ULL, 0x6d56ab3c4b1cd584ULL, + 0xe374ef45bf6062eeULL, 0xa1840eae168a547dULL, + 0x66952c92ecb40fc8ULL, 0x2465cd79455e395bULL, + 0x3821458aada7578fULL, 0x7ad1a461044d611cULL, + 0xbdc0865dfe733aa9ULL, 0xff3067b657990c3aULL, + 0x711223cfa3e5bb50ULL, 0x33e2c2240a0f8dc3ULL, + 0xf4f3e018f031d676ULL, 0xb60301f359dbe0e5ULL, + 0xda050215ea6c212fULL, 0x98f5e3fe438617bcULL, + 0x5fe4c1c2b9b84c09ULL, 0x1d14202910527a9aULL, + 0x93366450e42ecdf0ULL, 0xd1c685bb4dc4fb63ULL, + 0x16d7a787b7faa0d6ULL, 0x5427466c1e109645ULL, + 0x4863ce9ff6e9f891ULL, 0x0a932f745f03ce02ULL, + 0xcd820d48a53d95b7ULL, 0x8f72eca30cd7a324ULL, + 0x0150a8daf8ab144eULL, 0x43a04931514122ddULL, + 0x84b16b0dab7f7968ULL, 0xc6418ae602954ffbULL, + 0xbc387aea7a8da4c0ULL, 0xfec89b01d3679253ULL, + 0x39d9b93d2959c9e6ULL, 0x7b2958d680b3ff75ULL, + 0xf50b1caf74cf481fULL, 0xb7fbfd44dd257e8cULL, + 0x70eadf78271b2539ULL, 0x321a3e938ef113aaULL, + 0x2e5eb66066087d7eULL, 0x6cae578bcfe24bedULL, + 0xabbf75b735dc1058ULL, 0xe94f945c9c3626cbULL, + 0x676dd025684a91a1ULL, 0x259d31cec1a0a732ULL, + 0xe28c13f23b9efc87ULL, 0xa07cf2199274ca14ULL, + 0x167ff3eacbaf2af1ULL, 0x548f120162451c62ULL, + 0x939e303d987b47d7ULL, 0xd16ed1d631917144ULL, + 0x5f4c95afc5edc62eULL, 0x1dbc74446c07f0bdULL, + 0xdaad56789639ab08ULL, 0x985db7933fd39d9bULL, + 0x84193f60d72af34fULL, 0xc6e9de8b7ec0c5dcULL, + 0x01f8fcb784fe9e69ULL, 0x43081d5c2d14a8faULL, + 0xcd2a5925d9681f90ULL, 0x8fdab8ce70822903ULL, + 0x48cb9af28abc72b6ULL, 0x0a3b7b1923564425ULL, + 0x70428b155b4eaf1eULL, 0x32b26afef2a4998dULL, + 0xf5a348c2089ac238ULL, 0xb753a929a170f4abULL, + 0x3971ed50550c43c1ULL, 0x7b810cbbfce67552ULL, + 0xbc902e8706d82ee7ULL, 0xfe60cf6caf321874ULL, + 0xe224479f47cb76a0ULL, 0xa0d4a674ee214033ULL, + 0x67c58448141f1b86ULL, 0x253565a3bdf52d15ULL, + 0xab1721da49899a7fULL, 0xe9e7c031e063acecULL, + 0x2ef6e20d1a5df759ULL, 0x6c0603e6b3b7c1caULL, + 0xf6fae5c07d3274cdULL, 0xb40a042bd4d8425eULL, + 0x731b26172ee619ebULL, 0x31ebc7fc870c2f78ULL, + 0xbfc9838573709812ULL, 0xfd39626eda9aae81ULL, + 0x3a28405220a4f534ULL, 0x78d8a1b9894ec3a7ULL, + 0x649c294a61b7ad73ULL, 0x266cc8a1c85d9be0ULL, + 0xe17dea9d3263c055ULL, 0xa38d0b769b89f6c6ULL, + 0x2daf4f0f6ff541acULL, 0x6f5faee4c61f773fULL, + 0xa84e8cd83c212c8aULL, 0xeabe6d3395cb1a19ULL, + 0x90c79d3fedd3f122ULL, 0xd2377cd44439c7b1ULL, + 0x15265ee8be079c04ULL, 0x57d6bf0317edaa97ULL, + 0xd9f4fb7ae3911dfdULL, 0x9b041a914a7b2b6eULL, + 0x5c1538adb04570dbULL, 0x1ee5d94619af4648ULL, + 0x02a151b5f156289cULL, 0x4051b05e58bc1e0fULL, + 0x87409262a28245baULL, 0xc5b073890b687329ULL, + 0x4b9237f0ff14c443ULL, 0x0962d61b56fef2d0ULL, + 0xce73f427acc0a965ULL, 0x8c8315cc052a9ff6ULL, + 0x3a80143f5cf17f13ULL, 0x7870f5d4f51b4980ULL, + 0xbf61d7e80f251235ULL, 0xfd913603a6cf24a6ULL, + 0x73b3727a52b393ccULL, 0x31439391fb59a55fULL, + 0xf652b1ad0167feeaULL, 0xb4a25046a88dc879ULL, + 0xa8e6d8b54074a6adULL, 0xea16395ee99e903eULL, + 0x2d071b6213a0cb8bULL, 0x6ff7fa89ba4afd18ULL, + 0xe1d5bef04e364a72ULL, 0xa3255f1be7dc7ce1ULL, + 0x64347d271de22754ULL, 0x26c49cccb40811c7ULL, + 0x5cbd6cc0cc10fafcULL, 0x1e4d8d2b65facc6fULL, + 0xd95caf179fc497daULL, 0x9bac4efc362ea149ULL, + 0x158e0a85c2521623ULL, 0x577eeb6e6bb820b0ULL, + 0x906fc95291867b05ULL, 0xd29f28b9386c4d96ULL, + 0xcedba04ad0952342ULL, 0x8c2b41a1797f15d1ULL, + 0x4b3a639d83414e64ULL, 0x09ca82762aab78f7ULL, + 0x87e8c60fded7cf9dULL, 0xc51827e4773df90eULL, + 0x020905d88d03a2bbULL, 0x40f9e43324e99428ULL, + 0x2cffe7d5975e55e2ULL, 0x6e0f063e3eb46371ULL, + 0xa91e2402c48a38c4ULL, 0xebeec5e96d600e57ULL, + 0x65cc8190991cb93dULL, 0x273c607b30f68faeULL, + 0xe02d4247cac8d41bULL, 0xa2dda3ac6322e288ULL, + 0xbe992b5f8bdb8c5cULL, 0xfc69cab42231bacfULL, + 0x3b78e888d80fe17aULL, 0x7988096371e5d7e9ULL, + 0xf7aa4d1a85996083ULL, 0xb55aacf12c735610ULL, + 0x724b8ecdd64d0da5ULL, 0x30bb6f267fa73b36ULL, + 0x4ac29f2a07bfd00dULL, 0x08327ec1ae55e69eULL, + 0xcf235cfd546bbd2bULL, 0x8dd3bd16fd818bb8ULL, + 0x03f1f96f09fd3cd2ULL, 0x41011884a0170a41ULL, + 0x86103ab85a2951f4ULL, 0xc4e0db53f3c36767ULL, + 0xd8a453a01b3a09b3ULL, 0x9a54b24bb2d03f20ULL, + 0x5d45907748ee6495ULL, 0x1fb5719ce1045206ULL, + 0x919735e51578e56cULL, 0xd367d40ebc92d3ffULL, + 0x1476f63246ac884aULL, 0x568617d9ef46bed9ULL, + 0xe085162ab69d5e3cULL, 0xa275f7c11f7768afULL, + 0x6564d5fde549331aULL, 0x279434164ca30589ULL, + 0xa9b6706fb8dfb2e3ULL, 0xeb46918411358470ULL, + 0x2c57b3b8eb0bdfc5ULL, 0x6ea7525342e1e956ULL, + 0x72e3daa0aa188782ULL, 0x30133b4b03f2b111ULL, + 0xf7021977f9cceaa4ULL, 0xb5f2f89c5026dc37ULL, + 0x3bd0bce5a45a6b5dULL, 0x79205d0e0db05dceULL, + 0xbe317f32f78e067bULL, 0xfcc19ed95e6430e8ULL, + 0x86b86ed5267cdbd3ULL, 0xc4488f3e8f96ed40ULL, + 0x0359ad0275a8b6f5ULL, 0x41a94ce9dc428066ULL, + 0xcf8b0890283e370cULL, 0x8d7be97b81d4019fULL, + 0x4a6acb477bea5a2aULL, 0x089a2aacd2006cb9ULL, + 0x14dea25f3af9026dULL, 0x562e43b4931334feULL, + 0x913f6188692d6f4bULL, 0xd3cf8063c0c759d8ULL, + 0x5dedc41a34bbeeb2ULL, 0x1f1d25f19d51d821ULL, + 0xd80c07cd676f8394ULL, 0x9afce626ce85b507ULL, +}; diff --git a/linux/crypto/api.c b/linux/crypto/api.c new file mode 100644 index 00000000..2f3ab2b5 --- /dev/null +++ b/linux/crypto/api.c @@ -0,0 +1,114 @@ +/* + * Cryptographic API for algorithms (i.e., low-level API). + * + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/device.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <crypto/algapi.h> + +static LIST_HEAD(crypto_alg_list); +static DECLARE_RWSEM(crypto_alg_sem); + +struct crypto_type { +}; + +int crypto_register_alg(struct crypto_alg *alg) +{ + down_write(&crypto_alg_sem); + list_add(&alg->cra_list, &crypto_alg_list); + up_write(&crypto_alg_sem); + + return 0; +} + +static void *crypto_alloc_tfm(const char *name, + const struct crypto_type *type) +{ + struct crypto_alg *alg; + + down_read(&crypto_alg_sem); + list_for_each_entry(alg, &crypto_alg_list, cra_list) + if (alg->cra_type == type && !strcmp(alg->cra_name, name)) + goto found; + + alg = ERR_PTR(-ENOENT); +found: + up_read(&crypto_alg_sem); + + if (IS_ERR(alg)) + return ERR_CAST(alg); + + return alg->alloc_tfm() ?: ERR_PTR(-ENOMEM); +} + +/* skcipher: */ + +static const struct crypto_type crypto_skcipher_type2 = { +}; + +struct crypto_skcipher *crypto_alloc_skcipher(const char *name, + u32 type, u32 mask) +{ + return crypto_alloc_tfm(name, &crypto_skcipher_type2); +} + +int crypto_register_skcipher(struct skcipher_alg *alg) +{ + alg->base.cra_type = &crypto_skcipher_type2; + + return crypto_register_alg(&alg->base); +} + +/* shash: */ + +#include <crypto/hash.h> + +static int shash_finup(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) +{ + return crypto_shash_update(desc, data, len) ?: + crypto_shash_final(desc, out); +} + +static int shash_digest(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) +{ + return crypto_shash_init(desc) ?: + crypto_shash_finup(desc, data, len, out); +} + +static const struct crypto_type crypto_shash_type = { +}; + +struct crypto_shash *crypto_alloc_shash(const char *name, + u32 type, u32 mask) +{ + return crypto_alloc_tfm(name, &crypto_shash_type); +} + +int crypto_register_shash(struct shash_alg *alg) +{ + alg->base.cra_type = &crypto_shash_type; + + if (!alg->finup) + alg->finup = shash_finup; + if (!alg->digest) + alg->digest = shash_digest; + + return crypto_register_alg(&alg->base); +} diff --git a/linux/crypto/chacha20_generic.c b/linux/crypto/chacha20_generic.c new file mode 100644 index 00000000..914189e7 --- /dev/null +++ b/linux/crypto/chacha20_generic.c @@ -0,0 +1,111 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/byteorder.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/scatterlist.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/algapi.h> +#include <crypto/chacha.h> +#include <crypto/skcipher.h> + +#include <sodium/crypto_stream_chacha20.h> + +static struct skcipher_alg alg; + +struct chacha20_tfm { + struct crypto_skcipher tfm; + u32 key[8]; +}; + +static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize) +{ + struct chacha20_tfm *ctx = + container_of(tfm, struct chacha20_tfm, tfm); + int i; + + if (keysize != CHACHA_KEY_SIZE) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(ctx->key); i++) + ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); + + return 0; +} + +static int crypto_chacha20_crypt(struct skcipher_request *req) +{ + struct chacha20_tfm *ctx = + container_of(req->tfm, struct chacha20_tfm, tfm.base); + struct scatterlist *sg = req->src; + unsigned nbytes = req->cryptlen; + u32 iv[4]; + int ret; + + BUG_ON(req->src != req->dst); + + memcpy(iv, req->iv, sizeof(iv)); + + while (1) { + ret = crypto_stream_chacha20_xor_ic(sg_virt(sg), + sg_virt(sg), + sg->length, + (void *) &iv[2], + iv[0] | ((u64) iv[1] << 32), + (void *) ctx->key); + BUG_ON(ret); + + nbytes -= sg->length; + + if (sg_is_last(sg)) + break; + + BUG_ON(sg->length % CHACHA_BLOCK_SIZE); + iv[0] += sg->length / CHACHA_BLOCK_SIZE; + sg = sg_next(sg); + }; + + BUG_ON(nbytes); + + return 0; +} + +static void *crypto_chacha20_alloc_tfm(void) +{ + struct chacha20_tfm *tfm = kzalloc(sizeof(*tfm), GFP_KERNEL); + + if (!tfm) + return NULL; + + tfm->tfm.base.alg = &alg.base; + tfm->tfm.setkey = crypto_chacha20_setkey; + tfm->tfm.encrypt = crypto_chacha20_crypt; + tfm->tfm.decrypt = crypto_chacha20_crypt; + tfm->tfm.ivsize = CHACHA_IV_SIZE; + tfm->tfm.keysize = CHACHA_KEY_SIZE; + + return tfm; +} + +static struct skcipher_alg alg = { + .base.cra_name = "chacha20", + .base.alloc_tfm = crypto_chacha20_alloc_tfm, +}; + +__attribute__((constructor(110))) +static int chacha20_generic_mod_init(void) +{ + return crypto_register_skcipher(&alg); +} diff --git a/linux/crypto/poly1305_generic.c b/linux/crypto/poly1305_generic.c new file mode 100644 index 00000000..acb554c0 --- /dev/null +++ b/linux/crypto/poly1305_generic.c @@ -0,0 +1,88 @@ +/* + * Poly1305 authenticator algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * Based on public domain code by Andrew Moon and Daniel J. Bernstein. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/byteorder.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/algapi.h> +#include <crypto/hash.h> +#include <crypto/poly1305.h> + +static struct shash_alg poly1305_alg; + +struct poly1305_desc_ctx { + bool key_done; + crypto_onetimeauth_poly1305_state s; +}; + +static int poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *state = (void *) desc->ctx; + + state->key_done = false; + return 0; +} + +static int poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned len) +{ + struct poly1305_desc_ctx *state = (void *) desc->ctx; + + if (!state->key_done) { + BUG_ON(len != crypto_onetimeauth_poly1305_KEYBYTES); + + state->key_done = true; + return crypto_onetimeauth_poly1305_init(&state->s, src); + } + + return crypto_onetimeauth_poly1305_update(&state->s, src, len); +} + +static int poly1305_final(struct shash_desc *desc, u8 *out) +{ + struct poly1305_desc_ctx *state = (void *) desc->ctx; + + return crypto_onetimeauth_poly1305_final(&state->s, out); +} + +static void *poly1305_alloc_tfm(void) +{ + struct crypto_shash *tfm = kzalloc(sizeof(*tfm), GFP_KERNEL); + + if (!tfm) + return NULL; + + tfm->base.alg = &poly1305_alg.base; + tfm->descsize = sizeof(struct poly1305_desc_ctx); + return tfm; +} + +static struct shash_alg poly1305_alg = { + .digestsize = crypto_onetimeauth_poly1305_BYTES, + .init = poly1305_init, + .update = poly1305_update, + .final = poly1305_final, + .descsize = sizeof(struct poly1305_desc_ctx), + + .base.cra_name = "poly1305", + .base.alloc_tfm = poly1305_alloc_tfm, +}; + +__attribute__((constructor(110))) +static int poly1305_mod_init(void) +{ + return crypto_register_shash(&poly1305_alg); +} diff --git a/linux/crypto/sha256_generic.c b/linux/crypto/sha256_generic.c new file mode 100644 index 00000000..9326bfe7 --- /dev/null +++ b/linux/crypto/sha256_generic.c @@ -0,0 +1,81 @@ +/* + * Cryptographic API. + * + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/bitops.h> +#include <linux/byteorder.h> +#include <linux/types.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/hash.h> + +#include <sodium/crypto_hash_sha256.h> + +static struct shash_alg sha256_alg; + +static int sha256_init(struct shash_desc *desc) +{ + crypto_hash_sha256_state *state = (void *) desc->ctx; + + return crypto_hash_sha256_init(state); +} + +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + crypto_hash_sha256_state *state = (void *) desc->ctx; + + return crypto_hash_sha256_update(state, data, len); +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + crypto_hash_sha256_state *state = (void *) desc->ctx; + + return crypto_hash_sha256_final(state, out); +} + +static void *sha256_alloc_tfm(void) +{ + struct crypto_shash *tfm = kzalloc(sizeof(*tfm), GFP_KERNEL); + + if (!tfm) + return NULL; + + tfm->base.alg = &sha256_alg.base; + tfm->descsize = sizeof(crypto_hash_sha256_state); + return tfm; +} + +static struct shash_alg sha256_alg = { + .digestsize = crypto_hash_sha256_BYTES, + .init = sha256_init, + .update = sha256_update, + .final = sha256_final, + .descsize = sizeof(crypto_hash_sha256_state), + .base.cra_name = "sha256", + .base.alloc_tfm = sha256_alloc_tfm, +}; + +__attribute__((constructor(110))) +static int __init sha256_generic_mod_init(void) +{ + return crypto_register_shash(&sha256_alg); +} diff --git a/linux/fs.c b/linux/fs.c new file mode 100644 index 00000000..623ca266 --- /dev/null +++ b/linux/fs.c @@ -0,0 +1,14 @@ +#include <linux/fs.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +const struct xattr_handler nop_posix_acl_access = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, +}; + +const struct xattr_handler nop_posix_acl_default = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_DEFAULT, +}; diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c new file mode 100644 index 00000000..41f1bcdc --- /dev/null +++ b/linux/generic-radix-tree.c @@ -0,0 +1,307 @@ + +#include <linux/atomic.h> +#include <linux/export.h> +#include <linux/generic-radix-tree.h> +#include <linux/gfp.h> +#include <linux/kmemleak.h> + +#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) + +struct genradix_node { + union { + /* Interior node: */ + struct genradix_node *children[GENRADIX_ARY]; + + /* Leaf: */ + u8 data[PAGE_SIZE]; + }; +}; + +static inline int genradix_depth_shift(unsigned depth) +{ + return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; +} + +/* + * Returns size (of data, in bytes) that a tree of a given depth holds: + */ +static inline size_t genradix_depth_size(unsigned depth) +{ + return 1UL << genradix_depth_shift(depth); +} + +/* depth that's needed for a genradix that can address up to ULONG_MAX: */ +#define GENRADIX_MAX_DEPTH \ + DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) + +#define GENRADIX_DEPTH_MASK \ + ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) + +static inline unsigned genradix_root_to_depth(struct genradix_root *r) +{ + return (unsigned long) r & GENRADIX_DEPTH_MASK; +} + +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) +{ + return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); +} + +/* + * Returns pointer to the specified byte @offset within @radix, or NULL if not + * allocated + */ +void *__genradix_ptr(struct __genradix *radix, size_t offset) +{ + struct genradix_root *r = READ_ONCE(radix->root); + struct genradix_node *n = genradix_root_to_node(r); + unsigned level = genradix_root_to_depth(r); + + if (ilog2(offset) >= genradix_depth_shift(level)) + return NULL; + + while (1) { + if (!n) + return NULL; + if (!level) + break; + + level--; + + n = n->children[offset >> genradix_depth_shift(level)]; + offset &= genradix_depth_size(level) - 1; + } + + return &n->data[offset]; +} +EXPORT_SYMBOL(__genradix_ptr); + +static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) +{ + struct genradix_node *node; + + node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); + + /* + * We're using pages (not slab allocations) directly for kernel data + * structures, so we need to explicitly inform kmemleak of them in order + * to avoid false positive memory leak reports. + */ + kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); + return node; +} + +static inline void genradix_free_node(struct genradix_node *node) +{ + kmemleak_free(node); + free_page((unsigned long)node); +} + +/* + * Returns pointer to the specified byte @offset within @radix, allocating it if + * necessary - newly allocated slots are always zeroed out: + */ +void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, + gfp_t gfp_mask) +{ + struct genradix_root *v = READ_ONCE(radix->root); + struct genradix_node *n, *new_node = NULL; + unsigned level; + + /* Increase tree depth if necessary: */ + while (1) { + struct genradix_root *r = v, *new_root; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (n && ilog2(offset) < genradix_depth_shift(level)) + break; + + if (!new_node) { + new_node = genradix_alloc_node(gfp_mask); + if (!new_node) + return NULL; + } + + new_node->children[0] = n; + new_root = ((struct genradix_root *) + ((unsigned long) new_node | (n ? level + 1 : 0))); + + if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { + v = new_root; + new_node = NULL; + } + } + + while (level--) { + struct genradix_node **p = + &n->children[offset >> genradix_depth_shift(level)]; + offset &= genradix_depth_size(level) - 1; + + n = READ_ONCE(*p); + if (!n) { + if (!new_node) { + new_node = genradix_alloc_node(gfp_mask); + if (!new_node) + return NULL; + } + + if (!(n = cmpxchg_release(p, NULL, new_node))) + swap(n, new_node); + } + } + + if (new_node) + genradix_free_node(new_node); + + return &n->data[offset]; +} +EXPORT_SYMBOL(__genradix_ptr_alloc); + +void *__genradix_iter_peek(struct genradix_iter *iter, + struct __genradix *radix, + size_t objs_per_page) +{ + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + +restart: + r = READ_ONCE(radix->root); + if (!r) + return NULL; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (ilog2(iter->offset) >= genradix_depth_shift(level)) + return NULL; + + while (level) { + level--; + + i = (iter->offset >> genradix_depth_shift(level)) & + (GENRADIX_ARY - 1); + + while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + if (iter->offset + objs_per_ptr < iter->offset) { + iter->offset = SIZE_MAX; + iter->pos = SIZE_MAX; + return NULL; + } + + i++; + iter->offset = round_down(iter->offset + objs_per_ptr, + objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * + objs_per_page; + if (i == GENRADIX_ARY) + goto restart; + } + + n = n->children[i]; + } + + return &n->data[iter->offset & (PAGE_SIZE - 1)]; +} +EXPORT_SYMBOL(__genradix_iter_peek); + +void *__genradix_iter_peek_prev(struct genradix_iter *iter, + struct __genradix *radix, + size_t objs_per_page, + size_t obj_size_plus_page_remainder) +{ + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + +restart: + r = READ_ONCE(radix->root); + if (!r) + return NULL; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (ilog2(iter->offset) >= genradix_depth_shift(level)) { + iter->offset = genradix_depth_size(level); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + } + + while (level) { + level--; + + i = (iter->offset >> genradix_depth_shift(level)) & + (GENRADIX_ARY - 1); + + while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + iter->offset = round_down(iter->offset, objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + if (!iter->offset) + return NULL; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + + if (!i) + goto restart; + --i; + } + + n = n->children[i]; + } + + return &n->data[iter->offset & (PAGE_SIZE - 1)]; +} +EXPORT_SYMBOL(__genradix_iter_peek_prev); + +static void genradix_free_recurse(struct genradix_node *n, unsigned level) +{ + if (level) { + unsigned i; + + for (i = 0; i < GENRADIX_ARY; i++) + if (n->children[i]) + genradix_free_recurse(n->children[i], level - 1); + } + + genradix_free_node(n); +} + +int __genradix_prealloc(struct __genradix *radix, size_t size, + gfp_t gfp_mask) +{ + size_t offset; + + for (offset = 0; offset < size; offset += PAGE_SIZE) + if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(__genradix_prealloc); + +void __genradix_free(struct __genradix *radix) +{ + struct genradix_root *r = xchg(&radix->root, NULL); + + genradix_free_recurse(genradix_root_to_node(r), + genradix_root_to_depth(r)); +} +EXPORT_SYMBOL(__genradix_free); diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c new file mode 100644 index 00000000..a8170bb9 --- /dev/null +++ b/linux/int_sqrt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com> + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ + +#include <linux/export.h> +#include <linux/bitops.h> +#include <linux/limits.h> +#include <linux/math.h> + +/** + * int_sqrt - computes the integer square root + * @x: integer of which to calculate the sqrt + * + * Computes: floor(sqrt(x)) + */ +unsigned long int_sqrt(unsigned long x) +{ + unsigned long b, m, y = 0; + + if (x <= 1) + return x; + + m = 1UL << (__fls(x) & ~1UL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt); + +#if BITS_PER_LONG < 64 +/** + * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input + * is expected. + * @x: 64bit integer of which to calculate the sqrt + */ +u32 int_sqrt64(u64 x) +{ + u64 b, m, y = 0; + + if (x <= ULONG_MAX) + return int_sqrt((unsigned long) x); + + m = 1ULL << ((fls64(x) - 1) & ~1ULL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt64); +#endif diff --git a/linux/kstrtox.c b/linux/kstrtox.c new file mode 100644 index 00000000..bde55808 --- /dev/null +++ b/linux/kstrtox.c @@ -0,0 +1,357 @@ +/* + * Convert integer string representation to an integer. + * If an integer doesn't fit into specified type, -E is returned. + * + * Integer starts with optional sign. + * kstrtou*() functions do not accept sign "-". + * + * Radix 0 means autodetection: leading "0x" implies radix 16, + * leading "0" implies radix 8, otherwise radix is 10. + * Autodetection hints work after optional sign, but not before. + * + * If -E is returned, result is not touched. + */ +#include <errno.h> +#include <ctype.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include "kstrtox.h" + +#define KSTRTOX_OVERFLOW (1U << 31) + +const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) +{ + if (*base == 0) { + if (s[0] == '0') { + if (_tolower(s[1]) == 'x' && isxdigit(s[2])) + *base = 16; + else + *base = 8; + } else + *base = 10; + } + if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') + s += 2; + return s; +} + +/* + * Convert non-negative integer string representation in explicitly given radix + * to an integer. + * Return number of characters consumed maybe or-ed with overflow bit. + * If overflow occurs, result integer (incorrect) is still returned. + * + * Don't you dare use this function. + */ +unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p) +{ + unsigned long long res; + unsigned int rv; + int overflow; + + res = 0; + rv = 0; + overflow = 0; + while (*s) { + unsigned int val; + + if ('0' <= *s && *s <= '9') + val = *s - '0'; + else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f') + val = _tolower(*s) - 'a' + 10; + else + break; + + if (val >= base) + break; + /* + * Check for overflow only if we are within range of + * it in the max base we support (16) + */ + if (unlikely(res & (~0ull << 60))) { + if (res > ULLONG_MAX - val / base) + overflow = 1; + } + res = res * base + val; + rv++; + s++; + } + *p = res; + if (overflow) + rv |= KSTRTOX_OVERFLOW; + return rv; +} + +static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + unsigned long long _res; + unsigned int rv; + + s = _parse_integer_fixup_radix(s, &base); + rv = _parse_integer(s, base, &_res); + if (rv & KSTRTOX_OVERFLOW) + return -ERANGE; + if (rv == 0) + return -EINVAL; + s += rv; + if (*s == '\n') + s++; + if (*s) + return -EINVAL; + *res = _res; + return 0; +} + +/** + * kstrtoull - convert a string to an unsigned long long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ +int kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + if (s[0] == '+') + s++; + return _kstrtoull(s, base, res); +} + +/** + * kstrtoll - convert a string to a long long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign or a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ +int kstrtoll(const char *s, unsigned int base, long long *res) +{ + unsigned long long tmp; + int rv; + + if (s[0] == '-') { + rv = _kstrtoull(s + 1, base, &tmp); + if (rv < 0) + return rv; + if ((long long)-tmp > 0) + return -ERANGE; + *res = -tmp; + } else { + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if ((long long)tmp < 0) + return -ERANGE; + *res = tmp; + } + return 0; +} + +/* Internal, do not use. */ +int _kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(unsigned long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/* Internal, do not use. */ +int _kstrtol(const char *s, unsigned int base, long *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/** + * kstrtouint - convert a string to an unsigned int + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ +int kstrtouint(const char *s, unsigned int base, unsigned int *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(unsigned int)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/** + * kstrtoint - convert a string to an int + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign or a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ +int kstrtoint(const char *s, unsigned int base, int *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(int)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +int kstrtou16(const char *s, unsigned int base, u16 *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(u16)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +int kstrtos16(const char *s, unsigned int base, s16 *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(s16)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +int kstrtou8(const char *s, unsigned int base, u8 *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(u8)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +int kstrtos8(const char *s, unsigned int base, s8 *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(s8)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/** + * kstrtobool - convert common user inputs into boolean values + * @s: input string + * @res: result + * + * This routine returns 0 iff the first character is one of 'Yy1Nn0', or + * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value + * pointed to by res is updated upon finding a match. + */ +int kstrtobool(const char *s, bool *res) +{ + if (!s) + return -EINVAL; + + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + return 0; + case 'n': + case 'N': + case '0': + *res = false; + return 0; + case 'o': + case 'O': + switch (s[1]) { + case 'n': + case 'N': + *res = true; + return 0; + case 'f': + case 'F': + *res = false; + return 0; + default: + break; + } + default: + break; + } + + return -EINVAL; +} diff --git a/linux/kstrtox.h b/linux/kstrtox.h new file mode 100644 index 00000000..910b6de8 --- /dev/null +++ b/linux/kstrtox.h @@ -0,0 +1,7 @@ +#ifndef _LIB_KSTRTOX_H +#define _LIB_KSTRTOX_H + +const char *_parse_integer_fixup_radix(const char *s, unsigned int *base); +unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res); + +#endif diff --git a/linux/kthread.c b/linux/kthread.c new file mode 100644 index 00000000..17830e5f --- /dev/null +++ b/linux/kthread.c @@ -0,0 +1,139 @@ +#include <pthread.h> +#include <stdlib.h> +#include <string.h> + +#include <linux/bitops.h> +#include <linux/kthread.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> + +#include "tools-util.h" + +enum KTHREAD_BITS { + KTHREAD_IS_PER_CPU = 0, + KTHREAD_SHOULD_STOP, + KTHREAD_SHOULD_PARK, + KTHREAD_IS_PARKED, +}; + +static void *kthread_start_fn(void *data) +{ + rcu_register_thread(); + + current = data; + schedule(); + current->thread_fn(current->thread_data); + + complete(¤t->exited); + put_task_struct(current); + rcu_unregister_thread(); + return NULL; +} + +/** + * kthread_create_on_node - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @node: task and thread structures for the thread are allocated on this node + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. + * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn() can either call do_exit() directly if it is a + * standalone thread for which no one will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). + */ +struct task_struct *kthread_create(int (*thread_fn)(void *data), + void *thread_data, + const char namefmt[], ...) +{ + va_list args; + struct task_struct *p = malloc(sizeof(*p)); + int ret; + + memset(p, 0, sizeof(*p)); + + va_start(args, namefmt); + vsnprintf(p->comm, sizeof(p->comm), namefmt, args); + va_end(args); + + p->flags |= PF_KTHREAD; + p->thread_fn = thread_fn; + p->thread_data = thread_data; + p->state = TASK_UNINTERRUPTIBLE; + p->signal = &p->_signal; + atomic_set(&p->usage, 1); + init_completion(&p->exited); + init_rwsem(&p->_signal.exec_update_lock); + + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 32 << 10); + + for (unsigned i = 0; i < 10; i++) { + ret = pthread_create(&p->thread, &attr, kthread_start_fn, p); + if (!ret) + break; + + run_shrinkers(GFP_KERNEL, true); + } + if (ret) + return ERR_PTR(-ret); + pthread_setname_np(p->thread, p->comm); + return p; +} + +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop() on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ +bool kthread_should_stop(void) +{ + return test_bit(KTHREAD_SHOULD_STOP, ¤t->kthread_flags); +} + +bool kthread_freezable_should_stop(bool *was_frozen) +{ + return test_bit(KTHREAD_SHOULD_STOP, ¤t->kthread_flags); +} + +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ +int kthread_stop(struct task_struct *p) +{ + get_task_struct(p); + + set_bit(KTHREAD_SHOULD_STOP, &p->kthread_flags); + wake_up_process(p); + wait_for_completion(&p->exited); + + put_task_struct(p); + + return 0; +} diff --git a/linux/llist.c b/linux/llist.c new file mode 100644 index 00000000..611ce488 --- /dev/null +++ b/linux/llist.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Lock-less NULL terminated single linked list + * + * The basic atomic operation of this list is cmpxchg on long. On + * architectures that don't have NMI-safe cmpxchg implementation, the + * list can NOT be used in NMI handlers. So code that uses the list in + * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. + * + * Copyright 2010,2011 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + */ +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/llist.h> + + +/** + * llist_add_batch - add several linked entries in batch + * @new_first: first entry in batch to be added + * @new_last: last entry in batch to be added + * @head: the head for your lock-less list + * + * Return whether list is empty before adding. + */ +bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, + struct llist_head *head) +{ + struct llist_node *first; + + do { + new_last->next = first = READ_ONCE(head->first); + } while (cmpxchg(&head->first, first, new_first) != first); + + return !first; +} +EXPORT_SYMBOL_GPL(llist_add_batch); + +/** + * llist_del_first - delete the first entry of lock-less list + * @head: the head for your lock-less list + * + * If list is empty, return NULL, otherwise, return the first entry + * deleted, this is the newest added one. + * + * Only one llist_del_first user can be used simultaneously with + * multiple llist_add users without lock. Because otherwise + * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, + * llist_add) sequence in another user may change @head->first->next, + * but keep @head->first. If multiple consumers are needed, please + * use llist_del_all or use lock between consumers. + */ +struct llist_node *llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *old_entry, *next; + + entry = smp_load_acquire(&head->first); + for (;;) { + if (entry == NULL) + return NULL; + old_entry = entry; + next = READ_ONCE(entry->next); + entry = cmpxchg(&head->first, old_entry, next); + if (entry == old_entry) + break; + } + + return entry; +} +EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the order of a chain of llist entries and return the + * new first entry. + */ +struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +EXPORT_SYMBOL_GPL(llist_reverse_order); diff --git a/linux/mempool.c b/linux/mempool.c new file mode 100644 index 00000000..74d4fbb3 --- /dev/null +++ b/linux/mempool.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + * debugging by David Rientjes, Copyright (C) 2015 + */ + +#include <linux/slab.h> +//#include <linux/kasan.h> +//#include <linux/kmemleak.h> +#include <linux/export.h> +#include <linux/jiffies.h> +#include <linux/mempool.h> +#include <linux/mempool.h> +#include <linux/sched.h> + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +static void poison_error(mempool_t *pool, void *element, size_t size, + size_t byte) +{ + const int nr = pool->curr_nr; + const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); + const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); + int i; + + pr_err("BUG: mempool element poison mismatch\n"); + pr_err("Mempool %p size %zu\n", pool, size); + pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); + for (i = start; i < end; i++) + pr_cont("%x ", *(u8 *)(element + i)); + pr_cont("%s\n", end < size ? "..." : ""); + dump_stack(); +} + +static void __check_element(mempool_t *pool, void *element, size_t size) +{ + u8 *obj = element; + size_t i; + + for (i = 0; i < size; i++) { + u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; + + if (obj[i] != exp) { + poison_error(pool, element, size, i); + return; + } + } + memset(obj, POISON_INUSE, size); +} + +static void check_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { + __check_element(pool, element, ksize(element)); + } else if (pool->free == mempool_free_pages) { + /* Mempools backed by page allocator */ + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} + +static void __poison_element(void *element, size_t size) +{ + u8 *obj = element; + + memset(obj, POISON_FREE, size - 1); + obj[size - 1] = POISON_END; +} + +static void poison_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { + __poison_element(element, ksize(element)); + } else if (pool->alloc == mempool_alloc_pages) { + /* Mempools backed by page allocator */ + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __poison_element(addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} +#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static inline void check_element(mempool_t *pool, void *element) +{ +} +static inline void poison_element(mempool_t *pool, void *element) +{ +} +#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ + +static __always_inline void kasan_poison_element(mempool_t *pool, void *element) +{ +#if 0 + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_poison_kfree(element, _RET_IP_); + else if (pool->alloc == mempool_alloc_pages) + kasan_free_pages(element, (unsigned long)pool->pool_data); +#endif +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ +#if 0 + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_unpoison_slab(element); + else if (pool->alloc == mempool_alloc_pages) + kasan_alloc_pages(element, (unsigned long)pool->pool_data); +#endif +} + +static __always_inline void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + poison_element(pool, element); + kasan_poison_element(pool, element); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + void *element = pool->elements[--pool->curr_nr]; + + BUG_ON(pool->curr_nr < 0); + kasan_unpoison_element(pool, element); + check_element(pool, element); + return element; +} + +/** + * mempool_exit - exit a mempool initialized with mempool_init() + * @pool: pointer to the memory pool which was initialized with + * mempool_init(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + * + * May be called on a zeroed but uninitialized mempool (i.e. allocated with + * kzalloc()). + */ +void mempool_exit(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + pool->elements = NULL; +} +EXPORT_SYMBOL(mempool_exit); + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) +{ + if (unlikely(!pool)) + return; + + mempool_exit(pool); + kfree(pool); +} +EXPORT_SYMBOL(mempool_destroy); + +int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + pool->alloc = alloc_fn; + pool->free = free_fn; + init_waitqueue_head(&pool->wait); + + pool->elements = kmalloc_array(min_nr, sizeof(void *), gfp_mask); + if (!pool->elements) + return -ENOMEM; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(gfp_mask, pool->pool_data); + if (unlikely(!element)) { + mempool_exit(pool); + return -ENOMEM; + } + add_element(pool, element); + } + + return 0; +} +EXPORT_SYMBOL(mempool_init_node); + +/** + * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * Like mempool_create(), but initializes the pool in (i.e. embedded in another + * structure). + * + * Return: %0 on success, negative error code otherwise. + */ +int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_init_node(pool, min_nr, alloc_fn, free_fn, + pool_data, GFP_KERNEL, 0); + +} +EXPORT_SYMBOL(mempool_init); + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc() and mempool_free() + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc() function is not called + * from IRQ contexts. + * + * Return: pointer to the created memory pool object or %NULL on error. + */ +mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + GFP_KERNEL, 0); +} +EXPORT_SYMBOL(mempool_create); + +mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + mempool_t *pool; + + pool = kzalloc(sizeof(*pool), gfp_mask); + if (!pool) + return NULL; + + if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, + gfp_mask, node_id)) { + kfree(pool); + return NULL; + } + + return pool; +} +EXPORT_SYMBOL(mempool_create_node); + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + * + * Return: %0 on success, negative error code otherwise. + */ +int mempool_resize(mempool_t *pool, int new_min_nr) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + might_sleep(); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { + while (new_min_nr < pool->curr_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(new_min_nr <= pool->min_nr)) { + /* Raced, other resize will do our work */ + spin_unlock_irqrestore(&pool->lock, flags); + kfree(new_elements); + goto out; + } + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + } else { + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); /* Raced */ + goto out; + } + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} +EXPORT_SYMBOL(mempool_resize); + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn() function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. + * + * Return: pointer to the allocated element or %NULL on error. + */ +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +{ + void *element; + unsigned long flags; + DEFINE_WAIT(wait); + gfp_t gfp_temp; + + WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ + + gfp_temp = gfp_mask & ~(__GFP_IO); + +repeat_alloc: + + element = pool->alloc(gfp_temp, pool->pool_data); + if (likely(element != NULL)) + return element; + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); + return element; + } + + /* + * We use gfp mask w/o direct reclaim or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* Let's wait for someone else to return an element to @pool */ + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); + goto repeat_alloc; +} +EXPORT_SYMBOL(mempool_alloc); + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (unlikely(element == NULL)) + return; + + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr < pool->min_nr)) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} +EXPORT_SYMBOL(mempool_free); + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} +EXPORT_SYMBOL(mempool_alloc_slab); + +void mempool_free_slab(void *element, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + kmem_cache_free(mem, element); +} +EXPORT_SYMBOL(mempool_free_slab); + +/* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specified by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); diff --git a/linux/preempt.c b/linux/preempt.c new file mode 100644 index 00000000..72eceed3 --- /dev/null +++ b/linux/preempt.c @@ -0,0 +1,37 @@ +#include <pthread.h> + +#include "linux/preempt.h" + +/* + * In userspace, pthreads are preemptible and can migrate CPUs at any time. + * + * In the kernel, preempt_disable() logic essentially guarantees that a marked + * critical section owns its CPU for the relevant block. This is necessary for + * various code paths, critically including the percpu system as it allows for + * non-atomic reads and writes to CPU-local data structures. + * + * The high performance userspace equivalent would be to use thread local + * storage to replace percpu data, but that would be complicated. It should be + * correct to instead guarantee mutual exclusion for the critical sections. + */ + +static pthread_mutex_t preempt_lock; + +__attribute__((constructor)) +static void preempt_init(void) { + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&preempt_lock, &attr); + pthread_mutexattr_destroy(&attr); +} + +void preempt_disable(void) +{ + pthread_mutex_lock(&preempt_lock); +} + +void preempt_enable(void) +{ + pthread_mutex_unlock(&preempt_lock); +} diff --git a/linux/ratelimit.c b/linux/ratelimit.c new file mode 100644 index 00000000..21a6d6c8 --- /dev/null +++ b/linux/ratelimit.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ratelimit.c - Do something with rate limit. + * + * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com> + * + * 2008-05-01 rewrite the function and use a ratelimit_state data struct as + * parameter. Now every user can use their own standalone ratelimit_state. + */ + +#include <linux/ratelimit.h> +#include <linux/jiffies.h> +#include <linux/export.h> + +/* + * __ratelimit - rate limiting + * @rs: ratelimit_state data + * @func: name of calling function + * + * This enforces a rate limit: not more than @rs->burst callbacks + * in every @rs->interval + * + * RETURNS: + * 0 means callbacks will be suppressed. + * 1 means go ahead and do it. + */ +int ___ratelimit(struct ratelimit_state *rs, const char *func) +{ + int ret; + + if (!rs->interval) + return 1; + + /* + * If we contend on this state's lock then almost + * by definition we are too busy to print a message, + * in addition to the one that will be printed by + * the entity that is holding the lock already: + */ + if (!raw_spin_trylock(&rs->lock)) + return 0; + + if (!rs->begin) + rs->begin = jiffies; + + if (time_is_before_jiffies(rs->begin + rs->interval)) { + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + printk(KERN_WARNING + "%s: %d callbacks suppressed\n", + func, rs->missed); + rs->missed = 0; + } + } + rs->begin = jiffies; + rs->printed = 0; + } + if (rs->burst && rs->burst > rs->printed) { + rs->printed++; + ret = 1; + } else { + rs->missed++; + ret = 0; + } + raw_spin_unlock(&rs->lock); + + return ret; +} +EXPORT_SYMBOL(___ratelimit); diff --git a/linux/rhashtable.c b/linux/rhashtable.c new file mode 100644 index 00000000..ba2196fc --- /dev/null +++ b/linux/rhashtable.c @@ -0,0 +1,1237 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> + * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett + */ + +#include <linux/atomic.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/jhash.h> +#include <linux/overflow.h> +#include <linux/random.h> +#include <linux/rhashtable.h> +#include <linux/err.h> +#include <linux/export.h> + +#define HASH_DEFAULT_SIZE 64UL +#define HASH_MIN_SIZE 4U + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; + +static u32 head_hashfn(struct rhashtable *ht, + const struct bucket_table *tbl, + const struct rhash_head *he) +{ + return rht_head_hashfn(ht, tbl, he, ht->p); +} + +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) + +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); + +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif + +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} + +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + +static void bucket_table_free(struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree(tbl); +} + +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + +static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + struct bucket_table *tbl = NULL; + size_t size; + int i; + + tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); + + size = nbuckets; + + if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } + + if (tbl == NULL) + return NULL; + + tbl->size = size; + + rcu_head_init(&tbl->rcu); + INIT_LIST_HEAD(&tbl->walkers); + + tbl->hash_rnd = get_random_u32(); + + for (i = 0; i < nbuckets; i++) + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); + + return tbl; +} + +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *new_tbl; + + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + + return new_tbl; +} + +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; + struct rhash_head *head, *next, *entry; + struct rhash_head __rcu **pprev = NULL; + unsigned int new_hash; + + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + rht_lock(new_tbl, &new_tbl->buckets[new_hash]); + + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); + + RCU_INIT_POINTER(entry->next, head); + + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry); + + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); + +out: + return err; +} + +static int rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + int err; + + if (!bkt) + return 0; + rht_lock(old_tbl, bkt); + + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) + ; + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt); + + return err; +} + +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). + */ + + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; + + return 0; +} + +static int rhashtable_rehash_table(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + int err; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); + + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; +} + +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) +{ + struct bucket_table *new_tbl; + int err; + + ASSERT_RHT_MUTEX(ht); + + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; +} + +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_shrink(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; + + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; + + if (old_tbl->size <= size) + return 0; + + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; + + return rhashtable_rehash_alloc(ht, old_tbl, size); +} + +static void rht_deferred_worker(struct work_struct *work) +{ + struct rhashtable *ht; + struct bucket_table *tbl; + int err = 0; + + ht = container_of(work, struct rhashtable, run_work); + mutex_lock(&ht->mutex); + + tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); + + if (rht_grow_above_75(ht, tbl)) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; + + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } + + mutex_unlock(&ht->mutex); + + if (err) + schedule_work(&ht->run_work); +} + +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + unsigned int size; + int err; + + old_tbl = rht_dereference_rcu(ht->tbl, ht); + + size = tbl->size; + + err = -EBUSY; + + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) + goto fail; + + err = -ENOMEM; + + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); + if (new_tbl == NULL) + goto fail; + + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); + + return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_access_pointer(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; +} + +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; + struct rhash_head *head; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); + + head = rht_ptr(bkt, tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); + + atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); + +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) + return NULL; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; + else + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); + +static size_t rounded_hashtable_size(const struct rhashtable_params *params) +{ + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); +} + +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + struct bucket_table *tbl; + size_t size; + + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) + return -EINVAL; + + memset(ht, 0, sizeof(*ht)); + mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; + + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } + + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + + size = rounded_hashtable_size(&ht->p); + + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } + + atomic_set(&ht->nelems, 0); + + RCU_INIT_POINTER(ht->tbl, tbl); + + INIT_WORK(&ht->run_work, rht_deferred_worker); + + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_init); + +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; + + cancel_work_sync(&ht->run_work); + + mutex_lock(&ht->mutex); + tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); + bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } + mutex_unlock(&ht->mutex); +} +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/linux/sched.c b/linux/sched.c new file mode 100644 index 00000000..1c7198d2 --- /dev/null +++ b/linux/sched.c @@ -0,0 +1,133 @@ + +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <linux/futex.h> + +/* hack for mips: */ +#define CONFIG_RCU_HAVE_FUTEX 1 +#include <urcu/futex.h> + +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/timer.h> + +__thread struct task_struct *current; + +void __put_task_struct(struct task_struct *t) +{ + pthread_join(t->thread, NULL); + free(t); +} + +/* returns true if process was woken up, false if it was already running */ +int wake_up_process(struct task_struct *p) +{ + int ret = p->state != TASK_RUNNING; + + p->state = TASK_RUNNING; + futex(&p->state, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, + INT_MAX, NULL, NULL, 0); + return ret; +} + +void schedule(void) +{ + int v; + + rcu_quiescent_state(); + + while ((v = READ_ONCE(current->state)) != TASK_RUNNING) + futex(¤t->state, FUTEX_WAIT|FUTEX_PRIVATE_FLAG, + v, NULL, NULL, 0); +} + +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) +{ + struct process_timer *timeout = + container_of(t, struct process_timer, timer); + + wake_up_process(timeout->task); +} + +long schedule_timeout(long timeout) +{ + struct process_timer timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + fprintf(stderr, "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + mod_timer(&timer.timer, expire); + schedule(); + del_timer_sync(&timer.timer); + + timeout = expire - jiffies; +out: + return timeout < 0 ? 0 : timeout; +} + +__attribute__((constructor(101))) +static void sched_init(void) +{ + struct task_struct *p = malloc(sizeof(*p)); + + memset(p, 0, sizeof(*p)); + + p->state = TASK_RUNNING; + atomic_set(&p->usage, 1); + init_completion(&p->exited); + + current = p; + + rcu_init(); + rcu_register_thread(); +} + +#ifndef SYS_getrandom +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +int urandom_fd; + +__attribute__((constructor(101))) +static void rand_init(void) +{ + urandom_fd = open("/dev/urandom", O_RDONLY); + BUG_ON(urandom_fd < 0); +} +#endif diff --git a/linux/semaphore.c b/linux/semaphore.c new file mode 100644 index 00000000..b7d4b517 --- /dev/null +++ b/linux/semaphore.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> + +static noinline void __down(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long timeout); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the semaphore has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + raw_spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @timeout: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long timeout) +{ + unsigned long flags; + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, timeout); + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + bool up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = current; + waiter.up = false; + + for (;;) { + if (unlikely(timeout <= 0)) + goto timed_out; + __set_current_state(state); + raw_spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + raw_spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = true; + wake_up_process(waiter->task); +} diff --git a/linux/seq_buf.c b/linux/seq_buf.c new file mode 100644 index 00000000..cf8709ad --- /dev/null +++ b/linux/seq_buf.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * seq_buf.c + * + * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The seq_buf is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the seq_buf must be initialized with seq_buf_init(). + * This will set up the counters within the descriptor. You can call + * seq_buf_init() more than once to reset the seq_buf to start + * from scratch. + */ +#include <linux/seq_buf.h> +#include <stdio.h> + +/** + * seq_buf_can_fit - can the new data fit in the current buffer? + * @s: the seq_buf descriptor + * @len: The length to see if it can fit in the current buffer + * + * Returns true if there's enough unused space in the seq_buf buffer + * to fit the amount of new data according to @len. + */ +static bool seq_buf_can_fit(struct seq_buf *s, size_t len) +{ + return s->len + len <= s->size; +} + +/** + * seq_buf_vprintf - sequence printing of information. + * @s: seq_buf descriptor + * @fmt: printf format string + * @args: va_list of arguments from a printf() type function + * + * Writes a vnprintf() format into the sequencce buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) +{ + int len; + + WARN_ON(s->size == 0); + + if (s->len < s->size) { + len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); + if (s->len + len < s->size) { + s->len += len; + return 0; + } + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_printf - sequence printing of information + * @s: seq_buf descriptor + * @fmt: printf format string + * + * Writes a printf() format into the sequence buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = seq_buf_vprintf(s, fmt, ap); + va_end(ap); + + return ret; +} + +/** + * seq_buf_puts - sequence printing of simple string + * @s: seq_buf descriptor + * @str: simple string to record + * + * Copy a simple string into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_puts(struct seq_buf *s, const char *str) +{ + size_t len = strlen(str); + + WARN_ON(s->size == 0); + + /* Add 1 to len for the trailing null byte which must be there */ + len += 1; + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, str, len); + /* Don't count the trailing null byte against the capacity */ + s->len += len - 1; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putc - sequence printing of simple character + * @s: seq_buf descriptor + * @c: simple character to record + * + * Copy a single character into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putc(struct seq_buf *s, unsigned char c) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, 1)) { + s->buffer[s->len++] = c; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putmem - write raw data into the sequenc buffer + * @s: seq_buf descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, mem, len); + s->len += len; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} diff --git a/linux/shrinker.c b/linux/shrinker.c new file mode 100644 index 00000000..ca34ebc7 --- /dev/null +++ b/linux/shrinker.c @@ -0,0 +1,140 @@ + +#include <stdio.h> +#include <unistd.h> + +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/shrinker.h> + +#include "tools-util.h" + +static LIST_HEAD(shrinker_list); +static DEFINE_MUTEX(shrinker_lock); + +void shrinker_free(struct shrinker *s) +{ + if (s->list.next) { + mutex_lock(&shrinker_lock); + list_del(&s->list); + mutex_unlock(&shrinker_lock); + } + free(s); +} + +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + return calloc(sizeof(struct shrinker), 1); +} + +int shrinker_register(struct shrinker *shrinker) +{ + mutex_lock(&shrinker_lock); + list_add_tail(&shrinker->list, &shrinker_list); + mutex_unlock(&shrinker_lock); + return 0; +} + +static void run_shrinkers_allocation_failed(gfp_t gfp_mask) +{ + struct shrinker *shrinker; + + mutex_lock(&shrinker_lock); + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { .gfp_mask = gfp_mask, }; + + unsigned long have = shrinker->count_objects(shrinker, &sc); + + sc.nr_to_scan = have / 8; + + shrinker->scan_objects(shrinker, &sc); + } + mutex_unlock(&shrinker_lock); +} + +void run_shrinkers(gfp_t gfp_mask, bool allocation_failed) +{ + struct shrinker *shrinker; + struct sysinfo info; + s64 want_shrink; + + if (!(gfp_mask & GFP_KERNEL)) + return; + + /* Fast out if there are no shrinkers to run. */ + if (list_empty(&shrinker_list)) + return; + + if (allocation_failed) { + run_shrinkers_allocation_failed(gfp_mask); + return; + } + + si_meminfo(&info); + + /* Aim for 6% of physical RAM free without anything in swap */ + want_shrink = (info.totalram >> 4) - info.freeram + + info.totalswap - info.freeswap; + if (want_shrink <= 0) + return; + + mutex_lock(&shrinker_lock); + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nr_to_scan = want_shrink >> PAGE_SHIFT + }; + + shrinker->scan_objects(shrinker, &sc); + } + mutex_unlock(&shrinker_lock); +} + +static int shrinker_thread(void *arg) +{ + while (!kthread_should_stop()) { + struct timespec to; + int v; + + clock_gettime(CLOCK_MONOTONIC, &to); + to.tv_sec += 1; + __set_current_state(TASK_INTERRUPTIBLE); + errno = 0; + while ((v = READ_ONCE(current->state)) != TASK_RUNNING && + errno != ETIMEDOUT) + futex(¤t->state, FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG, + v, &to, NULL, (uint32_t)~0); + if (kthread_should_stop()) + break; + if (v != TASK_RUNNING) + __set_current_state(TASK_RUNNING); + run_shrinkers(GFP_KERNEL, false); + } + + return 0; +} + +struct task_struct *shrinker_task; + +__attribute__((constructor(103))) +static void shrinker_thread_init(void) +{ + shrinker_task = kthread_run(shrinker_thread, NULL, "shrinkers"); + BUG_ON(IS_ERR(shrinker_task)); +} + +#if 0 +/* + * We seem to be hitting a rare segfault when shutting down the shrinker thread. + * Disabling this is going to cause some harmless warnings about memory leaks: + */ +__attribute__((destructor(103))) +static void shrinker_thread_exit(void) +{ + int ret = kthread_stop(shrinker_task); + BUG_ON(ret); + + shrinker_task = NULL; +} +#endif diff --git a/linux/siphash.c b/linux/siphash.c new file mode 100644 index 00000000..f8dbecea --- /dev/null +++ b/linux/siphash.c @@ -0,0 +1,552 @@ +/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. + * + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * + * This implementation is specifically for SipHash2-4 for a secure PRF + * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for + * hashtables. + */ + +#include <linux/siphash.h> +#include <linux/bitops.h> +#include <asm/unaligned.h> + +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 +#include <linux/dcache.h> +#include <asm/word-at-a-time.h> +#endif + +#define SIPROUND \ + do { \ + v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \ + v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \ + v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \ + v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \ + } while (0) + +#define PREAMBLE(len) \ + u64 v0 = 0x736f6d6570736575ULL; \ + u64 v1 = 0x646f72616e646f6dULL; \ + u64 v2 = 0x6c7967656e657261ULL; \ + u64 v3 = 0x7465646279746573ULL; \ + u64 b = ((u64)(len)) << 56; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define POSTAMBLE \ + v3 ^= b; \ + SIPROUND; \ + SIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_unaligned); +#endif + +/** + * siphash_1u64 - compute 64-bit siphash PRF value of a u64 + * @first: first u64 + * @key: the siphash key + */ +u64 siphash_1u64(const u64 first, const siphash_key_t *key) +{ + PREAMBLE(8) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u64); + +/** + * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 + * @first: first u64 + * @second: second u64 + * @key: the siphash key + */ +u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) +{ + PREAMBLE(16) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_2u64); + +/** + * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @key: the siphash key + */ +u64 siphash_3u64(const u64 first, const u64 second, const u64 third, + const siphash_key_t *key) +{ + PREAMBLE(24) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u64); + +/** + * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @forth: forth u64 + * @key: the siphash key + */ +u64 siphash_4u64(const u64 first, const u64 second, const u64 third, + const u64 forth, const siphash_key_t *key) +{ + PREAMBLE(32) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + v3 ^= forth; + SIPROUND; + SIPROUND; + v0 ^= forth; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_4u64); + +u64 siphash_1u32(const u32 first, const siphash_key_t *key) +{ + PREAMBLE(4) + b |= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u32); + +u64 siphash_3u32(const u32 first, const u32 second, const u32 third, + const siphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + PREAMBLE(12) + v3 ^= combined; + SIPROUND; + SIPROUND; + v0 ^= combined; + b |= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u32); + +#if BITS_PER_LONG == 64 +/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for + * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. + */ + +#define HSIPROUND SIPROUND +#define HPREAMBLE(len) PREAMBLE(len) +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + b |= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(8) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(12) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + b |= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(16) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + combined = (u64)forth << 32 | third; + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#else +#define HSIPROUND \ + do { \ + v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \ + v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \ + v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \ + v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \ + } while (0) + +#define HPREAMBLE(len) \ + u32 v0 = 0; \ + u32 v1 = 0; \ + u32 v2 = 0x6c796765U; \ + u32 v3 = 0x74656462U; \ + u32 b = ((u32)(len)) << 24; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return v1 ^ v3; + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = le32_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = get_unaligned_le32(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + v3 ^= first; + HSIPROUND; + v0 ^= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + HPREAMBLE(8) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + HPREAMBLE(12) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + HPREAMBLE(16) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + v3 ^= forth; + HSIPROUND; + v0 ^= forth; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#endif diff --git a/linux/string.c b/linux/string.c new file mode 100644 index 00000000..a32a8995 --- /dev/null +++ b/linux/string.c @@ -0,0 +1,112 @@ +/* + * linux/lib/string.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * stupid library routines.. The optimized versions should generally be found + * as inline code in <asm-xx/string.h> + * + * These are buggy as well.. + * + * * Fri Jun 25 1999, Ingo Oeser <ioe@informatik.tu-chemnitz.de> + * - Added strsep() which will replace strtok() soon (because strsep() is + * reentrant and should be faster). Use only strsep() in new code, please. + * + * * Sat Feb 09 2002, Jason Thomas <jason@topic.com.au>, + * Matthew Hawkins <matt@mh.dropbear.id.au> + * - Kissed strtok() goodbye + */ + +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <string.h> + +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/string.h> + +static char *skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} + +char *strim(char *s) +{ + size_t size; + char *end; + + size = strlen(s); + if (!size) + return s; + + end = s + size - 1; + while (end >= s && isspace(*end)) + end--; + *(end + 1) = '\0'; + + return skip_spaces(s); +} + +size_t strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + return ret; +} + +ssize_t strscpy(char *dest, const char *src, size_t count) +{ + long res = 0; + + if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) + return -E2BIG; + + while (count) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Hit buffer length without finding a NUL; force NUL-termination. */ + if (res) + dest[res-1] = '\0'; + + return -E2BIG; +} + +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + barrier_data(s); +} + +int match_string(const char * const *array, size_t n, const char *string) +{ + int index; + const char *item; + + for (index = 0; index < n; index++) { + item = array[index]; + if (!item) + break; + if (!strcmp(item, string)) + return index; + } + + return -EINVAL; +} diff --git a/linux/string_helpers.c b/linux/string_helpers.c new file mode 100644 index 00000000..0810ca13 --- /dev/null +++ b/linux/string_helpers.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Helpers for formatting and printing strings + * + * Copyright 31 August 2008 James Bottomley + * Copyright (C) 2013, Intel Corporation + */ +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/math64.h> +#include <linux/export.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/limits.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/string_helpers.h> + +/** + * string_get_size - get the size in the specified units + * @size: The size to be converted in blocks + * @blk_size: Size of the block (use 1 for size in bytes) + * @units: units to use (powers of 1000 or 1024) + * @buf: buffer to format to + * @len: length of buffer + * + * This function returns a string formatted to 3 significant figures + * giving the size in the required units. @buf should have room for + * at least 9 bytes and will always be zero terminated. + * + */ +int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, + char *buf, int len) +{ + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, + [STRING_UNITS_2] = units_2, + }; + static const unsigned int divisor[] = { + [STRING_UNITS_10] = 1000, + [STRING_UNITS_2] = 1024, + }; + static const unsigned int rounding[] = { 500, 50, 5 }; + int i = 0, j; + u32 remainder = 0, sf_cap; + char tmp[12]; + const char *unit; + + tmp[0] = '\0'; + + if (blk_size == 0) + size = 0; + if (size == 0) + goto out; + + /* This is Napier's algorithm. Reduce the original block size to + * + * coefficient * divisor[units]^i + * + * we do the reduction so both coefficients are just under 32 bits so + * that multiplying them together won't overflow 64 bits and we keep + * as much precision as possible in the numbers. + * + * Note: it's safe to throw away the remainders here because all the + * precision is in the coefficients. + */ + while (blk_size >> 32) { + do_div(blk_size, divisor[units]); + i++; + } + + while (size >> 32) { + do_div(size, divisor[units]); + i++; + } + + /* now perform the actual multiplication keeping i as the sum of the + * two logarithms */ + size *= blk_size; + + /* and logarithmically reduce it until it's just under the divisor */ + while (size >= divisor[units]) { + remainder = do_div(size, divisor[units]); + i++; + } + + /* work out in j how many digits of precision we need from the + * remainder */ + sf_cap = size; + for (j = 0; sf_cap*10 < 1000; j++) + sf_cap *= 10; + + if (units == STRING_UNITS_2) { + /* express the remainder as a decimal. It's currently the + * numerator of a fraction whose denominator is + * divisor[units], which is 1 << 10 for STRING_UNITS_2 */ + remainder *= 1000; + remainder >>= 10; + } + + /* add a 5 to the digit below what will be printed to ensure + * an arithmetical round up and carry it through to size */ + remainder += rounding[j]; + if (remainder >= 1000) { + remainder -= 1000; + size += 1; + } + + if (j) { + snprintf(tmp, sizeof(tmp), ".%03u", remainder); + tmp[j+1] = '\0'; + } + + out: + if (i >= ARRAY_SIZE(units_2)) + unit = "UNK"; + else + unit = units_str[units][i]; + + return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit); +} +EXPORT_SYMBOL(string_get_size); diff --git a/linux/timer.c b/linux/timer.c new file mode 100644 index 00000000..7d519a4d --- /dev/null +++ b/linux/timer.c @@ -0,0 +1,329 @@ + +#include <pthread.h> +#include <signal.h> +#include <time.h> + +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/timer.h> + +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + * + * This must always be inlined because its used from the x86-64 vdso, + * which cannot call other kernel functions. + */ +static struct timespec timespec_add_ns(struct timespec a, u64 ns) +{ + a.tv_nsec += ns; + a.tv_sec += a.tv_nsec / NSEC_PER_SEC; + a.tv_nsec %= NSEC_PER_SEC; + return a; +} + +#define DECLARE_HEAP(type) \ +struct { \ + size_t size, used; \ + type *data; \ +} + +#define heap_init(heap, _size) \ +({ \ + size_t _bytes; \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + _bytes = (heap)->size * sizeof(*(heap)->data); \ + (heap)->data = malloc(_bytes); \ + (heap)->data; \ +}) + +#define heap_free(heap) \ +do { \ + kvfree((heap)->data); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_del(h, i, cmp) \ +do { \ + size_t _i = (i); \ + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used); \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ +} while (0) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + heap_del(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? &(h)->data[0] : NULL) +#define heap_full(h) ((h)->used == (h)->size) +#define heap_empty(h) ((h)->used == 0) + +#define heap_resort(heap, cmp) \ +do { \ + ssize_t _i; \ + for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ + heap_sift(heap, _i, cmp); \ +} while (0) + +struct pending_timer { + struct timer_list *timer; + unsigned long expires; +}; + +static inline bool pending_timer_cmp(struct pending_timer a, + struct pending_timer b) +{ + return a.expires < b.expires; +} + +static DECLARE_HEAP(struct pending_timer) pending_timers; + +static pthread_mutex_t timer_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t timer_cond = PTHREAD_COND_INITIALIZER; +static pthread_cond_t timer_running_cond = PTHREAD_COND_INITIALIZER; +static unsigned long timer_seq; + +static inline bool timer_running(void) +{ + return timer_seq & 1; +} + +static ssize_t timer_idx(struct timer_list *timer) +{ + size_t i; + + for (i = 0; i < pending_timers.used; i++) + if (pending_timers.data[i].timer == timer) + return i; + + return -1; +} + +int del_timer(struct timer_list *timer) +{ + ssize_t idx; + + pthread_mutex_lock(&timer_lock); + idx = timer_idx(timer); + if (idx >= 0) + heap_del(&pending_timers, idx, pending_timer_cmp); + + timer->pending = false; + pthread_mutex_unlock(&timer_lock); + + return idx >= 0; +} + +void flush_timers(void) +{ + unsigned long seq; + + pthread_mutex_lock(&timer_lock); + seq = timer_seq; + while (timer_running() && seq == timer_seq) + pthread_cond_wait(&timer_running_cond, &timer_lock); + + pthread_mutex_unlock(&timer_lock); +} + +int del_timer_sync(struct timer_list *timer) +{ + unsigned long seq; + ssize_t idx; + + pthread_mutex_lock(&timer_lock); + idx = timer_idx(timer); + if (idx >= 0) + heap_del(&pending_timers, idx, pending_timer_cmp); + + timer->pending = false; + + seq = timer_seq; + while (timer_running() && seq == timer_seq) + pthread_cond_wait(&timer_running_cond, &timer_lock); + pthread_mutex_unlock(&timer_lock); + + return idx >= 0; +} + +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + ssize_t idx; + + pthread_mutex_lock(&timer_lock); + timer->expires = expires; + timer->pending = true; + idx = timer_idx(timer); + + if (idx >= 0 && + pending_timers.data[idx].expires == expires) + goto out; + + if (idx >= 0) { + pending_timers.data[idx].expires = expires; + + heap_sift_down(&pending_timers, idx, pending_timer_cmp); + heap_sift(&pending_timers, idx, pending_timer_cmp); + } else { + if (heap_full(&pending_timers)) { + pending_timers.size *= 2; + pending_timers.data = + realloc(pending_timers.data, + pending_timers.size * + sizeof(struct pending_timer)); + + BUG_ON(!pending_timers.data); + } + + heap_add(&pending_timers, + ((struct pending_timer) { + .timer = timer, + .expires = expires, + }), + pending_timer_cmp); + } + + pthread_cond_signal(&timer_cond); +out: + pthread_mutex_unlock(&timer_lock); + + return idx >= 0; +} + +static bool timer_thread_stop = false; + +static int timer_thread(void *arg) +{ + struct pending_timer *p; + struct timespec ts; + unsigned long now; + int ret; + + pthread_mutex_lock(&timer_lock); + + while (!timer_thread_stop) { + now = jiffies; + p = heap_peek(&pending_timers); + + if (!p) { + pthread_cond_wait(&timer_cond, &timer_lock); + continue; + } + + if (time_after_eq(now, p->expires)) { + struct timer_list *timer = p->timer; + + heap_del(&pending_timers, 0, pending_timer_cmp); + BUG_ON(!timer_pending(timer)); + timer->pending = false; + + timer_seq++; + BUG_ON(!timer_running()); + + pthread_mutex_unlock(&timer_lock); + timer->function(timer); + pthread_mutex_lock(&timer_lock); + + timer_seq++; + pthread_cond_broadcast(&timer_running_cond); + continue; + } + + + ret = clock_gettime(CLOCK_REALTIME, &ts); + BUG_ON(ret); + + ts = timespec_add_ns(ts, jiffies_to_nsecs(p->expires - now)); + + pthread_cond_timedwait(&timer_cond, &timer_lock, &ts); + } + + pthread_mutex_unlock(&timer_lock); + + return 0; +} + +struct task_struct *timer_task; + +__attribute__((constructor(103))) +static void timers_init(void) +{ + heap_init(&pending_timers, 64); + BUG_ON(!pending_timers.data); + + timer_task = kthread_run(timer_thread, NULL, "timers"); + BUG_ON(IS_ERR(timer_task)); +} + +__attribute__((destructor(103))) +static void timers_cleanup(void) +{ + get_task_struct(timer_task); + + pthread_mutex_lock(&timer_lock); + timer_thread_stop = true; + pthread_cond_signal(&timer_cond); + pthread_mutex_unlock(&timer_lock); + + int ret = kthread_stop(timer_task); + BUG_ON(ret); + + put_task_struct(timer_task); + timer_task = NULL; +} diff --git a/linux/wait.c b/linux/wait.c new file mode 100644 index 00000000..b1f002b9 --- /dev/null +++ b/linux/wait.c @@ -0,0 +1,250 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 Nadia Yvette Chambers, Oracle + */ + +#include <linux/completion.h> +#include <linux/sched.h> +#include <linux/wait.h> + +static inline int waitqueue_active(wait_queue_head_t *q) +{ + return !list_empty(&q->task_list); +} + +static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) +{ + list_add(&new->task_list, &head->task_list); +} + +static inline void __add_wait_queue_tail(wait_queue_head_t *head, + wait_queue_t *new) +{ + list_add_tail(&new->task_list, &head->task_list); +} + +static inline void +__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(q, wait); +} + +static inline void +__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) +{ + list_del(&old->task_list); +} + +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +static void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} + +void wake_up(wait_queue_head_t *q) +{ + __wake_up(q, TASK_NORMAL, 1, NULL); +} + +void wake_up_all(wait_queue_head_t *q) +{ + __wake_up(q, TASK_NORMAL, 0, NULL); +} + +static void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} + +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} + +static void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} + +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, + void *key) +{ + return wake_up_process(curr->private); +} + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} + +struct wait_bit_key { + void *flags; + int bit_nr; + unsigned long timeout; +}; + +struct wait_bit_queue { + struct wait_bit_key key; + wait_queue_t wait; +}; + +static int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit = + container_of(wait, struct wait_bit_queue, wait); + + return (wait_bit->key.flags == key->flags && + wait_bit->key.bit_nr == key->bit_nr && + !test_bit(key->bit_nr, key->flags)) + ? autoremove_wake_function(wait, mode, sync, key) : 0; +} + +static DECLARE_WAIT_QUEUE_HEAD(bit_wq); + +#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ + { .flags = word, .bit_nr = bit, } + +#define DEFINE_WAIT_BIT(name, word, bit) \ + struct wait_bit_queue name = { \ + .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ + .wait = { \ + .private = current, \ + .func = wake_bit_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +void wake_up_bit(void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + + if (waitqueue_active(&bit_wq)) + __wake_up(&bit_wq, TASK_NORMAL, 1, &key); +} + +void __wait_on_bit(void *word, int bit, unsigned mode) +{ + DEFINE_WAIT_BIT(wait, word, bit); + + do { + prepare_to_wait(&bit_wq, &wait.wait, mode); + if (test_bit(wait.key.bit_nr, wait.key.flags)) + schedule(); + } while (test_bit(wait.key.bit_nr, wait.key.flags)); + + finish_wait(&bit_wq, &wait.wait); +} + +void __wait_on_bit_lock(void *word, int bit, unsigned mode) +{ + DEFINE_WAIT_BIT(wait, word, bit); + + do { + prepare_to_wait_exclusive(&bit_wq, &wait.wait, mode); + if (!test_bit(wait.key.bit_nr, wait.key.flags)) + continue; + schedule(); + } while (test_and_set_bit(wait.key.bit_nr, wait.key.flags)); + finish_wait(&bit_wq, &wait.wait); +} + +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_locked(&x->wait, TASK_NORMAL, 1); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void wait_for_completion(struct completion *x) +{ + spin_lock_irq(&x->wait.lock); + + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + goto out; + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); +} diff --git a/linux/workqueue.c b/linux/workqueue.c new file mode 100644 index 00000000..0d5af3fb --- /dev/null +++ b/linux/workqueue.c @@ -0,0 +1,346 @@ +#include <pthread.h> + +#include <linux/kthread.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +static pthread_mutex_t wq_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t work_finished = PTHREAD_COND_INITIALIZER; +static LIST_HEAD(wq_list); + +struct workqueue_struct { + struct list_head list; + + struct work_struct *current_work; + struct list_head pending_work; + + struct task_struct *worker; + char name[24]; +}; + +enum { + WORK_PENDING_BIT, +}; + +static bool work_pending(struct work_struct *work) +{ + return test_bit(WORK_PENDING_BIT, work_data_bits(work)); +} + +static void clear_work_pending(struct work_struct *work) +{ + clear_bit(WORK_PENDING_BIT, work_data_bits(work)); +} + +static bool set_work_pending(struct work_struct *work) +{ + return !test_and_set_bit(WORK_PENDING_BIT, work_data_bits(work)); +} + +static void __queue_work(struct workqueue_struct *wq, + struct work_struct *work) +{ + BUG_ON(!work_pending(work)); + BUG_ON(!list_empty(&work->entry)); + + list_add_tail(&work->entry, &wq->pending_work); + wake_up_process(wq->worker); +} + +bool queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + bool ret; + + pthread_mutex_lock(&wq_lock); + if ((ret = set_work_pending(work))) + __queue_work(wq, work); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +void delayed_work_timer_fn(struct timer_list *timer) +{ + struct delayed_work *dwork = + container_of(timer, struct delayed_work, timer); + + pthread_mutex_lock(&wq_lock); + __queue_work(dwork->wq, &dwork->work); + pthread_mutex_unlock(&wq_lock); +} + +static void __queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + + BUG_ON(timer->function != delayed_work_timer_fn); + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + if (!delay) { + __queue_work(wq, &dwork->work); + } else { + dwork->wq = wq; + timer->expires = jiffies + delay; + add_timer(timer); + } +} + +bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + struct work_struct *work = &dwork->work; + bool ret; + + pthread_mutex_lock(&wq_lock); + if ((ret = set_work_pending(work))) + __queue_delayed_work(wq, dwork, delay); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +static bool grab_pending(struct work_struct *work, bool is_dwork) +{ +retry: + if (set_work_pending(work)) { + BUG_ON(!list_empty(&work->entry)); + return false; + } + + if (is_dwork) { + struct delayed_work *dwork = to_delayed_work(work); + + if (likely(del_timer(&dwork->timer))) { + BUG_ON(!list_empty(&work->entry)); + return true; + } + } + + if (!list_empty(&work->entry)) { + list_del_init(&work->entry); + return true; + } + + BUG_ON(!is_dwork); + + pthread_mutex_unlock(&wq_lock); + flush_timers(); + pthread_mutex_lock(&wq_lock); + goto retry; +} + +static bool work_running(struct work_struct *work) +{ + struct workqueue_struct *wq; + + list_for_each_entry(wq, &wq_list, list) + if (wq->current_work == work) + return true; + + return false; +} + +bool flush_work(struct work_struct *work) +{ + bool ret = false; + + pthread_mutex_lock(&wq_lock); + while (work_pending(work) || work_running(work)) { + pthread_cond_wait(&work_finished, &wq_lock); + ret = true; + } + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +static bool __flush_work(struct work_struct *work) +{ + bool ret = false; + + while (work_running(work)) { + pthread_cond_wait(&work_finished, &wq_lock); + ret = true; + } + + return ret; +} + +bool cancel_work_sync(struct work_struct *work) +{ + bool ret; + + pthread_mutex_lock(&wq_lock); + ret = grab_pending(work, false); + + __flush_work(work); + clear_work_pending(work); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +bool mod_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + struct work_struct *work = &dwork->work; + bool ret; + + pthread_mutex_lock(&wq_lock); + ret = grab_pending(work, true); + + __queue_delayed_work(wq, dwork, delay); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +bool cancel_delayed_work(struct delayed_work *dwork) +{ + struct work_struct *work = &dwork->work; + bool ret; + + pthread_mutex_lock(&wq_lock); + ret = grab_pending(work, true); + + clear_work_pending(&dwork->work); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +bool cancel_delayed_work_sync(struct delayed_work *dwork) +{ + struct work_struct *work = &dwork->work; + bool ret; + + pthread_mutex_lock(&wq_lock); + ret = grab_pending(work, true); + + __flush_work(work); + clear_work_pending(work); + pthread_mutex_unlock(&wq_lock); + + return ret; +} + +static int worker_thread(void *arg) +{ + struct workqueue_struct *wq = arg; + struct work_struct *work; + + pthread_mutex_lock(&wq_lock); + while (1) { + __set_current_state(TASK_INTERRUPTIBLE); + work = list_first_entry_or_null(&wq->pending_work, + struct work_struct, entry); + wq->current_work = work; + + if (kthread_should_stop()) { + BUG_ON(wq->current_work); + break; + } + + if (!work) { + pthread_mutex_unlock(&wq_lock); + schedule(); + pthread_mutex_lock(&wq_lock); + continue; + } + + BUG_ON(!work_pending(work)); + list_del_init(&work->entry); + clear_work_pending(work); + + pthread_mutex_unlock(&wq_lock); + work->func(work); + pthread_mutex_lock(&wq_lock); + + pthread_cond_broadcast(&work_finished); + } + pthread_mutex_unlock(&wq_lock); + + return 0; +} + +void destroy_workqueue(struct workqueue_struct *wq) +{ + kthread_stop(wq->worker); + + pthread_mutex_lock(&wq_lock); + list_del(&wq->list); + pthread_mutex_unlock(&wq_lock); + + kfree(wq); +} + +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned flags, + int max_active, + ...) +{ + va_list args; + struct workqueue_struct *wq; + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; + + INIT_LIST_HEAD(&wq->list); + INIT_LIST_HEAD(&wq->pending_work); + + va_start(args, max_active); + vsnprintf(wq->name, sizeof(wq->name), fmt, args); + va_end(args); + + wq->worker = kthread_run(worker_thread, wq, "%s", wq->name); + if (IS_ERR(wq->worker)) { + kfree(wq); + return NULL; + } + + pthread_mutex_lock(&wq_lock); + list_add(&wq->list, &wq_list); + pthread_mutex_unlock(&wq_lock); + + return wq; +} + +struct workqueue_struct *system_wq; +struct workqueue_struct *system_highpri_wq; +struct workqueue_struct *system_long_wq; +struct workqueue_struct *system_unbound_wq; +struct workqueue_struct *system_freezable_wq; + +__attribute__((constructor(102))) +static void wq_init(void) +{ + system_wq = alloc_workqueue("events", 0, 0); + system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + system_freezable_wq = alloc_workqueue("events_freezable", + WQ_FREEZABLE, 0); + BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq); +} + +__attribute__((destructor(102))) +static void wq_cleanup(void) +{ + destroy_workqueue(system_freezable_wq); + destroy_workqueue(system_unbound_wq); + destroy_workqueue(system_long_wq); + destroy_workqueue(system_highpri_wq); + destroy_workqueue(system_wq); + + system_wq = system_highpri_wq = system_long_wq = system_unbound_wq = + system_freezable_wq = NULL; +} diff --git a/linux/xxhash.c b/linux/xxhash.c new file mode 100644 index 00000000..d5bb9ff1 --- /dev/null +++ b/linux/xxhash.c @@ -0,0 +1,500 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at: + * - xxHash homepage: https://cyan4973.github.io/xxHash/ + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +#include <asm/unaligned.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/xxhash.h> + +/*-************************************* + * Macros + **************************************/ +#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r))) +#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r))) + +#ifdef __LITTLE_ENDIAN +# define XXH_CPU_LITTLE_ENDIAN 1 +#else +# define XXH_CPU_LITTLE_ENDIAN 0 +#endif + +/*-************************************* + * Constants + **************************************/ +static const uint32_t PRIME32_1 = 2654435761U; +static const uint32_t PRIME32_2 = 2246822519U; +static const uint32_t PRIME32_3 = 3266489917U; +static const uint32_t PRIME32_4 = 668265263U; +static const uint32_t PRIME32_5 = 374761393U; + +static const uint64_t PRIME64_1 = 11400714785074694791ULL; +static const uint64_t PRIME64_2 = 14029467366897019727ULL; +static const uint64_t PRIME64_3 = 1609587929392839161ULL; +static const uint64_t PRIME64_4 = 9650029242287828579ULL; +static const uint64_t PRIME64_5 = 2870177450012600261ULL; + +/*-************************** + * Utils + ***************************/ +void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) +{ + memcpy(dst, src, sizeof(*dst)); +} +EXPORT_SYMBOL(xxh32_copy_state); + +void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) +{ + memcpy(dst, src, sizeof(*dst)); +} +EXPORT_SYMBOL(xxh64_copy_state); + +/*-*************************** + * Simple Hash Functions + ****************************/ +static uint32_t xxh32_round(uint32_t seed, const uint32_t input) +{ + seed += input * PRIME32_2; + seed = xxh_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *b_end = p + len; + uint32_t h32; + + if (len >= 16) { + const uint8_t *const limit = b_end - 16; + uint32_t v1 = seed + PRIME32_1 + PRIME32_2; + uint32_t v2 = seed + PRIME32_2; + uint32_t v3 = seed + 0; + uint32_t v4 = seed - PRIME32_1; + + do { + v1 = xxh32_round(v1, get_unaligned_le32(p)); + p += 4; + v2 = xxh32_round(v2, get_unaligned_le32(p)); + p += 4; + v3 = xxh32_round(v3, get_unaligned_le32(p)); + p += 4; + v4 = xxh32_round(v4, get_unaligned_le32(p)); + p += 4; + } while (p <= limit); + + h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) + + xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (uint32_t)len; + + while (p + 4 <= b_end) { + h32 += get_unaligned_le32(p) * PRIME32_3; + h32 = xxh_rotl32(h32, 17) * PRIME32_4; + p += 4; + } + + while (p < b_end) { + h32 += (*p) * PRIME32_5; + h32 = xxh_rotl32(h32, 11) * PRIME32_1; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} +EXPORT_SYMBOL(xxh32); + +static uint64_t xxh64_round(uint64_t acc, const uint64_t input) +{ + acc += input * PRIME64_2; + acc = xxh_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val) +{ + val = xxh64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + uint64_t h64; + + if (len >= 32) { + const uint8_t *const limit = b_end - 32; + uint64_t v1 = seed + PRIME64_1 + PRIME64_2; + uint64_t v2 = seed + PRIME64_2; + uint64_t v3 = seed + 0; + uint64_t v4 = seed - PRIME64_1; + + do { + v1 = xxh64_round(v1, get_unaligned_le64(p)); + p += 8; + v2 = xxh64_round(v2, get_unaligned_le64(p)); + p += 8; + v3 = xxh64_round(v3, get_unaligned_le64(p)); + p += 8; + v4 = xxh64_round(v4, get_unaligned_le64(p)); + p += 8; + } while (p <= limit); + + h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); + h64 = xxh64_merge_round(h64, v1); + h64 = xxh64_merge_round(h64, v2); + h64 = xxh64_merge_round(h64, v3); + h64 = xxh64_merge_round(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (uint64_t)len; + + while (p + 8 <= b_end) { + const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); + + h64 ^= k1; + h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; + p += 8; + } + + if (p + 4 <= b_end) { + h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; + h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p += 4; + } + + while (p < b_end) { + h64 ^= (*p) * PRIME64_5; + h64 = xxh_rotl64(h64, 11) * PRIME64_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} +EXPORT_SYMBOL(xxh64); + +/*-************************************************** + * Advanced Hash Functions + ***************************************************/ +void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) +{ + /* use a local state for memcpy() to avoid strict-aliasing warnings */ + struct xxh32_state state; + + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); +} +EXPORT_SYMBOL(xxh32_reset); + +void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) +{ + /* use a local state for memcpy() to avoid strict-aliasing warnings */ + struct xxh64_state state; + + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); +} +EXPORT_SYMBOL(xxh64_reset); + +int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + + if (input == NULL) + return -EINVAL; + + state->total_len_32 += (uint32_t)len; + state->large_len |= (len >= 16) | (state->total_len_32 >= 16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); + state->memsize += (uint32_t)len; + return 0; + } + + if (state->memsize) { /* some data left from previous update */ + const uint32_t *p32 = state->mem32; + + memcpy((uint8_t *)(state->mem32) + state->memsize, input, + 16 - state->memsize); + + state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); + p32++; + state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); + p32++; + state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); + p32++; + state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); + p32++; + + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= b_end - 16) { + const uint8_t *const limit = b_end - 16; + uint32_t v1 = state->v1; + uint32_t v2 = state->v2; + uint32_t v3 = state->v3; + uint32_t v4 = state->v4; + + do { + v1 = xxh32_round(v1, get_unaligned_le32(p)); + p += 4; + v2 = xxh32_round(v2, get_unaligned_le32(p)); + p += 4; + v3 = xxh32_round(v3, get_unaligned_le32(p)); + p += 4; + v4 = xxh32_round(v4, get_unaligned_le32(p)); + p += 4; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < b_end) { + memcpy(state->mem32, p, (size_t)(b_end-p)); + state->memsize = (uint32_t)(b_end-p); + } + + return 0; +} +EXPORT_SYMBOL(xxh32_update); + +uint32_t xxh32_digest(const struct xxh32_state *state) +{ + const uint8_t *p = (const uint8_t *)state->mem32; + const uint8_t *const b_end = (const uint8_t *)(state->mem32) + + state->memsize; + uint32_t h32; + + if (state->large_len) { + h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + + xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + while (p + 4 <= b_end) { + h32 += get_unaligned_le32(p) * PRIME32_3; + h32 = xxh_rotl32(h32, 17) * PRIME32_4; + p += 4; + } + + while (p < b_end) { + h32 += (*p) * PRIME32_5; + h32 = xxh_rotl32(h32, 11) * PRIME32_1; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} +EXPORT_SYMBOL(xxh32_digest); + +int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + + if (input == NULL) + return -EINVAL; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + memcpy(((uint8_t *)state->mem64) + state->memsize, input, len); + state->memsize += (uint32_t)len; + return 0; + } + + if (state->memsize) { /* tmp buffer is full */ + uint64_t *p64 = state->mem64; + + memcpy(((uint8_t *)p64) + state->memsize, input, + 32 - state->memsize); + + state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64)); + p64++; + state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64)); + p64++; + state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64)); + p64++; + state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64)); + + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p + 32 <= b_end) { + const uint8_t *const limit = b_end - 32; + uint64_t v1 = state->v1; + uint64_t v2 = state->v2; + uint64_t v3 = state->v3; + uint64_t v4 = state->v4; + + do { + v1 = xxh64_round(v1, get_unaligned_le64(p)); + p += 8; + v2 = xxh64_round(v2, get_unaligned_le64(p)); + p += 8; + v3 = xxh64_round(v3, get_unaligned_le64(p)); + p += 8; + v4 = xxh64_round(v4, get_unaligned_le64(p)); + p += 8; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < b_end) { + memcpy(state->mem64, p, (size_t)(b_end-p)); + state->memsize = (uint32_t)(b_end - p); + } + + return 0; +} +EXPORT_SYMBOL(xxh64_update); + +uint64_t xxh64_digest(const struct xxh64_state *state) +{ + const uint8_t *p = (const uint8_t *)state->mem64; + const uint8_t *const b_end = (const uint8_t *)state->mem64 + + state->memsize; + uint64_t h64; + + if (state->total_len >= 32) { + const uint64_t v1 = state->v1; + const uint64_t v2 = state->v2; + const uint64_t v3 = state->v3; + const uint64_t v4 = state->v4; + + h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); + h64 = xxh64_merge_round(h64, v1); + h64 = xxh64_merge_round(h64, v2); + h64 = xxh64_merge_round(h64, v3); + h64 = xxh64_merge_round(h64, v4); + } else { + h64 = state->v3 + PRIME64_5; + } + + h64 += (uint64_t)state->total_len; + + while (p + 8 <= b_end) { + const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); + + h64 ^= k1; + h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; + p += 8; + } + + if (p + 4 <= b_end) { + h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; + h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p += 4; + } + + while (p < b_end) { + h64 ^= (*p) * PRIME64_5; + h64 = xxh_rotl64(h64, 11) * PRIME64_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} +EXPORT_SYMBOL(xxh64_digest); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("xxHash"); diff --git a/linux/zstd_compress_module.c b/linux/zstd_compress_module.c new file mode 100644 index 00000000..35cc5cba --- /dev/null +++ b/linux/zstd_compress_module.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/zstd.h> + +#define ZSTD_FORWARD_IF_ERR(ret) \ + do { \ + size_t const __ret = (ret); \ + if (ZSTD_isError(__ret)) \ + return __ret; \ + } while (0) + +static size_t zstd_cctx_init(zstd_cctx *cctx, const zstd_parameters *parameters, + unsigned long long pledged_src_size) +{ + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_reset( + cctx, ZSTD_reset_session_and_parameters)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setPledgedSrcSize( + cctx, pledged_src_size)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_windowLog, parameters->cParams.windowLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_hashLog, parameters->cParams.hashLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_chainLog, parameters->cParams.chainLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_searchLog, parameters->cParams.searchLog)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_minMatch, parameters->cParams.minMatch)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_targetLength, parameters->cParams.targetLength)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_strategy, parameters->cParams.strategy)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_contentSizeFlag, parameters->fParams.contentSizeFlag)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_checksumFlag, parameters->fParams.checksumFlag)); + ZSTD_FORWARD_IF_ERR(ZSTD_CCtx_setParameter( + cctx, ZSTD_c_dictIDFlag, !parameters->fParams.noDictIDFlag)); + return 0; +} + +int zstd_min_clevel(void) +{ + return ZSTD_minCLevel(); +} +EXPORT_SYMBOL(zstd_min_clevel); + +int zstd_max_clevel(void) +{ + return ZSTD_maxCLevel(); +} +EXPORT_SYMBOL(zstd_max_clevel); + +size_t zstd_compress_bound(size_t src_size) +{ + return ZSTD_compressBound(src_size); +} +EXPORT_SYMBOL(zstd_compress_bound); + +zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size) +{ + return ZSTD_getParams(level, estimated_src_size, 0); +} +EXPORT_SYMBOL(zstd_get_params); + +size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) +{ + return ZSTD_estimateCCtxSize_usingCParams(*cparams); +} +EXPORT_SYMBOL(zstd_cctx_workspace_bound); + +zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + return ZSTD_initStaticCCtx(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_cctx); + +size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size, const zstd_parameters *parameters) +{ + ZSTD_FORWARD_IF_ERR(zstd_cctx_init(cctx, parameters, src_size)); + return ZSTD_compress2(cctx, dst, dst_capacity, src, src_size); +} +EXPORT_SYMBOL(zstd_compress_cctx); + +size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams) +{ + return ZSTD_estimateCStreamSize_usingCParams(*cparams); +} +EXPORT_SYMBOL(zstd_cstream_workspace_bound); + +zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters, + unsigned long long pledged_src_size, void *workspace, size_t workspace_size) +{ + zstd_cstream *cstream; + + if (workspace == NULL) + return NULL; + + cstream = ZSTD_initStaticCStream(workspace, workspace_size); + if (cstream == NULL) + return NULL; + + /* 0 means unknown in linux zstd API but means 0 in new zstd API */ + if (pledged_src_size == 0) + pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN; + + if (ZSTD_isError(zstd_cctx_init(cstream, parameters, pledged_src_size))) + return NULL; + + return cstream; +} +EXPORT_SYMBOL(zstd_init_cstream); + +size_t zstd_reset_cstream(zstd_cstream *cstream, + unsigned long long pledged_src_size) +{ + return ZSTD_resetCStream(cstream, pledged_src_size); +} +EXPORT_SYMBOL(zstd_reset_cstream); + +size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output, + zstd_in_buffer *input) +{ + return ZSTD_compressStream(cstream, output, input); +} +EXPORT_SYMBOL(zstd_compress_stream); + +size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output) +{ + return ZSTD_flushStream(cstream, output); +} +EXPORT_SYMBOL(zstd_flush_stream); + +size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output) +{ + return ZSTD_endStream(cstream, output); +} +EXPORT_SYMBOL(zstd_end_stream); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Compressor"); diff --git a/linux/zstd_decompress_module.c b/linux/zstd_decompress_module.c new file mode 100644 index 00000000..7e8cd446 --- /dev/null +++ b/linux/zstd_decompress_module.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/zstd.h> + +/* Common symbols. zstd_compress must depend on zstd_decompress. */ + +unsigned int zstd_is_error(size_t code) +{ + return ZSTD_isError(code); +} +EXPORT_SYMBOL(zstd_is_error); + +zstd_error_code zstd_get_error_code(size_t code) +{ + return ZSTD_getErrorCode(code); +} +EXPORT_SYMBOL(zstd_get_error_code); + +const char *zstd_get_error_name(size_t code) +{ + return ZSTD_getErrorName(code); +} +EXPORT_SYMBOL(zstd_get_error_name); + +/* Decompression symbols. */ + +size_t zstd_dctx_workspace_bound(void) +{ + return ZSTD_estimateDCtxSize(); +} +EXPORT_SYMBOL(zstd_dctx_workspace_bound); + +zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + return ZSTD_initStaticDCtx(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_dctx); + +size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size) +{ + return ZSTD_decompressDCtx(dctx, dst, dst_capacity, src, src_size); +} +EXPORT_SYMBOL(zstd_decompress_dctx); + +size_t zstd_dstream_workspace_bound(size_t max_window_size) +{ + return ZSTD_estimateDStreamSize(max_window_size); +} +EXPORT_SYMBOL(zstd_dstream_workspace_bound); + +zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace, + size_t workspace_size) +{ + if (workspace == NULL) + return NULL; + (void)max_window_size; + return ZSTD_initStaticDStream(workspace, workspace_size); +} +EXPORT_SYMBOL(zstd_init_dstream); + +size_t zstd_reset_dstream(zstd_dstream *dstream) +{ + return ZSTD_resetDStream(dstream); +} +EXPORT_SYMBOL(zstd_reset_dstream); + +size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + zstd_in_buffer *input) +{ + return ZSTD_decompressStream(dstream, output, input); +} +EXPORT_SYMBOL(zstd_decompress_stream); + +size_t zstd_find_frame_compressed_size(const void *src, size_t src_size) +{ + return ZSTD_findFrameCompressedSize(src, src_size); +} +EXPORT_SYMBOL(zstd_find_frame_compressed_size); + +size_t zstd_get_frame_header(zstd_frame_header *header, const void *src, + size_t src_size) +{ + return ZSTD_getFrameHeader(header, src, src_size); +} +EXPORT_SYMBOL(zstd_get_frame_header); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Decompressor"); |