logfsprogs/0000755000175000017500000000000012142655300011411 5ustar lukluklogfsprogs/.gitignore0000644000175000017500000000002412142655114013400 0ustar lukluk*.o mklogfs logfsck logfsprogs/lib.c0000644000175000017500000000077312142655114012335 0ustar lukluk#include "kerncompat.h" #include "logfs.h" /* * zlib crc32 differs from the kernel variant. zlib negated both the initial * value and the result bitwise. So for the kernel ~0 is a correct initial * value, for zlib 0 is. * Better check for such funnies instead of generating bad images. */ void check_crc32(void) { u32 c=0; if (logfs_crc32(&c, 4, 0) != cpu_to_be32(0xdebb20e3)) fail("crc32 returns bad results"); } void fail(const char *s) { printf("mklogfs: %s\n", s); exit(EXIT_FAILURE); } logfsprogs/btree.c0000644000175000017500000004560212142655114012670 0ustar lukluk/* * lib/btree.c - Simple In-memory B+Tree * * License: GPLv2 * * Copyright (c) 2007-2008 Joern Engel * * A relatively simple B+Tree implementation. I have written it as a learning * excercise to understand how B+Trees work. Turned out to be useful as well. * * B+Trees can be used similar to Linux radix trees (which don't have anything * in common with textbook radix trees, beware). Prerequisite for them working * well is that access to a random tree node is much faster than a large number * of operations within each node. * * Disks have fulfilled the prerequisite for a long time. More recently DRAM * has gained similar properties, as memory access times, when measured in cpu * cycles, have increased. Cacheline sizes have increased as well, which also * helps B+Trees. * * Compared to radix trees, B+Trees are more efficient when dealing with a * sparsely populated address space. Between 25% and 50% of the memory is * occupied with valid pointers. When densely populated, radix trees contain * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2% * pointers. * * This particular implementation stores pointers identified by a long value. * Storing NULL pointers is illegal, lookup will return NULL when no entry * was found. * * One trick was used that are not commonly found in textbooks. The lowest * values are to the right, not to the left. All used slots within a node * are on the left, all unused slots contain NUL values. Most operations * simply loop once over all slots and terminate on the first NUL. */ #include #include "btree.h" /* * Depending on the ratio of lookups vs. insert and removes, it may be * beneficial to spend more work trying to keep the tree as compact as * possible. With roughly 50 lookups for every insert/remove, stealing * from neighbours becomes more effective. If that is the case, please * define AGGRESSIVE_COMPACTION below */ // #define AGGRESSIVE_COMPACTION #ifndef L1_CACHE_BYTES #define L1_CACHE_BYTES 128 #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define NODESIZE MAX(L1_CACHE_BYTES, 128) struct btree_geo btree_geo32 = { .keylen = 1, .no_pairs = NODESIZE / sizeof(long) / 2, }; #define LONG_PER_U64 (64 / BITS_PER_LONG) struct btree_geo btree_geo64 = { .keylen = LONG_PER_U64, .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64), }; struct btree_geo btree_geo128 = { .keylen = 2 * LONG_PER_U64, .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64), }; static unsigned long *btree_node_alloc(struct btree_head *head) { return calloc(1, NODESIZE); } static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n) { size_t i; for (i = 0; i < n; i++) { if (l1[i] < l2[i]) return -1; if (l1[i] > l2[i]) return 1; } return 0; } static unsigned long *longcpy(unsigned long *dest, const unsigned long *src, size_t n) { size_t i; for (i = 0; i < n; i++) dest[i] = src[i]; return dest; } static unsigned long *longset(unsigned long *s, unsigned long c, size_t n) { size_t i; for (i = 0; i < n; i++) s[i] = c; return s; } /* * B+Tree node format: * [key0, key1, ..., keyN] [val0, val1, ..., valN] * Each key is an array of unsigned longs, head->keylen in total. * Total number of keys and vals (N) is head->no_pairs. */ static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n) { return &node[n * geo->keylen]; } static unsigned long bval(struct btree_geo *geo, unsigned long *node, int n) { return node[geo->no_pairs * geo->keylen + n]; } static void setkey(struct btree_geo *geo, unsigned long *node, unsigned long *key, int n) { longcpy(bkey(geo, node, n), key, geo->keylen); } static void setval(struct btree_geo *geo, unsigned long *node, unsigned long val, int n) { node[geo->no_pairs * geo->keylen + n] = val; } static void clearpair(struct btree_geo *geo, unsigned long *node, int n) { longset(bkey(geo, node, n), 0, geo->keylen); node[geo->no_pairs * geo->keylen + n] = 0; } #if 0 static void dumpkey(struct btree_geo *geo, unsigned long *key) { int k; printf("(%lx", key[0]); for (k = 1; k < geo->keylen; k++) printf(",%lx", key[k]); printf(")"); } static void dumpnode(struct btree_geo *geo, unsigned long *node) { int i; unsigned long *key; printf("%p: ", node); for (i = 0; i < geo->no_pairs; i++) { key = bkey(geo, node, i); dumpkey(geo, key); printf(" %lx ", bval(geo, node, i)); } printf("\n"); } static void __dumptree(struct btree_head *head, struct btree_geo *geo, unsigned long *node, int height) { int i; unsigned long *child; if (!height) return; printf("%2x ", height); dumpnode(geo, node); for (i = 0; i < geo->no_pairs; i++) { child = (void *)bval(geo, node, i); if (!child) return; __dumptree(head, geo, child, height - 1); } } static void dumptree(struct btree_head *head, struct btree_geo *geo) { __dumptree(head, geo, head->node, head->height); } #endif static inline void __btree_init(struct btree_head *head) { head->node = NULL; head->height = 0; } void btree_init(struct btree_head *head) { __btree_init(head); } unsigned long *btree_last(struct btree_head *head, struct btree_geo *geo) { int height = head->height; unsigned long *node = head->node; if (height == 0) return NULL; for ( ; height > 1; height--) node = (unsigned long *)bval(geo, node, 0); return bkey(geo, node, 0); } static int keycmp(struct btree_geo *geo, unsigned long *node, int pos, unsigned long *key) { return longcmp(bkey(geo, node, pos), key, geo->keylen); } void *btree_lookup(struct btree_head *head, struct btree_geo *geo, unsigned long *key) { int i, height = head->height; unsigned long *node = head->node; if (height == 0) return NULL; for ( ; height > 1; height--) { for (i = 0; i < geo->no_pairs; i++) if (keycmp(geo, node, i, key) <= 0) break; if (i == geo->no_pairs) return NULL; node = (unsigned long *)bval(geo, node, i); if (!node) return NULL; } if (!node) return NULL; for (i = 0; i < geo->no_pairs; i++) if (keycmp(geo, node, i, key) == 0) return (void *)bval(geo, node, i); return NULL; } static int getpos(struct btree_geo *geo, unsigned long *node, unsigned long *key) { int i; for (i = 0; i < geo->no_pairs; i++) { if (keycmp(geo, node, i, key) <= 0) break; } return i; } static int getfill(struct btree_geo *geo, unsigned long *node, int start) { int i; for (i = start; i < geo->no_pairs; i++) if (bval(geo, node, i) == 0) break; return i; } /* * locate the correct leaf node in the btree */ static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo, unsigned long *key, int level) { unsigned long *node = head->node; int i, height; for (height = head->height; height > level; height--) { for (i = 0; i < geo->no_pairs; i++) if (keycmp(geo, node, i, key) <= 0) break; if ((i == geo->no_pairs) || !bval(geo, node, i)) { /* right-most key is too large, update it */ /* FIXME: If the right-most key on higher levels is * always zero, this wouldn't be necessary. */ i--; setkey(geo, node, key, i); } BUG_ON(i < 0); node = (unsigned long *)bval(geo, node, i); } BUG_ON(!node); return node; } static int btree_grow(struct btree_head *head, struct btree_geo *geo) { unsigned long *node; int fill; node = btree_node_alloc(head); if (!node) return -ENOMEM; if (head->node) { fill = getfill(geo, head->node, 0); setkey(geo, node, bkey(geo, head->node, fill - 1), 0); setval(geo, node, (unsigned long)head->node, 0); } head->node = node; head->height++; return 0; } static void btree_shrink(struct btree_head *head, struct btree_geo *geo, int fill) { unsigned long *node; if ((fill == 0) || ((fill == 1) && (head->height > 1))) { node = head->node; head->node = (unsigned long *)bval(geo, node, 0); head->height--; free(node); } } static void steal_l(struct btree_head *head, struct btree_geo *geo, int level, unsigned long *left, int lfill, unsigned long *right, int rfill, unsigned long *parent, int lpos, int no_entries) { int i; for (i = rfill - 1; i >= 0; i--) { /* Shift entries on the right */ setkey(geo, right, bkey(geo, right, i), i + no_entries); setval(geo, right, bval(geo, right, i), i + no_entries); } for (i = 0; i < no_entries; i++) { /* Move some entries to the right */ setkey(geo, right, bkey(geo, left, lfill - no_entries + i), i); setval(geo, right, bval(geo, left, lfill - no_entries + i), i); } /* Set parent key */ setkey(geo, parent, bkey(geo, left, lfill - no_entries - 1), lpos); for (i = lfill - no_entries; i < lfill; i++) clearpair(geo, left, i); } static void steal_r(struct btree_head *head, struct btree_geo *geo, int level, unsigned long *left, int lfill, unsigned long *right, int rfill, unsigned long *parent, int lpos, int no_entries) { int i; for (i = 0; i < no_entries; i++) { /* Move some entries to the left */ setkey(geo, left, bkey(geo, right, i), lfill + i); setval(geo, left, bval(geo, right, i), lfill + i); } /* Set parent key */ setkey(geo, parent, bkey(geo, right, no_entries - 1), lpos); /* Shift entries on the right */ for ( ; i < rfill; i++) { setkey(geo, right, bkey(geo, right, i), i - no_entries); setval(geo, right, bval(geo, right, i), i - no_entries); } for (i = rfill - no_entries; i < rfill; i++) clearpair(geo, right, i); } static int btree_insert_level(struct btree_head *head, struct btree_geo *geo, unsigned long *key, unsigned long val, int level); static int split(struct btree_head *head, struct btree_geo *geo, unsigned long *node, int level) { unsigned long *new; int i, err, fill = geo->no_pairs; new = btree_node_alloc(head); if (!new) return -ENOMEM; err = btree_insert_level(head, geo, bkey(geo, node, fill / 2 - 1), (unsigned long)new, level + 1); if (err) { free(new); return err; } for (i = 0; i < fill / 2; i++) { setkey(geo, new, bkey(geo, node, i), i); setval(geo, new, bval(geo, node, i), i); setkey(geo, node, bkey(geo, node, i + fill / 2), i); setval(geo, node, bval(geo, node, i + fill / 2), i); clearpair(geo, node, i + fill / 2); } if (fill & 1) { setkey(geo, node, bkey(geo, node, fill - 1), i); setval(geo, node, bval(geo, node, fill - 1), i); clearpair(geo, node, fill - 1); } return 0; } static int rebalance_insert(struct btree_head *head, struct btree_geo *geo, unsigned long *key, unsigned long *child, int level) { #ifdef AGGRESSIVE_COMPACTION unsigned long *parent, *left, *right; int child_no, no_left, no_right, delta; if (level == head->height) goto split; parent = find_level(head, geo, key, level + 1); child_no = getpos(geo, parent, key); BUG_ON(bval(geo, parent, child_no) != (unsigned long)child); if (child_no > 0) { left = (unsigned long *)bval(geo, parent, child_no - 1); no_left = getfill(geo, left, 0); delta = geo->no_pairs - no_left; if (delta >= 2) { steal_r(head, geo, level, left, no_left, child, geo->no_pairs, parent, child_no - 1, delta / 2); return 0; } } if (child_no + 1 < getfill(geo, parent, child_no)) { right = (unsigned long *)bval(geo, parent, child_no + 1); no_right = getfill(geo, right, 0); delta = geo->no_pairs - no_right; if (delta >= 2) { steal_l(head, geo, level, child, geo->no_pairs, right, no_right, parent, child_no, delta / 2); return 0; } } split: #endif return split(head, geo, child, level); } static int btree_insert_level(struct btree_head *head, struct btree_geo *geo, unsigned long *key, unsigned long val, int level) { unsigned long *node; int i, pos, fill, err; BUG_ON(!val); if (head->height < level) { err = btree_grow(head, geo); if (err) return err; } retry: node = find_level(head, geo, key, level); pos = getpos(geo, node, key); fill = getfill(geo, node, pos); /* two identical keys are not allowed */ BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0); if (fill == geo->no_pairs) { /* need to split node */ err = rebalance_insert(head, geo, key, node, level); if (err) return err; goto retry; } BUG_ON(fill >= geo->no_pairs); /* shift and insert */ for (i = fill; i > pos; i--) { setkey(geo, node, bkey(geo, node, i - 1), i); setval(geo, node, bval(geo, node, i - 1), i); } setkey(geo, node, key, pos); setval(geo, node, val, pos); return 0; } int btree_insert(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val) { return btree_insert_level(head, geo, key, (unsigned long)val, 1); } static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, unsigned long *key, int level); static void merge(struct btree_head *head, struct btree_geo *geo, int level, unsigned long *left, int lfill, unsigned long *right, int rfill, unsigned long *parent, int lpos) { int i; for (i = 0; i < rfill; i++) { /* Move all entries to the left */ setkey(geo, left, bkey(geo, right, i), lfill + i); setval(geo, left, bval(geo, right, i), lfill + i); } /* Exchange left and right child in parent */ setval(geo, parent, (unsigned long)right, lpos); setval(geo, parent, (unsigned long)left, lpos + 1); /* Remove left (formerly right) child from parent */ btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1); free(right); } static void rebalance(struct btree_head *head, struct btree_geo *geo, unsigned long *key, int level, unsigned long *child, int fill) { unsigned long *parent, *left = NULL, *right = NULL; int child_no, no_left, no_right, i; parent = find_level(head, geo, key, level + 1); child_no = getpos(geo, parent, key); BUG_ON(bval(geo, parent, child_no) != (unsigned long)child); if (child_no > 0) { left = (unsigned long *)bval(geo, parent, child_no - 1); no_left = getfill(geo, left, 0); if (fill + no_left <= geo->no_pairs) { /* Merge with left neighbour */ merge(head, geo, level, left, no_left, child, fill, parent, child_no - 1); return; } } if (child_no + 1 < getfill(geo, parent, child_no)) { right = (unsigned long *)bval(geo, parent, child_no + 1); no_right = getfill(geo, right, 0); if (fill + no_right <= geo->no_pairs) { /* Merge with right neighbour */ merge(head, geo, level, child, fill, right, no_right, parent, child_no); return; } } /* * Leaving the btree in a somewhat unbalanced state can improve * performance. Stealing entries from a neighbour is a fairly * expensive operation. In trees where reads completely dominate * writes, the cost will be amortized sooner or later. When the * ratio of writes increases, they may never be amortized. * * So avoid stealing unless the tree would get _really_ unbalanced. */ if (fill > 1) return; if (left) { /* Steal from left neighbour */ i = (no_left - fill) / 2; BUG_ON(i < 1); steal_l(head, geo, level, left, no_left, child, fill, parent, child_no - 1, i); return; } if (right) { /* Steal from right neighbour */ i = (no_right - fill) / 2; BUG_ON(i < 1); steal_r(head, geo, level, child, fill, right, no_right, parent, child_no, i); return; } BUG(); /* We should never get here */ } static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo, unsigned long *key, int level) { unsigned long *node; int i, pos, fill; void *ret; if (level > head->height) { /* we recursed all the way up */ head->height = 0; head->node = NULL; return NULL; } node = find_level(head, geo, key, level); pos = getpos(geo, node, key); fill = getfill(geo, node, pos); if ((level == 1) && (keycmp(geo, node, pos, key) != 0)) return NULL; ret = (void *)bval(geo, node, pos); /* remove and shift */ for (i = pos; i < fill - 1; i++) { setkey(geo, node, bkey(geo, node, i + 1), i); setval(geo, node, bval(geo, node, i + 1), i); } clearpair(geo, node, fill - 1); if (fill - 1 < geo->no_pairs / 2) { if (level < head->height) rebalance(head, geo, key, level, node, fill - 1); else btree_shrink(head, geo, fill - 1); } return ret; } void *btree_remove(struct btree_head *head, struct btree_geo *geo, unsigned long *key) { if (head->height == 0) return NULL; return btree_remove_level(head, geo, key, 1); } int btree_merge(struct btree_head *target, struct btree_head *victim, struct btree_geo *geo, unsigned long *duplicate) { unsigned long *key; void *val; int err; BUG_ON(target == victim); if (!(target->node)) { /* target is empty, just copy fields over */ target->node = victim->node; target->height = victim->height; __btree_init(victim); return 0; } for (;;) { key = btree_last(victim, geo); if (!key) break; val = btree_lookup(victim, geo, key); err = btree_insert(target, geo, key, val); if (err) return err; /* We must make a copy of the key, as the original will get * mangled inside btree_remove. */ longcpy(duplicate, key, geo->keylen); btree_remove(victim, geo, duplicate); } return 0; } static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo, unsigned long *node, long opaque, void (*func)(void *elem, long opaque, unsigned long *key, size_t index, void *func2), void *func2, int reap, int height, size_t count) { int i; unsigned long *child; for (i = 0; i < geo->no_pairs; i++) { child = (void *)bval(geo, node, i); if (!child) break; if (height > 1) count = __btree_for_each(head, geo, child, opaque, func, func2, reap, height - 1, count); else func(child, opaque, bkey(geo, node, i), count++, func2); } if (reap) free(node); return count; } static void empty(void *elem, long opaque, unsigned long *key, size_t index, void *func2) { } void visitorl(void *elem, long opaque, unsigned long *key, size_t index, void *__func) { visitorl_t func = __func; func(elem, opaque, *key, index); } void visitor32(void *elem, long opaque, unsigned long *__key, size_t index, void *__func) { visitor32_t func = __func; u32 *key = (void *)__key; func(elem, opaque, *key, index); } void visitor64(void *elem, long opaque, unsigned long *__key, size_t index, void *__func) { visitor64_t func = __func; u64 *key = (void *)__key; func(elem, opaque, *key, index); } void visitor128(void *elem, long opaque, unsigned long *__key, size_t index, void *__func) { visitor128_t func = __func; u64 *key = (void *)__key; func(elem, opaque, key[0], key[1], index); } size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, long opaque, void (*func)(void *elem, long opaque, unsigned long *key, size_t index, void *func2), void *func2) { size_t count = 0; if (!func2) func = empty; if (head->node) count = __btree_for_each(head, geo, head->node, opaque, func, func2, 0, head->height, 0); return count; } size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, long opaque, void (*func)(void *elem, long opaque, unsigned long *key, size_t index, void *func2), void *func2) { size_t count = 0; if (!func2) func = empty; if (head->node) count = __btree_for_each(head, geo, head->node, opaque, func, func2, 1, head->height, 0); __btree_init(head); return count; } logfsprogs/mkfs.c0000644000175000017500000004341212142655114012524 0ustar lukluk/* * LogFS mkfs * * Copyright (c) 2007-2008 Joern Engel * * License: GPL version 2 */ #define _LARGEFILE64_SOURCE #define __USE_FILE_OFFSET64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define __USE_UNIX98 #include #include #include "kerncompat.h" #include #include "logfs_abi.h" #include "logfs.h" enum { OFS_SB = 0, OFS_JOURNAL = 1, OFS_ROOTDIR = 3, OFS_COUNT }; static unsigned user_segshift = -1; static unsigned user_blockshift = -1; static unsigned user_writeshift = -1; static u8 segshift = 18; static u8 blockshift = 12; static u8 writeshift = 0; static u32 no_journal_segs = 4; static u32 bad_seg_reserve = 4; /* journal entries */ static __be64 je_array[64]; static int no_je; /* commandline options */ static int compress_rootdir; static int quick_bad_block_scan; static int interactice_mode = 1; //////////////////////////////////////////////////////////////////////////////// static int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) { struct z_stream_s stream; int err, ret; ret = -EIO; memset(&stream, 0, sizeof(stream)); err = deflateInit(&stream, 3); if (err != Z_OK) goto error; stream.next_in = in; stream.avail_in = inlen; stream.total_in = 0; stream.next_out = out; stream.avail_out = outlen; stream.total_out = 0; err = deflate(&stream, Z_FINISH); if (err != Z_STREAM_END) goto error; err = deflateEnd(&stream); if (err != Z_OK) goto error; if (stream.total_out >= stream.total_in) goto error; ret = stream.total_out; error: return ret; } //////////////////////////////////////////////////////////////////////////////// static int mtd_erase(struct super_block *sb, u64 ofs, size_t size) { if (ofs >= 0x100000000ull) { struct erase_info_user64 ei; ei.start = ofs; ei.length = size; return ioctl(sb->fd, MEMERASE64, &ei); } else { struct erase_info_user ei; ei.start = ofs; ei.length = size; return ioctl(sb->fd, MEMERASE, &ei); } } static int mtd_prepare_sb(struct super_block *sb) { u32 segno; int err; /* 1st superblock at the beginning */ segno = get_segment(sb); sb->segment_entry[segno].ec_level = ec_level(1, 0); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); sb->sb_ofs1 = (u64)segno * sb->segsize; /* 2nd superblock at the end */ for (segno = sb->no_segs - 1; segno > sb->no_segs - 64; segno--) { err = mtd_erase(sb, (u64)segno * sb->segsize, sb->segsize); if (err) continue; sb->segment_entry[segno].ec_level = ec_level(1, 0); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); sb->sb_ofs2 = (u64)(segno + 1) * sb->segsize - 0x1000; break; } if (segno == sb->no_segs - 64 || sb->sb_ofs2 <= sb->sb_ofs1) return -EIO; return 0; } static int bdev_prepare_sb(struct super_block *sb) { u32 segno; /* 1st superblock at the beginning */ segno = get_segment(sb); sb->segment_entry[segno].ec_level = ec_level(1, 0); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); sb->sb_ofs1 = (u64)segno * sb->segsize; /* 2nd superblock at the end */ segno = sb->no_segs - 1; sb->segment_entry[segno].ec_level = ec_level(1, 0); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); sb->sb_ofs2 = (u64)(segno) * sb->segsize - 0x1000; return 0; } int safe_pwrite(int fd, char *buf, size_t size, u64 ofs) { ssize_t ret; size_t remaining; remaining = size; while (remaining > 0) { ret = pwrite(fd, buf, remaining, ofs); if (ret < 0) { if (errno == EINTR) continue; fprintf(stderr, "write failed: %s\n", strerror(errno)); return ret; } remaining -= ret; ofs += ret; buf += ret; } return 0; } static int bdev_write(struct super_block *sb, u64 ofs, size_t size, void *buf) { ssize_t ret; ret = safe_pwrite(sb->fd, buf, size, ofs); if (ret < 0) return -EIO; return 0; } static int bdev_erase(struct super_block *sb, u64 ofs, size_t size) { if (!sb->erase_buf) { sb->erase_buf = malloc(sb->segsize); if (!sb->erase_buf) fail("out of memory"); memset(sb->erase_buf, 0xff, sb->segsize); } return bdev_write(sb, ofs, size, sb->erase_buf); } static const struct logfs_device_operations mtd_ops = { .prepare_sb = mtd_prepare_sb, .write = bdev_write, .erase = mtd_erase, }; static const struct logfs_device_operations bdev_ops = { .prepare_sb = bdev_prepare_sb, .write = bdev_write, .erase = bdev_erase, }; //////////////////////////////////////////////////////////////////////////////// /* root inode */ static void set_segment_header(struct logfs_segment_header *sh, u8 type, u8 level, u32 segno) { sh->pad = 0; sh->type = type; sh->level = level; sh->segno = cpu_to_be32(segno); sh->ec = 0; sh->gec = cpu_to_be64(segno); sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); } static int write_inode(struct super_block *sb, u64 ino) { struct inode *inode; inode = find_or_create_inode(sb, ino); if (!inode) return -ENOMEM; return logfs_file_write(sb, LOGFS_INO_MASTER, ino, 0, OBJ_INODE, &inode->di); } static int write_segment_file(struct super_block *sb) { struct inode *inode; struct logfs_disk_inode *di; void *buf; int err; u64 ofs; buf = zalloc(sb->blocksize); if (!buf) return -ENOMEM; inode = find_or_create_inode(sb, LOGFS_INO_SEGFILE); if (!inode) return -ENOMEM; di = &inode->di; di->di_flags = 0; di->di_mode = cpu_to_be16(S_IFREG | 0); di->di_refcount = cpu_to_be32(1); di->di_size = cpu_to_be64(sb->no_segs * 8ull); for (ofs = 0; ofs * sb->blocksize < (u64)sb->no_segs * 8; ofs++) { err = logfs_file_write(sb, LOGFS_INO_SEGFILE, ofs, 0, OBJ_BLOCK, buf); if (err) return err; } err = logfs_file_flush(sb, LOGFS_INO_SEGFILE); if (err) return err; return write_inode(sb, LOGFS_INO_SEGFILE); } static int write_rootdir(struct super_block *sb) { struct inode *inode; struct logfs_disk_inode *di; inode = find_or_create_inode(sb, LOGFS_INO_ROOT); if (!inode) return -ENOMEM; di = &inode->di; di->di_flags = 0; if (compress_rootdir) di->di_flags |= cpu_to_be32(LOGFS_IF_COMPRESSED); di->di_mode = cpu_to_be16(S_IFDIR | 0755); di->di_refcount = cpu_to_be32(1); return write_inode(sb, LOGFS_INO_ROOT); } /* journal */ static size_t __write_header(struct logfs_journal_header *jh, size_t len, size_t datalen, u16 type, u8 compr) { jh->h_len = cpu_to_be16(len); jh->h_type = cpu_to_be16(type); jh->h_datalen = cpu_to_be16(datalen); jh->h_compr = compr; jh->h_pad[0] = 'h'; jh->h_pad[1] = 'e'; jh->h_pad[2] = 'a'; jh->h_pad[3] = 'd'; jh->h_pad[4] = 'r'; jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); return ALIGN(len, 16) + sizeof(*jh); } static size_t write_header(struct logfs_journal_header *h, size_t datalen, u16 type) { return __write_header(h, datalen, datalen, type, COMPR_NONE); } static size_t je_anchor(struct super_block *sb, void *_da, u16 *type) { struct inode *inode; struct logfs_je_anchor *da = _da; int i; inode = find_or_create_inode(sb, LOGFS_INO_MASTER); if (!inode) return -ENOMEM; memset(da, 0, sizeof(*da)); da->da_last_ino = cpu_to_be64(LOGFS_RESERVED_INOS); da->da_size = cpu_to_be64(LOGFS_RESERVED_INOS * sb->blocksize); da->da_used_bytes = inode->di.di_used_bytes; for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) da->da_data[i] = inode->di.di_data[i]; *type = JE_ANCHOR; return sizeof(*da); } static size_t je_dynsb(struct super_block *sb, void *_dynsb, u16 *type) { struct logfs_je_dynsb *dynsb = _dynsb; memset(dynsb, 0, sizeof(*dynsb)); dynsb->ds_used_bytes = cpu_to_be64(sb->used_bytes); /* Set ds_gec to something beyond anything mkfs would use */ dynsb->ds_gec = cpu_to_be64(0x1000); *type = JE_DYNSB; return sizeof(*dynsb); } static size_t je_alias(struct super_block *sb, void *_oa, u16 *type) { struct logfs_obj_alias *oa = _oa; struct logfs_segment_entry *se; u64 val; int i, k; int ashift, amask; ashift = blockshift - 3; /* 8 bytes per alias */ amask = (1 << ashift) - 1; memset(oa, 0, sb->blocksize); k = 0; for (i = 0; i < sb->no_segs; i++) { se = sb->segment_entry + i; if (se->ec_level || se->valid) { val = (u64)be32_to_cpu(se->ec_level) << 32 | be32_to_cpu(se->valid); oa[k].ino = cpu_to_be64(LOGFS_INO_SEGFILE); oa[k].bix = cpu_to_be64(i >> ashift); oa[k].val = cpu_to_be64(val); oa[k].level = 0; oa[k].child_no = cpu_to_be16(i & amask); k++; } } *type = JE_OBJ_ALIAS; return k * sizeof(*oa); } static size_t je_commit(struct super_block *sb, void *h, u16 *type) { *type = JE_COMMIT; memcpy(h, je_array, no_je * sizeof(__be64)); return no_je * sizeof(__be64); } static size_t write_je(struct super_block *sb, size_t jpos, void *scratch, void *header, u32 segno, size_t (*write)(struct super_block *sb, void *scratch, u16 *type)) { u64 ofs = (u64)segno * sb->segsize; void *data; ssize_t len, max, compr_len, pad_len; u16 type; u8 compr = COMPR_ZLIB; header += jpos; data = header + sizeof(struct logfs_journal_header); len = write(sb, scratch, &type); if (type != JE_COMMIT) je_array[no_je++] = cpu_to_be64(ofs + jpos); if (len == 0) return write_header(header, 0, type); max = sb->blocksize - jpos; compr_len = logfs_compress(scratch, data, len, max); if ((compr_len < 0) || (type == JE_COMMIT)) { BUG_ON(len > max); memcpy(data, scratch, len); compr_len = len; compr = COMPR_NONE; } pad_len = ALIGN(compr_len, 16); memset(data + compr_len, 0, pad_len - compr_len); return __write_header(header, compr_len, len, type, compr); } static int make_journal(struct super_block *sb) { void *journal, *scratch; size_t jpos; u32 seg; seg = sb->journal_seg[0]; /* TODO: add segment to superblock, segfile */ journal = zalloc(sb->segsize); if (!journal) return -ENOMEM; scratch = zalloc(2 * sb->blocksize); if (!scratch) return -ENOMEM; set_segment_header(journal, SEG_JOURNAL, 0, seg); jpos = ALIGN(sizeof(struct logfs_segment_header), 16); /* erasecount is not written - implicitly set to 0 */ /* neither are summary, index, wbuf */ jpos += write_je(sb, jpos, scratch, journal, seg, je_anchor); jpos += write_je(sb, jpos, scratch, journal, seg, je_dynsb); jpos += write_je(sb, jpos, scratch, journal, seg, je_alias); jpos += write_je(sb, jpos, scratch, journal, seg, je_commit); return sb->dev_ops->write(sb, seg * sb->segsize, sb->segsize, journal); } /* superblock */ static int make_super(struct super_block *sb) { struct logfs_disk_super _ds, *ds = &_ds; void *sector; int secsize = ALIGN(sizeof(*ds), sb->writesize); int i, ret; sector = zalloc(secsize); if (!sector) return -ENOMEM; memset(ds, 0, sizeof(*ds)); set_segment_header((void *)ds, SEG_SUPER, 0, 0); bad_seg_reserve = max(bad_seg_reserve, no_journal_segs); ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); ds->ds_ifile_levels = 3; /* 2+1, 1GiB */ ds->ds_iblock_levels = 4; /* 3+1, 512GiB */ ds->ds_data_levels = 1; /* old, young, unknown */ ds->ds_feature_incompat = 0; ds->ds_feature_ro_compat= 0; ds->ds_feature_compat = 0; ds->ds_feature_flags = 0; ds->ds_filesystem_size = cpu_to_be64(sb->fssize); ds->ds_segment_shift = segshift; ds->ds_block_shift = blockshift; ds->ds_write_shift = writeshift; ds->ds_bad_seg_reserve = cpu_to_be32(bad_seg_reserve); for (i = 0; i < no_journal_segs; i++) ds->ds_journal_seg[i] = cpu_to_be32(sb->journal_seg[i]); ds->ds_super_ofs[0] = cpu_to_be64(sb->sb_ofs1); ds->ds_super_ofs[1] = cpu_to_be64(sb->sb_ofs2); ds->ds_root_reserve = 0; ds->ds_crc = logfs_crc32(ds, sizeof(*ds), LOGFS_SEGMENT_HEADERSIZE + 12); memcpy(sector, ds, sizeof(*ds)); ret = sb->dev_ops->write(sb, sb->sb_ofs1, secsize, sector); if (!ret) ret = sb->dev_ops->write(sb, sb->sb_ofs2, secsize, sector); free(sector); return ret; } /* main stuff */ static void prepare_journal(struct super_block *sb) { int i; u32 segno; for (i = 0; i < no_journal_segs; i++) { segno = get_segment(sb); sb->journal_seg[i] = segno; sb->segment_entry[segno].ec_level = ec_level(1, 0); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); } } static void mkfs(struct super_block *sb) { char answer[4096]; /* I don't care about overflows */ int ret; BUG_ON(!sb); if (user_segshift + 1) segshift = user_segshift; if (user_blockshift + 1) blockshift = user_blockshift; if (user_writeshift + 1) writeshift = user_writeshift; if (segshift > 30) fail("segment shift too large (max 30)"); if (segshift <= blockshift) fail("segment shift must be larger than block shift"); if (blockshift != 12) fail("blockshift must be 12"); if (writeshift > 16) fail("writeshift too large (max 16)"); if (segshift < writeshift) fail("segment shift must be larger than write shift"); sb->segsize = 1 << segshift; sb->blocksize = 1 << blockshift; sb->blocksize_bits = blockshift; sb->writesize = 1 << writeshift; sb->no_segs = sb->fssize >> segshift; sb->fssize = (u64)sb->no_segs << segshift; printf("Will create filesystem with the following details:\n"); printf(" hex: decimal:\n"); printf("fssize= %8llx %10lld\n", sb->fssize, sb->fssize); printf("segsize= %8x %10d\n", sb->segsize, sb->segsize); printf("blocksize=%8x %10d\n", sb->blocksize, sb->blocksize); printf("writesize=%8x %10d\n", sb->writesize, sb->writesize); printf("\n"); if (interactice_mode) { printf("Do you wish to continue (yes/no)\n"); scanf("%s", answer); if (strcmp(answer, "yes")) fail("aborting..."); } if (quick_bad_block_scan) { printf("mklogfs won't erase filesystem. This may oops your kernel.\n"); scanf("%s", answer); if (strcmp(answer, "yes")) fail("aborting..."); } sb->segment_entry = zalloc(sb->no_segs * sizeof(sb->segment_entry[0])); if (!sb->segment_entry) fail("out of memory"); ret = sb->dev_ops->prepare_sb(sb); if (ret) fail("could not erase two superblocks"); prepare_journal(sb); ret = write_segment_file(sb); if (ret) fail("could not write segment file"); ret = write_rootdir(sb); if (ret) fail("could not create root inode"); ret = flush_segments(sb); if (ret) fail("could not write segments"); /* * prepare sb * prepare journal * write segment file (create alias) * write inodes (create alias) * flush segments * write journal (including aliases) * write sb */ ret = make_journal(sb); if (ret) fail("could not create journal"); ret = make_super(sb); if (ret) fail("could not create superblock"); fsync(sb->fd); printf("\nFinished generating LogFS\n"); } static struct super_block *__open_device(const char *name) { struct super_block *sb; const struct logfs_device_operations *ops = &bdev_ops; struct mtd_info_user mtd; struct stat stat; int err; sb = zalloc(sizeof(*sb)); sb->fd = open(name, O_WRONLY | O_EXCL | O_LARGEFILE); if (sb->fd == -1) fail("could not open device"); err = fstat(sb->fd, &stat); if (err) fail("could not stat device"); switch (stat.st_mode & S_IFMT) { case S_IFSOCK: case S_IFLNK: case S_IFDIR: case S_IFIFO: fail("wrong device type"); case S_IFCHR: if (major(stat.st_rdev) != 90) fail("non-mtd character device"); ops = &mtd_ops; err = ioctl(sb->fd, MEMGETINFO, &mtd); if (err) fail("mtd ioctl failed"); sb->erasesize = mtd.erasesize; segshift = ffs(mtd.erasesize) - 1; if (mtd.erasesize != 1 << segshift) fail("device erasesize must be a power of 2"); writeshift = ffs(mtd.writesize) - 1; if (mtd.writesize != 1 << writeshift) fail("device writesize must be a power of 2"); sb->fssize = mtd.size; { /* The new "improved" way of doing things */ char buf[256]; int fd; sprintf(buf, "/sys/class/mtd/%s/size", basename((char *)name)); fd = open(buf, O_RDONLY); if (fd >= 0) { read(fd, buf, 256); sb->fssize = strtoull(buf, NULL, 0); close(fd); } } break; case S_IFREG: sb->fssize = stat.st_size; break; case S_IFBLK: err = ioctl(sb->fd, BLKGETSIZE64, &sb->fssize); if (err) fail("block ioctl failed"); break; } sb->dev_ops = ops; return sb; } static void usage(void) { printf( "mklogfs \n" "\n" "Options:\n" " -c --compress turn compression on\n" " -h --help display this help\n" " -s --segshift segment shift in bits\n" " -w --writeshift write shift in bits\n" " --demo-mode skip bad block scan; don't erase device\n" " --non-interactive turn off safety question before writing\n" "\n" "Segment size and write size are powers of two. To specify them, the\n" "appropriate power is specified with the \"-s\" or \"-w\" options, instead\n" "of the actual size. E.g. \"mklogfs -w8\" will set a writesize\n" "of 256 Bytes (2^8).\n\n"); } int main(int argc, char **argv) { struct super_block *sb; check_crc32(); for (;;) { int oi = 1; char short_opts[] = "chs:w:"; static const struct option long_opts[] = { {"bad-segment-reserve", 1, NULL, 'B'}, {"compress", 0, NULL, 'c'}, {"journal-segments", 1, NULL, 'j'}, {"help", 0, NULL, 'h'}, {"non-interactive", 0, NULL, 'n'}, {"demo-mode", 0, NULL, 'q'}, {"segshift", 1, NULL, 's'}, {"writeshift", 1, NULL, 'w'}, { } }; int c = getopt_long(argc, argv, short_opts, long_opts, &oi); if (c == -1) break; switch (c) { case 'b': user_blockshift = strtoul(optarg, NULL, 0); break; case 'B': bad_seg_reserve = strtoul(optarg, NULL, 0); break; case 'c': compress_rootdir = 1; break; case 'j': no_journal_segs = strtoul(optarg, NULL, 0); break; case 'h': usage(); exit(EXIT_SUCCESS); case 'n': interactice_mode = 0; break; case 'q': quick_bad_block_scan = 1; break; case 's': user_segshift = strtoul(optarg, NULL, 0); break; case 'w': user_writeshift = strtoul(optarg, NULL, 0); break; default: usage(); exit(EXIT_FAILURE); } } if (optind != argc - 1) { usage(); exit(EXIT_FAILURE); } sb = __open_device(argv[optind]); mkfs(sb); return 0; } logfsprogs/Makefile0000644000175000017500000000220512142655114013053 0ustar lukluk# # Use "make C=1 foo" to enable sparse checking # Use "make S=1 foo" to compile statically # BIN := mklogfs SRC := mkfs.c fsck.c lib.c journal.c segment.c btree.c readwrite.c OBJ := $(SRC:.c=.o) BB := $(SRC:.c=.bb) BBG := $(SRC:.c=.bbg) DA := $(SRC:.c=.da) COV := $(SRC:.c=.c.gcov) ZLIB_O := crc32.o deflate.o adler32.o compress.o trees.o zutil.o CC := gcc CHECK := cgcc CHECKFLAGS := -D__CHECK_ENDIAN__ CFLAGS := -std=gnu99 CFLAGS += -Wall CFLAGS += -Os CFLAGS += -D_FILE_OFFSET_BITS=64 CFLAGS += -g #CFLAGS += -fprofile-arcs -ftest-coverage all: $(BIN) $(ZLIB_O): /usr/lib/libz.a ar -x /usr/lib/libz.a $@ ifdef S EXTRA_OBJ := $(ZLIB_O) CFLAGS += -static else CFLAGS += -lz endif mklogfs: $(EXTRA_OBJ) mklogfs: mkfs.o lib.o btree.o segment.o readwrite.o $(CC) $(CFLAGS) -o $@ $^ logfsck: $(ZLIB_O) logfsck: fsck.o lib.o journal.o super.o $(CC) $(CFLAGS) -o $@ $^ $(OBJ): kerncompat.h logfs.h logfs_abi.h btree.h %.o: %.c ifdef C $(CHECK) $(CFLAGS) $(CHECKFLAGS) -c -o $@ $< endif $(CC) $(CFLAGS) -c -o $@ $< install: all ~/bin cp $(BIN) ~/bin/ distclean: clean $(RM) core clean: $(RM) $(BIN) $(OBJ) $(BB) $(BBG) $(COV) $(DA) $(ZLIB_O) logfsprogs/kerncompat.h0000644000175000017500000001247512142655114013741 0ustar lukluk/* * Copyright (C) 2007 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */ #ifndef __KERNCOMPAT #define __KERNCOMPAT #include #include #include #include #include #define gfp_t int #define get_cpu_var(p) (p) #define __get_cpu_var(p) (p) #define BITS_PER_LONG (sizeof(long) * 8) #define __GFP_BITS_SHIFT 20 #define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1)) #define GFP_KERNEL 0 #define GFP_NOFS 0 #define __read_mostly #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define ULONG_MAX (~0UL) #define BUG() abort() #ifdef __CHECKER__ #define __force __attribute__((force)) #define __bitwise__ __attribute__((bitwise)) #else #define __force #define __bitwise__ #endif #ifdef __CHECKER__ typedef unsigned char u8; typedef unsigned short u16; typedef unsigned int u32; typedef unsigned int __u32; typedef unsigned long long u64; typedef char s8; typedef short s16; typedef int s32; typedef int __s32; typedef long long s64; #else #include typedef __u8 u8; typedef __u16 u16; typedef __u32 u32; typedef __u64 u64; typedef __s8 s8; typedef __s16 s16; typedef __s32 s32; typedef __s64 s64; #endif struct vma_shared { int prio_tree_node; }; struct vm_area_struct { unsigned long vm_pgoff; unsigned long vm_start; unsigned long vm_end; struct vma_shared shared; }; struct page { unsigned long index; }; #define preempt_enable() do { } while (0) #define preempt_disable() do { } while (0) #define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /** * __set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static inline void __set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = BITOP_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); *p |= mask; } static inline void __clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = BITOP_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); *p &= ~mask; } /** * test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } #define BUG_ON(c) do { if (c) abort(); } while (0) #undef offsetof #ifdef __compiler_offsetof #define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) #else #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) #ifdef __CHECKER__ #define __bitwise __bitwise__ #else #undef __bitwise #define __bitwise #endif typedef u16 __bitwise __le16; typedef u16 __bitwise __be16; typedef u32 __bitwise __le32; typedef u32 __bitwise __be32; typedef u64 __bitwise __le64; typedef u64 __bitwise __be64; #if __BYTE_ORDER == __BIG_ENDIAN #define cpu_to_be64(x) ((__force __be64)(u64)(x)) #define be64_to_cpu(x) ((__force u64)(__be64)(x)) #define cpu_to_be32(x) ((__force __be32)(u32)(x)) #define be32_to_cpu(x) ((__force u32)(__be32)(x)) #define cpu_to_be16(x) ((__force __be16)(u16)(x)) #define be16_to_cpu(x) ((__force u16)(__be16)(x)) #define cpu_to_le64(x) ((__force __le64)(u64)(bswap_64(x))) #define le64_to_cpu(x) (bswap_64((__force u64)(__le64)(x))) #define cpu_to_le32(x) ((__force __le32)(u32)(bswap_32(x))) #define le32_to_cpu(x) (bswap_32((__force u32)(__le32)(x))) #define cpu_to_le16(x) ((__force __le16)(u16)(bswap_16(x))) #define le16_to_cpu(x) (bswap_16((__force u16)(__le16)(x))) #else #define cpu_to_be64(x) ((__force __be64)(u64)(bswap_64(x))) #define be64_to_cpu(x) (bswap_64((__force u64)(__be64)(x))) #define cpu_to_be32(x) ((__force __be32)(u32)(bswap_32(x))) #define be32_to_cpu(x) (bswap_32((__force u32)(__be32)(x))) #define cpu_to_be16(x) ((__force __be16)(u16)(bswap_16(x))) #define be16_to_cpu(x) (bswap_16((__force u16)(__be16)(x))) #define cpu_to_le64(x) ((__force __le64)(u64)(x)) #define le64_to_cpu(x) ((__force u64)(__le64)(x)) #define cpu_to_le32(x) ((__force __le32)(u32)(x)) #define le32_to_cpu(x) ((__force u32)(__le32)(x)) #define cpu_to_le16(x) ((__force __le16)(u16)(x)) #define le16_to_cpu(x) ((__force u16)(__le16)(x)) #endif #define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1) #define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) #endif logfsprogs/btree.h0000644000175000017500000001652212142655114012674 0ustar lukluk#ifndef BTREE_H #define BTREE_H #include "kerncompat.h" /* * B+Tree node format: * [key0, key1, ..., keyN] [val0, val1, ..., valN] * Each key is an array of unsigned longs, head->no_longs in total. * Total number of keys and vals (N) is head->no_pairs. */ struct btree_head { unsigned long *node; int height; }; struct btree_geo { int keylen; int no_pairs; }; extern struct btree_geo btree_geo32; extern struct btree_geo btree_geo64; extern struct btree_geo btree_geo128; struct btree_headl { struct btree_head h; }; struct btree_head32 { struct btree_head h; }; struct btree_head64 { struct btree_head h; }; struct btree_head128 { struct btree_head h; }; /* * These couple of functions are all there is to it. The rest of this header * consists only of wrappers that try to add some typesafety, make the code * a little self-documenting and generally be nice to people. */ void btree_free(void *element, void *pool_data); void btree_init(struct btree_head *head); void *btree_lookup(struct btree_head *head, struct btree_geo *geo, unsigned long *key); int btree_insert(struct btree_head *head, struct btree_geo *geo, unsigned long *key, void *val); void *btree_remove(struct btree_head *head, struct btree_geo *geo, unsigned long *key); int btree_merge(struct btree_head *target, struct btree_head *victim, struct btree_geo *geo, unsigned long *duplicate); unsigned long *btree_last(struct btree_head *head, struct btree_geo *geo); size_t btree_visitor(struct btree_head *head, struct btree_geo *geo, long opaque, void (*func)(void *elem, long opaque, unsigned long *key, size_t index, void *func2), void *func2); size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo, long opaque, void (*func)(void *elem, long opaque, unsigned long *key, size_t index, void *func2), void *func2); /* key is unsigned long */ static inline void btree_initl(struct btree_headl *head) { btree_init(&head->h); } static inline void *btree_lookupl(struct btree_headl *head, unsigned long key) { return btree_lookup(&head->h, &btree_geo32, &key); } static inline int btree_insertl(struct btree_headl *head, unsigned long key, void *val) { return btree_insert(&head->h, &btree_geo32, &key, val); } static inline void *btree_removel(struct btree_headl *head, unsigned long key) { return btree_remove(&head->h, &btree_geo32, &key); } static inline int btree_mergel(struct btree_headl *target, struct btree_headl *victim) { unsigned long scratch; return btree_merge(&target->h, &victim->h, &btree_geo32, &scratch); } void visitorl(void *elem, long opaque, unsigned long *key, size_t index, void *__func); typedef void (*visitorl_t)(void *elem, long opaque, unsigned long key, size_t index); static inline size_t btree_visitorl(struct btree_headl *head, long opaque, visitorl_t func2) { return btree_visitor(&head->h, &btree_geo32, opaque, visitorl, func2); } static inline size_t btree_grim_visitorl(struct btree_headl *head, long opaque, visitorl_t func2) { return btree_grim_visitor(&head->h, &btree_geo32, opaque, visitorl, func2); } /* key is u32 */ static inline void btree_init32(struct btree_head32 *head) { btree_init(&head->h); } static inline void *btree_lookup32(struct btree_head32 *head, u32 key) { return btree_lookup(&head->h, &btree_geo32, (unsigned long *)&key); } static inline int btree_insert32(struct btree_head32 *head, u32 key, void *val) { return btree_insert(&head->h, &btree_geo32, (unsigned long *)&key, val); } static inline void *btree_remove32(struct btree_head32 *head, u32 key) { return btree_remove(&head->h, &btree_geo32, (unsigned long *)&key); } static inline int btree_merge32(struct btree_head32 *target, struct btree_head32 *victim) { unsigned long scratch; return btree_merge(&target->h, &victim->h, &btree_geo32, &scratch); } void visitor32(void *elem, long opaque, unsigned long *__key, size_t index, void *__func); typedef void (*visitor32_t)(void *elem, long opaque, u32 key, size_t index); static inline size_t btree_visitor32(struct btree_head32 *head, long opaque, visitor32_t func2) { return btree_visitor(&head->h, &btree_geo32, opaque, visitor32, func2); } static inline size_t btree_grim_visitor32(struct btree_head32 *head, long opaque, visitor32_t func2) { return btree_grim_visitor(&head->h, &btree_geo32, opaque, visitor32, func2); } /* key is u64 */ static inline void btree_init64(struct btree_head64 *head) { btree_init(&head->h); } static inline void *btree_lookup64(struct btree_head64 *head, u64 key) { return btree_lookup(&head->h, &btree_geo64, (unsigned long *)&key); } static inline int btree_insert64(struct btree_head64 *head, u64 key, void *val) { return btree_insert(&head->h, &btree_geo64, (unsigned long *)&key, val); } static inline void *btree_remove64(struct btree_head64 *head, u64 key) { return btree_remove(&head->h, &btree_geo64, (unsigned long *)&key); } static inline u64 btree_last64(struct btree_head64 *head) { u64 *key; key = (u64 *)btree_last(&head->h, &btree_geo64); if (key) return *key; else return 0; } static inline int btree_merge64(struct btree_head64 *target, struct btree_head64 *victim) { u64 scratch; return btree_merge(&target->h, &victim->h, &btree_geo64, (unsigned long *)&scratch); } void visitor64(void *elem, long opaque, unsigned long *__key, size_t index, void *__func); typedef void (*visitor64_t)(void *elem, long opaque, u64 key, size_t index); static inline size_t btree_visitor64(struct btree_head64 *head, long opaque, visitor64_t func2) { return btree_visitor(&head->h, &btree_geo64, opaque, visitor64, func2); } static inline size_t btree_grim_visitor64(struct btree_head64 *head, long opaque, visitor64_t func2) { return btree_grim_visitor(&head->h, &btree_geo64, opaque, visitor64, func2); } /* key is 128bit (two u64) */ static inline void btree_init128(struct btree_head128 *head) { btree_init(&head->h); } static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2) { u64 key[2] = {k1, k2}; return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key); } static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2, void *val) { u64 key[2] = {k1, k2}; return btree_insert(&head->h, &btree_geo128, (unsigned long *)&key, val); } static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2) { u64 key[2] = {k1, k2}; return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key); } static inline void btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2) { u64 *key = (u64 *)btree_last(&head->h, &btree_geo128); if (key) { *k1 = key[0]; *k2 = key[1]; } else { *k1 = 0; *k2 = 0; } } static inline int btree_merge128(struct btree_head128 *target, struct btree_head128 *victim) { u64 scratch[2]; return btree_merge(&target->h, &victim->h, &btree_geo128, (unsigned long *)scratch); } void visitor128(void *elem, long opaque, unsigned long *__key, size_t index, void *__func); typedef void (*visitor128_t)(void *elem, long opaque, u64 key1, u64 key2, size_t index); static inline size_t btree_visitor128(struct btree_head128 *head, long opaque, visitor128_t func2) { return btree_visitor(&head->h, &btree_geo128, opaque, visitor128, func2); } static inline size_t btree_grim_visitor128(struct btree_head128 *head, long opaque, visitor128_t func2) { return btree_grim_visitor(&head->h, &btree_geo128, opaque, visitor128, func2); } #endif logfsprogs/logfs.h0000644000175000017500000000366712142655114012713 0ustar lukluk#ifndef LOGFS_H #define LOGFS_H #include #include #include "btree.h" #include "kerncompat.h" #include "logfs_abi.h" struct super_block; struct logfs_device_operations { int (*prepare_sb)(struct super_block *sb); int (*write)(struct super_block *sb, u64 ofs, size_t size, void *buf); int (*erase)(struct super_block *sb, u64 ofs, size_t size); s64 (*scan_super)(struct super_block *sb); }; struct logfs_area { u32 segno; u32 used_bytes; void *buf; }; struct super_block { int fd; u64 fssize; u32 segsize; u32 erasesize; u32 blocksize; int blocksize_bits; u32 writesize; u32 no_segs; u32 journal_seg[LOGFS_JOURNAL_SEGS]; u64 used_bytes; u32 lastseg; struct logfs_area area[LOGFS_NO_AREAS]; struct logfs_segment_entry *segment_entry; void *erase_buf; u64 sb_ofs1; u64 sb_ofs2; struct btree_head64 ino_tree; struct btree_head128 block_tree[LOGFS_NO_AREAS]; const struct logfs_device_operations *dev_ops; }; struct inode { struct btree_head64 block_tree[LOGFS_MAX_LEVELS]; struct logfs_disk_inode di; }; void check_crc32(void); void fail(const char *s) __attribute__ ((__noreturn__)); struct super_block *open_device(const char *name); static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) { return cpu_to_be32(~crc32(0, data+skip, len-skip)); } static inline void *zalloc(size_t bytes) { void *p = malloc(bytes); memset(p, 0, bytes); return p; } /* readwrite.c */ struct inode *find_or_create_inode(struct super_block *sb, u64 ino); int logfs_file_write(struct super_block *sb, u64 ino, u64 bix, u8 level, u8 type, void *buf); int logfs_file_flush(struct super_block *sb, u64 ino); /* segment.c */ u32 get_segment(struct super_block *sb); s64 logfs_segment_write(struct super_block *sb, void *buf, u8 type, u64 ino, u64 bix, u8 level); int flush_segments(struct super_block *sb); static inline __be32 ec_level(u32 ec, u8 level) { return cpu_to_be32((ec << 4) | (level & 0xf)); } #endif logfsprogs/fsck.c0000644000175000017500000000225312142655114012510 0ustar lukluk/* * LogFS mkfs * * Copyright (c) 2007-2008 Joern Engel * * License: GPL version 2 */ #define _LARGEFILE64_SOURCE #define __USE_FILE_OFFSET64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "kerncompat.h" #include #include "logfs_abi.h" #include "logfs.h" static void usage(void) { printf( "logfsck \n" "\n" "Options:\n" " -h --help display this help\n" "\n"); } int main(int argc, char **argv) { check_crc32(); for (;;) { int oi = 1; char short_opts[] = "h"; static const struct option long_opts[] = { {"help", 0, NULL, 'h'}, { } }; int c = getopt_long(argc, argv, short_opts, long_opts, &oi); if (c == -1) break; switch (c) { case 'h': usage(); exit(EXIT_SUCCESS); default: fail("unknown option\n"); } } if (optind != argc - 1) { usage(); exit(EXIT_FAILURE); } printf("foo\n"); return 0; } logfsprogs/logfs_abi.h0000644000175000017500000004146412142655114013523 0ustar lukluk/* * fs/logfs/logfs_abi.h * * As should be obvious for Linux kernel code, license is GPLv2 * * Copyright (c) 2005-2008 Joern Engel * * Public header for logfs. */ #ifndef FS_LOGFS_LOGFS_ABI_H #define FS_LOGFS_LOGFS_ABI_H /* For out-of-kernel compiles */ #ifndef BUILD_BUG_ON #define BUILD_BUG_ON(condition) /**/ #endif #define SIZE_CHECK(type, size) \ static inline void check_##type(void) \ { \ BUILD_BUG_ON(sizeof(struct type) != (size)); \ } /* * Throughout the logfs code, we're constantly dealing with blocks at * various positions or offsets. To remove confusion, we stricly * distinguish between a "position" - the logical position within a * file and an "offset" - the physical location within the device. * * Any usage of the term offset for a logical location or position for * a physical one is a bug and should get fixed. */ /* * Block are allocated in one of several segments depending on their * level. The following levels are used: * 0 - regular data block * 1 - i1 indirect blocks * 2 - i2 indirect blocks * 3 - i3 indirect blocks * 4 - i4 indirect blocks * 5 - i5 indirect blocks * 6 - ifile data blocks * 7 - ifile i1 indirect blocks * 8 - ifile i2 indirect blocks * 9 - ifile i3 indirect blocks * 10 - ifile i4 indirect blocks * 11 - ifile i5 indirect blocks * Potential levels to be used in the future: * 12 - gc recycled blocks, long-lived data * 13 - replacement blocks, short-lived data * * Levels 1-11 are necessary for robust gc operations and help seperate * short-lived metadata from longer-lived file data. In the future, * file data should get seperated into several segments based on simple * heuristics. Old data recycled during gc operation is expected to be * long-lived. New data is of uncertain life expectancy. New data * used to replace older blocks in existing files is expected to be * short-lived. */ /* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ #define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull #define LOGFS_MAGIC_U32 0xc97e8168u /* * Various blocksize related macros. Blocksize is currently fixed at 4KiB. * Sooner or later that should become configurable and the macros replaced * by something superblock-dependent. Pointers in indirect blocks are and * will remain 64bit. * * LOGFS_BLOCKSIZE - self-explaining * LOGFS_BLOCK_FACTOR - number of pointers per indirect block * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts */ #define LOGFS_BLOCKSIZE (4096ull) #define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) #define LOGFS_BLOCK_BITS (9) /* * Number of blocks at various levels of indirection. There are 16 direct * block pointers plus a single indirect pointer. */ #define I0_BLOCKS (16) #define I1_BLOCKS LOGFS_BLOCK_FACTOR #define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) #define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) #define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) #define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) #define INDIRECT_INDEX I0_BLOCKS #define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) /* * Sizes at which files require another level of indirection. Files smaller * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, * similar like ext2 fast symlinks. * * Data at a position smaller than LOGFS_I0_SIZE is accessed through the * direct pointers, else through the 1x indirect pointer and so forth. */ #define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) #define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) #define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) /* * Each indirect block pointer must have this flag set, if all block pointers * behind it are set, i.e. there is no hole hidden in the shadow of this * indirect block pointer. */ #define LOGFS_FULLY_POPULATED (1ULL << 63) #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) /* * LogFS needs to seperate data into levels. Each level is defined as the * maximal possible distance from the master inode (inode of the inode file). * Data blocks reside on level 0, 1x indirect block on level 1, etc. * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. * This effort is necessary to guarantee garbage collection to always make * progress. * * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is * the maximal number of levels for one file. * LOGFS_NO_AREAS is twice that, as the inode file and regular files are * effectively stacked on top of each other. */ #define LOGFS_MAX_INDIRECT (5) #define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) #define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) /* Maximum size of filenames */ #define LOGFS_MAX_NAMELEN (255) /* Number of segments in the primary journal. */ #define LOGFS_JOURNAL_SEGS (16) /* Maximum number of free/erased/etc. segments in journal entries */ #define MAX_CACHED_SEGS (64) /* * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including * its header, * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for * its segment header and the padded space at the end when no further objects * fit. */ #define LOGFS_OBJECT_HEADERSIZE (0x1c) #define LOGFS_SEGMENT_HEADERSIZE (0x18) #define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) #define LOGFS_SEGMENT_RESERVE \ (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) /* * Segment types: * SEG_SUPER - Data or indirect block * SEG_JOURNAL - Inode * SEG_OSTORE - Dentry */ enum { SEG_SUPER = 0x01, SEG_JOURNAL = 0x02, SEG_OSTORE = 0x03, }; /** * struct logfs_segment_header - per-segment header in the ostore * * @crc: crc32 of header (there is no data) * @pad: unused, must be 0 * @type: segment type, see above * @level: GC level for all objects in this segment * @segno: segment number * @ec: erase count for this segment * @gec: global erase count at time of writing */ struct logfs_segment_header { __be32 crc; __be16 pad; __u8 type; __u8 level; __be32 segno; __be32 ec; __be64 gec; }; SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); #define LOGFS_FEATURES_INCOMPAT (0ull) #define LOGFS_FEATURES_RO_COMPAT (0ull) #define LOGFS_FEATURES_COMPAT (0ull) /** * struct logfs_disk_super - on-medium superblock * * @ds_magic: magic number, must equal LOGFS_MAGIC * @ds_crc: crc32 of structure starting with the next field * @ds_ifile_levels: maximum number of levels for ifile * @ds_iblock_levels: maximum number of levels for regular files * @ds_data_levels: number of seperate levels for data * @pad0: reserved, must be 0 * @ds_feature_incompat: incompatible filesystem features * @ds_feature_ro_compat: read-only compatible filesystem features * @ds_feature_compat: compatible filesystem features * @ds_flags: flags * @ds_segment_shift: log2 of segment size * @ds_block_shift: log2 of block size * @ds_write_shift: log2 of write size * @pad1: reserved, must be 0 * @ds_journal_seg: segments used by primary journal * @ds_root_reserve: bytes reserved for the superuser * @ds_speed_reserve: bytes reserved to speed up GC * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks * @pad2: reserved, must be 0 * @pad3: reserved, must be 0 * * Contains only read-only fields. Read-write fields like the amount of used * space is tracked in the dynamic superblock, which is stored in the journal. */ struct logfs_disk_super { struct logfs_segment_header ds_sh; __be64 ds_magic; __be32 ds_crc; __u8 ds_ifile_levels; __u8 ds_iblock_levels; __u8 ds_data_levels; __u8 ds_segment_shift; __u8 ds_block_shift; __u8 ds_write_shift; __u8 pad0[6]; __be64 ds_filesystem_size; __be32 ds_segment_size; __be32 ds_bad_seg_reserve; __be64 ds_feature_incompat; __be64 ds_feature_ro_compat; __be64 ds_feature_compat; __be64 ds_feature_flags; __be64 ds_root_reserve; __be64 ds_speed_reserve; __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; __be64 ds_super_ofs[2]; __be64 pad3[8]; }; SIZE_CHECK(logfs_disk_super, 256); /* * Object types: * OBJ_BLOCK - Data or indirect block * OBJ_INODE - Inode * OBJ_DENTRY - Dentry */ enum { OBJ_BLOCK = 0x04, OBJ_INODE = 0x05, OBJ_DENTRY = 0x06, }; /** * struct logfs_object_header - per-object header in the ostore * * @crc: crc32 of header, excluding data_crc * @len: length of data * @type: object type, see above * @compr: compression type * @ino: inode number * @bix: block index * @data_crc: crc32 of payload */ struct logfs_object_header { __be32 crc; __be16 len; __u8 type; __u8 compr; __be64 ino; __be64 bix; __be32 data_crc; } __attribute__((packed)); SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); /* * Reserved inode numbers: * LOGFS_INO_MASTER - master inode (for inode file) * LOGFS_INO_ROOT - root directory * LOGFS_INO_SEGFILE - per-segment used bytes and erase count */ enum { LOGFS_INO_MAPPING = 0x00, LOGFS_INO_MASTER = 0x01, LOGFS_INO_ROOT = 0x02, LOGFS_INO_SEGFILE = 0x03, LOGFS_RESERVED_INOS = 0x10, }; /* * Inode flags. High bits should never be written to the medium. They are * reserved for in-memory usage. * Low bits should either remain in sync with the corresponding FS_*_FL or * reuse slots that obviously don't make sense for logfs. * * LOGFS_IF_DIRTY Inode must be written back * LOGFS_IF_ZOMBIE Inode has been deleted * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode */ #define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ #define LOGFS_IF_DIRTY 0x20000000 #define LOGFS_IF_ZOMBIE 0x40000000 #define LOGFS_IF_STILLBORN 0x80000000 /* Flags available to chattr */ #define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) #define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) /* Flags inherited from parent directory on file/directory creation */ #define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) /** * struct logfs_disk_inode - on-medium inode * * @di_mode: file mode * @di_pad: reserved, must be 0 * @di_flags: inode flags, see above * @di_uid: user id * @di_gid: group id * @di_ctime: change time * @di_mtime: modify time * @di_refcount: reference count (aka nlink or link count) * @di_generation: inode generation, for nfs * @di_used_bytes: number of bytes used * @di_size: file size * @di_data: data pointers */ struct logfs_disk_inode { __be16 di_mode; __u8 di_height; __u8 di_pad; __be32 di_flags; __be32 di_uid; __be32 di_gid; __be64 di_ctime; __be64 di_mtime; __be64 di_atime; __be32 di_refcount; __be32 di_generation; __be64 di_used_bytes; __be64 di_size; __be64 di_data[LOGFS_EMBEDDED_FIELDS]; }; SIZE_CHECK(logfs_disk_inode, 200); #define INODE_POINTER_OFS \ (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) #define INODE_USED_OFS \ (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) #define INODE_SIZE_OFS \ (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) #define INODE_HEIGHT_OFS (0) /** * struct logfs_disk_dentry - on-medium dentry structure * * @ino: inode number * @namelen: length of file name * @type: file type, identical to bits 12..15 of mode * @name: file name */ /* FIXME: add 6 bytes of padding to remove the __packed */ struct logfs_disk_dentry { __be64 ino; __be16 namelen; __u8 type; __u8 name[LOGFS_MAX_NAMELEN]; } __attribute__((packed)); SIZE_CHECK(logfs_disk_dentry, 266); #define RESERVED 0xffffffff #define BADSEG 0xffffffff /** * struct logfs_segment_entry - segment file entry * * @ec_level: erase count and level * @valid: number of valid bytes * * Segment file contains one entry for every segment. ec_level contains the * erasecount in the upper 28 bits and the level in the lower 4 bits. An * ec_level of BADSEG (-1) identifies bad segments. valid contains the number * of valid bytes or RESERVED (-1 again) if the segment is used for either the * superblock or the journal, or when the segment is bad. */ struct logfs_segment_entry { __be32 ec_level; __be32 valid; }; SIZE_CHECK(logfs_segment_entry, 8); /** * struct logfs_journal_header - header for journal entries (JEs) * * @h_crc: crc32 of journal entry * @h_len: length of compressed journal entry, * not including header * @h_datalen: length of uncompressed data * @h_type: JE type * @h_compr: compression type * @h_pad: reserved */ struct logfs_journal_header { __be32 h_crc; __be16 h_len; __be16 h_datalen; __be16 h_type; __u8 h_compr; __u8 h_pad[5]; }; SIZE_CHECK(logfs_journal_header, 16); /* * Life expectency of data. * VIM_DEFAULT - default vim * VIM_SEGFILE - for segment file only - very short-living * VIM_GC - GC'd data - likely long-living */ enum logfs_vim { VIM_DEFAULT = 0, VIM_SEGFILE = 1, }; /** * struct logfs_je_area - wbuf header * * @segno: segment number of area * @used_bytes: number of bytes already used * @gc_level: GC level * @vim: life expectancy of data * * "Areas" are segments currently being used for writing. There is at least * one area per GC level. Several may be used to seperate long-living from * short-living data. If an area with unknown vim is encountered, it can * simply be closed. * The write buffer immediately follow this header. */ struct logfs_je_area { __be32 segno; __be32 used_bytes; __u8 gc_level; __u8 vim; } __attribute__((packed)); SIZE_CHECK(logfs_je_area, 10); #define MAX_JOURNAL_HEADER \ (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) /** * struct logfs_je_dynsb - dynamic superblock * * @ds_gec: global erase count * @ds_sweeper: current position of GC "sweeper" * @ds_rename_dir: source directory ino (see dir.c documentation) * @ds_rename_pos: position of source dd (see dir.c documentation) * @ds_victim_ino: victims of incomplete dir operation (see dir.c) * @ds_victim_ino: parent inode of victim (see dir.c) * @ds_used_bytes: number of used bytes */ struct logfs_je_dynsb { __be64 ds_gec; __be64 ds_sweeper; __be64 ds_rename_dir; __be64 ds_rename_pos; __be64 ds_victim_ino; __be64 ds_victim_parent; /* XXX */ __be64 ds_used_bytes; __be32 ds_generation; __be32 pad; }; SIZE_CHECK(logfs_je_dynsb, 64); /** * struct logfs_je_anchor - anchor of filesystem tree, aka master inode * * @da_size: size of inode file * @da_last_ino: last created inode * @da_used_bytes: number of bytes used * @da_data: data pointers */ struct logfs_je_anchor { __be64 da_size; __be64 da_last_ino; __be64 da_used_bytes; u8 da_height; u8 pad[7]; __be64 da_data[LOGFS_EMBEDDED_FIELDS]; }; SIZE_CHECK(logfs_je_anchor, 168); /** * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) * * @so_segment: segments used for 2nd journal * * Length of the array is given by h_len field in the header. */ struct logfs_je_spillout { __be64 so_segment[0]; }; SIZE_CHECK(logfs_je_spillout, 0); /** * struct logfs_je_journal_ec - erase counts for all journal segments * * @ec: erase count * * Length of the array is given by h_len field in the header. */ struct logfs_je_journal_ec { __be32 ec[0]; }; SIZE_CHECK(logfs_je_journal_ec, 0); /** * struct logfs_je_free_segments - list of free segmetns with erase count */ struct logfs_je_free_segments { __be32 segno; __be32 ec; }; SIZE_CHECK(logfs_je_free_segments, 8); /** * struct logfs_seg_alias - list of segment aliases */ struct logfs_seg_alias { __be32 old_segno; __be32 new_segno; }; SIZE_CHECK(logfs_seg_alias, 8); /** * struct logfs_obj_alias - list of object aliases */ struct logfs_obj_alias { __be64 ino; __be64 bix; __be64 val; u8 level; u8 pad[5]; __be16 child_no; }; SIZE_CHECK(logfs_obj_alias, 32); /** * Compression types. * * COMPR_NONE - uncompressed * COMPR_ZLIB - compressed with zlib */ enum { COMPR_NONE = 0, COMPR_ZLIB = 1, }; /* * Journal entries come in groups of 16. First group contains unique * entries, next groups contain one entry per level * * JE_FIRST - smallest possible journal entry number * * JEG_BASE - base group, containing unique entries * JE_COMMIT - commit entry, validates all previous entries * JE_DYNSB - dynamic superblock, anything that ought to be in the * superblock but cannot because it is read-write data * JE_ANCHOR - anchor aka master inode aka inode file's inode * JE_ERASECOUNT erasecounts for all journal segments * JE_SPILLOUT - unused * JE_SEG_ALIAS - aliases segments * JE_AREA - area description * * JE_LAST - largest possible journal entry number */ enum { JE_FIRST = 0x01, JEG_BASE = 0x00, JE_COMMIT = 0x02, JE_DYNSB = 0x03, JE_ANCHOR = 0x04, JE_ERASECOUNT = 0x05, JE_SPILLOUT = 0x06, JE_OBJ_ALIAS = 0x0d, JE_AREA = 0x0e, JE_LAST = 0x0e, }; #endif logfsprogs/segment.c0000644000175000017500000000772612142655114013236 0ustar lukluk/* * segment.c * * Copyright (c) 2007-2008 Joern Engel * * License: GPL version 2 */ #include #include #include "kerncompat.h" #include "logfs_abi.h" #include "logfs.h" #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) static inline void hexdump(const char *prefix, void *buf, size_t len) { unsigned char *c = buf; int i; printf("%s", prefix); for (i = 0; i < len; i++) { printf("%02x ", c[i]); if (i % 8 == 7) printf(" "); if (i % 16 == 15) printf("\n"); } printf("\n"); } static void copybuf(struct logfs_area *area, void *buf, size_t len) { memcpy(area->buf + area->used_bytes, buf, len); area->used_bytes += len; } u32 get_segment(struct super_block *sb) { u64 ofs; u32 segno; int err; do { segno = sb->lastseg; ofs = (u64)segno * sb->segsize; sb->lastseg += 1; if (sb->lastseg > sb->no_segs) fail("no more free segments"); err = sb->dev_ops->erase(sb, ofs, sb->segsize); if (err) { /* bad segment */ sb->segment_entry[segno].ec_level = cpu_to_be32(BADSEG); sb->segment_entry[segno].valid = cpu_to_be32(RESERVED); printf("Bad block at 0x%llx\n", ofs); } } while (err); return segno; } static void __init_area(struct super_block *sb, struct logfs_area *area, u8 level) { struct logfs_segment_header *sh = area->buf; memset(area->buf, 0xff, sb->segsize); area->segno = get_segment(sb); area->used_bytes = sizeof(*sh); sh->pad = 0; sh->type = SEG_OSTORE; sh->level = level; sh->segno = cpu_to_be32(area->segno); sh->ec = cpu_to_be32(1); sh->gec = cpu_to_be64(area->segno); sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); } static void init_area(struct super_block *sb, struct logfs_area *area, u8 level) { if (area->buf) return; area->buf = malloc(sb->segsize); __init_area(sb, area, level); } static int finish_area(struct super_block *sb, struct logfs_area *area, int final, u8 level) { u64 ofs = (u64)area->segno * sb->segsize; int err; err = sb->dev_ops->write(sb, ofs, sb->segsize, area->buf); if (err) return err; sb->segment_entry[area->segno].ec_level = ec_level(1, level); sb->segment_entry[area->segno].valid = cpu_to_be32(area->used_bytes - LOGFS_SEGMENT_HEADERSIZE); if (final) return 0; __init_area(sb, area, level); return 0; } static int grow_inode(struct super_block *sb, u64 ino, size_t len) { struct inode *inode = find_or_create_inode(sb, ino); if (!inode) return -ENOMEM; inode->di.di_used_bytes = cpu_to_be64(len + be64_to_cpu(inode->di.di_used_bytes)); sb->used_bytes += len; return 0; } static int obj_len(struct super_block *sb, int obj_type) { switch (obj_type) { case OBJ_DENTRY: return sizeof(struct logfs_disk_dentry); case OBJ_INODE: return sizeof(struct logfs_disk_inode); case OBJ_BLOCK: return sb->blocksize; default: BUG(); } } s64 logfs_segment_write(struct super_block *sb, void *buf, u8 type, u64 ino, u64 bix, u8 level) { struct logfs_object_header oh; struct logfs_area *area; int err; s64 ofs; u16 len = obj_len(sb, type); if (ino == LOGFS_INO_MASTER) level += LOGFS_MAX_LEVELS; area = sb->area + level; memset(&oh, 0, sizeof(oh)); oh.len = cpu_to_be16(len); oh.type = type; oh.compr = COMPR_NONE; oh.ino = cpu_to_be64(ino); oh.bix = cpu_to_be64(bix); oh.crc = logfs_crc32(&oh, LOGFS_OBJECT_HEADERSIZE - 4, 4); oh.data_crc = logfs_crc32(buf, len, 0); init_area(sb, area, level); if (area->used_bytes + sizeof(oh) + sb->blocksize > sb->segsize) { err = finish_area(sb, area, 0, level); if (err) return err; } ofs = area->segno * sb->segsize + area->used_bytes; copybuf(area, &oh, sizeof(oh)); copybuf(area, buf, len); err = grow_inode(sb, ino, sizeof(oh) + len); if (err) return err; return ofs; } int flush_segments(struct super_block *sb) { struct logfs_area *area; int i, err; for (i = 0; i < LOGFS_NO_AREAS; i++) { area = sb->area + i; if (area->buf) { err = finish_area(sb, area, 1, i); if (err) return err; } } return 0; } logfsprogs/readwrite.c0000644000175000017500000000734312142655114013555 0ustar lukluk/* * readwrite.c * * Copyright (c) 2007-2008 Joern Engel * * License: GPL version 2 */ #include #include #include "btree.h" #include "kerncompat.h" #include "logfs_abi.h" #include "logfs.h" static unsigned long __get_bits(u64 val, int skip, int no) { u64 ret = val; ret >>= skip * no; ret <<= 64 - no; ret >>= 64 - no; return ret; } static unsigned long get_bits(struct super_block *sb, u64 bix, u8 level) { return __get_bits(bix, level, sb->blocksize_bits - 3); } static inline int child_no(struct super_block *sb, u64 bix) { return bix & ((sb->blocksize / sizeof(__be64)) - 1); } struct inode *find_or_create_inode(struct super_block *sb, u64 ino) { struct inode *inode; int err; inode = btree_lookup64(&sb->ino_tree, ino); if (!inode) { inode = zalloc(sizeof(*inode) + sb->blocksize); if (!inode) return NULL; err = btree_insert64(&sb->ino_tree, ino, inode); if (err) return NULL; } return inode; } static __be64 *find_or_create_block(struct super_block *sb, struct inode *inode, u64 bix, u8 level) { __be64 *block; struct btree_head64 *tree = &inode->block_tree[level]; int err; block = btree_lookup64(tree, bix); if (!block) { block = zalloc(sb->blocksize); if (!block) return NULL; err = btree_insert64(tree, bix, block); if (err) return NULL; } return block; } static int write_direct(struct super_block *sb, struct inode *inode, u64 ino, u64 bix, u8 type, void *buf) { s64 ofs; ofs = logfs_segment_write(sb, buf, type, ino, bix, 0); if (ofs < 0) return ofs; inode->di.di_data[bix] = cpu_to_be64(ofs); return 0; } static u64 bixmask(struct super_block *sb, u8 level) { if (level == 0) return 0; return (1 << ((sb->blocksize_bits - 3) * level)) - 1; } static int write_loop(struct super_block *sb, struct inode *inode, u64 ino, u64 bix, u8 level, u8 type, void *buf) { u64 parent_bix; __be64 *iblock; s64 ofs; parent_bix = bix | bixmask(sb, level + 1); iblock = find_or_create_block(sb, inode, parent_bix, level + 1); if (!iblock) return -ENOMEM; ofs = logfs_segment_write(sb, buf, type, ino, bix, level); if (ofs < 0) return ofs; iblock[get_bits(sb, bix, level)] = cpu_to_be64(ofs); return 0; } static inline u64 maxbix(u8 height) { return 1ULL << (LOGFS_BLOCK_BITS * height); } static void grow_inode(struct inode *inode, u64 bix, u8 level) { if (level != 0) return; while (bix > maxbix(inode->di.di_height)) inode->di.di_height++; } int logfs_file_write(struct super_block *sb, u64 ino, u64 bix, u8 level, u8 type, void *buf) { struct inode *inode; inode = find_or_create_inode(sb, ino); if (!inode) return -ENOMEM; if (level == 0 && bix < I0_BLOCKS) return write_direct(sb, inode, ino, bix, type, buf); grow_inode(inode, bix, level); return write_loop(sb, inode, ino, bix, level, type, buf); } int logfs_file_flush(struct super_block *sb, u64 ino) { struct btree_head64 *tree; struct inode *inode; __be64 *iblock; s64 ofs; u64 bix; u8 level; int err; inode = find_or_create_inode(sb, ino); BUG_ON(!inode); if (inode->di.di_height == 0) return 0; for (level = 1; level < inode->di.di_height; level++) { tree = &inode->block_tree[level]; for (;;) { bix = btree_last64(tree); iblock = btree_remove64(tree, bix); if (!iblock) break; err = logfs_file_write(sb, ino, bix, level, OBJ_BLOCK, iblock); if (err) return err; free(iblock); } } BUG_ON(level != inode->di.di_height); tree = &inode->block_tree[level]; bix = btree_last64(tree); iblock = btree_remove64(tree, bix); BUG_ON(!iblock); ofs = logfs_segment_write(sb, iblock, OBJ_BLOCK, ino, bix, level); if (ofs < 0) return ofs; inode->di.di_data[INDIRECT_INDEX] = cpu_to_be64(ofs); return 0; }