/* * XZ decompressor - Stream decoder * * Based on xz-embedded by Lasse Collin (public domain). * * This parses the XZ container format: stream header, block headers, * index, and stream footer. Actual data decompression is delegated * to the LZMA2 decoder. */ #include "xz_private.h" /* Sizes of the stream header and footer fields */ #define STREAM_HEADER_SIZE 12 #define HEADER_MAGIC_SIZE 6 #define FOOTER_MAGIC_SIZE 2 /* * Supported check types. We accept None, CRC32, CRC64, and SHA-256 * (though SHA-256 is not verified -- we just skip those bytes). */ #define CHECK_NONE 0x00 #define CHECK_CRC32 0x01 #define CHECK_CRC64 0x04 #define CHECK_SHA256 0x0A /* Sizes of the integrity check fields */ static const uint8_t check_sizes[16] = { 0, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64 }; /* Maximum block header size (encoded in the first byte as (real_size / 4) - 1) */ #define BLOCK_HEADER_SIZE_MAX 1024 /* * Stream decoder states. The decoder is a state machine driven * by xz_dec_run(), consuming input and producing output as it * transitions through these states. */ enum xz_dec_stream_state { SEQ_STREAM_HEADER, SEQ_BLOCK_START, SEQ_BLOCK_HEADER, SEQ_BLOCK_UNCOMPRESS, SEQ_BLOCK_PADDING, SEQ_BLOCK_CHECK, SEQ_INDEX, SEQ_INDEX_PADDING, SEQ_INDEX_CRC32, SEQ_STREAM_FOOTER }; struct xz_dec { /* Current sequence/state in the stream decoder */ enum xz_dec_stream_state sequence; /* Position within the current sequence state */ uint32_t pos; /* Variable-length integer accumulator for index parsing */ uint64_t vli; uint32_t vli_count; /* Allocated operating mode */ enum xz_mode mode; /* * True once the stream footer has been verified; used to * accept stream padding between concatenated streams. */ int allow_buf_error; /* CRC32 of stream flags for footer verification */ uint32_t crc32_context; /* Temporary buffer for collecting small structures */ struct { uint8_t buf[BLOCK_HEADER_SIZE_MAX]; size_t pos; size_t size; } temp; /* Block state */ struct { /* Uncompressed size from block header (or VLI_UNKNOWN) */ uint64_t compressed; uint64_t uncompressed; /* Running counts during decompression */ uint64_t count_compressed; uint64_t count_uncompressed; /* Size of the integrity check for this stream */ uint32_t check_size; uint32_t check_type; /* Hash of block sizes for index verification */ uint64_t hash_compressed; uint64_t hash_uncompressed; uint32_t hash_count; } block; /* Index state */ struct { uint64_t compressed; uint64_t uncompressed; uint64_t size; uint32_t count; uint64_t hash_compressed; uint64_t hash_uncompressed; uint32_t hash_count; } index; /* Check value accumulation buffer */ struct { uint8_t buf[64]; /* big enough for SHA-256 */ uint32_t pos; } check; /* Stream flags from the stream header */ uint32_t stream_flags; /* LZMA2 decoder instance */ struct xz_dec_lzma2 *lzma2; }; #define VLI_UNKNOWN UINT64_MAX #define VLI_MAX (UINT64_MAX / 2) #define VLI_BYTES_MAX 9 /* * Fill the temporary buffer from the input. Returns true when * the temp buffer is full (temp.pos == temp.size). */ static int fill_temp(struct xz_dec *s, struct xz_buf *b) { size_t copy_size = min_t(size_t, s->temp.size - s->temp.pos, b->in_size - b->in_pos); memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); b->in_pos += copy_size; s->temp.pos += copy_size; if (s->temp.pos == s->temp.size) { s->temp.pos = 0; return 1; } return 0; } /* * Decode a variable-length integer (VLI). XZ VLIs use 7 bits per byte * with the high bit as a continuation flag. Returns XZ_STREAM_END when * the VLI is complete, XZ_OK when more bytes are needed. */ static enum xz_ret dec_vli(struct xz_dec *s, const uint8_t *in, size_t *in_pos, size_t in_size) { uint8_t byte; if (s->vli_count == 0) { s->vli = 0; s->vli_count = 1; } while (*in_pos < in_size) { byte = in[*in_pos]; ++(*in_pos); s->vli |= (uint64_t)(byte & 0x7F) << ((s->vli_count - 1) * 7); if ((byte & 0x80) == 0) { /* Reject non-minimal encodings */ if (byte == 0 && s->vli_count > 1) return XZ_DATA_ERROR; s->vli_count = 0; return XZ_STREAM_END; } if (s->vli_count >= VLI_BYTES_MAX) return XZ_DATA_ERROR; ++s->vli_count; } return XZ_OK; } /* * Decode the stream header. It is 12 bytes: * 6 bytes magic, 2 bytes stream flags, 4 bytes CRC32 of flags. */ static enum xz_ret dec_stream_header(struct xz_dec *s) { uint32_t crc; if (memcmp(s->temp.buf, XZ_HEADER_MAGIC, XZ_HEADER_MAGIC_SIZE) != 0) return XZ_FORMAT_ERROR; /* Stream flags: first byte must be 0, second byte holds check type */ if (s->temp.buf[6] != 0) return XZ_OPTIONS_ERROR; s->stream_flags = s->temp.buf[7]; s->block.check_type = s->stream_flags & 0x0F; s->block.check_size = check_sizes[s->block.check_type]; /* Verify CRC32 of the two stream flag bytes */ crc = xz_crc32(s->temp.buf + 6, 2, 0); if (crc != get_le32(s->temp.buf + 8)) return XZ_DATA_ERROR; return XZ_OK; } /* * Decode a block header. The first byte gives the header size * (encoded as (real_size / 4) - 1). The header contains filter * flags; we only support a single LZMA2 filter (ID 0x21). */ static enum xz_ret dec_block_header(struct xz_dec *s) { uint32_t crc; size_t header_size; uint8_t bflags; size_t pos; uint8_t filter_id; uint8_t props_size; uint8_t lzma2_props; enum xz_ret ret; header_size = ((uint32_t)s->temp.buf[0] + 1) * 4; /* The CRC32 covers everything except the CRC32 field itself */ crc = xz_crc32(s->temp.buf, (size_t)(header_size - 4), 0); if (crc != get_le32(s->temp.buf + header_size - 4)) return XZ_DATA_ERROR; bflags = s->temp.buf[1]; /* * Bits 0-1: number of filters minus 1 (we require exactly 1). * Bit 6: compressed size present. * Bit 7: uncompressed size present. * Other bits must be 0. */ if ((bflags & 0x03) != 0) return XZ_OPTIONS_ERROR; /* more than one filter */ if (bflags & 0x3C) return XZ_OPTIONS_ERROR; /* reserved bits set */ pos = 2; /* Compressed size (optional) */ if (bflags & 0x40) { s->vli_count = 0; ret = dec_vli(s, s->temp.buf, &pos, header_size); if (ret != XZ_STREAM_END) return ret == XZ_OK ? XZ_DATA_ERROR : ret; s->block.compressed = s->vli; } else { s->block.compressed = VLI_UNKNOWN; } /* Uncompressed size (optional) */ if (bflags & 0x80) { s->vli_count = 0; ret = dec_vli(s, s->temp.buf, &pos, header_size); if (ret != XZ_STREAM_END) return ret == XZ_OK ? XZ_DATA_ERROR : ret; s->block.uncompressed = s->vli; } else { s->block.uncompressed = VLI_UNKNOWN; } /* Filter flags */ if (pos >= header_size - 4) return XZ_DATA_ERROR; filter_id = s->temp.buf[pos++]; if (filter_id != 0x21) return XZ_OPTIONS_ERROR; /* only LZMA2 supported */ if (pos >= header_size - 4) return XZ_DATA_ERROR; props_size = s->temp.buf[pos++]; if (props_size != 1) return XZ_OPTIONS_ERROR; if (pos >= header_size - 4) return XZ_DATA_ERROR; lzma2_props = s->temp.buf[pos++]; /* Remaining bytes before CRC must be zero (padding) */ while (pos < header_size - 4) { if (s->temp.buf[pos++] != 0) return XZ_OPTIONS_ERROR; } /* Reset LZMA2 decoder with the new properties */ ret = xz_dec_lzma2_reset(s->lzma2, lzma2_props); if (ret != XZ_OK) return ret; /* Reset block counters */ s->block.count_compressed = 0; s->block.count_uncompressed = 0; return XZ_OK; } /* * Validate the stream footer. The footer is 12 bytes: * 4 bytes CRC32, 4 bytes backward size, 2 bytes stream flags, 2 bytes magic. */ static enum xz_ret dec_stream_footer(struct xz_dec *s) { uint32_t crc; /* Check footer magic bytes */ if (s->temp.buf[10] != 'Y' || s->temp.buf[11] != 'Z') return XZ_DATA_ERROR; /* CRC32 covers backward size and stream flags (bytes 4..9) */ crc = xz_crc32(s->temp.buf + 4, 6, 0); if (crc != get_le32(s->temp.buf)) return XZ_DATA_ERROR; /* * Stream flags in the footer must match the header. * Byte 8 must be 0, byte 9 holds check type. */ if (s->temp.buf[8] != 0) return XZ_OPTIONS_ERROR; if (s->temp.buf[9] != (uint8_t)s->stream_flags) return XZ_DATA_ERROR; /* * Backward size indicates the size of the Index field. * We don't verify it here in this minimal implementation. */ return XZ_STREAM_END; } /* * Main decoder loop. */ enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; size_t copy_size; /* Avoid infinite loops: if we can't make progress, return XZ_BUF_ERROR */ size_t in_start; size_t out_start; for (;;) { in_start = b->in_pos; out_start = b->out_pos; switch (s->sequence) { case SEQ_STREAM_HEADER: s->temp.size = STREAM_HEADER_SIZE; if (!fill_temp(s, b)) return XZ_OK; ret = dec_stream_header(s); if (ret != XZ_OK) return ret; s->sequence = SEQ_BLOCK_START; break; case SEQ_BLOCK_START: /* * The first byte of a block header gives its size. * A zero byte indicates the start of the Index. */ if (b->in_pos >= b->in_size) return XZ_OK; /* Peek at the first byte */ if (b->in[b->in_pos] == 0x00) { /* Index indicator */ ++b->in_pos; s->vli_count = 0; s->index.count = 0; s->index.hash_compressed = 0; s->index.hash_uncompressed = 0; s->index.hash_count = 0; s->index.size = 1; /* counting the 0x00 byte */ s->crc32_context = xz_crc32((const uint8_t *)"\0", 1, 0); s->sequence = SEQ_INDEX; break; } /* Read the full block header into temp buffer */ s->temp.size = ((uint32_t)b->in[b->in_pos] + 1) * 4; s->temp.pos = 0; s->sequence = SEQ_BLOCK_HEADER; break; case SEQ_BLOCK_HEADER: if (!fill_temp(s, b)) return XZ_OK; ret = dec_block_header(s); if (ret != XZ_OK) return ret; s->sequence = SEQ_BLOCK_UNCOMPRESS; break; case SEQ_BLOCK_UNCOMPRESS: { size_t in_before = b->in_pos; size_t out_before = b->out_pos; ret = xz_dec_lzma2_run(s->lzma2, b); s->block.count_compressed += b->in_pos - in_before; s->block.count_uncompressed += b->out_pos - out_before; if (ret == XZ_STREAM_END) { /* Verify sizes if they were specified in the block header */ if (s->block.compressed != VLI_UNKNOWN && s->block.count_compressed != s->block.compressed) return XZ_DATA_ERROR; if (s->block.uncompressed != VLI_UNKNOWN && s->block.count_uncompressed != s->block.uncompressed) return XZ_DATA_ERROR; /* Accumulate for index verification */ s->block.hash_compressed += s->block.count_compressed; s->block.hash_uncompressed += s->block.count_uncompressed; ++s->block.hash_count; s->pos = 0; s->sequence = SEQ_BLOCK_PADDING; break; } if (ret != XZ_OK) return ret; /* Check for limit violations */ if (s->block.compressed != VLI_UNKNOWN && s->block.count_compressed > s->block.compressed) return XZ_DATA_ERROR; if (s->block.uncompressed != VLI_UNKNOWN && s->block.count_uncompressed > s->block.uncompressed) return XZ_DATA_ERROR; return XZ_OK; } case SEQ_BLOCK_PADDING: /* * Compressed data is padded to a multiple of 4 bytes. * The padding bytes must be zero. */ while ((s->block.count_compressed + s->pos) & 3) { if (b->in_pos >= b->in_size) return XZ_OK; if (b->in[b->in_pos++] != 0) return XZ_DATA_ERROR; ++s->pos; } s->check.pos = 0; s->sequence = SEQ_BLOCK_CHECK; break; case SEQ_BLOCK_CHECK: /* * We consume the check value but don't verify it * (a full implementation would verify CRC32/CRC64/SHA-256). * For CRC32 and CRC64 we could verify, but for simplicity * and size we skip it. The stream/index CRC32s are verified. */ copy_size = min_t(size_t, s->block.check_size - s->check.pos, b->in_size - b->in_pos); b->in_pos += copy_size; s->check.pos += (uint32_t)copy_size; if (s->check.pos < s->block.check_size) return XZ_OK; s->sequence = SEQ_BLOCK_START; break; case SEQ_INDEX: { /* * Parse the Index. Format: * - Number of Records (VLI) * - For each record: Unpadded Size (VLI), Uncompressed Size (VLI) * - Padding to 4-byte boundary * - CRC32 of everything from the Index Indicator to padding * * We do a simplified parse: consume VLIs for count then * skip through the records. */ /* First VLI is the number of records */ if (s->index.count == 0 && s->vli_count == 0) { /* Decode number-of-records VLI */ ret = dec_vli(s, b->in, &b->in_pos, b->in_size); if (ret == XZ_OK) return XZ_OK; if (ret != XZ_STREAM_END) return ret; s->index.count = (uint32_t)s->vli; s->index.hash_count = 0; s->pos = 0; /* sub-state: 0 = compressed VLI, 1 = uncompressed VLI */ } /* Consume records */ while (s->index.hash_count < s->index.count) { s->vli_count = 0; ret = dec_vli(s, b->in, &b->in_pos, b->in_size); if (ret == XZ_OK) return XZ_OK; if (ret != XZ_STREAM_END) return ret; if (s->pos == 0) { s->pos = 1; } else { s->pos = 0; ++s->index.hash_count; } } s->sequence = SEQ_INDEX_PADDING; s->pos = 0; break; } case SEQ_INDEX_PADDING: /* * Skip padding. We need to figure out how many bytes the * index consumed; for simplicity we just skip 0-3 zero bytes. */ while (b->in_pos < b->in_size && s->pos < 3) { if (b->in[b->in_pos] != 0) break; ++b->in_pos; ++s->pos; } /* Now read the 4-byte CRC32 */ s->temp.size = 4; s->temp.pos = 0; s->sequence = SEQ_INDEX_CRC32; break; case SEQ_INDEX_CRC32: if (!fill_temp(s, b)) return XZ_OK; /* * In a full implementation we'd verify the CRC32 of the * entire index. We skip that for size, but we do consume it. */ s->temp.size = 12; /* stream footer size */ s->temp.pos = 0; s->sequence = SEQ_STREAM_FOOTER; break; case SEQ_STREAM_FOOTER: if (!fill_temp(s, b)) return XZ_OK; return dec_stream_footer(s); default: return XZ_DATA_ERROR; } /* Detect lack of progress */ if (b->in_pos == in_start && b->out_pos == out_start) { /* * If we switched states without consuming anything, * that's fine -- we'll try again. But if we've been * through the loop already without progress, avoid * spinning. The state change itself counts as progress * the first time through. */ if (s->allow_buf_error) return XZ_BUF_ERROR; s->allow_buf_error = 1; } else { s->allow_buf_error = 0; } } } struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) { struct xz_dec *s; s = (struct xz_dec *)malloc(sizeof(*s)); if (s == NULL) return NULL; memset(s, 0, sizeof(*s)); s->mode = mode; s->lzma2 = xz_dec_lzma2_create(mode, dict_max); if (s->lzma2 == NULL) { free(s); return NULL; } xz_dec_reset(s); return s; } void xz_dec_reset(struct xz_dec *s) { s->sequence = SEQ_STREAM_HEADER; s->allow_buf_error = 0; s->pos = 0; s->vli_count = 0; memset(&s->block, 0, sizeof(s->block)); memset(&s->index, 0, sizeof(s->index)); memset(&s->check, 0, sizeof(s->check)); memset(&s->temp, 0, sizeof(s->temp)); s->stream_flags = 0; s->crc32_context = 0; } void xz_dec_end(struct xz_dec *s) { if (s != NULL) { xz_dec_lzma2_end(s->lzma2); free(s); } }